import numpy as np
import pandas as pd

def generate_captcha_behavior_dataset_v1(n_samples=50000, random_seed=42,
                                         noise_level=0.5):
    """
    Simulates human vs bot behavior on a CAPTCHA flow.
    
    Columns:
      - Check_Time
      - Challenge_Time
      - Challenge_Errors
      - Mouse_Path_Entropy
      - Click_Speed
      - Scroll_Count
      - History_Captcha_Success
      - History_Captcha_Count
      - IP_Suspicious
      - Device_Trust_Score
      - Is_Human
      
    Plus optional latent columns for debugging:
      - user_type (0..5)
      - motor_skill
      - visual_skill
      - patience
    """
    rng = np.random.default_rng(random_seed)
    
    # --------------------------
    # 1. Label
    # --------------------------
    # Slightly more humans than bots
    is_human = rng.choice([0, 1], size=n_samples, p=[0.4, 0.6])
    
    # --------------------------
    # 2. Latent discrete user type
    # --------------------------
    # For humans: 0 = careful, 1 = rushed, 2 = distracted
    # For bots:   3 = script, 4 = stealth, 5 = solver
    user_type = np.empty(n_samples, dtype=int)
    
    # humans
    human_mask = is_human == 1
    n_human = human_mask.sum()
    user_type[human_mask] = rng.choice(
        [0, 1, 2],
        size=n_human,
        p=[0.4, 0.4, 0.2]  # mostly careful/rushed
    )
    
    # bots
    bot_mask = is_human == 0
    n_bot = bot_mask.sum()
    user_type[bot_mask] = rng.choice(
        [3, 4, 5],
        size=n_bot,
        p=[0.5, 0.3, 0.2]  # mostly script; some stealth/solver
    )
    
    # --------------------------
    # 3. Latent continuous traits
    # --------------------------
    # Base means by user_type
    motor_skill = np.zeros(n_samples)
    visual_skill = np.zeros(n_samples)
    patience = np.zeros(n_samples)
    
    # define helper
    def assign_traits(mask, m_mean, v_mean, p_mean, m_sd=0.5, v_sd=0.5, p_sd=0.5):
        motor_skill[mask] = rng.normal(m_mean, m_sd, mask.sum())
        visual_skill[mask] = rng.normal(v_mean, v_sd, mask.sum())
        patience[mask]     = rng.normal(p_mean, p_sd, mask.sum())
    
    # humans
    # careful humans: good motor, good visual, high patience
    assign_traits(user_type == 0, m_mean=1.2, v_mean=1.2, p_mean=1.3)
    # rushed humans: decent motor, good visual, low patience
    assign_traits(user_type == 1, m_mean=1.0, v_mean=1.1, p_mean=0.2)
    # distracted humans: okay motor, okay visual, noisy patience
    assign_traits(user_type == 2, m_mean=0.8, v_mean=0.8, p_mean=0.5)
    
    # bots
    # script bots: poor motor (straight lines), low visual, very low patience
    assign_traits(user_type == 3, m_mean=0.3, v_mean=0.2, p_mean=-0.2)
    # stealth bots: good motor, moderate visual, fake patience
    assign_traits(user_type == 4, m_mean=1.1, v_mean=0.7, p_mean=0.8)
    # solver bots: okay motor, very good visual (OCR/ML), low patience
    assign_traits(user_type == 5, m_mean=0.9, v_mean=1.4, p_mean=0.1)
    
    # add global noise to traits
    motor_skill += rng.normal(0, 0.2 * noise_level, n_samples)
    visual_skill += rng.normal(0, 0.2 * noise_level, n_samples)
    patience += rng.normal(0, 0.2 * noise_level, n_samples)
    
    # --------------------------
    # 4. Observed features
    # --------------------------
    eps = 1e-6
    
    # 4.1 Check_Time: time to click "I'm not a robot"
    base_check = (
        1.5
        - 0.5 * motor_skill    # better motor -> quicker
        - 0.4 * patience       # more patient -> quicker to act appropriately
        + 0.3 * (user_type == 2)  # distracted humans sometimes slower
        + 0.3 * (user_type == 3)  # script bots may be too fast (we'll cap later)
    )
    check_time = base_check + rng.normal(0, 0.3 + 0.3*noise_level, n_samples)
    check_time = np.clip(check_time, 0.05, None)
    
    # script bots often near-minimum time: apply a soft min-warper
    script_mask = user_type == 3
    check_time[script_mask] = np.minimum(
        check_time[script_mask],
        rng.normal(0.15, 0.05, script_mask.sum())
    ).clip(0.05, None)
    
    # 4.2 Challenge_Time: time on the image grid challenge (if shown)
    # high visual_skill & patience -> lower time, but stealth bots sometimes overdo it
    base_challenge = (
        4.0
        - 0.8 * visual_skill
        - 0.3 * patience
        + 0.3 * (user_type == 2)    # distracted humans slower
        + 0.2 * (user_type == 4)    # stealth bots artificially slow sometimes
    )
    challenge_time = base_challenge + rng.normal(0, 0.8 + 0.5*noise_level, n_samples)
    challenge_time = np.clip(challenge_time, 0.3, None)
    
    # 4.3 Challenge_Errors: number of wrong tiles clicked
    # low visual_skill -> more errors; solver bots very good
    error_rate = np.exp(
        -0.7 * visual_skill
        + 0.4 * (user_type == 3)  # script
        - 0.6 * (user_type == 5)  # solver
        + rng.normal(0, 0.3*noise_level, n_samples)
    )
    error_rate = np.clip(error_rate, 0.05, 4.0)
    challenge_errors = rng.poisson(error_rate)
    
    # 4.4 Mouse_Path_Entropy: higher for humans & stealth bots,
    # low for script bots (straight lines).
    base_entropy = (
        1.0
        + 0.6 * motor_skill
        + 0.4 * patience
        - 0.8 * (user_type == 3)   # script bots: low entropy
        + 0.3 * (user_type == 4)   # stealth bots: extra "wobble"
    )
    mouse_path_entropy = base_entropy + rng.normal(0, 0.3 + 0.3*noise_level, n_samples)
    mouse_path_entropy = np.clip(mouse_path_entropy, 0.1, None)
    
    # 4.5 Click_Speed: clicks per second during challenge
    base_click_speed = (
        1.5
        + 0.4 * motor_skill
        - 0.3 * patience
        + 0.5 * (user_type == 3)  # script: very quick
        + 0.2 * (user_type == 5)  # solver: efficient
    )
    click_speed = base_click_speed + rng.normal(0, 0.4 + 0.3*noise_level, n_samples)
    click_speed = np.clip(click_speed, 0.2, None)
    
    # 4.6 Scroll_Count: number of scroll events in the session
    base_scroll = (
        1.0
        + 1.0 * patience
        + 0.2 * (user_type == 2)  # distracted humans scroll more
        - 0.7 * (user_type == 3)  # script bots barely scroll
    )
    scroll_lambda = np.clip(
        np.exp(base_scroll + rng.normal(0, 0.4*noise_level, n_samples)),
        0.2, 20
    )
    scroll_count = rng.poisson(scroll_lambda)
    
    # 4.7 History_Captcha_Count: how many captchas seen before by this entity
    # more for power users, also for persistent bots
    base_hist_count = (
        5
        + 4 * (user_type == 0)   # careful humans often longstanding accounts
        + 3 * (user_type == 1)
        + 1 * (user_type == 2)
        + 5 * (user_type == 4)   # stealth bots reused
        + 7 * (user_type == 5)   # solver services
        + rng.normal(0, 3.0, n_samples)
    )
    history_captcha_count = np.clip(base_hist_count, 0, None).astype(int)
    
    # 4.8 History_Captcha_Success: empirical success rate
    # depends on latent type but also noisy and overlapping
    success_logit = (
        0.4 * visual_skill
        + 0.3 * motor_skill
        + 0.1 * patience
        - 0.4 * (user_type == 3)  # script
        + 0.3 * (user_type == 0)  # careful humans
        + rng.normal(0, 1.0 + 0.4*noise_level, n_samples)
    )
    success_prob = 1 / (1 + np.exp(-success_logit))
    success_prob = np.clip(success_prob, 0.05, 0.95)
    
    # "observed" success rate: noisy estimate from binomial over historical count
    hist_trials = np.maximum(history_captcha_count, 1)
    successes = rng.binomial(hist_trials, success_prob)
    history_captcha_success = successes / hist_trials
    
    # 4.9 IP_Suspicious: from external risk system
    # depends on user_type but with lots of noise and overlap
    ip_logit = (
        -0.7 * is_human
        + 0.3 * (user_type == 3)
        + 0.2 * (user_type == 5)
        + 0.1 * (user_type == 4)
        + rng.normal(0, 1.0 + 0.5*noise_level, n_samples)
    )
    ip_prob = 1 / (1 + np.exp(-ip_logit))
    ip_prob = np.clip(ip_prob, 0.05, 0.95)
    ip_suspicious = (rng.random(n_samples) < ip_prob).astype(int)
    
    # 4.10 Device_Trust_Score: noisy 1D feature
    device_trust = (
        0.8 * is_human
        - 0.5 * (user_type == 3)
        - 0.2 * (user_type == 5)
        + 0.3 * (user_type == 0)
        + rng.normal(0, 0.8 + 0.4*noise_level, n_samples)
    )
    
    data = pd.DataFrame({
        'Check_Time': check_time,
        'Challenge_Time': challenge_time,
        'Challenge_Errors': challenge_errors,
        'Mouse_Path_Entropy': mouse_path_entropy,
        'Click_Speed': click_speed,
        'Scroll_Count': scroll_count,
        'History_Captcha_Success': history_captcha_success,
        'History_Captcha_Count': history_captcha_count,
        'IP_Suspicious': ip_suspicious,
        'Device_Trust_Score': device_trust,
        'Is_Human': is_human
    })
    
    return data

# Example usage:
if __name__ == "__main__":
    df = generate_captcha_behavior_dataset_v1(n_samples=5000, random_seed=0)
    df.to_csv("captcha_behavior_dataset_v1.csv", index=False)