"""Inter-annotator / rule-vs-expert agreement statistics.

v0 ships `rule_vs_expert_kappa` — Cohen's κ between the scripted analyzer's
binary is_scam predictions and the human-curated ground-truth labels. This
is NOT full inter-rater reliability (that requires two independent human
annotators). It IS a legitimate agreement measure between a rule-based
detector and the expert labels, and it provides a reproducible consistency
number for v0.2.

Full human IRR (Cohen's κ between two human labelers on a 30-scenario
sample) is deferred to v0.3.
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path

from chakravyuh_env.agents.analyzer import ScriptedAnalyzer
from chakravyuh_env.schemas import ChatMessage, Observation


def cohens_kappa(a: list[int], b: list[int]) -> float:
    """Cohen's κ for two binary label sequences of equal length."""
    if len(a) != len(b):
        raise ValueError(f"length mismatch: {len(a)} vs {len(b)}")
    n = len(a)
    if n == 0:
        return 0.0

    agree = sum(1 for x, y in zip(a, b) if x == y)
    p_o = agree / n

    # Expected agreement by chance
    pa1 = sum(a) / n
    pb1 = sum(b) / n
    p_e = pa1 * pb1 + (1 - pa1) * (1 - pb1)

    if p_e == 1.0:
        return 1.0
    return (p_o - p_e) / (1 - p_e)


def scripted_label(text: str, threshold: float = 0.5) -> int:
    """Run the scripted analyzer on a message, return 1 if flagged, 0 else."""
    analyzer = ScriptedAnalyzer(flag_threshold=threshold)
    obs = Observation(
        agent_role="analyzer",
        turn=1,
        chat_history=[ChatMessage(sender="scammer", turn=1, text=text)],
    )
    score = analyzer.act(obs)
    return 1 if score.score >= threshold else 0


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset",
        default="data/chakravyuh-bench-v0/scenarios.jsonl",
    )
    parser.add_argument("--threshold", type=float, default=0.5)
    args = parser.parse_args()

    scenarios = [json.loads(l) for l in Path(args.dataset).open()]

    expert_labels: list[int] = []
    rule_labels: list[int] = []

    for s in scenarios:
        # Concatenate all scammer turns (for multi-turn, we give the rule
        # detector the full sequence — same privilege the expert had).
        scammer_text = " ".join(
            step["text"] for step in s["attack_sequence"]
            if step["sender"] == "scammer"
        )
        if not scammer_text:
            continue
        expert_labels.append(1 if s["ground_truth"]["is_scam"] else 0)
        rule_labels.append(scripted_label(scammer_text, args.threshold))

    n = len(expert_labels)
    agree = sum(1 for x, y in zip(expert_labels, rule_labels) if x == y)
    kappa = cohens_kappa(expert_labels, rule_labels)

    print(f"Dataset: {args.dataset}")
    print(f"N compared: {n}")
    print(f"Raw agreement: {agree}/{n} = {agree/n:.3f}")
    print(f"Expert positive rate: {sum(expert_labels)/n:.3f}")
    print(f"Rule positive rate: {sum(rule_labels)/n:.3f}")
    print(f"Cohen's κ (rule vs expert): {kappa:.3f}")

    # Interpretation band
    if kappa < 0:
        interp = "worse than chance"
    elif kappa < 0.20:
        interp = "slight"
    elif kappa < 0.40:
        interp = "fair"
    elif kappa < 0.60:
        interp = "moderate"
    elif kappa < 0.80:
        interp = "substantial"
    else:
        interp = "almost perfect"
    print(f"Landis-Koch interpretation: {interp} agreement")


if __name__ == "__main__":
    main()