"""Inter-annotator / rule-vs-expert agreement statistics. v0 ships `rule_vs_expert_kappa` — Cohen's κ between the scripted analyzer's binary is_scam predictions and the human-curated ground-truth labels. This is NOT full inter-rater reliability (that requires two independent human annotators). It IS a legitimate agreement measure between a rule-based detector and the expert labels, and it provides a reproducible consistency number for v0.2. Full human IRR (Cohen's κ between two human labelers on a 30-scenario sample) is deferred to v0.3. """ from __future__ import annotations import argparse import json from pathlib import Path from chakravyuh_env.agents.analyzer import ScriptedAnalyzer from chakravyuh_env.schemas import ChatMessage, Observation def cohens_kappa(a: list[int], b: list[int]) -> float: """Cohen's κ for two binary label sequences of equal length.""" if len(a) != len(b): raise ValueError(f"length mismatch: {len(a)} vs {len(b)}") n = len(a) if n == 0: return 0.0 agree = sum(1 for x, y in zip(a, b) if x == y) p_o = agree / n # Expected agreement by chance pa1 = sum(a) / n pb1 = sum(b) / n p_e = pa1 * pb1 + (1 - pa1) * (1 - pb1) if p_e == 1.0: return 1.0 return (p_o - p_e) / (1 - p_e) def scripted_label(text: str, threshold: float = 0.5) -> int: """Run the scripted analyzer on a message, return 1 if flagged, 0 else.""" analyzer = ScriptedAnalyzer(flag_threshold=threshold) obs = Observation( agent_role="analyzer", turn=1, chat_history=[ChatMessage(sender="scammer", turn=1, text=text)], ) score = analyzer.act(obs) return 1 if score.score >= threshold else 0 def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--dataset", default="data/chakravyuh-bench-v0/scenarios.jsonl", ) parser.add_argument("--threshold", type=float, default=0.5) args = parser.parse_args() scenarios = [json.loads(l) for l in Path(args.dataset).open()] expert_labels: list[int] = [] rule_labels: list[int] = [] for s in scenarios: # Concatenate all scammer turns (for multi-turn, we give the rule # detector the full sequence — same privilege the expert had). scammer_text = " ".join( step["text"] for step in s["attack_sequence"] if step["sender"] == "scammer" ) if not scammer_text: continue expert_labels.append(1 if s["ground_truth"]["is_scam"] else 0) rule_labels.append(scripted_label(scammer_text, args.threshold)) n = len(expert_labels) agree = sum(1 for x, y in zip(expert_labels, rule_labels) if x == y) kappa = cohens_kappa(expert_labels, rule_labels) print(f"Dataset: {args.dataset}") print(f"N compared: {n}") print(f"Raw agreement: {agree}/{n} = {agree/n:.3f}") print(f"Expert positive rate: {sum(expert_labels)/n:.3f}") print(f"Rule positive rate: {sum(rule_labels)/n:.3f}") print(f"Cohen's κ (rule vs expert): {kappa:.3f}") # Interpretation band if kappa < 0: interp = "worse than chance" elif kappa < 0.20: interp = "slight" elif kappa < 0.40: interp = "fair" elif kappa < 0.60: interp = "moderate" elif kappa < 0.80: interp = "substantial" else: interp = "almost perfect" print(f"Landis-Koch interpretation: {interp} agreement") if __name__ == "__main__": main()