"""Tests for Mode C evaluation pipeline + stats utilities.""" from __future__ import annotations import pytest from eval.bootstrap_ci import bootstrap_ci, cohens_d, permutation_test from eval.mode_c_real_cases import ( DEFAULT_DATASET, ScriptedAnalyzerAdapter, aggregate, load_dataset, per_category_breakdown, run_eval, ) @pytest.mark.unit def test_dataset_loads_and_is_expected_size(): data = load_dataset(DEFAULT_DATASET) # Lower bound — may grow over time as community contributes assert len(data) >= 130 @pytest.mark.unit def test_dataset_has_scam_and_benign(): data = load_dataset(DEFAULT_DATASET) scam = sum(1 for s in data if s["ground_truth"]["is_scam"]) benign = sum(1 for s in data if not s["ground_truth"]["is_scam"]) assert scam > 0 assert benign > 0 assert scam + benign == len(data) @pytest.mark.unit def test_dataset_has_novel_subset_for_temporal_eval(): """Temporal generalization eval requires n>=30 for statistical power.""" data = load_dataset(DEFAULT_DATASET) novel = [s for s in data if s["ground_truth"]["difficulty"] == "novel"] assert len(novel) >= 30, f"Only {len(novel)} novel scenarios — need 30+ for CI" @pytest.mark.unit def test_bootstrap_ci_reasonable(): # 80% positive rate in a large sample → CI should bracket 0.8 samples = [1.0] * 80 + [0.0] * 20 point, lo, hi = bootstrap_ci(samples, n_resamples=1000, seed=42) assert abs(point - 0.8) < 0.001 assert lo < 0.8 < hi assert (hi - lo) < 0.2 # reasonably tight CI @pytest.mark.unit def test_permutation_test_detects_difference(): group_a = [1.0] * 50 group_b = [0.0] * 50 p = permutation_test(group_a, group_b, n_permutations=1000, seed=42) assert p < 0.05 @pytest.mark.unit def test_permutation_test_no_difference(): group_a = [0.5, 0.5, 0.5, 0.5] group_b = [0.5, 0.5, 0.5, 0.5] p = permutation_test(group_a, group_b, n_permutations=500, seed=42) assert p > 0.05 @pytest.mark.unit def test_cohens_d_large_effect(): group_a = [10.0, 11.0, 9.0, 10.5, 11.5, 9.5] group_b = [1.0, 2.0, 0.5, 1.5, 2.5, 0.0] d = cohens_d(group_a, group_b) assert d > 0.8 # large effect @pytest.mark.integration def test_scripted_analyzer_catches_majority_of_scams(): data = load_dataset(DEFAULT_DATASET) analyzer = ScriptedAnalyzerAdapter() results = run_eval(analyzer, data, threshold=0.5) metrics = aggregate(results) # Baseline scripted analyzer must catch at least 40% of scams (sanity bound) assert metrics.detection_rate >= 0.40 # Benign cases should have SOME false positives (rule-based is crude) # but should NOT be > 60% (otherwise the analyzer is useless) assert metrics.false_positive_rate <= 0.60 @pytest.mark.integration def test_per_category_covers_all_scam_categories(): data = load_dataset(DEFAULT_DATASET) analyzer = ScriptedAnalyzerAdapter() results = run_eval(analyzer, data) by_cat = per_category_breakdown(results) for required_cat in ( "otp_theft", "kyc_fraud", "loan_app_fraud", "investment_fraud", "impersonation", "benign", ): assert required_cat in by_cat, f"Missing category: {required_cat}"