Spaces:
Running
Running
| """Tests for Mode C evaluation pipeline + stats utilities.""" | |
| from __future__ import annotations | |
| import pytest | |
| from eval.bootstrap_ci import bootstrap_ci, cohens_d, permutation_test | |
| from eval.mode_c_real_cases import ( | |
| DEFAULT_DATASET, | |
| ScriptedAnalyzerAdapter, | |
| aggregate, | |
| load_dataset, | |
| per_category_breakdown, | |
| run_eval, | |
| ) | |
| def test_dataset_loads_and_is_expected_size(): | |
| data = load_dataset(DEFAULT_DATASET) | |
| # Lower bound — may grow over time as community contributes | |
| assert len(data) >= 130 | |
| def test_dataset_has_scam_and_benign(): | |
| data = load_dataset(DEFAULT_DATASET) | |
| scam = sum(1 for s in data if s["ground_truth"]["is_scam"]) | |
| benign = sum(1 for s in data if not s["ground_truth"]["is_scam"]) | |
| assert scam > 0 | |
| assert benign > 0 | |
| assert scam + benign == len(data) | |
| def test_dataset_has_novel_subset_for_temporal_eval(): | |
| """Temporal generalization eval requires n>=30 for statistical power.""" | |
| data = load_dataset(DEFAULT_DATASET) | |
| novel = [s for s in data if s["ground_truth"]["difficulty"] == "novel"] | |
| assert len(novel) >= 30, f"Only {len(novel)} novel scenarios — need 30+ for CI" | |
| def test_bootstrap_ci_reasonable(): | |
| # 80% positive rate in a large sample → CI should bracket 0.8 | |
| samples = [1.0] * 80 + [0.0] * 20 | |
| point, lo, hi = bootstrap_ci(samples, n_resamples=1000, seed=42) | |
| assert abs(point - 0.8) < 0.001 | |
| assert lo < 0.8 < hi | |
| assert (hi - lo) < 0.2 # reasonably tight CI | |
| def test_permutation_test_detects_difference(): | |
| group_a = [1.0] * 50 | |
| group_b = [0.0] * 50 | |
| p = permutation_test(group_a, group_b, n_permutations=1000, seed=42) | |
| assert p < 0.05 | |
| def test_permutation_test_no_difference(): | |
| group_a = [0.5, 0.5, 0.5, 0.5] | |
| group_b = [0.5, 0.5, 0.5, 0.5] | |
| p = permutation_test(group_a, group_b, n_permutations=500, seed=42) | |
| assert p > 0.05 | |
| def test_cohens_d_large_effect(): | |
| group_a = [10.0, 11.0, 9.0, 10.5, 11.5, 9.5] | |
| group_b = [1.0, 2.0, 0.5, 1.5, 2.5, 0.0] | |
| d = cohens_d(group_a, group_b) | |
| assert d > 0.8 # large effect | |
| def test_scripted_analyzer_catches_majority_of_scams(): | |
| data = load_dataset(DEFAULT_DATASET) | |
| analyzer = ScriptedAnalyzerAdapter() | |
| results = run_eval(analyzer, data, threshold=0.5) | |
| metrics = aggregate(results) | |
| # Baseline scripted analyzer must catch at least 40% of scams (sanity bound) | |
| assert metrics.detection_rate >= 0.40 | |
| # Benign cases should have SOME false positives (rule-based is crude) | |
| # but should NOT be > 60% (otherwise the analyzer is useless) | |
| assert metrics.false_positive_rate <= 0.60 | |
| def test_per_category_covers_all_scam_categories(): | |
| data = load_dataset(DEFAULT_DATASET) | |
| analyzer = ScriptedAnalyzerAdapter() | |
| results = run_eval(analyzer, data) | |
| by_cat = per_category_breakdown(results) | |
| for required_cat in ( | |
| "otp_theft", | |
| "kyc_fraud", | |
| "loan_app_fraud", | |
| "investment_fraud", | |
| "impersonation", | |
| "benign", | |
| ): | |
| assert required_cat in by_cat, f"Missing category: {required_cat}" | |