chakravyuh / tests /test_mode_c.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Tests for Mode C evaluation pipeline + stats utilities."""
from __future__ import annotations
import pytest
from eval.bootstrap_ci import bootstrap_ci, cohens_d, permutation_test
from eval.mode_c_real_cases import (
DEFAULT_DATASET,
ScriptedAnalyzerAdapter,
aggregate,
load_dataset,
per_category_breakdown,
run_eval,
)
@pytest.mark.unit
def test_dataset_loads_and_is_expected_size():
data = load_dataset(DEFAULT_DATASET)
# Lower bound — may grow over time as community contributes
assert len(data) >= 130
@pytest.mark.unit
def test_dataset_has_scam_and_benign():
data = load_dataset(DEFAULT_DATASET)
scam = sum(1 for s in data if s["ground_truth"]["is_scam"])
benign = sum(1 for s in data if not s["ground_truth"]["is_scam"])
assert scam > 0
assert benign > 0
assert scam + benign == len(data)
@pytest.mark.unit
def test_dataset_has_novel_subset_for_temporal_eval():
"""Temporal generalization eval requires n>=30 for statistical power."""
data = load_dataset(DEFAULT_DATASET)
novel = [s for s in data if s["ground_truth"]["difficulty"] == "novel"]
assert len(novel) >= 30, f"Only {len(novel)} novel scenarios — need 30+ for CI"
@pytest.mark.unit
def test_bootstrap_ci_reasonable():
# 80% positive rate in a large sample → CI should bracket 0.8
samples = [1.0] * 80 + [0.0] * 20
point, lo, hi = bootstrap_ci(samples, n_resamples=1000, seed=42)
assert abs(point - 0.8) < 0.001
assert lo < 0.8 < hi
assert (hi - lo) < 0.2 # reasonably tight CI
@pytest.mark.unit
def test_permutation_test_detects_difference():
group_a = [1.0] * 50
group_b = [0.0] * 50
p = permutation_test(group_a, group_b, n_permutations=1000, seed=42)
assert p < 0.05
@pytest.mark.unit
def test_permutation_test_no_difference():
group_a = [0.5, 0.5, 0.5, 0.5]
group_b = [0.5, 0.5, 0.5, 0.5]
p = permutation_test(group_a, group_b, n_permutations=500, seed=42)
assert p > 0.05
@pytest.mark.unit
def test_cohens_d_large_effect():
group_a = [10.0, 11.0, 9.0, 10.5, 11.5, 9.5]
group_b = [1.0, 2.0, 0.5, 1.5, 2.5, 0.0]
d = cohens_d(group_a, group_b)
assert d > 0.8 # large effect
@pytest.mark.integration
def test_scripted_analyzer_catches_majority_of_scams():
data = load_dataset(DEFAULT_DATASET)
analyzer = ScriptedAnalyzerAdapter()
results = run_eval(analyzer, data, threshold=0.5)
metrics = aggregate(results)
# Baseline scripted analyzer must catch at least 40% of scams (sanity bound)
assert metrics.detection_rate >= 0.40
# Benign cases should have SOME false positives (rule-based is crude)
# but should NOT be > 60% (otherwise the analyzer is useless)
assert metrics.false_positive_rate <= 0.60
@pytest.mark.integration
def test_per_category_covers_all_scam_categories():
data = load_dataset(DEFAULT_DATASET)
analyzer = ScriptedAnalyzerAdapter()
results = run_eval(analyzer, data)
by_cat = per_category_breakdown(results)
for required_cat in (
"otp_theft",
"kyc_fraud",
"loan_app_fraud",
"investment_fraud",
"impersonation",
"benign",
):
assert required_cat in by_cat, f"Missing category: {required_cat}"