Spaces:

ujjwalpardeshi
/

chakravyuh

Running

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 13 days ago

3.27 kB

	"""Tests for Mode C evaluation pipeline + stats utilities."""

	from __future__ import annotations

	import pytest

	from eval.bootstrap_ci import bootstrap_ci, cohens_d, permutation_test
	from eval.mode_c_real_cases import (
	DEFAULT_DATASET,
	ScriptedAnalyzerAdapter,
	aggregate,
	load_dataset,
	per_category_breakdown,
	run_eval,
	)


	@pytest.mark.unit
	def test_dataset_loads_and_is_expected_size():
	data = load_dataset(DEFAULT_DATASET)
	# Lower bound — may grow over time as community contributes
	assert len(data) >= 130


	@pytest.mark.unit
	def test_dataset_has_scam_and_benign():
	data = load_dataset(DEFAULT_DATASET)
	scam = sum(1 for s in data if s["ground_truth"]["is_scam"])
	benign = sum(1 for s in data if not s["ground_truth"]["is_scam"])
	assert scam > 0
	assert benign > 0
	assert scam + benign == len(data)


	@pytest.mark.unit
	def test_dataset_has_novel_subset_for_temporal_eval():
	"""Temporal generalization eval requires n>=30 for statistical power."""
	data = load_dataset(DEFAULT_DATASET)
	novel = [s for s in data if s["ground_truth"]["difficulty"] == "novel"]
	assert len(novel) >= 30, f"Only {len(novel)} novel scenarios — need 30+ for CI"


	@pytest.mark.unit
	def test_bootstrap_ci_reasonable():
	# 80% positive rate in a large sample → CI should bracket 0.8
	samples = [1.0] * 80 + [0.0] * 20
	point, lo, hi = bootstrap_ci(samples, n_resamples=1000, seed=42)
	assert abs(point - 0.8) < 0.001
	assert lo < 0.8 < hi
	assert (hi - lo) < 0.2 # reasonably tight CI


	@pytest.mark.unit
	def test_permutation_test_detects_difference():
	group_a = [1.0] * 50
	group_b = [0.0] * 50
	p = permutation_test(group_a, group_b, n_permutations=1000, seed=42)
	assert p < 0.05


	@pytest.mark.unit
	def test_permutation_test_no_difference():
	group_a = [0.5, 0.5, 0.5, 0.5]
	group_b = [0.5, 0.5, 0.5, 0.5]
	p = permutation_test(group_a, group_b, n_permutations=500, seed=42)
	assert p > 0.05


	@pytest.mark.unit
	def test_cohens_d_large_effect():
	group_a = [10.0, 11.0, 9.0, 10.5, 11.5, 9.5]
	group_b = [1.0, 2.0, 0.5, 1.5, 2.5, 0.0]
	d = cohens_d(group_a, group_b)
	assert d > 0.8 # large effect


	@pytest.mark.integration
	def test_scripted_analyzer_catches_majority_of_scams():
	data = load_dataset(DEFAULT_DATASET)
	analyzer = ScriptedAnalyzerAdapter()
	results = run_eval(analyzer, data, threshold=0.5)
	metrics = aggregate(results)
	# Baseline scripted analyzer must catch at least 40% of scams (sanity bound)
	assert metrics.detection_rate >= 0.40
	# Benign cases should have SOME false positives (rule-based is crude)
	# but should NOT be > 60% (otherwise the analyzer is useless)
	assert metrics.false_positive_rate <= 0.60


	@pytest.mark.integration
	def test_per_category_covers_all_scam_categories():
	data = load_dataset(DEFAULT_DATASET)
	analyzer = ScriptedAnalyzerAdapter()
	results = run_eval(analyzer, data)
	by_cat = per_category_breakdown(results)
	for required_cat in (
	"otp_theft",
	"kyc_fraud",
	"loan_app_fraud",
	"investment_fraud",
	"impersonation",
	"benign",
	):
	assert required_cat in by_cat, f"Missing category: {required_cat}"