Spaces:

ujjwalpardeshi
/

chakravyuh

Running

chakravyuh / eval /rupee_weighted_eval.py

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 13 days ago

6.18 kB

	"""Aggregate ₹ at risk across the bench and produce the headline economic-loss number.

	For each scenario in `data/chakravyuh-bench-v0/scenarios.jsonl` we read
	`metadata.loss_amount_inr` and join with a per-row eval log
	(`logs/eval_v2_per_row.jsonl` by default) to compute:

	- ₹ at risk (sum of loss_amount_inr across scams in the bench)
	- ₹ prevented (sum across scams the analyzer flagged early)
	- ₹ leaked (sum across scams the analyzer missed AND money was extracted)
	- bench-level prevention rate (₹ prevented / ₹ at risk)

	The headline phrasing ("Chakravyuh prevented ₹X cr of expected loss
	across the n=174 bench") is sourced from this script — never fabricated.

	Usage:
	python eval/rupee_weighted_eval.py \
	--bench data/chakravyuh-bench-v0/scenarios.jsonl \
	--eval-log logs/eval_v2_per_row.jsonl \
	--output logs/rupee_weighted_eval.json

	The eval log must be JSONL with at least these fields per row:
	scenario_id (matches `id` in the bench)
	flagged (bool)
	money_extracted (bool, optional — defaults to "missed scam = money lost")

	If `--eval-log` is omitted the script computes only the gross ₹ at risk
	and exits — useful as a bench summary.
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from dataclasses import asdict, dataclass
	from pathlib import Path


	@dataclass(frozen=True)
	class RupeeReport:
	n_scenarios: int
	n_scams_with_amount: int
	rupees_at_risk: float
	rupees_prevented: float
	rupees_leaked: float
	prevention_rate: float
	notes: list[str]


	def _load_bench(path: Path) -> dict[str, dict]:
	out: dict[str, dict] = {}
	for line in path.read_text(encoding="utf-8").splitlines():
	line = line.strip()
	if not line:
	continue
	row = json.loads(line)
	out[row["id"]] = row
	return out


	def _load_eval_log(path: Path) -> dict[str, dict]:
	out: dict[str, dict] = {}
	for line in path.read_text(encoding="utf-8").splitlines():
	line = line.strip()
	if not line:
	continue
	row = json.loads(line)
	scenario_id = row.get("scenario_id") or row.get("id")
	if scenario_id is None:
	continue
	out[str(scenario_id)] = row
	return out


	def compute(
	bench: dict[str, dict],
	eval_log: dict[str, dict] \| None,
	) -> RupeeReport:
	notes: list[str] = []
	rupees_at_risk = 0.0
	rupees_prevented = 0.0
	rupees_leaked = 0.0
	n_scams_with_amount = 0

	for sid, scenario in bench.items():
	metadata = scenario.get("metadata") or {}
	ground = scenario.get("ground_truth") or {}
	is_scam = bool(ground.get("is_scam", True))
	if not is_scam:
	continue
	loss_raw = metadata.get("loss_amount_inr")
	if loss_raw is None:
	continue
	try:
	loss = float(loss_raw)
	except (TypeError, ValueError):
	notes.append(f"{sid}: non-numeric loss_amount_inr={loss_raw!r}")
	continue
	if loss <= 0:
	continue
	n_scams_with_amount += 1
	rupees_at_risk += loss
	if eval_log is None:
	continue
	row = eval_log.get(sid)
	if row is None:
	notes.append(f"{sid}: no eval-log row")
	continue
	flagged = bool(row.get("flagged"))
	# If money_extracted is unspecified, assume the bench's labelled
	# outcome (most scams are "money_extracted" in the bench v0 set;
	# see `metadata.outcome`).
	if "money_extracted" in row:
	money = bool(row["money_extracted"])
	else:
	money = (metadata.get("outcome") == "money_extracted")
	if flagged:
	rupees_prevented += loss
	elif money:
	rupees_leaked += loss

	prevention_rate = (
	rupees_prevented / rupees_at_risk if rupees_at_risk > 0 else 0.0
	)
	return RupeeReport(
	n_scenarios=len(bench),
	n_scams_with_amount=n_scams_with_amount,
	rupees_at_risk=round(rupees_at_risk, 2),
	rupees_prevented=round(rupees_prevented, 2),
	rupees_leaked=round(rupees_leaked, 2),
	prevention_rate=round(prevention_rate, 4),
	notes=notes,
	)


	def _format_inr(amount: float) -> str:
	if amount >= 1e7:
	return f"₹{amount / 1e7:.2f} cr"
	if amount >= 1e5:
	return f"₹{amount / 1e5:.2f} lakh"
	return f"₹{amount:,.0f}"


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--bench",
	type=Path,
	default=Path("data/chakravyuh-bench-v0/scenarios.jsonl"),
	help="Path to bench scenarios JSONL.",
	)
	parser.add_argument(
	"--eval-log",
	type=Path,
	default=None,
	help=(
	"Optional per-row JSONL with `scenario_id` + `flagged` "
	"(+ optional `money_extracted`). When omitted, only ₹ at risk is computed."
	),
	)
	parser.add_argument(
	"--output",
	type=Path,
	default=Path("logs/rupee_weighted_eval.json"),
	help="Path to write the JSON report.",
	)
	args = parser.parse_args()

	if not args.bench.exists():
	print(f"error: bench not found: {args.bench}", file=sys.stderr)
	return 2

	bench = _load_bench(args.bench)
	eval_log = _load_eval_log(args.eval_log) if args.eval_log else None
	report = compute(bench, eval_log)

	args.output.parent.mkdir(parents=True, exist_ok=True)
	args.output.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")

	print(f"Bench: {report.n_scenarios} scenarios")
	print(f"Scams with loss_amount_inr: {report.n_scams_with_amount}")
	print(f"₹ at risk: {_format_inr(report.rupees_at_risk)}")
	if eval_log is not None:
	print(f"₹ prevented: {_format_inr(report.rupees_prevented)}")
	print(f"₹ leaked: {_format_inr(report.rupees_leaked)}")
	print(f"Prevention rate: {report.prevention_rate * 100:.1f}%")
	print(f"Wrote {args.output}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())