from __future__ import annotations import json import sys from pathlib import Path ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(ROOT)) from app.decision_engine import build_candidates, extract_evidence # noqa: E402 def load(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) def ensure_expanded() -> Path: out = ROOT / "expanded" if not (out / "test_pairs.json").exists(): import subprocess subprocess.run([sys.executable, "dataset/generate_dataset.py", "--seed-dir", "dataset", "--out", "expanded"], cwd=ROOT, check=True) return out def score_pair(out: Path, pair: dict) -> tuple[int, dict]: merchant = load(out / "merchants" / f"{pair['merchant_id']}.json") category = load(out / "categories" / f"{merchant['category_slug']}.json") trigger = load(out / "triggers" / f"{pair['trigger_id']}.json") customer = load(out / "customers" / f"{pair['customer_id']}.json") if pair.get("customer_id") else None evidence = extract_evidence(category, merchant, trigger, customer) candidates = build_candidates(category, merchant, trigger, customer, evidence) if not candidates: return 0, {"reason": "no candidates"} best = max(candidates, key=lambda c: c.total_score) return best.total_score, { "scores": best.rubric_scores, "signal": best.primary_signal, "lever": best.selected_lever, "body": best.body, } def main() -> int: threshold = int(sys.argv[1]) if len(sys.argv) > 1 else 36 out = ensure_expanded() pairs = load(out / "test_pairs.json")["pairs"] failures = [] totals = [] for pair in pairs: total, detail = score_pair(out, pair) totals.append(total) if total < threshold: failures.append((pair["test_id"], total, detail)) avg = sum(totals) / max(1, len(totals)) print(f"proxy scored {len(totals)} canonical pairs; avg={avg:.1f}/50 min={min(totals) if totals else 0}/50 threshold={threshold}") if failures: print("proxy score failures:") for test_id, total, detail in failures: print(f"- {test_id}: {total}/50 {detail}") return 1 return 0 if __name__ == "__main__": raise SystemExit(main())