File size: 2,274 Bytes
aec2fdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from __future__ import annotations

import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))

from app.decision_engine import build_candidates, extract_evidence  # noqa: E402


def load(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))


def ensure_expanded() -> Path:
    out = ROOT / "expanded"
    if not (out / "test_pairs.json").exists():
        import subprocess
        subprocess.run([sys.executable, "dataset/generate_dataset.py", "--seed-dir", "dataset", "--out", "expanded"], cwd=ROOT, check=True)
    return out


def score_pair(out: Path, pair: dict) -> tuple[int, dict]:
    merchant = load(out / "merchants" / f"{pair['merchant_id']}.json")
    category = load(out / "categories" / f"{merchant['category_slug']}.json")
    trigger = load(out / "triggers" / f"{pair['trigger_id']}.json")
    customer = load(out / "customers" / f"{pair['customer_id']}.json") if pair.get("customer_id") else None
    evidence = extract_evidence(category, merchant, trigger, customer)
    candidates = build_candidates(category, merchant, trigger, customer, evidence)
    if not candidates:
        return 0, {"reason": "no candidates"}
    best = max(candidates, key=lambda c: c.total_score)
    return best.total_score, {
        "scores": best.rubric_scores,
        "signal": best.primary_signal,
        "lever": best.selected_lever,
        "body": best.body,
    }


def main() -> int:
    threshold = int(sys.argv[1]) if len(sys.argv) > 1 else 36
    out = ensure_expanded()
    pairs = load(out / "test_pairs.json")["pairs"]
    failures = []
    totals = []
    for pair in pairs:
        total, detail = score_pair(out, pair)
        totals.append(total)
        if total < threshold:
            failures.append((pair["test_id"], total, detail))
    avg = sum(totals) / max(1, len(totals))
    print(f"proxy scored {len(totals)} canonical pairs; avg={avg:.1f}/50 min={min(totals) if totals else 0}/50 threshold={threshold}")
    if failures:
        print("proxy score failures:")
        for test_id, total, detail in failures:
            print(f"- {test_id}: {total}/50 {detail}")
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())