""" train/generate_eval_report.py Regenerates `reports/eval_report.json` and `reports/eval_report.md` from a live DebateFloor environment using the canonical `inference_debatefloor.py:STRATEGIES`. Why this exists (NEW-1 / FATAL-4): - The previous reports/eval_report.json was 3 weeks old and had `variant_id: 0` and `evidence_quality: 0.0` for every row, contradicting the FATAL-3 + FATAL-4 server-side fixes. - PLAN.md mentioned `pre_validation_script.py --output ... --seeds ...` but those flags were never implemented in that script. - This is the dedicated regeneration tool. What it does: - Sweeps every task registered in inference_debatefloor.STRATEGIES (currently 5 — clean_claim, contradictory_claim, distribution_shift_claim, coordinated_fraud, identity_fraud) × 5 distinct seeds (7, 11, 13, 19, 25) covering all 5 variant_ids (variant_id = abs(seed) % 5 — see app/tasks.py:548). - Per row captures: task_id, seed, done, reward, variant_id, evidence_quality, exploit_penalty. - Writes JSON (schema-compatible with the previous file) + Markdown. Usage: $ python train/generate_eval_report.py [--base-url http://localhost:7860] """ from __future__ import annotations import argparse import json import sys from datetime import datetime, timezone from pathlib import Path from statistics import mean # Make the inference baseline importable from the repo root. REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO_ROOT)) from inference_debatefloor import ( # noqa: E402 DebateFloorClient, STRATEGIES, ) # Seeds chosen so that abs(seed) % 5 covers all 5 variants: # 7 -> 2, 11 -> 1, 13 -> 3, 19 -> 4, 25 -> 0 SEEDS = [7, 11, 13, 19, 25] TASKS = list(STRATEGIES.keys()) def run_one(client: DebateFloorClient, task_id: str, seed: int) -> dict: obs = client.reset(task_id=task_id, seed=seed) actions = STRATEGIES[task_id](client, obs) last = None steps = 0 for action in actions: try: last = client.step(action) steps += 1 if last.get("done"): break except Exception as exc: return { "task_id": task_id, "seed": seed, "done": False, "reward": 0.0, "variant_id": None, "evidence_quality": 0.0, "exploit_penalty": 0.0, "error": str(exc), } if last is None: return { "task_id": task_id, "seed": seed, "done": False, "reward": 0.0, "variant_id": None, "evidence_quality": 0.0, "exploit_penalty": 0.0, "error": "no steps executed", } obs = last.get("observation", {}) metadata = obs.get("metadata", {}) or {} breakdown = obs.get("reward_breakdown", {}) or {} return { "task_id": task_id, "seed": seed, "done": bool(last.get("done", False)), "reward": round(float(last.get("reward", 0.0)), 4), "variant_id": int(metadata.get("variant_id", 0)), "evidence_quality": round(float(breakdown.get("evidence_quality_score", 0.0)), 4), "exploit_penalty": round(float(metadata.get("exploit_penalty", 0.0)), 4), "steps": steps, } def write_markdown(payload: dict, path: Path) -> None: rows = payload["rows"] lines = [ "# Evaluation Report", "", f"Generated at: {payload['generated_at']}", f"Base URL: {payload['base_url']}", f"Tasks: {', '.join(sorted({r['task_id'] for r in rows}))}", f"Seeds: {', '.join(str(s) for s in sorted({r['seed'] for r in rows}))}", f"Distinct variant_ids: {sorted({r['variant_id'] for r in rows if r['variant_id'] is not None})}", "", "| Task | Seed | Variant | Steps | Done | Reward | Evidence Quality | Exploit Penalty |", "|---|---:|---:|---:|:---:|---:|---:|---:|", ] for r in sorted(rows, key=lambda x: (x["task_id"], x["seed"])): done_glyph = "yes" if r["done"] else "no" lines.append( f"| {r['task_id']} | {r['seed']} | {r['variant_id']} | " f"{r.get('steps', '-')} | {done_glyph} | " f"{r['reward']:.4f} | {r['evidence_quality']:.4f} | " f"{r['exploit_penalty']:.4f} |" ) lines += [ "", f"Average Reward: {payload['average_reward']:.4f}", f"Completion Rate: {payload['completion_rate'] * 100:.2f}%", "", ] path.write_text("\n".join(lines), encoding="utf-8") def main() -> int: parser = argparse.ArgumentParser(description="Regenerate reports/eval_report.{json,md}") parser.add_argument("--base-url", default="http://localhost:7860") parser.add_argument( "--output-json", default=str(REPO_ROOT / "reports" / "eval_report.json"), ) parser.add_argument( "--output-md", default=str(REPO_ROOT / "reports" / "eval_report.md"), ) args = parser.parse_args() print(f"Generating eval report against {args.base_url}") print(f"Tasks: {TASKS}") print(f"Seeds: {SEEDS} (variant_ids: {sorted({abs(s) % 5 for s in SEEDS})})") print() rows = [] for task_id in TASKS: for seed in SEEDS: client = DebateFloorClient(args.base_url) row = run_one(client, task_id, seed) rows.append(row) print( f" {task_id:<28s} seed={seed:>3d} variant={row['variant_id']} " f"reward={row['reward']:.4f} ev_q={row['evidence_quality']:.4f} " f"exp_pen={row['exploit_penalty']:.4f} done={row['done']}" ) completed = [r for r in rows if r.get("done")] payload = { "generated_at": datetime.now(timezone.utc).isoformat(), "base_url": args.base_url, "rows": rows, "average_reward": round(mean(r["reward"] for r in completed) if completed else 0.0, 4), "completion_rate": round(len(completed) / len(rows) if rows else 0.0, 4), } out_json = Path(args.output_json) out_md = Path(args.output_md) out_json.parent.mkdir(parents=True, exist_ok=True) out_json.write_text(json.dumps(payload, indent=2), encoding="utf-8") write_markdown(payload, out_md) print() print(f"Wrote {out_json} ({len(rows)} rows)") print(f"Wrote {out_md}") print(f"Average reward: {payload['average_reward']:.4f}") print(f"Completion rate: {payload['completion_rate'] * 100:.2f}%") distinct_variants = sorted({r["variant_id"] for r in rows if r["variant_id"] is not None}) distinct_rewards = sorted({r["reward"] for r in rows}) nonzero_evidence = sum(1 for r in rows if r["evidence_quality"] > 0.0) print() print("Invariants (the FATAL-3 / FATAL-4 acceptance criteria):") print(f" distinct variant_ids : {distinct_variants} (expected: > 1 distinct)") print(f" distinct rewards : {len(distinct_rewards)} unique values") print(f" rows with evidence_quality > 0 : {nonzero_evidence} / {len(rows)}") failed = [] if len(distinct_variants) <= 1: failed.append("FATAL-4 invariant: variant_ids still constant") if nonzero_evidence == 0: failed.append("FATAL-3 invariant: evidence_quality still zero everywhere") if len(distinct_rewards) <= 1: failed.append("rewards are constant — investigate") if failed: for f in failed: print(f" FAIL: {f}") return 1 print(" PASS: all invariants hold") return 0 if __name__ == "__main__": sys.exit(main())