debatefloor / train /generate_eval_report.py
AniketAsla's picture
sync: mirror git d05fcb5 to Space
b4ac377 verified
"""
train/generate_eval_report.py
Regenerates `reports/eval_report.json` and `reports/eval_report.md` from a
live DebateFloor environment using the canonical
`inference_debatefloor.py:STRATEGIES`.
Why this exists (NEW-1 / FATAL-4):
- The previous reports/eval_report.json was 3 weeks old and had
`variant_id: 0` and `evidence_quality: 0.0` for every row, contradicting
the FATAL-3 + FATAL-4 server-side fixes.
- PLAN.md mentioned `pre_validation_script.py --output ... --seeds ...`
but those flags were never implemented in that script.
- This is the dedicated regeneration tool.
What it does:
- Sweeps every task registered in inference_debatefloor.STRATEGIES
(currently 5 — clean_claim, contradictory_claim, distribution_shift_claim,
coordinated_fraud, identity_fraud) × 5 distinct seeds
(7, 11, 13, 19, 25) covering all 5 variant_ids
(variant_id = abs(seed) % 5 — see app/tasks.py:548).
- Per row captures: task_id, seed, done, reward, variant_id,
evidence_quality, exploit_penalty.
- Writes JSON (schema-compatible with the previous file) + Markdown.
Usage:
$ python train/generate_eval_report.py [--base-url http://localhost:7860]
"""
from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from statistics import mean
# Make the inference baseline importable from the repo root.
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
from inference_debatefloor import ( # noqa: E402
DebateFloorClient,
STRATEGIES,
)
# Seeds chosen so that abs(seed) % 5 covers all 5 variants:
# 7 -> 2, 11 -> 1, 13 -> 3, 19 -> 4, 25 -> 0
SEEDS = [7, 11, 13, 19, 25]
TASKS = list(STRATEGIES.keys())
def run_one(client: DebateFloorClient, task_id: str, seed: int) -> dict:
obs = client.reset(task_id=task_id, seed=seed)
actions = STRATEGIES[task_id](client, obs)
last = None
steps = 0
for action in actions:
try:
last = client.step(action)
steps += 1
if last.get("done"):
break
except Exception as exc:
return {
"task_id": task_id,
"seed": seed,
"done": False,
"reward": 0.0,
"variant_id": None,
"evidence_quality": 0.0,
"exploit_penalty": 0.0,
"error": str(exc),
}
if last is None:
return {
"task_id": task_id,
"seed": seed,
"done": False,
"reward": 0.0,
"variant_id": None,
"evidence_quality": 0.0,
"exploit_penalty": 0.0,
"error": "no steps executed",
}
obs = last.get("observation", {})
metadata = obs.get("metadata", {}) or {}
breakdown = obs.get("reward_breakdown", {}) or {}
return {
"task_id": task_id,
"seed": seed,
"done": bool(last.get("done", False)),
"reward": round(float(last.get("reward", 0.0)), 4),
"variant_id": int(metadata.get("variant_id", 0)),
"evidence_quality": round(float(breakdown.get("evidence_quality_score", 0.0)), 4),
"exploit_penalty": round(float(metadata.get("exploit_penalty", 0.0)), 4),
"steps": steps,
}
def write_markdown(payload: dict, path: Path) -> None:
rows = payload["rows"]
lines = [
"# Evaluation Report",
"",
f"Generated at: {payload['generated_at']}",
f"Base URL: {payload['base_url']}",
f"Tasks: {', '.join(sorted({r['task_id'] for r in rows}))}",
f"Seeds: {', '.join(str(s) for s in sorted({r['seed'] for r in rows}))}",
f"Distinct variant_ids: {sorted({r['variant_id'] for r in rows if r['variant_id'] is not None})}",
"",
"| Task | Seed | Variant | Steps | Done | Reward | Evidence Quality | Exploit Penalty |",
"|---|---:|---:|---:|:---:|---:|---:|---:|",
]
for r in sorted(rows, key=lambda x: (x["task_id"], x["seed"])):
done_glyph = "yes" if r["done"] else "no"
lines.append(
f"| {r['task_id']} | {r['seed']} | {r['variant_id']} | "
f"{r.get('steps', '-')} | {done_glyph} | "
f"{r['reward']:.4f} | {r['evidence_quality']:.4f} | "
f"{r['exploit_penalty']:.4f} |"
)
lines += [
"",
f"Average Reward: {payload['average_reward']:.4f}",
f"Completion Rate: {payload['completion_rate'] * 100:.2f}%",
"",
]
path.write_text("\n".join(lines), encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(description="Regenerate reports/eval_report.{json,md}")
parser.add_argument("--base-url", default="http://localhost:7860")
parser.add_argument(
"--output-json",
default=str(REPO_ROOT / "reports" / "eval_report.json"),
)
parser.add_argument(
"--output-md",
default=str(REPO_ROOT / "reports" / "eval_report.md"),
)
args = parser.parse_args()
print(f"Generating eval report against {args.base_url}")
print(f"Tasks: {TASKS}")
print(f"Seeds: {SEEDS} (variant_ids: {sorted({abs(s) % 5 for s in SEEDS})})")
print()
rows = []
for task_id in TASKS:
for seed in SEEDS:
client = DebateFloorClient(args.base_url)
row = run_one(client, task_id, seed)
rows.append(row)
print(
f" {task_id:<28s} seed={seed:>3d} variant={row['variant_id']} "
f"reward={row['reward']:.4f} ev_q={row['evidence_quality']:.4f} "
f"exp_pen={row['exploit_penalty']:.4f} done={row['done']}"
)
completed = [r for r in rows if r.get("done")]
payload = {
"generated_at": datetime.now(timezone.utc).isoformat(),
"base_url": args.base_url,
"rows": rows,
"average_reward": round(mean(r["reward"] for r in completed) if completed else 0.0, 4),
"completion_rate": round(len(completed) / len(rows) if rows else 0.0, 4),
}
out_json = Path(args.output_json)
out_md = Path(args.output_md)
out_json.parent.mkdir(parents=True, exist_ok=True)
out_json.write_text(json.dumps(payload, indent=2), encoding="utf-8")
write_markdown(payload, out_md)
print()
print(f"Wrote {out_json} ({len(rows)} rows)")
print(f"Wrote {out_md}")
print(f"Average reward: {payload['average_reward']:.4f}")
print(f"Completion rate: {payload['completion_rate'] * 100:.2f}%")
distinct_variants = sorted({r["variant_id"] for r in rows if r["variant_id"] is not None})
distinct_rewards = sorted({r["reward"] for r in rows})
nonzero_evidence = sum(1 for r in rows if r["evidence_quality"] > 0.0)
print()
print("Invariants (the FATAL-3 / FATAL-4 acceptance criteria):")
print(f" distinct variant_ids : {distinct_variants} (expected: > 1 distinct)")
print(f" distinct rewards : {len(distinct_rewards)} unique values")
print(f" rows with evidence_quality > 0 : {nonzero_evidence} / {len(rows)}")
failed = []
if len(distinct_variants) <= 1:
failed.append("FATAL-4 invariant: variant_ids still constant")
if nonzero_evidence == 0:
failed.append("FATAL-3 invariant: evidence_quality still zero everywhere")
if len(distinct_rewards) <= 1:
failed.append("rewards are constant — investigate")
if failed:
for f in failed:
print(f" FAIL: {f}")
return 1
print(" PASS: all invariants hold")
return 0
if __name__ == "__main__":
sys.exit(main())