#!/usr/bin/env python3 """Compare baseline and candidate reports to show measurable improvement.""" from __future__ import annotations import argparse import json from pathlib import Path def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Compare two run reports.") parser.add_argument("--baseline", required=True) parser.add_argument("--candidate", required=True) parser.add_argument("--output", default="outputs/reports/improvement_report.json") return parser.parse_args() def _load(path: Path) -> dict: if not path.exists(): return {} return json.loads(path.read_text(encoding="utf-8")) def _metric(payload: dict, key: str) -> float: if key in payload: return float(payload.get(key, 0.0)) offline = payload.get("offline_policy_eval", {}) if isinstance(payload.get("offline_policy_eval"), dict) else {} process = payload.get("process_eval", {}) if isinstance(payload.get("process_eval"), dict) else {} ablations = payload.get("policy_stack_ablations", {}) if isinstance(payload.get("policy_stack_ablations"), dict) else {} llm_bandit = ( payload.get("ablations", {}).get("llm_bandit", {}) if isinstance(payload.get("ablations"), dict) else {} ) mapping = { "avg_reward": [offline.get("avg_reward"), llm_bandit.get("avg_reward"), ablations.get("llm+bandit", {}).get("avg_reward")], "legality_rate": [offline.get("legal_rate"), llm_bandit.get("legality_rate"), ablations.get("llm+bandit", {}).get("legality_rate")], "success_rate": [offline.get("success_rate"), llm_bandit.get("success_rate")], "avg_process_fidelity": [process.get("process_fidelity"), llm_bandit.get("avg_process_fidelity")], "timeout_rate": [payload.get("timeout_rate"), llm_bandit.get("timeout_rate")], "failure_visible_rate": [payload.get("failure_visible_rate"), llm_bandit.get("failure_visible_rate")], } for value in mapping.get(key, []): if value is not None: return float(value) return 0.0 def main() -> None: args = parse_args() baseline = _load(Path(args.baseline)) candidate = _load(Path(args.candidate)) keys = [ "avg_reward", "legality_rate", "success_rate", "avg_process_fidelity", "timeout_rate", "failure_visible_rate", ] deltas = {} for key in keys: b = _metric(baseline, key) c = _metric(candidate, key) deltas[key] = round(c - b, 6) gate = { "avg_reward_up": deltas["avg_reward"] >= 0.0, "legality_up": deltas["legality_rate"] >= 0.0, "success_up": deltas["success_rate"] >= 0.0, } payload = { "status": "ok", "baseline": str(args.baseline), "candidate": str(args.candidate), "deltas": deltas, "gate": gate, "improved": all(gate.values()), } out = Path(args.output) out.parent.mkdir(parents=True, exist_ok=True) out.write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8") print("evaluate_compare_runs_done") if __name__ == "__main__": main()