| |
| """Compare baseline and candidate reports to show measurable improvement.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Compare two run reports.") |
| parser.add_argument("--baseline", required=True) |
| parser.add_argument("--candidate", required=True) |
| parser.add_argument("--output", default="outputs/reports/improvement_report.json") |
| return parser.parse_args() |
|
|
|
|
| def _load(path: Path) -> dict: |
| if not path.exists(): |
| return {} |
| return json.loads(path.read_text(encoding="utf-8")) |
|
|
|
|
| def _metric(payload: dict, key: str) -> float: |
| if key in payload: |
| return float(payload.get(key, 0.0)) |
|
|
| offline = payload.get("offline_policy_eval", {}) if isinstance(payload.get("offline_policy_eval"), dict) else {} |
| process = payload.get("process_eval", {}) if isinstance(payload.get("process_eval"), dict) else {} |
| ablations = payload.get("policy_stack_ablations", {}) if isinstance(payload.get("policy_stack_ablations"), dict) else {} |
| llm_bandit = ( |
| payload.get("ablations", {}).get("llm_bandit", {}) |
| if isinstance(payload.get("ablations"), dict) |
| else {} |
| ) |
|
|
| mapping = { |
| "avg_reward": [offline.get("avg_reward"), llm_bandit.get("avg_reward"), ablations.get("llm+bandit", {}).get("avg_reward")], |
| "legality_rate": [offline.get("legal_rate"), llm_bandit.get("legality_rate"), ablations.get("llm+bandit", {}).get("legality_rate")], |
| "success_rate": [offline.get("success_rate"), llm_bandit.get("success_rate")], |
| "avg_process_fidelity": [process.get("process_fidelity"), llm_bandit.get("avg_process_fidelity")], |
| "timeout_rate": [payload.get("timeout_rate"), llm_bandit.get("timeout_rate")], |
| "failure_visible_rate": [payload.get("failure_visible_rate"), llm_bandit.get("failure_visible_rate")], |
| } |
| for value in mapping.get(key, []): |
| if value is not None: |
| return float(value) |
| return 0.0 |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| baseline = _load(Path(args.baseline)) |
| candidate = _load(Path(args.candidate)) |
|
|
| keys = [ |
| "avg_reward", |
| "legality_rate", |
| "success_rate", |
| "avg_process_fidelity", |
| "timeout_rate", |
| "failure_visible_rate", |
| ] |
| deltas = {} |
| for key in keys: |
| b = _metric(baseline, key) |
| c = _metric(candidate, key) |
| deltas[key] = round(c - b, 6) |
|
|
| gate = { |
| "avg_reward_up": deltas["avg_reward"] >= 0.0, |
| "legality_up": deltas["legality_rate"] >= 0.0, |
| "success_up": deltas["success_rate"] >= 0.0, |
| } |
|
|
| payload = { |
| "status": "ok", |
| "baseline": str(args.baseline), |
| "candidate": str(args.candidate), |
| "deltas": deltas, |
| "gate": gate, |
| "improved": all(gate.values()), |
| } |
| out = Path(args.output) |
| out.parent.mkdir(parents=True, exist_ok=True) |
| out.write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8") |
| print("evaluate_compare_runs_done") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|