Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Compare baseline and candidate reports to show measurable improvement.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Compare two run reports.") | |
| parser.add_argument("--baseline", required=True) | |
| parser.add_argument("--candidate", required=True) | |
| parser.add_argument("--output", default="outputs/reports/improvement_report.json") | |
| return parser.parse_args() | |
| def _load(path: Path) -> dict: | |
| if not path.exists(): | |
| return {} | |
| return json.loads(path.read_text(encoding="utf-8")) | |
| def _metric(payload: dict, key: str) -> float: | |
| if key in payload: | |
| return float(payload.get(key, 0.0)) | |
| offline = payload.get("offline_policy_eval", {}) if isinstance(payload.get("offline_policy_eval"), dict) else {} | |
| process = payload.get("process_eval", {}) if isinstance(payload.get("process_eval"), dict) else {} | |
| ablations = payload.get("policy_stack_ablations", {}) if isinstance(payload.get("policy_stack_ablations"), dict) else {} | |
| llm_bandit = ( | |
| payload.get("ablations", {}).get("llm_bandit", {}) | |
| if isinstance(payload.get("ablations"), dict) | |
| else {} | |
| ) | |
| mapping = { | |
| "avg_reward": [offline.get("avg_reward"), llm_bandit.get("avg_reward"), ablations.get("llm+bandit", {}).get("avg_reward")], | |
| "legality_rate": [offline.get("legal_rate"), llm_bandit.get("legality_rate"), ablations.get("llm+bandit", {}).get("legality_rate")], | |
| "success_rate": [offline.get("success_rate"), llm_bandit.get("success_rate")], | |
| "avg_process_fidelity": [process.get("process_fidelity"), llm_bandit.get("avg_process_fidelity")], | |
| "timeout_rate": [payload.get("timeout_rate"), llm_bandit.get("timeout_rate")], | |
| "failure_visible_rate": [payload.get("failure_visible_rate"), llm_bandit.get("failure_visible_rate")], | |
| } | |
| for value in mapping.get(key, []): | |
| if value is not None: | |
| return float(value) | |
| return 0.0 | |
| def main() -> None: | |
| args = parse_args() | |
| baseline = _load(Path(args.baseline)) | |
| candidate = _load(Path(args.candidate)) | |
| keys = [ | |
| "avg_reward", | |
| "legality_rate", | |
| "success_rate", | |
| "avg_process_fidelity", | |
| "timeout_rate", | |
| "failure_visible_rate", | |
| ] | |
| deltas = {} | |
| for key in keys: | |
| b = _metric(baseline, key) | |
| c = _metric(candidate, key) | |
| deltas[key] = round(c - b, 6) | |
| gate = { | |
| "avg_reward_up": deltas["avg_reward"] >= 0.0, | |
| "legality_up": deltas["legality_rate"] >= 0.0, | |
| "success_up": deltas["success_rate"] >= 0.0, | |
| } | |
| payload = { | |
| "status": "ok", | |
| "baseline": str(args.baseline), | |
| "candidate": str(args.candidate), | |
| "deltas": deltas, | |
| "gate": gate, | |
| "improved": all(gate.values()), | |
| } | |
| out = Path(args.output) | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| out.write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8") | |
| print("evaluate_compare_runs_done") | |
| if __name__ == "__main__": | |
| main() | |