Spaces:
Running
Running
| """Benchmark report generation.""" | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from app.evaluation.abstention_eval import abstention_eval | |
| from app.evaluation.calibration_eval import calibration_eval | |
| from app.evaluation.dosing_eval import dosing_eval | |
| from app.evaluation.offline_policy_eval import offline_policy_eval | |
| from app.evaluation.process_eval import process_eval | |
| from app.evaluation.robustness_eval import robustness_eval | |
| from app.evaluation.safety_eval import safety_eval | |
| from app.evaluation.subgroup_eval import subgroup_eval | |
| from app.evaluation.explainability_eval import explainability_eval | |
| def build_benchmark_report(out_path: Path) -> dict: | |
| report = { | |
| "offline_policy_eval": offline_policy_eval(), | |
| "safety_eval": safety_eval(), | |
| "dosing_eval": dosing_eval(), | |
| "robustness_eval": robustness_eval(), | |
| "calibration_eval": calibration_eval(), | |
| "abstention_eval": abstention_eval(), | |
| "process_eval": process_eval(), | |
| "subgroup_eval": subgroup_eval(), | |
| "explainability_eval": explainability_eval(), | |
| } | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| out_path.write_text(json.dumps(report, ensure_ascii=True, indent=2), encoding="utf-8") | |
| return report | |