"""Benchmark report generation.""" from __future__ import annotations import json from pathlib import Path from app.evaluation.abstention_eval import abstention_eval from app.evaluation.calibration_eval import calibration_eval from app.evaluation.dosing_eval import dosing_eval from app.evaluation.offline_policy_eval import offline_policy_eval from app.evaluation.process_eval import process_eval from app.evaluation.robustness_eval import robustness_eval from app.evaluation.safety_eval import safety_eval from app.evaluation.subgroup_eval import subgroup_eval from app.evaluation.explainability_eval import explainability_eval def build_benchmark_report(out_path: Path) -> dict: report = { "offline_policy_eval": offline_policy_eval(), "safety_eval": safety_eval(), "dosing_eval": dosing_eval(), "robustness_eval": robustness_eval(), "calibration_eval": calibration_eval(), "abstention_eval": abstention_eval(), "process_eval": process_eval(), "subgroup_eval": subgroup_eval(), "explainability_eval": explainability_eval(), } out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(report, ensure_ascii=True, indent=2), encoding="utf-8") return report