File size: 1,278 Bytes
877add7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""Benchmark report generation."""

from __future__ import annotations

import json
from pathlib import Path

from app.evaluation.abstention_eval import abstention_eval
from app.evaluation.calibration_eval import calibration_eval
from app.evaluation.dosing_eval import dosing_eval
from app.evaluation.offline_policy_eval import offline_policy_eval
from app.evaluation.process_eval import process_eval
from app.evaluation.robustness_eval import robustness_eval
from app.evaluation.safety_eval import safety_eval
from app.evaluation.subgroup_eval import subgroup_eval
from app.evaluation.explainability_eval import explainability_eval


def build_benchmark_report(out_path: Path) -> dict:
    report = {
        "offline_policy_eval": offline_policy_eval(),
        "safety_eval": safety_eval(),
        "dosing_eval": dosing_eval(),
        "robustness_eval": robustness_eval(),
        "calibration_eval": calibration_eval(),
        "abstention_eval": abstention_eval(),
        "process_eval": process_eval(),
        "subgroup_eval": subgroup_eval(),
        "explainability_eval": explainability_eval(),
    }
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(report, ensure_ascii=True, indent=2), encoding="utf-8")
    return report