#!/usr/bin/env python3 from __future__ import annotations import argparse import json from pathlib import Path from statistics import mean from typing import Any TASK_ORDER = ("easy", "medium", "medium_plus", "hard") def find_latest_eval() -> Path: candidates = sorted(Path("outputs/evals").glob("*.json")) if not candidates: raise FileNotFoundError("No eval JSON files found under outputs/evals/.") return candidates[-1] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Print a compact summary for an InvoiceOps eval JSON artifact." ) parser.add_argument( "paths", nargs="*", help="Optional eval JSON paths. Defaults to the latest file under outputs/evals/.", ) return parser.parse_args() def _safe_mean(values: list[float]) -> float | None: return round(mean(values), 4) if values else None def _request_error_count(result: dict[str, Any]) -> int: attempts = result.get("model_attempts") or [] return sum( 1 for attempt in attempts if isinstance(attempt, dict) and attempt.get("request_error") ) def summarize_eval(path: Path) -> dict[str, Any]: payload = json.loads(path.read_text(encoding="utf-8")) results = payload.get("results") or [] task_scores: dict[str, float] = {} resolution_scores: list[float] = [] evidence_scores: list[float] = [] documentation_scores: list[float] = [] efficiency_scores: list[float] = [] steps: list[float] = [] reward_lengths: list[float] = [] fallback_count = 0 parse_failure_count = 0 request_error_count = 0 for result in results: task_id = result.get("task_id") score = result.get("score") if isinstance(task_id, str) and isinstance(score, (int, float)): task_scores[task_id] = round(float(score), 4) if result.get("used_fallback") is True: fallback_count += 1 if result.get("decision_parsed") is False: parse_failure_count += 1 request_error_count += _request_error_count(result) if isinstance(result.get("steps_used"), (int, float)): steps.append(float(result["steps_used"])) reward_trace = result.get("reward_trace") if isinstance(reward_trace, list): reward_lengths.append(float(len(reward_trace))) report = result.get("submission_report") if not isinstance(report, dict): continue for source, bucket in ( ("resolution_score", resolution_scores), ("evidence_score", evidence_scores), ("documentation_score", documentation_scores), ("efficiency_score", efficiency_scores), ): value = report.get(source) if isinstance(value, (int, float)): bucket.append(float(value)) return { "path": str(path), "run_id": payload.get("run_id"), "model_name": payload.get("model_name"), "mean_score": payload.get("mean_score"), "raw_mean_score": payload.get("raw_mean_score"), "strict_baseline_scoring": payload.get("strict_baseline_scoring"), "task_scores": task_scores, "fallback_count": fallback_count, "parse_failure_count": parse_failure_count, "request_error_count": request_error_count, "avg_resolution_score": _safe_mean(resolution_scores), "avg_evidence_score": _safe_mean(evidence_scores), "avg_documentation_score": _safe_mean(documentation_scores), "avg_efficiency_score": _safe_mean(efficiency_scores), "avg_steps_used": _safe_mean(steps), "avg_reward_trace_len": _safe_mean(reward_lengths), } def print_summary(summary: dict[str, Any]) -> None: print(f"path: {summary['path']}") print(f"run_id: {summary['run_id']}") print(f"model: {summary['model_name']}") print( "mean_score: " f"{summary['mean_score']:.4f} " f"(raw_mean_score={summary['raw_mean_score']:.4f}, " f"strict_baseline_scoring={summary['strict_baseline_scoring']})" ) print("tasks:") for task_id in TASK_ORDER: score = summary["task_scores"].get(task_id) rendered = "-" if score is None else f"{score:.4f}" print(f" {task_id}: {rendered}") print("components:") for label in ( "avg_resolution_score", "avg_evidence_score", "avg_documentation_score", "avg_efficiency_score", "avg_steps_used", "avg_reward_trace_len", ): value = summary[label] rendered = "-" if value is None else f"{value:.4f}" print(f" {label}: {rendered}") print("health:") print(f" fallbacks: {summary['fallback_count']}") print(f" parse_failures: {summary['parse_failure_count']}") print(f" request_errors: {summary['request_error_count']}") def main() -> None: args = parse_args() paths = [Path(value) for value in args.paths] if args.paths else [find_latest_eval()] for index, path in enumerate(paths): if index: print() print_summary(summarize_eval(path)) if __name__ == "__main__": main()