Spaces:
Running
Running
| """Process-fidelity evaluation.""" | |
| from __future__ import annotations | |
| from app.evaluation.simulator_rollouts import run_rollouts | |
| def process_eval() -> dict[str, float]: | |
| rows = run_rollouts(episodes=8, difficulty="medium") | |
| if not rows: | |
| return {"process_fidelity": 0.0} | |
| fidelity_scores = [ | |
| float((row.get("reward_breakdown", {}) or {}).get("process_fidelity_score", 0.0)) | |
| for row in rows | |
| ] | |
| invalid_actions = [float(row.get("invalid_action_count", 0)) for row in rows] | |
| return { | |
| "process_fidelity": round(sum(fidelity_scores) / max(1, len(fidelity_scores)), 6), | |
| "avg_invalid_actions": round(sum(invalid_actions) / max(1, len(invalid_actions)), 6), | |
| } | |