from __future__ import annotations import json import re import sys from pathlib import Path from typing import Any MIN_SCORE = 0.01 MAX_SCORE = 0.99 END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)") START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)") def clamp_score(score: float) -> float: return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4) def read_payload_text() -> str: if len(sys.argv) > 1: path = Path(sys.argv[1]) if path.exists(): return path.read_text() return sys.stdin.read() def _lookup_score(value: Any) -> float | None: if isinstance(value, (int, float)): return float(value) if isinstance(value, dict): for key in ( "score", "benchmark_score", "final_score", "task_score", ): candidate = value.get(key) if isinstance(candidate, (int, float)): return float(candidate) for key in ( "success_metrics", "observation", "final_observation", "result", "metrics", ): candidate = value.get(key) if candidate is not None: nested = _lookup_score(candidate) if nested is not None: return nested if isinstance(value, list): for item in value: nested = _lookup_score(item) if nested is not None: return nested return None def extract_score(text: str) -> float: stripped = text.strip() if not stripped: return MIN_SCORE match = END_SCORE_RE.search(stripped) if match: return clamp_score(float(match.group(1))) try: payload = json.loads(stripped) except json.JSONDecodeError: return MIN_SCORE score = _lookup_score(payload) if score is None: return MIN_SCORE return clamp_score(score) def extract_started_task(text: str) -> str | None: match = START_TASK_RE.search(text) if match: return match.group(1) return None def emit_grade(expected_task: str) -> int: text = read_payload_text() observed_task = extract_started_task(text) score = extract_score(text) if observed_task is not None and observed_task != expected_task: score = MIN_SCORE print( json.dumps( { "task_id": expected_task, "score": score, }, separators=(",", ":"), ) ) return 0