Spaces:
Running
Running
| from __future__ import annotations | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| MIN_SCORE = 0.01 | |
| MAX_SCORE = 0.99 | |
| END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)") | |
| START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)") | |
| def clamp_score(score: float) -> float: | |
| return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4) | |
| def read_payload_text() -> str: | |
| if len(sys.argv) > 1: | |
| path = Path(sys.argv[1]) | |
| if path.exists(): | |
| return path.read_text() | |
| return sys.stdin.read() | |
| def _lookup_score(value: Any) -> float | None: | |
| if isinstance(value, (int, float)): | |
| return float(value) | |
| if isinstance(value, dict): | |
| for key in ( | |
| "score", | |
| "benchmark_score", | |
| "final_score", | |
| "task_score", | |
| ): | |
| candidate = value.get(key) | |
| if isinstance(candidate, (int, float)): | |
| return float(candidate) | |
| for key in ( | |
| "success_metrics", | |
| "observation", | |
| "final_observation", | |
| "result", | |
| "metrics", | |
| ): | |
| candidate = value.get(key) | |
| if candidate is not None: | |
| nested = _lookup_score(candidate) | |
| if nested is not None: | |
| return nested | |
| if isinstance(value, list): | |
| for item in value: | |
| nested = _lookup_score(item) | |
| if nested is not None: | |
| return nested | |
| return None | |
| def extract_score(text: str) -> float: | |
| stripped = text.strip() | |
| if not stripped: | |
| return MIN_SCORE | |
| match = END_SCORE_RE.search(stripped) | |
| if match: | |
| return clamp_score(float(match.group(1))) | |
| try: | |
| payload = json.loads(stripped) | |
| except json.JSONDecodeError: | |
| return MIN_SCORE | |
| score = _lookup_score(payload) | |
| if score is None: | |
| return MIN_SCORE | |
| return clamp_score(score) | |
| def extract_started_task(text: str) -> str | None: | |
| match = START_TASK_RE.search(text) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def emit_grade(expected_task: str) -> int: | |
| text = read_payload_text() | |
| observed_task = extract_started_task(text) | |
| score = extract_score(text) | |
| if observed_task is not None and observed_task != expected_task: | |
| score = MIN_SCORE | |
| print( | |
| json.dumps( | |
| { | |
| "task_id": expected_task, | |
| "score": score, | |
| }, | |
| separators=(",", ":"), | |
| ) | |
| ) | |
| return 0 | |