workflow_arena / grade /common.py
Cyber-Machine's picture
feat: implement grading system with task definitions and score extraction
b522b5c verified
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
from typing import Any
MIN_SCORE = 0.01
MAX_SCORE = 0.99
END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)")
START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)")
def clamp_score(score: float) -> float:
return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4)
def read_payload_text() -> str:
if len(sys.argv) > 1:
path = Path(sys.argv[1])
if path.exists():
return path.read_text()
return sys.stdin.read()
def _lookup_score(value: Any) -> float | None:
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, dict):
for key in (
"score",
"benchmark_score",
"final_score",
"task_score",
):
candidate = value.get(key)
if isinstance(candidate, (int, float)):
return float(candidate)
for key in (
"success_metrics",
"observation",
"final_observation",
"result",
"metrics",
):
candidate = value.get(key)
if candidate is not None:
nested = _lookup_score(candidate)
if nested is not None:
return nested
if isinstance(value, list):
for item in value:
nested = _lookup_score(item)
if nested is not None:
return nested
return None
def extract_score(text: str) -> float:
stripped = text.strip()
if not stripped:
return MIN_SCORE
match = END_SCORE_RE.search(stripped)
if match:
return clamp_score(float(match.group(1)))
try:
payload = json.loads(stripped)
except json.JSONDecodeError:
return MIN_SCORE
score = _lookup_score(payload)
if score is None:
return MIN_SCORE
return clamp_score(score)
def extract_started_task(text: str) -> str | None:
match = START_TASK_RE.search(text)
if match:
return match.group(1)
return None
def emit_grade(expected_task: str) -> int:
text = read_payload_text()
observed_task = extract_started_task(text)
score = extract_score(text)
if observed_task is not None and observed_task != expected_task:
score = MIN_SCORE
print(
json.dumps(
{
"task_id": expected_task,
"score": score,
},
separators=(",", ":"),
)
)
return 0