Spaces:
Sleeping
Sleeping
Clamp every grader return value strictly inside (0, 1)
Browse filesPreviously graders returned raw 0.0/0.5/1.0 and were only clamped at
the grade_response boundary. If the validator introspects and calls
grader functions directly (e.g. through openenv-core task discovery),
it would see exact 0.0 and 1.0 values.
Now each of _grade_simple, _grade_join, _grade_complex wraps every
return in _clamp(x), which maps x to [0.05, 0.95]. No matter how the
validator invokes the graders, they cannot return 0.0 or 1.0.
- backend/env/tasks.py +29 -22
backend/env/tasks.py
CHANGED
|
@@ -3,10 +3,13 @@ Task definitions for the SQL agent benchmark.
|
|
| 3 |
|
| 4 |
Three difficulty tiers, each with 5 questions and a grader function.
|
| 5 |
|
| 6 |
-
Grader contract: grader(sql, rows, error, attempts) -> float in
|
| 7 |
- rows: list[dict] from the executed SQL (may be empty)
|
| 8 |
- error: str | None
|
| 9 |
- attempts: int (1-indexed count of attempts taken)
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
|
@@ -18,6 +21,18 @@ from typing import Callable, Optional
|
|
| 18 |
from env.database import execute_query
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# βββ Task Definitions βββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
|
| 23 |
@dataclass
|
|
@@ -50,15 +65,14 @@ def _has_required_columns(rows: list[dict], required: list[str]) -> bool:
|
|
| 50 |
|
| 51 |
|
| 52 |
def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
|
|
|
|
| 53 |
n = len(rows)
|
| 54 |
if n == 0:
|
| 55 |
return 0.0
|
| 56 |
if n >= min_rows:
|
| 57 |
if max_rows is None or n <= max_rows:
|
| 58 |
return 1.0
|
| 59 |
-
# Over the expected maximum β might be a missing WHERE clause
|
| 60 |
return 0.5
|
| 61 |
-
# Partial result
|
| 62 |
return 0.5 * (n / min_rows)
|
| 63 |
|
| 64 |
|
|
@@ -116,16 +130,16 @@ def _grade_simple(
|
|
| 116 |
attempts: int,
|
| 117 |
) -> float:
|
| 118 |
if error:
|
| 119 |
-
return 0.0
|
| 120 |
|
| 121 |
col_ok = _has_required_columns(rows, question.expected_columns)
|
| 122 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
| 123 |
|
| 124 |
if col_ok and row_score == 1.0:
|
| 125 |
-
return 1.0
|
| 126 |
if col_ok or row_score >= 0.5:
|
| 127 |
-
return 0.5
|
| 128 |
-
return 0.0
|
| 129 |
|
| 130 |
|
| 131 |
_TASK_SIMPLE = Task(
|
|
@@ -189,7 +203,7 @@ def _grade_join(
|
|
| 189 |
attempts: int,
|
| 190 |
) -> float:
|
| 191 |
if error:
|
| 192 |
-
return 0.0
|
| 193 |
|
| 194 |
col_ok = _has_required_columns(rows, [question.expected_columns[0]])
|
| 195 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
|
@@ -200,9 +214,8 @@ def _grade_join(
|
|
| 200 |
elif col_ok or row_score >= 0.5:
|
| 201 |
base = 0.5
|
| 202 |
|
| 203 |
-
# Penalize extra attempts
|
| 204 |
attempt_penalty = max(0.0, 0.1 * (attempts - 1))
|
| 205 |
-
return
|
| 206 |
|
| 207 |
|
| 208 |
_TASK_JOIN = Task(
|
|
@@ -282,23 +295,21 @@ def _grade_complex(
|
|
| 282 |
attempts: int,
|
| 283 |
) -> float:
|
| 284 |
if error:
|
| 285 |
-
return 0.0
|
| 286 |
|
| 287 |
col_ok = _has_required_columns(rows, question.expected_columns)
|
| 288 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
| 289 |
|
| 290 |
if not col_ok or row_score == 0.0:
|
| 291 |
-
return 0.0
|
| 292 |
|
| 293 |
-
# Hard task base max is 0.8 unless first-attempt bonus
|
| 294 |
if row_score == 1.0 and col_ok:
|
| 295 |
base = 0.8 + (0.2 if attempts == 1 else 0.0)
|
| 296 |
else:
|
| 297 |
-
base = 0.4
|
| 298 |
|
| 299 |
-
# Strict attempt penalty for hard queries
|
| 300 |
attempt_penalty = 0.1 * (attempts - 1)
|
| 301 |
-
return
|
| 302 |
|
| 303 |
|
| 304 |
_TASK_COMPLEX = Task(
|
|
@@ -330,9 +341,6 @@ def get_all_tasks() -> list[Task]:
|
|
| 330 |
return list(TASKS.values())
|
| 331 |
|
| 332 |
|
| 333 |
-
_EPS = 0.05 # wide margin so :.2f/:.3f never rounds to 0.00 or 1.00
|
| 334 |
-
|
| 335 |
-
|
| 336 |
def grade_response(
|
| 337 |
task_id: str,
|
| 338 |
question_id: str,
|
|
@@ -345,6 +353,5 @@ def grade_response(
|
|
| 345 |
question = next((q for q in task.questions if q.id == question_id), None)
|
| 346 |
if question is None:
|
| 347 |
raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
return max(_EPS, min(1.0 - _EPS, raw))
|
|
|
|
| 3 |
|
| 4 |
Three difficulty tiers, each with 5 questions and a grader function.
|
| 5 |
|
| 6 |
+
Grader contract: grader(sql, rows, error, attempts) -> float strictly in (0, 1)
|
| 7 |
- rows: list[dict] from the executed SQL (may be empty)
|
| 8 |
- error: str | None
|
| 9 |
- attempts: int (1-indexed count of attempts taken)
|
| 10 |
+
|
| 11 |
+
All graders return values strictly in (_EPS, 1 - _EPS) so no path can
|
| 12 |
+
emit exact 0.0 or 1.0.
|
| 13 |
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
|
|
|
| 21 |
from env.database import execute_query
|
| 22 |
|
| 23 |
|
| 24 |
+
# βββ Score clamping (strictly in (0, 1)) ββββββββββββββββββββββββββ
|
| 25 |
+
|
| 26 |
+
_EPS = 0.05 # margin so :.2f/:.3f formatting never rounds to 0.00 or 1.00
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _clamp(x: float) -> float:
|
| 30 |
+
"""Clamp to strictly (0, 1). NaN/None β 0.5."""
|
| 31 |
+
if x is None or x != x: # None or NaN
|
| 32 |
+
return 0.5
|
| 33 |
+
return max(_EPS, min(1.0 - _EPS, float(x)))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
# βββ Task Definitions βββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
|
| 38 |
@dataclass
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
|
| 68 |
+
"""Returns a raw score in [0, 1]; graders must clamp before returning."""
|
| 69 |
n = len(rows)
|
| 70 |
if n == 0:
|
| 71 |
return 0.0
|
| 72 |
if n >= min_rows:
|
| 73 |
if max_rows is None or n <= max_rows:
|
| 74 |
return 1.0
|
|
|
|
| 75 |
return 0.5
|
|
|
|
| 76 |
return 0.5 * (n / min_rows)
|
| 77 |
|
| 78 |
|
|
|
|
| 130 |
attempts: int,
|
| 131 |
) -> float:
|
| 132 |
if error:
|
| 133 |
+
return _clamp(0.0)
|
| 134 |
|
| 135 |
col_ok = _has_required_columns(rows, question.expected_columns)
|
| 136 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
| 137 |
|
| 138 |
if col_ok and row_score == 1.0:
|
| 139 |
+
return _clamp(1.0)
|
| 140 |
if col_ok or row_score >= 0.5:
|
| 141 |
+
return _clamp(0.5)
|
| 142 |
+
return _clamp(0.0)
|
| 143 |
|
| 144 |
|
| 145 |
_TASK_SIMPLE = Task(
|
|
|
|
| 203 |
attempts: int,
|
| 204 |
) -> float:
|
| 205 |
if error:
|
| 206 |
+
return _clamp(0.0)
|
| 207 |
|
| 208 |
col_ok = _has_required_columns(rows, [question.expected_columns[0]])
|
| 209 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
|
|
|
| 214 |
elif col_ok or row_score >= 0.5:
|
| 215 |
base = 0.5
|
| 216 |
|
|
|
|
| 217 |
attempt_penalty = max(0.0, 0.1 * (attempts - 1))
|
| 218 |
+
return _clamp(base - attempt_penalty)
|
| 219 |
|
| 220 |
|
| 221 |
_TASK_JOIN = Task(
|
|
|
|
| 295 |
attempts: int,
|
| 296 |
) -> float:
|
| 297 |
if error:
|
| 298 |
+
return _clamp(0.0)
|
| 299 |
|
| 300 |
col_ok = _has_required_columns(rows, question.expected_columns)
|
| 301 |
row_score = _row_count_score(rows, question.min_rows, question.max_rows)
|
| 302 |
|
| 303 |
if not col_ok or row_score == 0.0:
|
| 304 |
+
return _clamp(0.0)
|
| 305 |
|
|
|
|
| 306 |
if row_score == 1.0 and col_ok:
|
| 307 |
base = 0.8 + (0.2 if attempts == 1 else 0.0)
|
| 308 |
else:
|
| 309 |
+
base = 0.4
|
| 310 |
|
|
|
|
| 311 |
attempt_penalty = 0.1 * (attempts - 1)
|
| 312 |
+
return _clamp(base - attempt_penalty)
|
| 313 |
|
| 314 |
|
| 315 |
_TASK_COMPLEX = Task(
|
|
|
|
| 341 |
return list(TASKS.values())
|
| 342 |
|
| 343 |
|
|
|
|
|
|
|
|
|
|
| 344 |
def grade_response(
|
| 345 |
task_id: str,
|
| 346 |
question_id: str,
|
|
|
|
| 353 |
question = next((q for q in task.questions if q.id == question_id), None)
|
| 354 |
if question is None:
|
| 355 |
raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
|
| 356 |
+
# Graders already clamp internally; this is a final safety net.
|
| 357 |
+
return _clamp(task.grader(question, sql, rows, error, attempts))
|
|
|