Spaces:

ar9av
/

sql-agent-openenv

Sleeping

ar9avg commited on 9 days ago

Commit

98b87b7

1 Parent(s): 263261a

Clamp every grader return value strictly inside (0, 1)

Previously graders returned raw 0.0/0.5/1.0 and were only clamped at
the grade_response boundary. If the validator introspects and calls
grader functions directly (e.g. through openenv-core task discovery),
it would see exact 0.0 and 1.0 values.

Now each of _grade_simple, _grade_join, _grade_complex wraps every
return in _clamp(x), which maps x to [0.05, 0.95]. No matter how the
validator invokes the graders, they cannot return 0.0 or 1.0.

Files changed (1) hide show

backend/env/tasks.py +29 -22

backend/env/tasks.py CHANGED Viewed

@@ -3,10 +3,13 @@ Task definitions for the SQL agent benchmark.
 Three difficulty tiers, each with 5 questions and a grader function.
-Grader contract: grader(sql, rows, error, attempts) -> float in [0.0, 1.0]
   - rows: list[dict] from the executed SQL (may be empty)
   - error: str | None
   - attempts: int (1-indexed count of attempts taken)
 """
 from __future__ import annotations
@@ -18,6 +21,18 @@ from typing import Callable, Optional
 from env.database import execute_query
 # ─── Task Definitions ─────────────────────────────────────────────
 @dataclass
@@ -50,15 +65,14 @@ def _has_required_columns(rows: list[dict], required: list[str]) -> bool:
 def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
     n = len(rows)
     if n == 0:
         return 0.0
     if n >= min_rows:
         if max_rows is None or n <= max_rows:
             return 1.0
-        # Over the expected maximum — might be a missing WHERE clause
         return 0.5
-    # Partial result
     return 0.5 * (n / min_rows)
@@ -116,16 +130,16 @@ def _grade_simple(
     attempts: int,
 ) -> float:
     if error:
-        return 0.0
     col_ok = _has_required_columns(rows, question.expected_columns)
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
     if col_ok and row_score == 1.0:
-        return 1.0
     if col_ok or row_score >= 0.5:
-        return 0.5
-    return 0.0
 _TASK_SIMPLE = Task(
@@ -189,7 +203,7 @@ def _grade_join(
     attempts: int,
 ) -> float:
     if error:
-        return 0.0
     col_ok = _has_required_columns(rows, [question.expected_columns[0]])
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
@@ -200,9 +214,8 @@ def _grade_join(
     elif col_ok or row_score >= 0.5:
         base = 0.5
-    # Penalize extra attempts
     attempt_penalty = max(0.0, 0.1 * (attempts - 1))
-    return max(0.0, base - attempt_penalty)
 _TASK_JOIN = Task(
@@ -282,23 +295,21 @@ def _grade_complex(
     attempts: int,
 ) -> float:
     if error:
-        return 0.0
     col_ok = _has_required_columns(rows, question.expected_columns)
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
     if not col_ok or row_score == 0.0:
-        return 0.0
-    # Hard task base max is 0.8 unless first-attempt bonus
     if row_score == 1.0 and col_ok:
         base = 0.8 + (0.2 if attempts == 1 else 0.0)
     else:
-        base = 0.4  # partial
-    # Strict attempt penalty for hard queries
     attempt_penalty = 0.1 * (attempts - 1)
-    return max(0.0, base - attempt_penalty)
 _TASK_COMPLEX = Task(
@@ -330,9 +341,6 @@ def get_all_tasks() -> list[Task]:
     return list(TASKS.values())
-_EPS = 0.05  # wide margin so :.2f/:.3f never rounds to 0.00 or 1.00
 def grade_response(
     task_id: str,
     question_id: str,
@@ -345,6 +353,5 @@ def grade_response(
     question = next((q for q in task.questions if q.id == question_id), None)
     if question is None:
         raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
-    raw = task.grader(question, sql, rows, error, attempts)
-    # Score must be strictly in (0, 1) exclusive per OpenEnv spec
-    return max(_EPS, min(1.0 - _EPS, raw))

 Three difficulty tiers, each with 5 questions and a grader function.
+Grader contract: grader(sql, rows, error, attempts) -> float strictly in (0, 1)
   - rows: list[dict] from the executed SQL (may be empty)
   - error: str | None
   - attempts: int (1-indexed count of attempts taken)
+All graders return values strictly in (_EPS, 1 - _EPS) so no path can
+emit exact 0.0 or 1.0.
 """
 from __future__ import annotations
 from env.database import execute_query
+# ─── Score clamping (strictly in (0, 1)) ──────────────────────────
+_EPS = 0.05  # margin so :.2f/:.3f formatting never rounds to 0.00 or 1.00
+def _clamp(x: float) -> float:
+    """Clamp to strictly (0, 1). NaN/None → 0.5."""
+    if x is None or x != x:  # None or NaN
+        return 0.5
+    return max(_EPS, min(1.0 - _EPS, float(x)))
 # ─── Task Definitions ─────────────────────────────────────────────
 @dataclass
 def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
+    """Returns a raw score in [0, 1]; graders must clamp before returning."""
     n = len(rows)
     if n == 0:
         return 0.0
     if n >= min_rows:
         if max_rows is None or n <= max_rows:
             return 1.0
         return 0.5
     return 0.5 * (n / min_rows)
     attempts: int,
 ) -> float:
     if error:
+        return _clamp(0.0)
     col_ok = _has_required_columns(rows, question.expected_columns)
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
     if col_ok and row_score == 1.0:
+        return _clamp(1.0)
     if col_ok or row_score >= 0.5:
+        return _clamp(0.5)
+    return _clamp(0.0)
 _TASK_SIMPLE = Task(
     attempts: int,
 ) -> float:
     if error:
+        return _clamp(0.0)
     col_ok = _has_required_columns(rows, [question.expected_columns[0]])
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
     elif col_ok or row_score >= 0.5:
         base = 0.5
     attempt_penalty = max(0.0, 0.1 * (attempts - 1))
+    return _clamp(base - attempt_penalty)
 _TASK_JOIN = Task(
     attempts: int,
 ) -> float:
     if error:
+        return _clamp(0.0)
     col_ok = _has_required_columns(rows, question.expected_columns)
     row_score = _row_count_score(rows, question.min_rows, question.max_rows)
     if not col_ok or row_score == 0.0:
+        return _clamp(0.0)
     if row_score == 1.0 and col_ok:
         base = 0.8 + (0.2 if attempts == 1 else 0.0)
     else:
+        base = 0.4
     attempt_penalty = 0.1 * (attempts - 1)
+    return _clamp(base - attempt_penalty)
 _TASK_COMPLEX = Task(
     return list(TASKS.values())
 def grade_response(
     task_id: str,
     question_id: str,
     question = next((q for q in task.questions if q.id == question_id), None)
     if question is None:
         raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
+    # Graders already clamp internally; this is a final safety net.
+    return _clamp(task.grader(question, sql, rows, error, attempts))