ar9avg commited on
Commit
98b87b7
Β·
1 Parent(s): 263261a

Clamp every grader return value strictly inside (0, 1)

Browse files

Previously graders returned raw 0.0/0.5/1.0 and were only clamped at
the grade_response boundary. If the validator introspects and calls
grader functions directly (e.g. through openenv-core task discovery),
it would see exact 0.0 and 1.0 values.

Now each of _grade_simple, _grade_join, _grade_complex wraps every
return in _clamp(x), which maps x to [0.05, 0.95]. No matter how the
validator invokes the graders, they cannot return 0.0 or 1.0.

Files changed (1) hide show
  1. backend/env/tasks.py +29 -22
backend/env/tasks.py CHANGED
@@ -3,10 +3,13 @@ Task definitions for the SQL agent benchmark.
3
 
4
  Three difficulty tiers, each with 5 questions and a grader function.
5
 
6
- Grader contract: grader(sql, rows, error, attempts) -> float in [0.0, 1.0]
7
  - rows: list[dict] from the executed SQL (may be empty)
8
  - error: str | None
9
  - attempts: int (1-indexed count of attempts taken)
 
 
 
10
  """
11
 
12
  from __future__ import annotations
@@ -18,6 +21,18 @@ from typing import Callable, Optional
18
  from env.database import execute_query
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # ─── Task Definitions ─────────────────────────────────────────────
22
 
23
  @dataclass
@@ -50,15 +65,14 @@ def _has_required_columns(rows: list[dict], required: list[str]) -> bool:
50
 
51
 
52
  def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
 
53
  n = len(rows)
54
  if n == 0:
55
  return 0.0
56
  if n >= min_rows:
57
  if max_rows is None or n <= max_rows:
58
  return 1.0
59
- # Over the expected maximum β€” might be a missing WHERE clause
60
  return 0.5
61
- # Partial result
62
  return 0.5 * (n / min_rows)
63
 
64
 
@@ -116,16 +130,16 @@ def _grade_simple(
116
  attempts: int,
117
  ) -> float:
118
  if error:
119
- return 0.0
120
 
121
  col_ok = _has_required_columns(rows, question.expected_columns)
122
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
123
 
124
  if col_ok and row_score == 1.0:
125
- return 1.0
126
  if col_ok or row_score >= 0.5:
127
- return 0.5
128
- return 0.0
129
 
130
 
131
  _TASK_SIMPLE = Task(
@@ -189,7 +203,7 @@ def _grade_join(
189
  attempts: int,
190
  ) -> float:
191
  if error:
192
- return 0.0
193
 
194
  col_ok = _has_required_columns(rows, [question.expected_columns[0]])
195
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
@@ -200,9 +214,8 @@ def _grade_join(
200
  elif col_ok or row_score >= 0.5:
201
  base = 0.5
202
 
203
- # Penalize extra attempts
204
  attempt_penalty = max(0.0, 0.1 * (attempts - 1))
205
- return max(0.0, base - attempt_penalty)
206
 
207
 
208
  _TASK_JOIN = Task(
@@ -282,23 +295,21 @@ def _grade_complex(
282
  attempts: int,
283
  ) -> float:
284
  if error:
285
- return 0.0
286
 
287
  col_ok = _has_required_columns(rows, question.expected_columns)
288
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
289
 
290
  if not col_ok or row_score == 0.0:
291
- return 0.0
292
 
293
- # Hard task base max is 0.8 unless first-attempt bonus
294
  if row_score == 1.0 and col_ok:
295
  base = 0.8 + (0.2 if attempts == 1 else 0.0)
296
  else:
297
- base = 0.4 # partial
298
 
299
- # Strict attempt penalty for hard queries
300
  attempt_penalty = 0.1 * (attempts - 1)
301
- return max(0.0, base - attempt_penalty)
302
 
303
 
304
  _TASK_COMPLEX = Task(
@@ -330,9 +341,6 @@ def get_all_tasks() -> list[Task]:
330
  return list(TASKS.values())
331
 
332
 
333
- _EPS = 0.05 # wide margin so :.2f/:.3f never rounds to 0.00 or 1.00
334
-
335
-
336
  def grade_response(
337
  task_id: str,
338
  question_id: str,
@@ -345,6 +353,5 @@ def grade_response(
345
  question = next((q for q in task.questions if q.id == question_id), None)
346
  if question is None:
347
  raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
348
- raw = task.grader(question, sql, rows, error, attempts)
349
- # Score must be strictly in (0, 1) exclusive per OpenEnv spec
350
- return max(_EPS, min(1.0 - _EPS, raw))
 
3
 
4
  Three difficulty tiers, each with 5 questions and a grader function.
5
 
6
+ Grader contract: grader(sql, rows, error, attempts) -> float strictly in (0, 1)
7
  - rows: list[dict] from the executed SQL (may be empty)
8
  - error: str | None
9
  - attempts: int (1-indexed count of attempts taken)
10
+
11
+ All graders return values strictly in (_EPS, 1 - _EPS) so no path can
12
+ emit exact 0.0 or 1.0.
13
  """
14
 
15
  from __future__ import annotations
 
21
  from env.database import execute_query
22
 
23
 
24
+ # ─── Score clamping (strictly in (0, 1)) ──────────────────────────
25
+
26
+ _EPS = 0.05 # margin so :.2f/:.3f formatting never rounds to 0.00 or 1.00
27
+
28
+
29
+ def _clamp(x: float) -> float:
30
+ """Clamp to strictly (0, 1). NaN/None β†’ 0.5."""
31
+ if x is None or x != x: # None or NaN
32
+ return 0.5
33
+ return max(_EPS, min(1.0 - _EPS, float(x)))
34
+
35
+
36
  # ─── Task Definitions ─────────────────────────────────────────────
37
 
38
  @dataclass
 
65
 
66
 
67
  def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
68
+ """Returns a raw score in [0, 1]; graders must clamp before returning."""
69
  n = len(rows)
70
  if n == 0:
71
  return 0.0
72
  if n >= min_rows:
73
  if max_rows is None or n <= max_rows:
74
  return 1.0
 
75
  return 0.5
 
76
  return 0.5 * (n / min_rows)
77
 
78
 
 
130
  attempts: int,
131
  ) -> float:
132
  if error:
133
+ return _clamp(0.0)
134
 
135
  col_ok = _has_required_columns(rows, question.expected_columns)
136
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
137
 
138
  if col_ok and row_score == 1.0:
139
+ return _clamp(1.0)
140
  if col_ok or row_score >= 0.5:
141
+ return _clamp(0.5)
142
+ return _clamp(0.0)
143
 
144
 
145
  _TASK_SIMPLE = Task(
 
203
  attempts: int,
204
  ) -> float:
205
  if error:
206
+ return _clamp(0.0)
207
 
208
  col_ok = _has_required_columns(rows, [question.expected_columns[0]])
209
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
 
214
  elif col_ok or row_score >= 0.5:
215
  base = 0.5
216
 
 
217
  attempt_penalty = max(0.0, 0.1 * (attempts - 1))
218
+ return _clamp(base - attempt_penalty)
219
 
220
 
221
  _TASK_JOIN = Task(
 
295
  attempts: int,
296
  ) -> float:
297
  if error:
298
+ return _clamp(0.0)
299
 
300
  col_ok = _has_required_columns(rows, question.expected_columns)
301
  row_score = _row_count_score(rows, question.min_rows, question.max_rows)
302
 
303
  if not col_ok or row_score == 0.0:
304
+ return _clamp(0.0)
305
 
 
306
  if row_score == 1.0 and col_ok:
307
  base = 0.8 + (0.2 if attempts == 1 else 0.0)
308
  else:
309
+ base = 0.4
310
 
 
311
  attempt_penalty = 0.1 * (attempts - 1)
312
+ return _clamp(base - attempt_penalty)
313
 
314
 
315
  _TASK_COMPLEX = Task(
 
341
  return list(TASKS.values())
342
 
343
 
 
 
 
344
  def grade_response(
345
  task_id: str,
346
  question_id: str,
 
353
  question = next((q for q in task.questions if q.id == question_id), None)
354
  if question is None:
355
  raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
356
+ # Graders already clamp internally; this is a final safety net.
357
+ return _clamp(task.grader(question, sql, rows, error, attempts))