immortalindeed commited on
Commit
6284048
Β·
1 Parent(s): f96532b

Final strict spec-compliance polish: score precision, empty rewards, updated test assertions

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. inference.py +6 -7
  3. tests/test_grader_variance.py +35 -8
README.md CHANGED
@@ -240,7 +240,7 @@ entropyenv/
240
  **Design principles:**
241
  - 🎯 **No artificial difficulty caps** β€” scores reflect actual grader correctness
242
  - πŸ“Š **Weighted blend** β€” rewards consistently good episodes over single-lucky-step flukes
243
- - πŸ”¬ **Spec-compliant** β€” `[END]` lines have NO `score=` field per official guidelines
244
  - 🧠 **14+ model families tested** for universal compatibility
245
 
246
  ---
@@ -253,7 +253,7 @@ The baseline `inference.py` emits structured logs matching the OpenEnv spec:
253
  [START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
254
  [STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
255
  [STEP] step=2 action=propose_fix reward=0.92 done=true error=null
256
- [END] success=true steps=2 rewards=0.85,0.92
257
  ```
258
 
259
  ---
 
240
  **Design principles:**
241
  - 🎯 **No artificial difficulty caps** β€” scores reflect actual grader correctness
242
  - πŸ“Š **Weighted blend** β€” rewards consistently good episodes over single-lucky-step flukes
243
+ - πŸ”¬ **Spec-compliant** β€” `[END]` lines perfectly match the 3-line format mandatory rules
244
  - 🧠 **14+ model families tested** for universal compatibility
245
 
246
  ---
 
253
  [START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
254
  [STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
255
  [STEP] step=2 action=propose_fix reward=0.92 done=true error=null
256
+ [END] success=true steps=2 score=0.89 rewards=0.85,0.92
257
  ```
258
 
259
  ---
inference.py CHANGED
@@ -266,12 +266,12 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
266
  except Exception as e:
267
  # Env unreachable β€” must still emit [START] and [END]
268
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
269
- print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
270
  return 0.01, False
271
 
272
  if "error" in data and not data.get("episode_id"):
273
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
274
- print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
275
  return 0.01, False
276
 
277
  episode_id = data.get("episode_id", "unknown")
@@ -375,10 +375,9 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
375
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
376
 
377
  # ── Mandatory [END] line β€” exact official spec ─────────────────────────
378
- # spec: success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
379
- # NO score= field β€” not in the official spec
380
  print(
381
- f"[END] success={str(success).lower()} steps={step_num} score={score:.4f} rewards={rewards_str}",
382
  flush=True
383
  )
384
 
@@ -420,13 +419,13 @@ def main() -> None:
420
  if remaining not in scores:
421
  scores[remaining] = 0.01
422
  print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
423
- print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
424
  break
425
 
426
  except Exception as e:
427
  scores[task_id] = 0.01
428
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
429
- print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
430
 
431
  avg = round(sum(scores.values()) / max(len(scores), 1), 4)
432
  print(f"\nβœ… All tasks complete! Average: {avg:.4f}", flush=True)
 
266
  except Exception as e:
267
  # Env unreachable β€” must still emit [START] and [END]
268
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
269
+ print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
270
  return 0.01, False
271
 
272
  if "error" in data and not data.get("episode_id"):
273
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
274
+ print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
275
  return 0.01, False
276
 
277
  episode_id = data.get("episode_id", "unknown")
 
375
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
376
 
377
  # ── Mandatory [END] line β€” exact official spec ─────────────────────────
378
+ # spec: success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
 
379
  print(
380
+ f"[END] success={str(success).lower()} steps={step_num} score={score:.2f} rewards={rewards_str}",
381
  flush=True
382
  )
383
 
 
419
  if remaining not in scores:
420
  scores[remaining] = 0.01
421
  print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
422
+ print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
423
  break
424
 
425
  except Exception as e:
426
  scores[task_id] = 0.01
427
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
428
+ print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
429
 
430
  avg = round(sum(scores.values()) / max(len(scores), 1), 4)
431
  print(f"\nβœ… All tasks complete! Average: {avg:.4f}", flush=True)
tests/test_grader_variance.py CHANGED
@@ -112,13 +112,40 @@ def test_cli_order_variance():
112
  print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
113
 
114
 
115
- def test_safe_score_none():
116
- """Bug 1 fix: safe_score(None) must return 0.0, not crash."""
117
- assert safe_score(None) == 0.0
118
- assert safe_score(1.5) == 1.0
119
- assert safe_score(-0.5) == 0.0
120
- assert safe_score('bad') == 0.0
121
- print(' safe_score(None) guard: PASS')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  def test_clinical_valid_actions():
@@ -130,7 +157,7 @@ def test_clinical_valid_actions():
130
 
131
 
132
  if __name__ == '__main__':
133
- test_safe_score_none()
134
  test_clinical_valid_actions()
135
  test_sec_identify_variance()
136
  test_dep_resolve_variance()
 
112
  print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
113
 
114
 
115
+ def test_safe_score_clamp():
116
+ """
117
+ safe_score clamps to [0.01, 0.99] β€” strictly between 0 and 1.
118
+
119
+ WHY 0.01 not 0.0: The official spec says scores must be strictly > 0.
120
+ A score of 0.0 from a crashed run looks indistinguishable
121
+ from a broken environment. 0.01 signals "ran but failed".
122
+
123
+ WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved
124
+ or broken. 0.99 signals "excellent but not perfect".
125
+ """
126
+ # Floor: None, negative, bad types β†’ 0.01
127
+ assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}"
128
+ assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
129
+ assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}"
130
+ assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}"
131
+ assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}"
132
+
133
+ # Ceiling: values > 1 β†’ 0.99
134
+ assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
135
+ assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
136
+ assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}"
137
+
138
+ # Exact boundary values
139
+ assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
140
+ assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}"
141
+
142
+ # Pass-through: normal values in range stay unchanged
143
+ assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}"
144
+ assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
145
+ assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
146
+ assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"
147
+
148
+ print(' safe_score clamp [0.01, 0.99]: PASS')
149
 
150
 
151
  def test_clinical_valid_actions():
 
157
 
158
 
159
  if __name__ == '__main__':
160
+ test_safe_score_clamp()
161
  test_clinical_valid_actions()
162
  test_sec_identify_variance()
163
  test_dep_resolve_variance()