Spaces:
Running
Running
Commit Β·
6284048
1
Parent(s): f96532b
Final strict spec-compliance polish: score precision, empty rewards, updated test assertions
Browse files- README.md +2 -2
- inference.py +6 -7
- tests/test_grader_variance.py +35 -8
README.md
CHANGED
|
@@ -240,7 +240,7 @@ entropyenv/
|
|
| 240 |
**Design principles:**
|
| 241 |
- π― **No artificial difficulty caps** β scores reflect actual grader correctness
|
| 242 |
- π **Weighted blend** β rewards consistently good episodes over single-lucky-step flukes
|
| 243 |
-
- π¬ **Spec-compliant** β `[END]` lines
|
| 244 |
- π§ **14+ model families tested** for universal compatibility
|
| 245 |
|
| 246 |
---
|
|
@@ -253,7 +253,7 @@ The baseline `inference.py` emits structured logs matching the OpenEnv spec:
|
|
| 253 |
[START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
|
| 254 |
[STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
|
| 255 |
[STEP] step=2 action=propose_fix reward=0.92 done=true error=null
|
| 256 |
-
[END] success=true steps=2 rewards=0.85,0.92
|
| 257 |
```
|
| 258 |
|
| 259 |
---
|
|
|
|
| 240 |
**Design principles:**
|
| 241 |
- π― **No artificial difficulty caps** β scores reflect actual grader correctness
|
| 242 |
- π **Weighted blend** β rewards consistently good episodes over single-lucky-step flukes
|
| 243 |
+
- π¬ **Spec-compliant** β `[END]` lines perfectly match the 3-line format mandatory rules
|
| 244 |
- π§ **14+ model families tested** for universal compatibility
|
| 245 |
|
| 246 |
---
|
|
|
|
| 253 |
[START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
|
| 254 |
[STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
|
| 255 |
[STEP] step=2 action=propose_fix reward=0.92 done=true error=null
|
| 256 |
+
[END] success=true steps=2 score=0.89 rewards=0.85,0.92
|
| 257 |
```
|
| 258 |
|
| 259 |
---
|
inference.py
CHANGED
|
@@ -266,12 +266,12 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
|
|
| 266 |
except Exception as e:
|
| 267 |
# Env unreachable β must still emit [START] and [END]
|
| 268 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 269 |
-
print(f"[END] success=false steps=0 score=0.01 rewards=
|
| 270 |
return 0.01, False
|
| 271 |
|
| 272 |
if "error" in data and not data.get("episode_id"):
|
| 273 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 274 |
-
print(f"[END] success=false steps=0 score=0.01 rewards=
|
| 275 |
return 0.01, False
|
| 276 |
|
| 277 |
episode_id = data.get("episode_id", "unknown")
|
|
@@ -375,10 +375,9 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
|
|
| 375 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 376 |
|
| 377 |
# ββ Mandatory [END] line β exact official spec βββββββββββββββββββββββββ
|
| 378 |
-
# spec: success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 379 |
-
# NO score= field β not in the official spec
|
| 380 |
print(
|
| 381 |
-
f"[END] success={str(success).lower()} steps={step_num} score={score:.
|
| 382 |
flush=True
|
| 383 |
)
|
| 384 |
|
|
@@ -420,13 +419,13 @@ def main() -> None:
|
|
| 420 |
if remaining not in scores:
|
| 421 |
scores[remaining] = 0.01
|
| 422 |
print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 423 |
-
print(f"[END] success=false steps=0 score=0.01 rewards=
|
| 424 |
break
|
| 425 |
|
| 426 |
except Exception as e:
|
| 427 |
scores[task_id] = 0.01
|
| 428 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 429 |
-
print(f"[END] success=false steps=0 score=0.01 rewards=
|
| 430 |
|
| 431 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 432 |
print(f"\nβ
All tasks complete! Average: {avg:.4f}", flush=True)
|
|
|
|
| 266 |
except Exception as e:
|
| 267 |
# Env unreachable β must still emit [START] and [END]
|
| 268 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 269 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
|
| 270 |
return 0.01, False
|
| 271 |
|
| 272 |
if "error" in data and not data.get("episode_id"):
|
| 273 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 274 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
|
| 275 |
return 0.01, False
|
| 276 |
|
| 277 |
episode_id = data.get("episode_id", "unknown")
|
|
|
|
| 375 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 376 |
|
| 377 |
# ββ Mandatory [END] line β exact official spec βββββββββββββββββββββββββ
|
| 378 |
+
# spec: success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
|
|
|
| 379 |
print(
|
| 380 |
+
f"[END] success={str(success).lower()} steps={step_num} score={score:.2f} rewards={rewards_str}",
|
| 381 |
flush=True
|
| 382 |
)
|
| 383 |
|
|
|
|
| 419 |
if remaining not in scores:
|
| 420 |
scores[remaining] = 0.01
|
| 421 |
print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 422 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
|
| 423 |
break
|
| 424 |
|
| 425 |
except Exception as e:
|
| 426 |
scores[task_id] = 0.01
|
| 427 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 428 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
|
| 429 |
|
| 430 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 431 |
print(f"\nβ
All tasks complete! Average: {avg:.4f}", flush=True)
|
tests/test_grader_variance.py
CHANGED
|
@@ -112,13 +112,40 @@ def test_cli_order_variance():
|
|
| 112 |
print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
|
| 113 |
|
| 114 |
|
| 115 |
-
def
|
| 116 |
-
"""
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
def test_clinical_valid_actions():
|
|
@@ -130,7 +157,7 @@ def test_clinical_valid_actions():
|
|
| 130 |
|
| 131 |
|
| 132 |
if __name__ == '__main__':
|
| 133 |
-
|
| 134 |
test_clinical_valid_actions()
|
| 135 |
test_sec_identify_variance()
|
| 136 |
test_dep_resolve_variance()
|
|
|
|
| 112 |
print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
|
| 113 |
|
| 114 |
|
| 115 |
+
def test_safe_score_clamp():
|
| 116 |
+
"""
|
| 117 |
+
safe_score clamps to [0.01, 0.99] β strictly between 0 and 1.
|
| 118 |
+
|
| 119 |
+
WHY 0.01 not 0.0: The official spec says scores must be strictly > 0.
|
| 120 |
+
A score of 0.0 from a crashed run looks indistinguishable
|
| 121 |
+
from a broken environment. 0.01 signals "ran but failed".
|
| 122 |
+
|
| 123 |
+
WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved
|
| 124 |
+
or broken. 0.99 signals "excellent but not perfect".
|
| 125 |
+
"""
|
| 126 |
+
# Floor: None, negative, bad types β 0.01
|
| 127 |
+
assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}"
|
| 128 |
+
assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
|
| 129 |
+
assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}"
|
| 130 |
+
assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}"
|
| 131 |
+
assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}"
|
| 132 |
+
|
| 133 |
+
# Ceiling: values > 1 β 0.99
|
| 134 |
+
assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
|
| 135 |
+
assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
|
| 136 |
+
assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}"
|
| 137 |
+
|
| 138 |
+
# Exact boundary values
|
| 139 |
+
assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
|
| 140 |
+
assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}"
|
| 141 |
+
|
| 142 |
+
# Pass-through: normal values in range stay unchanged
|
| 143 |
+
assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}"
|
| 144 |
+
assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
|
| 145 |
+
assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
|
| 146 |
+
assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"
|
| 147 |
+
|
| 148 |
+
print(' safe_score clamp [0.01, 0.99]: PASS')
|
| 149 |
|
| 150 |
|
| 151 |
def test_clinical_valid_actions():
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
if __name__ == '__main__':
|
| 160 |
+
test_safe_score_clamp()
|
| 161 |
test_clinical_valid_actions()
|
| 162 |
test_sec_identify_variance()
|
| 163 |
test_dep_resolve_variance()
|