Spaces:

Cooked4riyal
/

EntropyEnv

Running

App Files Files Community

immortalindeed commited on 8 days ago

Commit

6284048

1 Parent(s): f96532b

Final strict spec-compliance polish: score precision, empty rewards, updated test assertions

Browse files

Files changed (3) hide show

README.md +2 -2
inference.py +6 -7
tests/test_grader_variance.py +35 -8

README.md CHANGED Viewed

@@ -240,7 +240,7 @@ entropyenv/
 **Design principles:**
 - 🎯 **No artificial difficulty caps** — scores reflect actual grader correctness
 - 📊 **Weighted blend** — rewards consistently good episodes over single-lucky-step flukes
-- 🔬 **Spec-compliant** — `[END]` lines have NO `score=` field per official guidelines
 - 🧠 **14+ model families tested** for universal compatibility
 ---
@@ -253,7 +253,7 @@ The baseline `inference.py` emits structured logs matching the OpenEnv spec:
 [START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
 [STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
 [STEP] step=2 action=propose_fix reward=0.92 done=true error=null
-[END] success=true steps=2 rewards=0.85,0.92
 ```
 ---

 **Design principles:**
 - 🎯 **No artificial difficulty caps** — scores reflect actual grader correctness
 - 📊 **Weighted blend** — rewards consistently good episodes over single-lucky-step flukes
+- 🔬 **Spec-compliant** — `[END]` lines perfectly match the 3-line format mandatory rules
 - 🧠 **14+ model families tested** for universal compatibility
 ---
 [START] task=sec_easy env=EntropyEnv model=Qwen/Qwen2.5-72B-Instruct
 [STEP] step=1 action=identify_vulnerability reward=0.85 done=false error=null
 [STEP] step=2 action=propose_fix reward=0.92 done=true error=null
+[END] success=true steps=2 score=0.89 rewards=0.85,0.92
 ```
 ---

inference.py CHANGED Viewed

@@ -266,12 +266,12 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
     except Exception as e:
         # Env unreachable — must still emit [START] and [END]
         print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
-        print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
         return 0.01, False
     if "error" in data and not data.get("episode_id"):
         print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
-        print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
         return 0.01, False
     episode_id = data.get("episode_id", "unknown")
@@ -375,10 +375,9 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
     # ── Mandatory [END] line — exact official spec ─────────────────────────
-    # spec: success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
-    # NO score= field — not in the official spec
     print(
-        f"[END] success={str(success).lower()} steps={step_num} score={score:.4f} rewards={rewards_str}",
         flush=True
     )
@@ -420,13 +419,13 @@ def main() -> None:
                     if remaining not in scores:
                         scores[remaining] = 0.01
                         print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
-                        print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
                 break
         except Exception as e:
             scores[task_id] = 0.01
             print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
-            print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
     avg = round(sum(scores.values()) / max(len(scores), 1), 4)
     print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)

     except Exception as e:
         # Env unreachable — must still emit [START] and [END]
         print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+        print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
         return 0.01, False
     if "error" in data and not data.get("episode_id"):
         print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+        print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
         return 0.01, False
     episode_id = data.get("episode_id", "unknown")
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
     # ── Mandatory [END] line — exact official spec ─────────────────────────
+    # spec: success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
     print(
+        f"[END] success={str(success).lower()} steps={step_num} score={score:.2f} rewards={rewards_str}",
         flush=True
     )
                     if remaining not in scores:
                         scores[remaining] = 0.01
                         print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+                        print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
                 break
         except Exception as e:
             scores[task_id] = 0.01
             print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+            print(f"[END] success=false steps=0 score=0.01 rewards=", flush=True)
     avg = round(sum(scores.values()) / max(len(scores), 1), 4)
     print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)

tests/test_grader_variance.py CHANGED Viewed

@@ -112,13 +112,40 @@ def test_cli_order_variance():
     print(f'  Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
-def test_safe_score_none():
-    """Bug 1 fix: safe_score(None) must return 0.0, not crash."""
-    assert safe_score(None) == 0.0
-    assert safe_score(1.5) == 1.0
-    assert safe_score(-0.5) == 0.0
-    assert safe_score('bad') == 0.0
-    print('  safe_score(None) guard: PASS')
 def test_clinical_valid_actions():
@@ -130,7 +157,7 @@ def test_clinical_valid_actions():
 if __name__ == '__main__':
-    test_safe_score_none()
     test_clinical_valid_actions()
     test_sec_identify_variance()
     test_dep_resolve_variance()

     print(f'  Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
+def test_safe_score_clamp():
+    """
+    safe_score clamps to [0.01, 0.99] — strictly between 0 and 1.
+    WHY 0.01 not 0.0:  The official spec says scores must be strictly > 0.
+                        A score of 0.0 from a crashed run looks indistinguishable
+                        from a broken environment. 0.01 signals "ran but failed".
+    WHY 0.99 not 1.0:  A score of exactly 1.0 means the grader is trivially solved
+                        or broken. 0.99 signals "excellent but not perfect".
+    """
+    # Floor: None, negative, bad types → 0.01
+    assert safe_score(None)   == 0.01, f"Expected 0.01, got {safe_score(None)}"
+    assert safe_score(-0.5)   == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
+    assert safe_score(-999)   == 0.01, f"Expected 0.01, got {safe_score(-999)}"
+    assert safe_score('bad')  == 0.01, f"Expected 0.01, got {safe_score('bad')}"
+    assert safe_score([])     == 0.01, f"Expected 0.01, got {safe_score([])}"
+    # Ceiling: values > 1 → 0.99
+    assert safe_score(1.5)    == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
+    assert safe_score(2.0)    == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
+    assert safe_score(100)    == 0.99, f"Expected 0.99, got {safe_score(100)}"
+    # Exact boundary values
+    assert safe_score(0.01)   == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
+    assert safe_score(0.99)   == 0.99, f"Expected 0.99, got {safe_score(0.99)}"
+    # Pass-through: normal values in range stay unchanged
+    assert safe_score(0.5)    == 0.5,  f"Expected 0.5, got {safe_score(0.5)}"
+    assert safe_score(0.85)   == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
+    assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
+    assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"
+    print('  safe_score clamp [0.01, 0.99]: PASS')
 def test_clinical_valid_actions():
 if __name__ == '__main__':
+    test_safe_score_clamp()
     test_clinical_valid_actions()
     test_sec_identify_variance()
     test_dep_resolve_variance()