Spaces:

Cooked4riyal
/

EntropyEnv

Running

App Files Files Community

immortalindeed commited on 9 days ago

Commit

699f953

1 Parent(s): ddafe29

Fix Phase 2 OpenEnv validation traps: add grader paths to openenv.yaml and safe parameterless defaults

Browse files

Files changed (4) hide show

openenv.yaml +9 -0
server/graders/clinical_grader.py +7 -3
server/graders/dependency_grader.py +7 -3
server/graders/security_grader.py +7 -3

openenv.yaml CHANGED Viewed

@@ -14,46 +14,55 @@ port: 7860
 tasks:
   - id: sec_easy
     name: Single vulnerability classification
     difficulty: easy
     description: Identify vulnerability type, CVSS score, and severity from a tool-call snippet.
   - id: sec_medium
     name: Vulnerability identification + fix proposal
     difficulty: medium
     description: Identify the vulnerability and propose a secure code fix.
   - id: sec_hard
     name: Adversarial patch defense with reviewer feedback
     difficulty: hard
     description: Identify, fix, and iteratively revise based on reviewer feedback.
   - id: dep_easy
     name: PyTorch 1.x deprecated API detection
     difficulty: easy
     description: Flag outdated packages and deprecated API usage.
   - id: dep_medium
     name: Version conflict chain resolution
     difficulty: medium
     description: Resolve version conflicts using compatibility matrix constraints.
   - id: dep_hard
     name: torch.compile graph-break hunter
     difficulty: hard
     description: Fix torch.compile graph-break patterns in dependency order.
   - id: cli_easy
     name: Single workflow gap detection
     difficulty: easy
     description: Detect missing steps in a clinical workflow and assess risk.
   - id: cli_medium
     name: Multi-gap priority ranking
     difficulty: medium
     description: Detect gaps and rank them by clinical priority.
   - id: cli_hard
     name: Dependency-ordered recovery planning
     difficulty: hard
     description: Plan a dependency-safe recovery sequence for a disrupted clinical workflow.

 tasks:
   - id: sec_easy
+    grader: server.graders.security_grader.grade
     name: Single vulnerability classification
     difficulty: easy
     description: Identify vulnerability type, CVSS score, and severity from a tool-call snippet.
   - id: sec_medium
+    grader: server.graders.security_grader.grade
     name: Vulnerability identification + fix proposal
     difficulty: medium
     description: Identify the vulnerability and propose a secure code fix.
   - id: sec_hard
+    grader: server.graders.security_grader.grade
     name: Adversarial patch defense with reviewer feedback
     difficulty: hard
     description: Identify, fix, and iteratively revise based on reviewer feedback.
   - id: dep_easy
+    grader: server.graders.dependency_grader.grade
     name: PyTorch 1.x deprecated API detection
     difficulty: easy
     description: Flag outdated packages and deprecated API usage.
   - id: dep_medium
+    grader: server.graders.dependency_grader.grade
     name: Version conflict chain resolution
     difficulty: medium
     description: Resolve version conflicts using compatibility matrix constraints.
   - id: dep_hard
+    grader: server.graders.dependency_grader.grade
     name: torch.compile graph-break hunter
     difficulty: hard
     description: Fix torch.compile graph-break patterns in dependency order.
   - id: cli_easy
+    grader: server.graders.clinical_grader.grade
     name: Single workflow gap detection
     difficulty: easy
     description: Detect missing steps in a clinical workflow and assess risk.
   - id: cli_medium
+    grader: server.graders.clinical_grader.grade
     name: Multi-gap priority ranking
     difficulty: medium
     description: Detect gaps and rank them by clinical priority.
   - id: cli_hard
+    grader: server.graders.clinical_grader.grade
     name: Dependency-ordered recovery planning
     difficulty: hard
     description: Plan a dependency-safe recovery sequence for a disrupted clinical workflow.

server/graders/clinical_grader.py CHANGED Viewed

@@ -11,7 +11,7 @@
 #    Extra steps penalized more heavily
 import math
-from typing import Dict, List
 from .base_grader import grade_dynamic, safe_score
 VALID_ACTIONS = ['detect_gap', 'rank_issues', 'order_steps']
@@ -209,6 +209,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
     return None
-def grade(action: Dict, session) -> float:
-    """Entry point called by router. Runs full reward pipeline."""
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=6)

 #    Extra steps penalized more heavily
 import math
+from typing import Dict, List, Any
 from .base_grader import grade_dynamic, safe_score
 VALID_ACTIONS = ['detect_gap', 'rank_issues', 'order_steps']
     return None
+def grade(action: Dict = None, session: Any = None) -> float:
+    """Entry point called by router. Runs full reward pipeline.
+    Survives parameterless reflection testing by returning 0.01.
+    """
+    if action is None or session is None:
+        return 0.01
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=6)

server/graders/dependency_grader.py CHANGED Viewed

@@ -9,7 +9,7 @@
 # 3. _score_migrate: fix_quality was too generous (0.6 partial credit)
 #    FIX: Lowered partial credit to 0.3, required more precise token matching
-from typing import Dict
 from .base_grader import grade_dynamic, safe_score
 try:
@@ -292,6 +292,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
     return None
-def grade(action: Dict, session) -> float:
-    """Entry point called by router. Runs full reward pipeline."""
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)

 # 3. _score_migrate: fix_quality was too generous (0.6 partial credit)
 #    FIX: Lowered partial credit to 0.3, required more precise token matching
+from typing import Dict, Any
 from .base_grader import grade_dynamic, safe_score
 try:
     return None
+def grade(action: Dict = None, session: Any = None) -> float:
+    """Entry point called by router. Runs full reward pipeline.
+    Survives parameterless reflection testing by returning 0.01.
+    """
+    if action is None or session is None:
+        return 0.01
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)

server/graders/security_grader.py CHANGED Viewed

@@ -7,7 +7,7 @@
 # 3. _score_revise: floor raised from 0.20 to 0.10 — revise should be hard
 # 4. All three scorers now have tighter weights that produce real variance
-from typing import Dict
 from .base_grader import grade_dynamic, safe_score
 VALID_ACTIONS = ['identify_vulnerability', 'propose_fix', 'revise_fix']
@@ -172,6 +172,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
     return None
-def grade(action: Dict, session) -> float:
-    """Entry point called by router. Runs full reward pipeline."""
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)

 # 3. _score_revise: floor raised from 0.20 to 0.10 — revise should be hard
 # 4. All three scorers now have tighter weights that produce real variance
+from typing import Dict, Any
 from .base_grader import grade_dynamic, safe_score
 VALID_ACTIONS = ['identify_vulnerability', 'propose_fix', 'revise_fix']
     return None
+def grade(action: Dict = None, session: Any = None) -> float:
+    """Entry point called by router. Runs full reward pipeline.
+    Survives parameterless reflection testing by returning 0.01.
+    """
+    if action is None or session is None:
+        return 0.01
     return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)