immortalindeed commited on
Commit
699f953
·
1 Parent(s): ddafe29

Fix Phase 2 OpenEnv validation traps: add grader paths to openenv.yaml and safe parameterless defaults

Browse files
openenv.yaml CHANGED
@@ -14,46 +14,55 @@ port: 7860
14
 
15
  tasks:
16
  - id: sec_easy
 
17
  name: Single vulnerability classification
18
  difficulty: easy
19
  description: Identify vulnerability type, CVSS score, and severity from a tool-call snippet.
20
 
21
  - id: sec_medium
 
22
  name: Vulnerability identification + fix proposal
23
  difficulty: medium
24
  description: Identify the vulnerability and propose a secure code fix.
25
 
26
  - id: sec_hard
 
27
  name: Adversarial patch defense with reviewer feedback
28
  difficulty: hard
29
  description: Identify, fix, and iteratively revise based on reviewer feedback.
30
 
31
  - id: dep_easy
 
32
  name: PyTorch 1.x deprecated API detection
33
  difficulty: easy
34
  description: Flag outdated packages and deprecated API usage.
35
 
36
  - id: dep_medium
 
37
  name: Version conflict chain resolution
38
  difficulty: medium
39
  description: Resolve version conflicts using compatibility matrix constraints.
40
 
41
  - id: dep_hard
 
42
  name: torch.compile graph-break hunter
43
  difficulty: hard
44
  description: Fix torch.compile graph-break patterns in dependency order.
45
 
46
  - id: cli_easy
 
47
  name: Single workflow gap detection
48
  difficulty: easy
49
  description: Detect missing steps in a clinical workflow and assess risk.
50
 
51
  - id: cli_medium
 
52
  name: Multi-gap priority ranking
53
  difficulty: medium
54
  description: Detect gaps and rank them by clinical priority.
55
 
56
  - id: cli_hard
 
57
  name: Dependency-ordered recovery planning
58
  difficulty: hard
59
  description: Plan a dependency-safe recovery sequence for a disrupted clinical workflow.
 
14
 
15
  tasks:
16
  - id: sec_easy
17
+ grader: server.graders.security_grader.grade
18
  name: Single vulnerability classification
19
  difficulty: easy
20
  description: Identify vulnerability type, CVSS score, and severity from a tool-call snippet.
21
 
22
  - id: sec_medium
23
+ grader: server.graders.security_grader.grade
24
  name: Vulnerability identification + fix proposal
25
  difficulty: medium
26
  description: Identify the vulnerability and propose a secure code fix.
27
 
28
  - id: sec_hard
29
+ grader: server.graders.security_grader.grade
30
  name: Adversarial patch defense with reviewer feedback
31
  difficulty: hard
32
  description: Identify, fix, and iteratively revise based on reviewer feedback.
33
 
34
  - id: dep_easy
35
+ grader: server.graders.dependency_grader.grade
36
  name: PyTorch 1.x deprecated API detection
37
  difficulty: easy
38
  description: Flag outdated packages and deprecated API usage.
39
 
40
  - id: dep_medium
41
+ grader: server.graders.dependency_grader.grade
42
  name: Version conflict chain resolution
43
  difficulty: medium
44
  description: Resolve version conflicts using compatibility matrix constraints.
45
 
46
  - id: dep_hard
47
+ grader: server.graders.dependency_grader.grade
48
  name: torch.compile graph-break hunter
49
  difficulty: hard
50
  description: Fix torch.compile graph-break patterns in dependency order.
51
 
52
  - id: cli_easy
53
+ grader: server.graders.clinical_grader.grade
54
  name: Single workflow gap detection
55
  difficulty: easy
56
  description: Detect missing steps in a clinical workflow and assess risk.
57
 
58
  - id: cli_medium
59
+ grader: server.graders.clinical_grader.grade
60
  name: Multi-gap priority ranking
61
  difficulty: medium
62
  description: Detect gaps and rank them by clinical priority.
63
 
64
  - id: cli_hard
65
+ grader: server.graders.clinical_grader.grade
66
  name: Dependency-ordered recovery planning
67
  difficulty: hard
68
  description: Plan a dependency-safe recovery sequence for a disrupted clinical workflow.
server/graders/clinical_grader.py CHANGED
@@ -11,7 +11,7 @@
11
  # Extra steps penalized more heavily
12
 
13
  import math
14
- from typing import Dict, List
15
  from .base_grader import grade_dynamic, safe_score
16
 
17
  VALID_ACTIONS = ['detect_gap', 'rank_issues', 'order_steps']
@@ -209,6 +209,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
209
  return None
210
 
211
 
212
- def grade(action: Dict, session) -> float:
213
- """Entry point called by router. Runs full reward pipeline."""
 
 
 
 
214
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=6)
 
11
  # Extra steps penalized more heavily
12
 
13
  import math
14
+ from typing import Dict, List, Any
15
  from .base_grader import grade_dynamic, safe_score
16
 
17
  VALID_ACTIONS = ['detect_gap', 'rank_issues', 'order_steps']
 
209
  return None
210
 
211
 
212
+ def grade(action: Dict = None, session: Any = None) -> float:
213
+ """Entry point called by router. Runs full reward pipeline.
214
+ Survives parameterless reflection testing by returning 0.01.
215
+ """
216
+ if action is None or session is None:
217
+ return 0.01
218
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=6)
server/graders/dependency_grader.py CHANGED
@@ -9,7 +9,7 @@
9
  # 3. _score_migrate: fix_quality was too generous (0.6 partial credit)
10
  # FIX: Lowered partial credit to 0.3, required more precise token matching
11
 
12
- from typing import Dict
13
  from .base_grader import grade_dynamic, safe_score
14
 
15
  try:
@@ -292,6 +292,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
292
  return None
293
 
294
 
295
- def grade(action: Dict, session) -> float:
296
- """Entry point called by router. Runs full reward pipeline."""
 
 
 
 
297
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)
 
9
  # 3. _score_migrate: fix_quality was too generous (0.6 partial credit)
10
  # FIX: Lowered partial credit to 0.3, required more precise token matching
11
 
12
+ from typing import Dict, Any
13
  from .base_grader import grade_dynamic, safe_score
14
 
15
  try:
 
292
  return None
293
 
294
 
295
+ def grade(action: Dict = None, session: Any = None) -> float:
296
+ """Entry point called by router. Runs full reward pipeline.
297
+ Survives parameterless reflection testing by returning 0.01.
298
+ """
299
+ if action is None or session is None:
300
+ return 0.01
301
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)
server/graders/security_grader.py CHANGED
@@ -7,7 +7,7 @@
7
  # 3. _score_revise: floor raised from 0.20 to 0.10 — revise should be hard
8
  # 4. All three scorers now have tighter weights that produce real variance
9
 
10
- from typing import Dict
11
  from .base_grader import grade_dynamic, safe_score
12
 
13
  VALID_ACTIONS = ['identify_vulnerability', 'propose_fix', 'revise_fix']
@@ -172,6 +172,10 @@ def compute_correctness(action: Dict, case: Dict) -> float:
172
  return None
173
 
174
 
175
- def grade(action: Dict, session) -> float:
176
- """Entry point called by router. Runs full reward pipeline."""
 
 
 
 
177
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)
 
7
  # 3. _score_revise: floor raised from 0.20 to 0.10 — revise should be hard
8
  # 4. All three scorers now have tighter weights that produce real variance
9
 
10
+ from typing import Dict, Any
11
  from .base_grader import grade_dynamic, safe_score
12
 
13
  VALID_ACTIONS = ['identify_vulnerability', 'propose_fix', 'revise_fix']
 
172
  return None
173
 
174
 
175
+ def grade(action: Dict = None, session: Any = None) -> float:
176
+ """Entry point called by router. Runs full reward pipeline.
177
+ Survives parameterless reflection testing by returning 0.01.
178
+ """
179
+ if action is None or session is None:
180
+ return 0.01
181
  return grade_dynamic(action, session, compute_correctness, VALID_ACTIONS, FORBIDDEN, max_steps=8)