SimranShaikh commited on
Commit
bdbb08b
Β·
verified Β·
1 Parent(s): 220b4a7
Files changed (1) hide show
  1. environment/graders.py +259 -0
environment/graders.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deterministic graders for each task.
3
+ All graders return a score in [0.0, 1.0] and a feedback message.
4
+ """
5
+ from typing import List, Tuple
6
+ from environment.models import CodeReviewAction, Issue
7
+
8
+
9
+ # ─────────────────────────────────────────────────────────────
10
+ # Helpers
11
+ # ─────────────────────────────────────────────────────────────
12
+
13
+ def _keyword_hit(text: str, keywords: List[str]) -> bool:
14
+ """Case-insensitive check β€” does `text` contain any keyword?"""
15
+ text_lower = text.lower()
16
+ return any(kw.lower() in text_lower for kw in keywords)
17
+
18
+
19
+ def _keyword_score(text: str, keywords: List[str]) -> float:
20
+ """Fraction of keywords found in text (0.0 – 1.0)."""
21
+ if not keywords:
22
+ return 0.0
23
+ hits = sum(1 for kw in keywords if kw.lower() in text.lower())
24
+ return hits / len(keywords)
25
+
26
+
27
+ def _issue_text(issues: List[Issue]) -> str:
28
+ """Concatenate all issue fields into a single string for matching."""
29
+ parts = []
30
+ for issue in issues:
31
+ parts.append(issue.issue_type)
32
+ parts.append(issue.description)
33
+ if issue.line_number is not None:
34
+ parts.append(str(issue.line_number))
35
+ return " ".join(parts).lower()
36
+
37
+
38
+ # ─────────────────────────────────────────────────────────────
39
+ # Easy grader: syntax error detection
40
+ # ─────────────────────────────────────────────────────────────
41
+
42
+ def grade_easy(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
43
+ """
44
+ Rubric (total 1.0):
45
+ 0.35 β€” identified issue_type == "syntax_error"
46
+ 0.35 β€” description mentions the relevant keywords (colon / if / syntax)
47
+ 0.30 β€” suggested_fix contains the corrected line
48
+ """
49
+ score = 0.0
50
+ feedback_parts = []
51
+
52
+ issue_types = [i.issue_type.lower() for i in action.identified_issues]
53
+ all_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
54
+
55
+ # 1) Issue type check
56
+ if "syntax_error" in issue_types:
57
+ score += 0.35
58
+ feedback_parts.append("βœ… Correctly identified as a syntax error (+0.35)")
59
+ else:
60
+ feedback_parts.append(
61
+ f"❌ Expected issue_type='syntax_error', got {issue_types} (+0.00)"
62
+ )
63
+
64
+ # 2) Description keyword check
65
+ kw_score = _keyword_score(all_text, ground_truth["keywords"])
66
+ desc_points = round(0.35 * min(kw_score * 2, 1.0), 3)
67
+ score += desc_points
68
+ feedback_parts.append(
69
+ f"{'βœ…' if desc_points > 0.1 else '❌'} Description accuracy: "
70
+ f"{desc_points:.2f}/0.35 (keyword match {kw_score:.0%})"
71
+ )
72
+
73
+ # 3) Fix quality check
74
+ fix = action.suggested_fix or ""
75
+ if _keyword_hit(fix, ground_truth["fix_keywords"]):
76
+ score += 0.30
77
+ feedback_parts.append("βœ… Suggested fix contains the correct patch (+0.30)")
78
+ else:
79
+ feedback_parts.append("❌ Suggested fix missing or incorrect (+0.00)")
80
+
81
+ score = round(min(score, 1.0), 4)
82
+ return score, "\n".join(feedback_parts)
83
+
84
+
85
+ # ─────────────────────────────────────────────────────────────
86
+ # Medium grader: logic bug detection
87
+ # ─────────────────────────────────────────────────────────────
88
+
89
+ def _run_is_palindrome(code: str) -> List[Tuple[str, bool, bool]]:
90
+ """
91
+ Execute the patched `is_palindrome` function in a subprocess-safe sandbox.
92
+ Returns list of (input, expected, actual).
93
+ """
94
+ import subprocess, sys, json, textwrap
95
+
96
+ test_driver = textwrap.dedent(f"""
97
+ {code}
98
+
99
+ import json, sys
100
+ cases = [
101
+ ("racecar", True),
102
+ ("hello", False),
103
+ ("amanaplanacanalpanama", True),
104
+ ("abba", True),
105
+ ("abc", False),
106
+ ]
107
+ results = []
108
+ for inp, exp in cases:
109
+ try:
110
+ got = is_palindrome(inp)
111
+ results.append([inp, exp, bool(got)])
112
+ except Exception as e:
113
+ results.append([inp, exp, None])
114
+ print(json.dumps(results))
115
+ """)
116
+ try:
117
+ out = subprocess.run(
118
+ [sys.executable, "-c", test_driver],
119
+ capture_output=True, text=True, timeout=5
120
+ )
121
+ if out.returncode != 0:
122
+ return []
123
+ return [tuple(r) for r in json.loads(out.stdout.strip())]
124
+ except Exception:
125
+ return []
126
+
127
+
128
+ def grade_medium(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
129
+ """
130
+ Rubric (total 1.0):
131
+ 0.25 β€” identified issue_type == "logic_bug"
132
+ 0.25 β€” description mentions index / off-by-one keywords
133
+ 0.50 β€” suggested fix passes all 5 test cases (0.10 each)
134
+ """
135
+ score = 0.0
136
+ feedback_parts = []
137
+
138
+ issue_types = [i.issue_type.lower() for i in action.identified_issues]
139
+ all_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
140
+
141
+ # 1) Issue type
142
+ if "logic_bug" in issue_types:
143
+ score += 0.25
144
+ feedback_parts.append("βœ… Correctly identified as a logic bug (+0.25)")
145
+ else:
146
+ feedback_parts.append(f"❌ Expected 'logic_bug', got {issue_types} (+0.00)")
147
+
148
+ # 2) Description accuracy
149
+ kw_score = _keyword_score(all_text, ground_truth["keywords"])
150
+ desc_pts = round(0.25 * min(kw_score * 2.5, 1.0), 3)
151
+ score += desc_pts
152
+ feedback_parts.append(
153
+ f"{'βœ…' if desc_pts > 0.08 else '❌'} Description accuracy: "
154
+ f"{desc_pts:.2f}/0.25 (keyword match {kw_score:.0%})"
155
+ )
156
+
157
+ # 3) Fix execution test
158
+ fix_code = action.suggested_fix or ""
159
+ if fix_code.strip():
160
+ results = _run_is_palindrome(fix_code)
161
+ if results:
162
+ passed = sum(1 for inp, exp, got in results if got == exp)
163
+ pts = round(0.50 * (passed / len(results)), 3)
164
+ score += pts
165
+ feedback_parts.append(
166
+ f"{'βœ…' if passed == len(results) else '⚠️'} Fix passed "
167
+ f"{passed}/{len(results)} test cases β†’ +{pts:.2f}/0.50"
168
+ )
169
+ for inp, exp, got in results:
170
+ status = "βœ…" if got == exp else "❌"
171
+ feedback_parts.append(f" {status} is_palindrome({inp!r}) β†’ {got} (expected {exp})")
172
+ else:
173
+ feedback_parts.append("❌ Fix code could not be executed (+0.00)")
174
+ else:
175
+ feedback_parts.append("❌ No suggested fix provided (+0.00)")
176
+
177
+ score = round(min(score, 1.0), 4)
178
+ return score, "\n".join(feedback_parts)
179
+
180
+
181
+ # ─────────────────────────────────────────────────────────────
182
+ # Hard grader: security vulnerability detection
183
+ # ─────────────────────────────────────────────────────────────
184
+
185
+ def grade_hard(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
186
+ """
187
+ Rubric (total 1.0) β€” 3 vulnerabilities, each worth ~0.33:
188
+ Per vulnerability:
189
+ 0.15 β€” identified as security_vulnerability
190
+ 0.10 β€” description mentions relevant keywords
191
+ 0.08 β€” fix mentions remediation keywords
192
+ Bonus 0.05 for finding all 3 and providing a complete fixed file.
193
+ """
194
+ vulns = ground_truth["vulnerabilities"]
195
+ per_vuln = 1.0 / len(vulns)
196
+
197
+ all_issue_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
198
+ fix_text = (action.suggested_fix or "") + " " + (action.explanation or "")
199
+
200
+ total_score = 0.0
201
+ feedback_parts = []
202
+ found_count = 0
203
+
204
+ for vuln in vulns:
205
+ v_score = 0.0
206
+ v_name = vuln["name"]
207
+ feedback_parts.append(f"\nπŸ” Checking: {v_name}")
208
+
209
+ # a) issue_type == security_vulnerability
210
+ sec_issues = [i for i in action.identified_issues if "security" in i.issue_type.lower()]
211
+ if sec_issues:
212
+ v_score += per_vuln * 0.45
213
+ feedback_parts.append(f" βœ… Flagged as security vulnerability (+{per_vuln*0.45:.3f})")
214
+ else:
215
+ feedback_parts.append(f" ❌ Not flagged as security vulnerability (+0.00)")
216
+
217
+ # b) description keyword match
218
+ kw_hit = _keyword_hit(all_issue_text, vuln["keywords"])
219
+ if kw_hit:
220
+ kw_score_val = _keyword_score(all_issue_text, vuln["keywords"])
221
+ pts = round(per_vuln * 0.30 * min(kw_score_val * 3, 1.0), 4)
222
+ v_score += pts
223
+ feedback_parts.append(f" βœ… Identified '{v_name}' in description (+{pts:.3f})")
224
+ found_count += 1
225
+ else:
226
+ feedback_parts.append(f" ❌ '{v_name}' not mentioned in description (+0.00)")
227
+
228
+ # c) fix keyword match
229
+ fix_hit = _keyword_hit(fix_text, vuln["fix_keywords"])
230
+ if fix_hit:
231
+ v_score += per_vuln * 0.25
232
+ feedback_parts.append(f" βœ… Fix addresses '{v_name}' (+{per_vuln*0.25:.3f})")
233
+ else:
234
+ feedback_parts.append(f" ❌ Fix doesn't address '{v_name}' (+0.00)")
235
+
236
+ total_score += v_score
237
+
238
+ # Bonus: found all 3
239
+ if found_count == len(vulns):
240
+ total_score = min(total_score + 0.05, 1.0)
241
+ feedback_parts.append("\n🎯 Bonus: All 3 vulnerabilities identified! (+0.05)")
242
+
243
+ total_score = round(min(total_score, 1.0), 4)
244
+ return total_score, "\n".join(feedback_parts)
245
+
246
+
247
+ # ────────────────────────────────────────────────��────────────
248
+ # Dispatcher
249
+ # ─────────────────────────────────────────────────────────────
250
+
251
+ def grade(task_id: str, action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
252
+ if task_id == "easy_syntax":
253
+ return grade_easy(action, ground_truth)
254
+ elif task_id == "medium_logic":
255
+ return grade_medium(action, ground_truth)
256
+ elif task_id == "hard_security":
257
+ return grade_hard(action, ground_truth)
258
+ else:
259
+ raise ValueError(f"No grader for task: {task_id}")