darshanajudiya7 commited on
Commit
b5cee2e
·
verified ·
1 Parent(s): a9df702

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. models.py +11 -6
  2. server/python_env_environment.py +12 -12
models.py CHANGED
@@ -178,23 +178,28 @@ class RewardSummary(BaseModel):
178
  class PythonReviewAction(Action):
179
  """Structured review action emitted by a model or trainer."""
180
 
181
- action_type: ActionType
 
 
 
 
 
 
182
  line_number: Optional[int] = Field(default=None, ge=1)
183
  issue_type: Optional[IssueType] = None
184
  severity: Optional[Severity] = None
185
  comment: Optional[str] = None
186
  suggestion: Optional[str] = None
187
  question: Optional[str] = None
188
-
189
- # Template compatibility
190
- operation: str = "submit_findings"
191
- findings: List[ReviewFinding] = Field(default_factory=list)
192
- patched_code: Optional[str] = None
193
 
194
  @model_validator(mode="after")
195
  def validate_action_shape(self) -> "PythonReviewAction":
196
  """Require the right fields for each action type."""
197
 
 
 
 
 
198
  if self.action_type == ActionType.ADD_COMMENT:
199
  missing = []
200
  if self.line_number is None:
 
178
  class PythonReviewAction(Action):
179
  """Structured review action emitted by a model or trainer."""
180
 
181
+ # Primary UI Fields (matches CodingEnv style)
182
+ operation: str = Field(default="submit_findings", description="The operation to perform.")
183
+ code: Optional[str] = Field(default=None, description="The fixed source code.")
184
+ findings: List[ReviewFinding] = Field(default_factory=list, description="The structured findings list.")
185
+
186
+ # Optional Review Fields (for benchmark compatibility)
187
+ action_type: ActionType = ActionType.ADD_COMMENT
188
  line_number: Optional[int] = Field(default=None, ge=1)
189
  issue_type: Optional[IssueType] = None
190
  severity: Optional[Severity] = None
191
  comment: Optional[str] = None
192
  suggestion: Optional[str] = None
193
  question: Optional[str] = None
 
 
 
 
 
194
 
195
  @model_validator(mode="after")
196
  def validate_action_shape(self) -> "PythonReviewAction":
197
  """Require the right fields for each action type."""
198
 
199
+ # Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
200
+ if self.operation != ActionType.ADD_COMMENT:
201
+ return self
202
+
203
  if self.action_type == ActionType.ADD_COMMENT:
204
  missing = []
205
  if self.line_number is None:
server/python_env_environment.py CHANGED
@@ -64,7 +64,7 @@ class ReviewTask:
64
  descriptor: TaskDescriptor
65
  references: tuple[ReferenceFinding, ...]
66
  hint: str
67
- patched_code: Optional[str] = None
68
 
69
 
70
  TASK_BANK: Dict[str, ReviewTask] = {
@@ -96,7 +96,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
96
  ),
97
  ),
98
  hint="Look for state that survives between separate function calls.",
99
- patched_code=(
100
  "def add_tag(tag, tags=None):\n"
101
  " if tags is None:\n"
102
  " tags = []\n"
@@ -132,7 +132,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
132
  ),
133
  ),
134
  hint="Check how external commands are invoked and whether user input is escaped.",
135
- patched_code=(
136
  "import subprocess\n\n"
137
  "def run_backup(path):\n"
138
  " subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
@@ -184,7 +184,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
184
  ),
185
  ),
186
  hint="Consider what happens to the final error after the retry loop finishes.",
187
- patched_code=(
188
  "import time\n\n"
189
  "def fetch_with_retry(client, url, retries=3):\n"
190
  " last_error = None\n"
@@ -274,12 +274,12 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
274
  if operation == "request_hint":
275
  self._hints_used += 1
276
  feedback = self._current_task.hint
277
- evaluation = self._evaluate(self._submitted_findings, action.patched_code)
278
  reward = evaluation.score
279
  else:
280
  if action.findings:
281
  self._submitted_findings.extend(action.findings)
282
- evaluation = self._evaluate(self._submitted_findings, action.patched_code)
283
  reward = evaluation.score
284
  if operation == "finalize":
285
  done = True
@@ -307,7 +307,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
307
  feedback=feedback,
308
  reward=reward,
309
  done=done,
310
- patched_code=action.patched_code,
311
  )
312
 
313
  def _build_observation(
@@ -316,10 +316,10 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
316
  feedback: str,
317
  reward: float,
318
  done: bool,
319
- patched_code: Optional[str] = None,
320
  ) -> PythonObservation:
321
  assert self._current_task is not None
322
- evaluation = self._evaluate(self._submitted_findings, patched_code)
323
  attempts_remaining = max(
324
  self._max_steps() - self._state.step_count,
325
  0,
@@ -345,7 +345,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
345
  def _evaluate(
346
  self,
347
  findings: Iterable[ReviewFinding],
348
- patched_code: Optional[str],
349
  ) -> TaskEvaluation:
350
  assert self._current_task is not None
351
 
@@ -372,9 +372,9 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
372
  weighted_recall = min(matched_weight / total_weight, 1.0)
373
 
374
  patch_score = 0.0
375
- if self._current_task.patched_code and patched_code:
376
  patch_score = float(
377
- _normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
378
  )
379
 
380
  raw_score = (
 
64
  descriptor: TaskDescriptor
65
  references: tuple[ReferenceFinding, ...]
66
  hint: str
67
+ code: Optional[str] = None
68
 
69
 
70
  TASK_BANK: Dict[str, ReviewTask] = {
 
96
  ),
97
  ),
98
  hint="Look for state that survives between separate function calls.",
99
+ code=(
100
  "def add_tag(tag, tags=None):\n"
101
  " if tags is None:\n"
102
  " tags = []\n"
 
132
  ),
133
  ),
134
  hint="Check how external commands are invoked and whether user input is escaped.",
135
+ code=(
136
  "import subprocess\n\n"
137
  "def run_backup(path):\n"
138
  " subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
 
184
  ),
185
  ),
186
  hint="Consider what happens to the final error after the retry loop finishes.",
187
+ code=(
188
  "import time\n\n"
189
  "def fetch_with_retry(client, url, retries=3):\n"
190
  " last_error = None\n"
 
274
  if operation == "request_hint":
275
  self._hints_used += 1
276
  feedback = self._current_task.hint
277
+ evaluation = self._evaluate(self._submitted_findings, action.code)
278
  reward = evaluation.score
279
  else:
280
  if action.findings:
281
  self._submitted_findings.extend(action.findings)
282
+ evaluation = self._evaluate(self._submitted_findings, action.code)
283
  reward = evaluation.score
284
  if operation == "finalize":
285
  done = True
 
307
  feedback=feedback,
308
  reward=reward,
309
  done=done,
310
+ code=action.code,
311
  )
312
 
313
  def _build_observation(
 
316
  feedback: str,
317
  reward: float,
318
  done: bool,
319
+ code: Optional[str] = None,
320
  ) -> PythonObservation:
321
  assert self._current_task is not None
322
+ evaluation = self._evaluate(self._submitted_findings, code)
323
  attempts_remaining = max(
324
  self._max_steps() - self._state.step_count,
325
  0,
 
345
  def _evaluate(
346
  self,
347
  findings: Iterable[ReviewFinding],
348
+ code: Optional[str],
349
  ) -> TaskEvaluation:
350
  assert self._current_task is not None
351
 
 
372
  weighted_recall = min(matched_weight / total_weight, 1.0)
373
 
374
  patch_score = 0.0
375
+ if self._current_task.code and code:
376
  patch_score = float(
377
+ _normalize_code(code) == _normalize_code(self._current_task.code)
378
  )
379
 
380
  raw_score = (