uvpatel7271 commited on
Commit
605cd75
·
verified ·
1 Parent(s): a954add

Upload folder using huggingface_hub

Browse files
inference.py CHANGED
@@ -170,8 +170,7 @@ def run_task_episode(
170
 
171
  if verbose:
172
  print(f"Step {step_count}: {action.action_type}")
173
- print(f" Reward: {step_reward:+.4f}")
174
- print(f" Done: {observation.done}")
175
  if step_reward != 0 or observation.reward_details.reason:
176
  print(f" Reward Details: {observation.reward_details.reason}")
177
  if observation.last_action_status:
 
170
 
171
  if verbose:
172
  print(f"Step {step_count}: {action.action_type}")
173
+ print(f" Reward: {step_reward:+.4f} Done: {observation.done}")
 
174
  if step_reward != 0 or observation.reward_details.reason:
175
  print(f" Reward Details: {observation.reward_details.reason}")
176
  if observation.last_action_status:
pytest-cache-files-le0qcl0z/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-le0qcl0z/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-qm8xzmpt/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-qm8xzmpt/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
server/env.py CHANGED
@@ -20,9 +20,13 @@ from models import (
20
  from tasks import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
21
 
22
 
23
- # Reward shaping constants
24
- INVALID_ACTION_PENALTY = 0.1
25
- QUALITY_BONUS_SCALE = 0.15
 
 
 
 
26
 
27
 
28
  class PythonCodeReviewEnvironment(
@@ -197,11 +201,11 @@ class PythonCodeReviewEnvironment(
197
  },
198
  )
199
 
200
- def _handle_analyze(self) -> tuple[RewardDetails, str]:
201
- """Analyze code for errors and test status."""
202
- if self._task is None:
203
- return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
204
-
205
  grade = grade_task(self._state.current_code, self._task, include_hidden=False)
206
  error = grade.details.get("compile_error", "")
207
 
@@ -209,21 +213,22 @@ class PythonCodeReviewEnvironment(
209
  self._state.errors = error
210
  self._state.test_results = "Compilation failed. Fix syntax first."
211
  summary = f"Syntax error detected: {error}"
212
- else:
213
- self._state.errors = ""
214
- if self._task.task_kind == "syntax_fix":
215
- self._state.test_results = "Code compiles successfully."
216
- summary = "Code compiles. Ready to submit."
217
- else:
218
- visible_total = len(self._task.visible_tests)
219
- visible_passed = grade.tests_passed
220
- self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
221
- summary = self._state.test_results
222
-
223
- reward = RewardDetails(value=0.0, reason=summary)
224
- self._append_history("analyze_code", summary, reward.value)
225
- self._sync_score(include_hidden=False)
226
- return reward, summary
 
227
 
228
  def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
229
  """Edit the code and compute reward for progress."""
@@ -241,74 +246,80 @@ class PythonCodeReviewEnvironment(
241
  self._append_history("edit_code", status, reward.value)
242
  return reward, status
243
 
244
- # Grade before and after
245
- previous_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
246
- new_grade = grade_task(code, self._task, include_hidden=False)
247
- self._state.current_code = code
248
 
249
  # Update state
250
  self._state.errors = new_grade.details.get("compile_error", "")
251
  self._state.test_results = self._format_test_results(new_grade)
252
 
253
- # Compute reward with shaping
254
- syntax_reward = 0.0
255
- if previous_grade.syntax_score < 1.0 and new_grade.syntax_score == 1.0:
256
- syntax_reward = 0.2
257
- self._syntax_reward_awarded = True
258
-
259
- quality_delta = max(new_grade.quality_score - self._best_quality_score, 0.0)
260
- quality_bonus = 0.0
261
- if quality_delta > 0:
262
- quality_bonus = min(quality_delta * QUALITY_BONUS_SCALE, 0.1)
263
- self._best_quality_score = new_grade.quality_score
264
-
265
- test_delta = 0.0
266
- if new_grade.tests_total > 0:
267
- current_test_fraction = new_grade.tests_passed / new_grade.tests_total
268
- test_delta = max(current_test_fraction - self._best_visible_test_fraction, 0.0)
269
- self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_test_fraction)
270
-
271
- reward_value = syntax_reward + quality_bonus + (0.15 * test_delta)
272
-
273
- status = "Code updated."
274
- if self._state.errors:
275
- status = f"Code updated with syntax issues: {self._state.errors}"
276
- elif new_grade.tests_total > 0:
277
  status = self._state.test_results
278
 
279
- reward = RewardDetails(
280
- value=reward_value,
281
- syntax_reward=syntax_reward,
282
- quality_bonus=quality_bonus,
283
- test_reward=0.15 * test_delta,
284
- reason=status,
285
- )
286
- self._append_history("edit_code", status, reward_value)
287
- self._sync_score(include_hidden=False)
288
- return reward, status
 
289
 
290
  def _handle_run_tests(self) -> tuple[RewardDetails, str]:
291
  """Run tests and provide feedback."""
292
  if self._task is None:
293
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
294
 
295
- grade = grade_task(self._state.current_code, self._task, include_hidden=False)
296
- self._state.errors = grade.details.get("compile_error", "")
297
- self._state.test_results = self._format_test_results(grade)
298
-
299
- if grade.tests_total > 0:
300
- current_fraction = grade.tests_passed / grade.tests_total
301
- test_delta = max(current_fraction - self._best_visible_test_fraction, 0.0)
302
- self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_fraction)
303
- test_reward = 0.15 * test_delta
304
- else:
305
- test_reward = 0.0
306
-
307
- status = self._state.test_results if not self._state.errors else self._state.errors
308
- reward = RewardDetails(value=test_reward, test_reward=test_reward, reason=status)
309
- self._append_history("run_tests", status, reward.value)
310
- self._sync_score(include_hidden=False)
311
- return reward, status
 
 
 
 
 
 
312
 
313
  def _handle_submit(self) -> tuple[RewardDetails, str]:
314
  """Submit solution and finalize episode."""
@@ -319,23 +330,25 @@ class PythonCodeReviewEnvironment(
319
  self._state.errors = grade.details.get("compile_error", "")
320
  self._state.test_results = self._format_test_results(grade)
321
 
322
- # Compute final reward bonuses
323
- correctness_bonus = 0.0
324
- if grade.score >= 0.999999 and not self._full_correctness_awarded:
325
- correctness_bonus = 0.5
326
- self._full_correctness_awarded = True
327
-
328
- reward_value = correctness_bonus
329
- self._finalize_episode(auto_submit=False, grade=grade)
330
- status = f"Solution submitted. Final score: {grade.score:.3f}"
331
-
332
- reward = RewardDetails(
333
- value=reward_value,
334
- correctness_bonus=correctness_bonus,
335
- reason=status,
336
- )
337
- self._append_history("submit_solution", status, reward_value)
338
- return reward, status
 
 
339
 
340
  def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
341
  """Mark episode as done and set final score."""
@@ -350,8 +363,8 @@ class PythonCodeReviewEnvironment(
350
  self._done = True
351
  self._state.done = True
352
 
353
- if auto_submit:
354
- self._last_status = f"Step budget exhausted. Final score: {grade.score:.3f}"
355
 
356
  def _sync_score(self, include_hidden: bool) -> None:
357
  """Update visible score based on current code."""
@@ -380,6 +393,17 @@ class PythonCodeReviewEnvironment(
380
  )
381
  self._state.history.append(entry)
382
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
  # Backwards-compatible aliases used elsewhere in the repo.
385
  PythonEnvironment = PythonCodeReviewEnvironment
 
20
  from tasks import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
21
 
22
 
23
+ # Reward shaping constants
24
+ INVALID_ACTION_PENALTY = 0.1
25
+ QUALITY_BONUS_SCALE = 0.15
26
+ ANALYZE_FAILURE_PENALTY = 0.05
27
+ RUN_FAILURE_PENALTY = 0.05
28
+ TIMEOUT_PENALTY = 0.1
29
+ SUBMIT_BASE_SCALE = 0.1
30
 
31
 
32
  class PythonCodeReviewEnvironment(
 
201
  },
202
  )
203
 
204
+ def _handle_analyze(self) -> tuple[RewardDetails, str]:
205
+ """Analyze code for errors and test status."""
206
+ if self._task is None:
207
+ return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
208
+
209
  grade = grade_task(self._state.current_code, self._task, include_hidden=False)
210
  error = grade.details.get("compile_error", "")
211
 
 
213
  self._state.errors = error
214
  self._state.test_results = "Compilation failed. Fix syntax first."
215
  summary = f"Syntax error detected: {error}"
216
+ else:
217
+ self._state.errors = ""
218
+ if self._task.task_kind == "syntax_fix":
219
+ self._state.test_results = "Code compiles successfully."
220
+ summary = "Code compiles. Ready to submit."
221
+ else:
222
+ visible_total = len(self._task.visible_tests)
223
+ visible_passed = grade.tests_passed
224
+ self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
225
+ summary = self._state.test_results
226
+
227
+ reward_value = round((grade.score * 0.05) - self._grade_penalty(grade, failure_penalty=ANALYZE_FAILURE_PENALTY), 6)
228
+ reward = RewardDetails(value=reward_value, reason=summary)
229
+ self._append_history("analyze_code", summary, reward.value)
230
+ self._sync_score(include_hidden=False)
231
+ return reward, summary
232
 
233
  def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
234
  """Edit the code and compute reward for progress."""
 
246
  self._append_history("edit_code", status, reward.value)
247
  return reward, status
248
 
249
+ # Grade before and after
250
+ previous_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
251
+ new_grade = grade_task(code, self._task, include_hidden=False)
252
+ self._state.current_code = code
253
 
254
  # Update state
255
  self._state.errors = new_grade.details.get("compile_error", "")
256
  self._state.test_results = self._format_test_results(new_grade)
257
 
258
+ # Compute reward with shaping
259
+ syntax_reward = 0.0
260
+ if previous_grade.syntax_score < 1.0 and new_grade.syntax_score == 1.0:
261
+ syntax_reward = 0.2
262
+ self._syntax_reward_awarded = True
263
+
264
+ quality_delta = new_grade.quality_score - previous_grade.quality_score
265
+ quality_bonus = max(min(quality_delta * QUALITY_BONUS_SCALE, 0.1), -0.1)
266
+ if new_grade.quality_score > self._best_quality_score:
267
+ self._best_quality_score = new_grade.quality_score
268
+
269
+ progress_reward = 0.2 * (new_grade.score - previous_grade.score)
270
+ if new_grade.tests_total > 0:
271
+ current_test_fraction = new_grade.tests_passed / new_grade.tests_total
272
+ self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_test_fraction)
273
+
274
+ penalty = self._grade_penalty(new_grade)
275
+ reward_value = round(progress_reward + syntax_reward + quality_bonus - penalty, 6)
276
+
277
+ status = "Code updated."
278
+ if self._state.errors:
279
+ status = f"Code updated with syntax issues: {self._state.errors}"
280
+ elif new_grade.tests_total > 0:
 
281
  status = self._state.test_results
282
 
283
+ reward = RewardDetails(
284
+ value=reward_value,
285
+ syntax_reward=syntax_reward,
286
+ quality_bonus=round(quality_bonus, 6),
287
+ test_reward=round(progress_reward, 6),
288
+ timeout_penalty=TIMEOUT_PENALTY if new_grade.timed_out else 0.0,
289
+ reason=status,
290
+ )
291
+ self._append_history("edit_code", status, reward_value)
292
+ self._sync_score(include_hidden=False)
293
+ return reward, status
294
 
295
  def _handle_run_tests(self) -> tuple[RewardDetails, str]:
296
  """Run tests and provide feedback."""
297
  if self._task is None:
298
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
299
 
300
+ grade = grade_task(self._state.current_code, self._task, include_hidden=False)
301
+ self._state.errors = grade.details.get("compile_error", "")
302
+ self._state.test_results = self._format_test_results(grade)
303
+
304
+ previous_score = self._state.score
305
+ progress_reward = 0.2 * (grade.score - previous_score)
306
+ completion_bonus = 0.05 if grade.tests_total > 0 and grade.tests_passed == grade.tests_total else 0.0
307
+ penalty = self._grade_penalty(grade, failure_penalty=RUN_FAILURE_PENALTY)
308
+ reward_value = round(progress_reward + completion_bonus - penalty, 6)
309
+ if grade.tests_total > 0:
310
+ current_fraction = grade.tests_passed / grade.tests_total
311
+ self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_fraction)
312
+
313
+ status = self._state.test_results if not self._state.errors else self._state.errors
314
+ reward = RewardDetails(
315
+ value=reward_value,
316
+ test_reward=round(progress_reward + completion_bonus, 6),
317
+ timeout_penalty=TIMEOUT_PENALTY if grade.timed_out else 0.0,
318
+ reason=status,
319
+ )
320
+ self._append_history("run_tests", status, reward.value)
321
+ self._sync_score(include_hidden=False)
322
+ return reward, status
323
 
324
  def _handle_submit(self) -> tuple[RewardDetails, str]:
325
  """Submit solution and finalize episode."""
 
330
  self._state.errors = grade.details.get("compile_error", "")
331
  self._state.test_results = self._format_test_results(grade)
332
 
333
+ # Compute final reward bonuses
334
+ correctness_bonus = 0.0
335
+ if grade.score >= 0.999999 and not self._full_correctness_awarded:
336
+ correctness_bonus = 0.5
337
+ self._full_correctness_awarded = True
338
+
339
+ penalty = self._grade_penalty(grade, failure_penalty=RUN_FAILURE_PENALTY)
340
+ reward_value = round((grade.score * SUBMIT_BASE_SCALE) + correctness_bonus - penalty, 6)
341
+ self._finalize_episode(auto_submit=False, grade=grade)
342
+ status = f"Solution submitted. Final score: {grade.score:.3f}"
343
+
344
+ reward = RewardDetails(
345
+ value=reward_value,
346
+ correctness_bonus=correctness_bonus,
347
+ timeout_penalty=TIMEOUT_PENALTY if grade.timed_out else 0.0,
348
+ reason=status,
349
+ )
350
+ self._append_history("submit_solution", status, reward_value)
351
+ return reward, status
352
 
353
  def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
354
  """Mark episode as done and set final score."""
 
363
  self._done = True
364
  self._state.done = True
365
 
366
+ if auto_submit:
367
+ self._last_status = f"Step budget exhausted. Final score: {grade.score:.3f}"
368
 
369
  def _sync_score(self, include_hidden: bool) -> None:
370
  """Update visible score based on current code."""
 
393
  )
394
  self._state.history.append(entry)
395
 
396
+ def _grade_penalty(self, grade: TaskGrade, failure_penalty: float = RUN_FAILURE_PENALTY) -> float:
397
+ """Return a negative signal when the action leads to an obviously bad result."""
398
+ penalty = 0.0
399
+ if grade.details.get("compile_error"):
400
+ penalty += failure_penalty + grade.score
401
+ if grade.timed_out:
402
+ penalty += TIMEOUT_PENALTY
403
+ if grade.tests_total > 0 and grade.tests_passed == 0:
404
+ penalty += failure_penalty
405
+ return round(penalty, 6)
406
+
407
 
408
  # Backwards-compatible aliases used elsewhere in the repo.
409
  PythonEnvironment = PythonCodeReviewEnvironment