uvpatel7271 commited on
Commit
6266f5f
·
verified ·
1 Parent(s): 605cd75

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. models.py +29 -21
  2. server/env.py +347 -203
models.py CHANGED
@@ -31,9 +31,17 @@ class RewardDetails(BaseModel):
31
  test_reward: float = Field(default=0.0, description="Reward from passing tests")
32
  quality_bonus: float = Field(default=0.0, description="Bonus for code quality improvements")
33
  correctness_bonus: float = Field(default=0.0, description="Bonus for full correctness")
 
 
 
34
  invalid_action_penalty: float = Field(default=0.0, description="Penalty for invalid actions")
35
  timeout_penalty: float = Field(default=0.0, description="Penalty for timeouts")
36
  reason: str = Field(..., description="Explanation of reward")
 
 
 
 
 
37
 
38
 
39
  class PythonCodeReviewAction(Action):
@@ -43,26 +51,26 @@ class PythonCodeReviewAction(Action):
43
  code: Optional[str] = Field(default=None, description="New code for edit_code actions")
44
 
45
 
46
- class PythonCodeReviewObservation(Observation):
47
- """Observation returned by reset() and step()."""
48
-
49
- task_id: str = Field(..., description="Current task identifier")
50
- title: str = Field(default="", description="Human-readable task title")
51
- difficulty: Difficulty = Field(..., description="Task difficulty level")
52
- task_kind: Optional[TaskKind] = Field(default=None, description="Task type")
53
- task_description: str = Field(..., description="Detailed task description")
54
- current_code: str = Field(..., description="Current code state")
55
- errors: str = Field(..., description="Syntax/compilation errors, if any")
56
- test_results: str = Field(..., description="Results from test execution")
57
- visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
58
- history: List[HistoryEntry] = Field(default_factory=list, description="Action history")
59
- attempts_remaining: int = Field(..., ge=0, description="Actions left in episode")
60
- last_action_status: str = Field(default="", description="Outcome message from the last action")
61
- score: float = Field(..., ge=0.0, le=1.0, description="Current episode score")
62
- reward_details: RewardDetails = Field(
63
- default_factory=lambda: RewardDetails(value=0.0, reason="Reset"),
64
- description="Detailed reward breakdown for the last action",
65
- )
66
 
67
 
68
  class PythonCodeReviewState(State):
@@ -112,4 +120,4 @@ class HealthResponse(BaseModel):
112
 
113
  status: Literal["ok"] = "ok"
114
  environment: str = "python_code_review_env"
115
- task_count: int = Field(default=0, ge=0)
 
31
  test_reward: float = Field(default=0.0, description="Reward from passing tests")
32
  quality_bonus: float = Field(default=0.0, description="Bonus for code quality improvements")
33
  correctness_bonus: float = Field(default=0.0, description="Bonus for full correctness")
34
+ progress_delta: float = Field(default=0.0, description="Reward from score improvement")
35
+ stagnation_penalty: float = Field(default=0.0, description="Penalty for code not changing")
36
+ regression_penalty: float = Field(default=0.0, description="Penalty for score decline")
37
  invalid_action_penalty: float = Field(default=0.0, description="Penalty for invalid actions")
38
  timeout_penalty: float = Field(default=0.0, description="Penalty for timeouts")
39
  reason: str = Field(..., description="Explanation of reward")
40
+
41
+ # Debug info
42
+ prev_score: float = Field(default=0.0, description="Score before this step")
43
+ curr_score: float = Field(default=0.0, description="Score after this step")
44
+ code_changed: bool = Field(default=False, description="Whether code was modified")
45
 
46
 
47
  class PythonCodeReviewAction(Action):
 
51
  code: Optional[str] = Field(default=None, description="New code for edit_code actions")
52
 
53
 
54
+ class PythonCodeReviewObservation(Observation):
55
+ """Observation returned by reset() and step()."""
56
+
57
+ task_id: str = Field(..., description="Current task identifier")
58
+ title: str = Field(default="", description="Human-readable task title")
59
+ difficulty: Difficulty = Field(..., description="Task difficulty level")
60
+ task_kind: Optional[TaskKind] = Field(default=None, description="Task type")
61
+ task_description: str = Field(..., description="Detailed task description")
62
+ current_code: str = Field(..., description="Current code state")
63
+ errors: str = Field(..., description="Syntax/compilation errors, if any")
64
+ test_results: str = Field(..., description="Results from test execution")
65
+ visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
66
+ history: List[HistoryEntry] = Field(default_factory=list, description="Action history")
67
+ attempts_remaining: int = Field(..., ge=0, description="Actions left in episode")
68
+ last_action_status: str = Field(default="", description="Outcome message from the last action")
69
+ score: float = Field(..., ge=0.0, le=1.0, description="Current episode score")
70
+ reward_details: RewardDetails = Field(
71
+ default_factory=lambda: RewardDetails(value=0.0, reason="Reset"),
72
+ description="Detailed reward breakdown for the last action",
73
+ )
74
 
75
 
76
  class PythonCodeReviewState(State):
 
120
 
121
  status: Literal["ok"] = "ok"
122
  environment: str = "python_code_review_env"
123
+ task_count: int = Field(default=0, ge=0)
server/env.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  from __future__ import annotations
4
 
 
5
  from typing import List, Optional
6
  from uuid import uuid4
7
 
@@ -20,13 +21,16 @@ from models import (
20
  from tasks import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
21
 
22
 
23
- # Reward shaping constants
24
- INVALID_ACTION_PENALTY = 0.1
25
- QUALITY_BONUS_SCALE = 0.15
26
- ANALYZE_FAILURE_PENALTY = 0.05
27
- RUN_FAILURE_PENALTY = 0.05
28
- TIMEOUT_PENALTY = 0.1
29
- SUBMIT_BASE_SCALE = 0.1
 
 
 
30
 
31
 
32
  class PythonCodeReviewEnvironment(
@@ -36,17 +40,22 @@ class PythonCodeReviewEnvironment(
36
 
37
  SUPPORTS_CONCURRENT_SESSIONS = True
38
 
39
- def __init__(self) -> None:
40
- super().__init__()
41
- self._task_order = list(task_ids())
42
- self._task_cursor = -1
43
- self._task: Optional[TaskSpec] = None
44
- self._state = PythonCodeReviewState(episode_id=str(uuid4()))
45
- self._done = False
46
- self._last_status = "Call reset() to start."
47
- self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
48
- self._best_visible_test_fraction = 0.0
49
- self._best_quality_score = 0.0
 
 
 
 
 
50
  self._full_correctness_awarded = False
51
  self._syntax_reward_awarded = False
52
 
@@ -69,14 +78,16 @@ class PythonCodeReviewEnvironment(
69
  self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
70
  self._task = get_task(self._task_order[self._task_cursor])
71
 
72
- # Reset episode state
73
  self._done = False
 
 
74
  self._best_visible_test_fraction = 0.0
75
  self._best_quality_score = 0.0
76
  self._full_correctness_awarded = False
77
  self._syntax_reward_awarded = False
78
  self._last_status = "Inspect the code, edit it, run tests, then submit."
79
- self._last_reward = RewardDetails(value=0.0, reason="Episode reset.")
80
 
81
  self._state = PythonCodeReviewState(
82
  episode_id=episode_id or str(uuid4()),
@@ -93,6 +104,11 @@ class PythonCodeReviewEnvironment(
93
  done=False,
94
  )
95
 
 
 
 
 
 
96
  return self._build_observation()
97
 
98
  def step(
@@ -148,6 +164,10 @@ class PythonCodeReviewEnvironment(
148
  self._finalize_episode(auto_submit=True)
149
  self._state.done = True
150
 
 
 
 
 
151
  return self._build_observation()
152
 
153
  @property
@@ -155,13 +175,9 @@ class PythonCodeReviewEnvironment(
155
  """Return the current environment state."""
156
  return self._state.model_copy(deep=True)
157
 
158
- def list_task_summaries(self) -> List[object]:
159
- """Return public task metadata."""
160
- return list_task_summaries()
161
-
162
- def list_tasks(self) -> List[object]:
163
- """Return all public task descriptors."""
164
- return list_task_descriptors()
165
 
166
  def get_task(self, task_id: str) -> object:
167
  """Return a single task descriptor."""
@@ -175,60 +191,79 @@ class PythonCodeReviewEnvironment(
175
  """Expose deterministic grading outside of an active episode."""
176
  return grade_task(code, get_task(task_id), include_hidden=True)
177
 
178
- def _build_observation(self) -> PythonCodeReviewObservation:
179
- """Build current observation from state."""
180
- return PythonCodeReviewObservation(
181
- task_id=self._state.task_id or "",
182
- title=self._task.title if self._task else "",
183
- difficulty=self._state.difficulty or "easy",
184
- task_kind=self._state.task_kind,
185
- task_description=self._task.task_description if self._task else "",
186
- current_code=self._state.current_code,
187
- errors=self._state.errors,
188
- test_results=self._state.test_results,
189
- visible_tests=self._task.visible_tests if self._task else [],
190
- history=self._state.history,
191
- attempts_remaining=self._state.attempts_remaining,
192
- last_action_status=self._last_status,
193
- score=self._state.score,
194
- reward=self._last_reward.value,
195
- reward_details=self._last_reward,
196
- done=self._done,
197
- metadata={
198
- "episode_id": self._state.episode_id,
199
- "step_count": self._state.step_count,
200
- "task_kind": self._state.task_kind,
201
- },
202
- )
203
-
204
- def _handle_analyze(self) -> tuple[RewardDetails, str]:
205
- """Analyze code for errors and test status."""
206
- if self._task is None:
207
- return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
208
-
209
- grade = grade_task(self._state.current_code, self._task, include_hidden=False)
210
- error = grade.details.get("compile_error", "")
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  if error:
213
  self._state.errors = error
214
  self._state.test_results = "Compilation failed. Fix syntax first."
215
  summary = f"Syntax error detected: {error}"
216
- else:
217
- self._state.errors = ""
218
- if self._task.task_kind == "syntax_fix":
219
- self._state.test_results = "Code compiles successfully."
220
- summary = "Code compiles. Ready to submit."
221
- else:
222
- visible_total = len(self._task.visible_tests)
223
- visible_passed = grade.tests_passed
224
- self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
225
- summary = self._state.test_results
226
-
227
- reward_value = round((grade.score * 0.05) - self._grade_penalty(grade, failure_penalty=ANALYZE_FAILURE_PENALTY), 6)
228
- reward = RewardDetails(value=reward_value, reason=summary)
229
- self._append_history("analyze_code", summary, reward.value)
230
- self._sync_score(include_hidden=False)
231
- return reward, summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
234
  """Edit the code and compute reward for progress."""
@@ -246,109 +281,218 @@ class PythonCodeReviewEnvironment(
246
  self._append_history("edit_code", status, reward.value)
247
  return reward, status
248
 
249
- # Grade before and after
250
- previous_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
251
- new_grade = grade_task(code, self._task, include_hidden=False)
252
- self._state.current_code = code
253
-
 
 
 
 
 
 
 
 
 
 
 
254
  # Update state
255
- self._state.errors = new_grade.details.get("compile_error", "")
256
- self._state.test_results = self._format_test_results(new_grade)
257
-
258
- # Compute reward with shaping
259
- syntax_reward = 0.0
260
- if previous_grade.syntax_score < 1.0 and new_grade.syntax_score == 1.0:
261
- syntax_reward = 0.2
262
- self._syntax_reward_awarded = True
263
-
264
- quality_delta = new_grade.quality_score - previous_grade.quality_score
265
- quality_bonus = max(min(quality_delta * QUALITY_BONUS_SCALE, 0.1), -0.1)
266
- if new_grade.quality_score > self._best_quality_score:
267
- self._best_quality_score = new_grade.quality_score
268
-
269
- progress_reward = 0.2 * (new_grade.score - previous_grade.score)
270
- if new_grade.tests_total > 0:
271
- current_test_fraction = new_grade.tests_passed / new_grade.tests_total
272
- self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_test_fraction)
273
-
274
- penalty = self._grade_penalty(new_grade)
275
- reward_value = round(progress_reward + syntax_reward + quality_bonus - penalty, 6)
276
-
277
- status = "Code updated."
278
- if self._state.errors:
279
- status = f"Code updated with syntax issues: {self._state.errors}"
280
- elif new_grade.tests_total > 0:
281
  status = self._state.test_results
282
 
283
- reward = RewardDetails(
284
- value=reward_value,
285
- syntax_reward=syntax_reward,
286
- quality_bonus=round(quality_bonus, 6),
287
- test_reward=round(progress_reward, 6),
288
- timeout_penalty=TIMEOUT_PENALTY if new_grade.timed_out else 0.0,
289
- reason=status,
290
- )
291
- self._append_history("edit_code", status, reward_value)
292
- self._sync_score(include_hidden=False)
293
- return reward, status
 
 
 
 
 
 
294
 
295
  def _handle_run_tests(self) -> tuple[RewardDetails, str]:
296
  """Run tests and provide feedback."""
297
  if self._task is None:
298
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
299
 
300
- grade = grade_task(self._state.current_code, self._task, include_hidden=False)
301
- self._state.errors = grade.details.get("compile_error", "")
302
- self._state.test_results = self._format_test_results(grade)
303
-
304
- previous_score = self._state.score
305
- progress_reward = 0.2 * (grade.score - previous_score)
306
- completion_bonus = 0.05 if grade.tests_total > 0 and grade.tests_passed == grade.tests_total else 0.0
307
- penalty = self._grade_penalty(grade, failure_penalty=RUN_FAILURE_PENALTY)
308
- reward_value = round(progress_reward + completion_bonus - penalty, 6)
309
- if grade.tests_total > 0:
310
- current_fraction = grade.tests_passed / grade.tests_total
311
- self._best_visible_test_fraction = max(self._best_visible_test_fraction, current_fraction)
312
-
313
- status = self._state.test_results if not self._state.errors else self._state.errors
314
- reward = RewardDetails(
315
- value=reward_value,
316
- test_reward=round(progress_reward + completion_bonus, 6),
317
- timeout_penalty=TIMEOUT_PENALTY if grade.timed_out else 0.0,
318
- reason=status,
319
- )
320
- self._append_history("run_tests", status, reward.value)
321
- self._sync_score(include_hidden=False)
322
- return reward, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  def _handle_submit(self) -> tuple[RewardDetails, str]:
325
  """Submit solution and finalize episode."""
326
  if self._task is None:
327
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
328
 
329
- grade = grade_task(self._state.current_code, self._task, include_hidden=True)
330
- self._state.errors = grade.details.get("compile_error", "")
331
- self._state.test_results = self._format_test_results(grade)
332
-
333
- # Compute final reward bonuses
334
- correctness_bonus = 0.0
335
- if grade.score >= 0.999999 and not self._full_correctness_awarded:
336
- correctness_bonus = 0.5
337
- self._full_correctness_awarded = True
338
-
339
- penalty = self._grade_penalty(grade, failure_penalty=RUN_FAILURE_PENALTY)
340
- reward_value = round((grade.score * SUBMIT_BASE_SCALE) + correctness_bonus - penalty, 6)
341
- self._finalize_episode(auto_submit=False, grade=grade)
342
- status = f"Solution submitted. Final score: {grade.score:.3f}"
343
-
344
- reward = RewardDetails(
345
- value=reward_value,
346
- correctness_bonus=correctness_bonus,
347
- timeout_penalty=TIMEOUT_PENALTY if grade.timed_out else 0.0,
348
- reason=status,
349
- )
350
- self._append_history("submit_solution", status, reward_value)
351
- return reward, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
354
  """Mark episode as done and set final score."""
@@ -356,24 +500,10 @@ class PythonCodeReviewEnvironment(
356
  if self._task is None:
357
  return
358
  grade = grade_task(self._state.current_code, self._task, include_hidden=True)
359
- self._state.errors = grade.details.get("compile_error", "")
360
- self._state.test_results = self._format_test_results(grade)
361
 
362
  self._state.score = grade.score
363
  self._done = True
364
  self._state.done = True
365
-
366
- if auto_submit:
367
- self._last_status = f"Step budget exhausted. Final score: {grade.score:.3f}"
368
-
369
- def _sync_score(self, include_hidden: bool) -> None:
370
- """Update visible score based on current code."""
371
- if self._task is None:
372
- return
373
- grade = grade_task(self._state.current_code, self._task, include_hidden=include_hidden)
374
- # For visible runs, use a soft score; for hidden, it will be finalized on submit
375
- if not include_hidden:
376
- self._state.score = grade.score
377
 
378
  def _format_test_results(self, grade: TaskGrade) -> str:
379
  """Format test results for display."""
@@ -383,28 +513,42 @@ class PythonCodeReviewEnvironment(
383
  return "Test execution timed out."
384
  return f"Tests: {grade.tests_passed}/{grade.tests_total} passing"
385
 
386
- def _append_history(self, action_type: str, status: str, reward: float) -> None:
387
- """Append action to history."""
388
- entry = HistoryEntry(
389
- step=self._state.step_count,
390
- action_type=action_type,
391
- status=status,
392
- reward=reward,
393
- )
394
- self._state.history.append(entry)
395
-
396
- def _grade_penalty(self, grade: TaskGrade, failure_penalty: float = RUN_FAILURE_PENALTY) -> float:
397
- """Return a negative signal when the action leads to an obviously bad result."""
398
- penalty = 0.0
399
- if grade.details.get("compile_error"):
400
- penalty += failure_penalty + grade.score
401
- if grade.timed_out:
402
- penalty += TIMEOUT_PENALTY
403
- if grade.tests_total > 0 and grade.tests_passed == 0:
404
- penalty += failure_penalty
405
- return round(penalty, 6)
406
-
407
-
408
- # Backwards-compatible aliases used elsewhere in the repo.
409
- PythonEnvironment = PythonCodeReviewEnvironment
410
- CodeReviewEnvironment = PythonCodeReviewEnvironment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
5
+ import sys
6
  from typing import List, Optional
7
  from uuid import uuid4
8
 
 
21
  from tasks import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
22
 
23
 
24
+ # Reward shaping constants (balanced for meaningful variation)
25
+ SYNTAX_FIX_BONUS = 0.35 # One-time for fixing syntax
26
+ TEST_PASS_REWARD_SCALE = 0.30 # Per test improvement
27
+ QUALITY_BONUS_SCALE = 0.15 # Code quality improvement
28
+ PROGRESS_SCALE = 0.25 # Score improvement
29
+ COMPLETION_BONUS = 0.50 # Full correctness (one-time)
30
+ INVALID_ACTION_PENALTY = 0.15
31
+ STAGNATION_PENALTY = 0.10 # If code unchanged but action taken
32
+ REGRESSION_PENALTY_SCALE = 0.20 # Per 0.1 score decline
33
+ TIMEOUT_PENALTY = 0.15
34
 
35
 
36
  class PythonCodeReviewEnvironment(
 
40
 
41
  SUPPORTS_CONCURRENT_SESSIONS = True
42
 
43
+ def __init__(self, verbose: bool = True) -> None:
44
+ super().__init__()
45
+ self._task_order = list(task_ids())
46
+ self._task_cursor = -1
47
+ self._task: Optional[TaskSpec] = None
48
+ self._state = PythonCodeReviewState(episode_id=str(uuid4()))
49
+ self._done = False
50
+ self._last_status = "Call reset() to start."
51
+ self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
52
+ self._verbose = verbose
53
+
54
+ # Progress tracking
55
+ self._previous_score = 0.0
56
+ self._previous_code = ""
57
+ self._best_visible_test_fraction = 0.0
58
+ self._best_quality_score = 0.0
59
  self._full_correctness_awarded = False
60
  self._syntax_reward_awarded = False
61
 
 
78
  self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
79
  self._task = get_task(self._task_order[self._task_cursor])
80
 
81
+ # Reset episode state and tracking
82
  self._done = False
83
+ self._previous_score = 0.0
84
+ self._previous_code = self._task.starter_code
85
  self._best_visible_test_fraction = 0.0
86
  self._best_quality_score = 0.0
87
  self._full_correctness_awarded = False
88
  self._syntax_reward_awarded = False
89
  self._last_status = "Inspect the code, edit it, run tests, then submit."
90
+ self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
91
 
92
  self._state = PythonCodeReviewState(
93
  episode_id=episode_id or str(uuid4()),
 
104
  done=False,
105
  )
106
 
107
+ if self._verbose:
108
+ print(f"\n{'='*70}")
109
+ print(f"RESET: Task {self._task.task_id} ({self._task.difficulty})")
110
+ print(f"{'='*70}")
111
+
112
  return self._build_observation()
113
 
114
  def step(
 
164
  self._finalize_episode(auto_submit=True)
165
  self._state.done = True
166
 
167
+ # Debug logging
168
+ if self._verbose:
169
+ self._log_debug_step(reward)
170
+
171
  return self._build_observation()
172
 
173
  @property
 
175
  """Return the current environment state."""
176
  return self._state.model_copy(deep=True)
177
 
178
+ def list_task_summaries(self) -> List[object]:
179
+ """Return public task metadata."""
180
+ return list_task_summaries()
 
 
 
 
181
 
182
  def get_task(self, task_id: str) -> object:
183
  """Return a single task descriptor."""
 
191
  """Expose deterministic grading outside of an active episode."""
192
  return grade_task(code, get_task(task_id), include_hidden=True)
193
 
194
+ def _build_observation(self) -> PythonCodeReviewObservation:
195
+ """Build current observation from state."""
196
+ return PythonCodeReviewObservation(
197
+ task_id=self._state.task_id or "",
198
+ title=self._task.title if self._task else "",
199
+ difficulty=self._state.difficulty or "easy",
200
+ task_kind=self._state.task_kind,
201
+ task_description=self._task.task_description if self._task else "",
202
+ current_code=self._state.current_code,
203
+ errors=self._state.errors,
204
+ test_results=self._state.test_results,
205
+ visible_tests=self._task.visible_tests if self._task else [],
206
+ history=self._state.history,
207
+ attempts_remaining=self._state.attempts_remaining,
208
+ last_action_status=self._last_status,
209
+ score=self._state.score,
210
+ reward_details=self._last_reward,
211
+ )
212
+
213
+ def _handle_analyze(self) -> tuple[RewardDetails, str]:
214
+ """Analyze code for errors and test status."""
215
+ if self._task is None:
216
+ return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
 
 
 
 
 
 
 
 
 
 
217
 
218
+ # Grade current code
219
+ curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
220
+ curr_score = curr_grade.score
221
+ error = curr_grade.details.get("compile_error", "")
222
+
223
+ # Compute reward components
224
+ reward_components = self._compute_reward_components(
225
+ curr_score=curr_score,
226
+ prev_score=self._previous_score,
227
+ curr_grade=curr_grade,
228
+ code_changed=False, # analyze doesn't change code
229
+ )
230
+
231
+ # Status message
232
  if error:
233
  self._state.errors = error
234
  self._state.test_results = "Compilation failed. Fix syntax first."
235
  summary = f"Syntax error detected: {error}"
236
+ else:
237
+ self._state.errors = ""
238
+ if self._task.task_kind == "syntax_fix":
239
+ self._state.test_results = "Code compiles successfully."
240
+ summary = "Code compiles. Ready to submit."
241
+ else:
242
+ visible_total = len(self._task.visible_tests)
243
+ visible_passed = curr_grade.tests_passed
244
+ self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
245
+ summary = self._state.test_results
246
+
247
+ reward = RewardDetails(
248
+ value=reward_components["total"],
249
+ progress_delta=reward_components["progress"],
250
+ syntax_reward=reward_components["syntax"],
251
+ test_reward=reward_components["test"],
252
+ quality_bonus=reward_components["quality"],
253
+ stagnation_penalty=reward_components["stagnation"],
254
+ regression_penalty=reward_components["regression"],
255
+ timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
256
+ reason=summary,
257
+ prev_score=round(self._previous_score, 4),
258
+ curr_score=round(curr_score, 4),
259
+ code_changed=False,
260
+ )
261
+
262
+ # Update state
263
+ self._state.score = curr_score
264
+ self._state.errors = curr_grade.details.get("compile_error", "")
265
+ self._append_history("analyze_code", summary, reward.value)
266
+ return reward, summary
267
 
268
  def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
269
  """Edit the code and compute reward for progress."""
 
281
  self._append_history("edit_code", status, reward.value)
282
  return reward, status
283
 
284
+ # Detect code change
285
+ code_changed = (code != self._previous_code)
286
+
287
+ # Grade before and after
288
+ curr_grade = grade_task(code, self._task, include_hidden=False)
289
+ curr_score = curr_grade.score
290
+
291
+ # Compute reward components
292
+ reward_components = self._compute_reward_components(
293
+ curr_score=curr_score,
294
+ prev_score=self._previous_score,
295
+ curr_grade=curr_grade,
296
+ code_changed=code_changed,
297
+ prev_grade_score=grade_task(self._previous_code, self._task, include_hidden=False).syntax_score,
298
+ )
299
+
300
  # Update state
301
+ self._state.current_code = code
302
+ self._previous_code = code
303
+ self._previous_score = curr_score
304
+ self._state.errors = curr_grade.details.get("compile_error", "")
305
+ self._state.test_results = self._format_test_results(curr_grade)
306
+ self._state.score = curr_score
307
+
308
+ status = "Code updated."
309
+ if self._state.errors:
310
+ status = f"Code updated, but syntax issues remain: {self._state.errors}"
311
+ elif curr_grade.tests_total > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  status = self._state.test_results
313
 
314
+ reward = RewardDetails(
315
+ value=reward_components["total"],
316
+ progress_delta=reward_components["progress"],
317
+ syntax_reward=reward_components["syntax"],
318
+ test_reward=reward_components["test"],
319
+ quality_bonus=reward_components["quality"],
320
+ stagnation_penalty=reward_components["stagnation"],
321
+ regression_penalty=reward_components["regression"],
322
+ timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
323
+ reason=status,
324
+ prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
325
+ curr_score=round(curr_score, 4),
326
+ code_changed=code_changed,
327
+ )
328
+
329
+ self._append_history("edit_code", status, reward.value)
330
+ return reward, status
331
 
332
  def _handle_run_tests(self) -> tuple[RewardDetails, str]:
333
  """Run tests and provide feedback."""
334
  if self._task is None:
335
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
336
 
337
+ curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
338
+ curr_score = curr_grade.score
339
+
340
+ # Compute reward components
341
+ reward_components = self._compute_reward_components(
342
+ curr_score=curr_score,
343
+ prev_score=self._previous_score,
344
+ curr_grade=curr_grade,
345
+ code_changed=True, # Consider any test run as "changed behavior"
346
+ )
347
+
348
+ # Update state
349
+ self._state.errors = curr_grade.details.get("compile_error", "")
350
+ self._state.test_results = self._format_test_results(curr_grade)
351
+ self._state.score = curr_score
352
+ self._previous_score = curr_score
353
+
354
+ # Add completion bonus if all visible tests pass
355
+ completion_bonus = 0.0
356
+ if curr_grade.tests_total > 0 and curr_grade.tests_passed == curr_grade.tests_total:
357
+ completion_bonus = 0.20
358
+
359
+ status = self._state.test_results if not self._state.errors else self._state.errors
360
+ reward = RewardDetails(
361
+ value=reward_components["total"] + completion_bonus,
362
+ progress_delta=reward_components["progress"],
363
+ test_reward=reward_components["test"] + completion_bonus,
364
+ syntax_reward=reward_components["syntax"],
365
+ quality_bonus=reward_components["quality"],
366
+ stagnation_penalty=reward_components["stagnation"],
367
+ regression_penalty=reward_components["regression"],
368
+ timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
369
+ reason=status,
370
+ prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
371
+ curr_score=round(curr_score, 4),
372
+ code_changed=True,
373
+ )
374
+
375
+ self._append_history("run_tests", status, reward.value)
376
+ return reward, status
377
 
378
  def _handle_submit(self) -> tuple[RewardDetails, str]:
379
  """Submit solution and finalize episode."""
380
  if self._task is None:
381
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
382
 
383
+ curr_grade = grade_task(self._state.current_code, self._task, include_hidden=True)
384
+ curr_score = curr_grade.score
385
+
386
+ # Compute reward components
387
+ reward_components = self._compute_reward_components(
388
+ curr_score=curr_score,
389
+ prev_score=self._previous_score,
390
+ curr_grade=curr_grade,
391
+ code_changed=False, # Submit doesn't change code
392
+ )
393
+
394
+ # Bonus for full correctness (one-time only)
395
+ correctness_bonus = 0.0
396
+ if curr_score >= 0.9999 and not self._full_correctness_awarded:
397
+ correctness_bonus = COMPLETION_BONUS
398
+ self._full_correctness_awarded = True
399
+
400
+ # Update state
401
+ self._state.errors = curr_grade.details.get("compile_error", "")
402
+ self._state.test_results = self._format_test_results(curr_grade)
403
+ self._state.score = curr_score
404
+ self._finalize_episode(auto_submit=False, grade=curr_grade)
405
+
406
+ reward_value = max(-1.0, min(1.0, reward_components["total"] + correctness_bonus))
407
+ status = f"Solution submitted. Final score: {curr_score:.3f}"
408
+
409
+ reward = RewardDetails(
410
+ value=reward_value,
411
+ progress_delta=reward_components["progress"],
412
+ correctness_bonus=correctness_bonus,
413
+ syntax_reward=reward_components["syntax"],
414
+ test_reward=reward_components["test"],
415
+ quality_bonus=reward_components["quality"],
416
+ stagnation_penalty=reward_components["stagnation"],
417
+ regression_penalty=reward_components["regression"],
418
+ timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
419
+ reason=status,
420
+ prev_score=round(self._previous_score, 4),
421
+ curr_score=round(curr_score, 4),
422
+ code_changed=False,
423
+ )
424
+
425
+ self._append_history("submit_solution", status, reward_value)
426
+ return reward, status
427
+
428
+ def _compute_reward_components(
429
+ self,
430
+ curr_score: float,
431
+ prev_score: float,
432
+ curr_grade: TaskGrade,
433
+ code_changed: bool,
434
+ prev_grade_score: float = 0.0,
435
+ ) -> dict:
436
+ """Compute all reward components and return as dict."""
437
+ components = {
438
+ "progress": 0.0,
439
+ "syntax": 0.0,
440
+ "test": 0.0,
441
+ "quality": 0.0,
442
+ "stagnation": 0.0,
443
+ "regression": 0.0,
444
+ "total": 0.0,
445
+ }
446
+
447
+ # 1. Progress reward: score improvement
448
+ score_delta = curr_score - prev_score
449
+ if score_delta > 0:
450
+ components["progress"] = min(PROGRESS_SCALE * score_delta, 0.25)
451
+
452
+ # 2. Syntax reward: one-time bonus for first time compiling
453
+ if not self._syntax_reward_awarded and curr_grade.syntax_score >= 0.99:
454
+ if prev_grade_score < 0.99:
455
+ components["syntax"] = SYNTAX_FIX_BONUS
456
+ self._syntax_reward_awarded = True
457
+
458
+ # 3. Test reward: improvement in test pass rate
459
+ if curr_grade.tests_total > 0:
460
+ curr_test_frac = curr_grade.tests_passed / curr_grade.tests_total
461
+ test_delta = curr_test_frac - self._best_visible_test_fraction
462
+ if test_delta > 0:
463
+ components["test"] = min(TEST_PASS_REWARD_SCALE * test_delta, 0.20)
464
+ self._best_visible_test_fraction = max(self._best_visible_test_fraction, curr_test_frac)
465
+
466
+ # 4. Quality reward: code quality improvement
467
+ quality_delta = curr_grade.quality_score - self._best_quality_score
468
+ if quality_delta > 0:
469
+ components["quality"] = min(QUALITY_BONUS_SCALE * quality_delta, 0.15)
470
+ self._best_quality_score = max(self._best_quality_score, curr_grade.quality_score)
471
+
472
+ # 5. Stagnation penalty: code not changed (except analyze_code)
473
+ if not code_changed and not (curr_grade.details.get("compile_error") == ""):
474
+ components["stagnation"] = -STAGNATION_PENALTY
475
+
476
+ # 6. Regression penalty: score decreased
477
+ if score_delta < 0:
478
+ components["regression"] = REGRESSION_PENALTY_SCALE * abs(score_delta)
479
+
480
+ # 7. Timeout penalty
481
+ if curr_grade.timed_out:
482
+ components["regression"] = -TIMEOUT_PENALTY
483
+
484
+ # Compute total and clamp to [-1.0, 1.0]
485
+ total = (
486
+ components["progress"]
487
+ + components["syntax"]
488
+ + components["test"]
489
+ + components["quality"]
490
+ - components["stagnation"]
491
+ - components["regression"]
492
+ )
493
+ components["total"] = max(-1.0, min(1.0, round(total, 6)))
494
+
495
+ return components
496
 
497
  def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
498
  """Mark episode as done and set final score."""
 
500
  if self._task is None:
501
  return
502
  grade = grade_task(self._state.current_code, self._task, include_hidden=True)
 
 
503
 
504
  self._state.score = grade.score
505
  self._done = True
506
  self._state.done = True
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
  def _format_test_results(self, grade: TaskGrade) -> str:
509
  """Format test results for display."""
 
513
  return "Test execution timed out."
514
  return f"Tests: {grade.tests_passed}/{grade.tests_total} passing"
515
 
516
+ def _append_history(self, action_type: str, status: str, reward: float) -> None:
517
+ """Append action to history."""
518
+ entry = HistoryEntry(
519
+ step=self._state.step_count,
520
+ action_type=action_type,
521
+ status=status,
522
+ reward=reward,
523
+ )
524
+ self._state.history.append(entry)
525
+
526
+ def _log_debug_step(self, reward: RewardDetails) -> None:
527
+ """Log step details for debugging."""
528
+ print(
529
+ f"\nStep {self._state.step_count:2d} | "
530
+ f"Score: {reward.curr_score:.3f} | "
531
+ f"Delta: {(reward.curr_score - reward.prev_score):+.3f} | "
532
+ f"Reward: {reward.value:+.4f} | "
533
+ f"Changed: {reward.code_changed}"
534
+ )
535
+
536
+ # Log components if non-zero
537
+ components = [
538
+ ("Progress", reward.progress_delta),
539
+ ("Syntax", reward.syntax_reward),
540
+ ("Test", reward.test_reward),
541
+ ("Quality", reward.quality_bonus),
542
+ ("Stagnation", -reward.stagnation_penalty),
543
+ ("Regression", -reward.regression_penalty),
544
+ ]
545
+
546
+ non_zero = [f"{name}={val:+.3f}" for name, val in components if abs(val) > 0.001]
547
+ if non_zero:
548
+ print(f" | {' | '.join(non_zero)}")
549
+ print(f" | Reason: {reward.reason}")
550
+
551
+
552
+ # Backwards-compatible aliases used elsewhere in the repo.
553
+ PythonEnvironment = PythonCodeReviewEnvironment
554
+ CodeReviewEnvironment = PythonCodeReviewEnvironment