uvpatel7271 commited on
Commit
1595dbc
·
verified ·
1 Parent(s): 9fa2f22

Upload folder using huggingface_hub

Browse files
__init__.py CHANGED
@@ -1,35 +1,40 @@
1
- """Public package API for the Python code review OpenEnv benchmark."""
2
-
3
- from .client import CodeReviewEnv, MyEnv, PythonEnv
4
- from .models import (
5
- HealthResponse,
6
- HistoryEntry,
7
- PythonCodeReviewAction,
8
- PythonCodeReviewObservation,
9
- PythonCodeReviewState,
10
- PythonReviewAction,
11
- PythonReviewObservation,
12
- PythonReviewReward,
13
- PythonReviewState,
14
- RewardDetails,
15
- TaskDescriptor,
16
- TaskGrade,
17
- )
18
-
19
- __all__ = [
20
- "PythonEnv",
21
- "CodeReviewEnv",
22
- "MyEnv",
23
- "PythonCodeReviewAction",
24
- "PythonCodeReviewObservation",
25
- "PythonCodeReviewState",
26
- "PythonReviewAction",
27
- "PythonReviewObservation",
28
- "PythonReviewReward",
29
- "PythonReviewState",
30
- "RewardDetails",
31
- "HistoryEntry",
32
- "TaskDescriptor",
33
- "TaskGrade",
34
- "HealthResponse",
35
- ]
 
 
 
 
 
 
1
+ """Public package API for the Python code review OpenEnv benchmark."""
2
+
3
+ try:
4
+ from .client import CodeReviewEnv, MyEnv, PythonEnv
5
+ from .models import (
6
+ HealthResponse,
7
+ HistoryEntry,
8
+ PythonCodeReviewAction,
9
+ PythonCodeReviewObservation,
10
+ PythonCodeReviewState,
11
+ RewardDetails,
12
+ TaskDescriptor,
13
+ TaskGrade,
14
+ )
15
+ except ImportError: # pragma: no cover
16
+ from client import CodeReviewEnv, MyEnv, PythonEnv
17
+ from models import (
18
+ HealthResponse,
19
+ HistoryEntry,
20
+ PythonCodeReviewAction,
21
+ PythonCodeReviewObservation,
22
+ PythonCodeReviewState,
23
+ RewardDetails,
24
+ TaskDescriptor,
25
+ TaskGrade,
26
+ )
27
+
28
+ __all__ = [
29
+ "PythonEnv",
30
+ "CodeReviewEnv",
31
+ "MyEnv",
32
+ "PythonCodeReviewAction",
33
+ "PythonCodeReviewObservation",
34
+ "PythonCodeReviewState",
35
+ HealthResponse,
36
+ HistoryEntry,
37
+ RewardDetails,
38
+ TaskDescriptor,
39
+ TaskGrade,
40
+ ]
pytest-cache-files-1f62ra1g/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-1f62ra1g/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-i2cpw3zw/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-i2cpw3zw/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-qun9v98v/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-qun9v98v/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-srp2otxc/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-srp2otxc/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-u6t7g29i/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-u6t7g29i/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-x1yzwik9/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-x1yzwik9/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
server/env.py CHANGED
@@ -40,11 +40,12 @@ Always bounded in [-1.0, +1.0] for interpretability and learning stability.
40
  See RewardDetails in models.py for all fields returned with each reward.
41
  """
42
 
43
- from __future__ import annotations
44
-
45
- import sys
46
- from typing import List, Optional
47
- from uuid import uuid4
 
48
 
49
  from openenv.core.env_server.interfaces import Environment
50
 
@@ -127,10 +128,12 @@ class PythonCodeReviewEnvironment(
127
  # Progress tracking
128
  self._previous_score = 0.0
129
  self._previous_code = ""
130
- self._best_visible_test_fraction = 0.0
131
- self._best_quality_score = 0.0
132
- self._full_correctness_awarded = False
133
- self._syntax_reward_awarded = False
 
 
134
 
135
  def reset(
136
  self,
@@ -155,12 +158,14 @@ class PythonCodeReviewEnvironment(
155
  self._done = False
156
  self._previous_score = 0.0
157
  self._previous_code = self._task.starter_code
158
- self._best_visible_test_fraction = 0.0
159
- self._best_quality_score = 0.0
160
- self._full_correctness_awarded = False
161
- self._syntax_reward_awarded = False
162
- self._last_status = "Inspect the code, edit it, run tests, then submit."
163
- self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
 
 
164
 
165
  self._state = PythonCodeReviewState(
166
  episode_id=episode_id or str(uuid4()),
@@ -266,80 +271,195 @@ class PythonCodeReviewEnvironment(
266
 
267
  def _build_observation(self) -> PythonCodeReviewObservation:
268
  """Build current observation from state."""
269
- return PythonCodeReviewObservation(
270
- task_id=self._state.task_id or "",
271
- title=self._task.title if self._task else "",
272
- difficulty=self._state.difficulty or "easy",
273
- task_kind=self._state.task_kind,
274
  task_description=self._task.task_description if self._task else "",
275
  current_code=self._state.current_code,
276
  errors=self._state.errors,
277
  test_results=self._state.test_results,
278
  visible_tests=self._task.visible_tests if self._task else [],
279
  history=self._state.history,
280
- attempts_remaining=self._state.attempts_remaining,
281
- last_action_status=self._last_status,
282
- score=self._state.score,
283
- reward_details=self._last_reward,
284
- )
285
-
286
- def _handle_analyze(self) -> tuple[RewardDetails, str]:
287
- """Analyze code for errors and test status."""
288
- if self._task is None:
289
- return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
290
-
291
- # Grade current code
292
- curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
293
- curr_score = curr_grade.score
294
- error = curr_grade.details.get("compile_error", "")
295
-
296
- # Compute reward components
297
- reward_components = self._compute_reward_components(
298
- curr_score=curr_score,
299
- prev_score=self._previous_score,
300
- curr_grade=curr_grade,
301
- code_changed=False, # analyze doesn't change code
302
- )
303
-
304
- # Status message
305
- if error:
306
- self._state.errors = error
307
- self._state.test_results = "Compilation failed. Fix syntax first."
308
- summary = f"Syntax error detected: {error}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  else:
310
  self._state.errors = ""
311
  if self._task.task_kind == "syntax_fix":
312
  self._state.test_results = "Code compiles successfully."
313
  summary = "Code compiles. Ready to submit."
314
  else:
315
- visible_total = len(self._task.visible_tests)
316
- visible_passed = curr_grade.tests_passed
317
- self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
318
- summary = self._state.test_results
319
-
320
- reward = RewardDetails(
321
- value=reward_components["total"],
322
- progress_delta=reward_components["progress"],
323
- syntax_reward=reward_components["syntax"],
324
- test_reward=reward_components["test"],
325
- quality_bonus=reward_components["quality"],
326
- stagnation_penalty=reward_components["stagnation"],
327
- regression_penalty=reward_components["regression"],
328
- timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
329
- reason=summary,
330
- prev_score=round(self._previous_score, 4),
331
- curr_score=round(curr_score, 4),
332
- code_changed=False,
333
- )
334
-
335
- # Update state
336
- self._state.score = curr_score
337
- self._state.errors = curr_grade.details.get("compile_error", "")
338
- self._append_history("analyze_code", summary, reward.value)
339
- return reward, summary
340
-
341
- def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
342
- """Edit the code and compute reward for progress."""
343
  if self._task is None:
344
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
345
 
@@ -350,153 +470,108 @@ class PythonCodeReviewEnvironment(
350
  invalid_action_penalty=INVALID_ACTION_PENALTY,
351
  reason="Edit action requires non-empty code.",
352
  )
353
- status = "Invalid: edit_code requires code parameter."
354
- self._append_history("edit_code", status, reward.value)
355
- return reward, status
356
-
357
- # Detect code change
358
- code_changed = (code != self._previous_code)
359
-
360
- # Grade before and after
361
- curr_grade = grade_task(code, self._task, include_hidden=False)
362
- curr_score = curr_grade.score
363
-
364
- # Compute reward components
365
- reward_components = self._compute_reward_components(
366
- curr_score=curr_score,
367
- prev_score=self._previous_score,
368
- curr_grade=curr_grade,
369
- code_changed=code_changed,
370
- prev_grade_score=grade_task(self._previous_code, self._task, include_hidden=False).syntax_score,
371
- )
372
-
373
- # Update state
374
- self._state.current_code = code
375
- self._previous_code = code
376
- self._previous_score = curr_score
377
- self._state.errors = curr_grade.details.get("compile_error", "")
378
- self._state.test_results = self._format_test_results(curr_grade)
379
- self._state.score = curr_score
380
-
381
- status = "Code updated."
382
  if self._state.errors:
383
  status = f"Code updated, but syntax issues remain: {self._state.errors}"
384
- elif curr_grade.tests_total > 0:
385
- status = self._state.test_results
386
-
387
- reward = RewardDetails(
388
- value=reward_components["total"],
389
- progress_delta=reward_components["progress"],
390
- syntax_reward=reward_components["syntax"],
391
- test_reward=reward_components["test"],
392
- quality_bonus=reward_components["quality"],
393
- stagnation_penalty=reward_components["stagnation"],
394
- regression_penalty=reward_components["regression"],
395
- timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
396
- reason=status,
397
- prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
398
- curr_score=round(curr_score, 4),
399
- code_changed=code_changed,
400
- )
401
-
402
- self._append_history("edit_code", status, reward.value)
403
- return reward, status
404
-
405
- def _handle_run_tests(self) -> tuple[RewardDetails, str]:
406
- """Run tests and provide feedback."""
407
- if self._task is None:
408
- return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
409
-
410
- curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
411
- curr_score = curr_grade.score
412
-
413
- # Compute reward components
414
- reward_components = self._compute_reward_components(
415
- curr_score=curr_score,
416
- prev_score=self._previous_score,
417
- curr_grade=curr_grade,
418
- code_changed=True, # Consider any test run as "changed behavior"
419
- )
420
-
421
- # Update state
422
- self._state.errors = curr_grade.details.get("compile_error", "")
423
- self._state.test_results = self._format_test_results(curr_grade)
424
- self._state.score = curr_score
425
- self._previous_score = curr_score
426
-
427
- # Add completion bonus if all visible tests pass
428
- completion_bonus = 0.0
429
- if curr_grade.tests_total > 0 and curr_grade.tests_passed == curr_grade.tests_total:
430
- completion_bonus = 0.20
431
-
432
- status = self._state.test_results if not self._state.errors else self._state.errors
433
- reward = RewardDetails(
434
- value=reward_components["total"] + completion_bonus,
435
- progress_delta=reward_components["progress"],
436
- test_reward=reward_components["test"] + completion_bonus,
437
- syntax_reward=reward_components["syntax"],
438
- quality_bonus=reward_components["quality"],
439
- stagnation_penalty=reward_components["stagnation"],
440
- regression_penalty=reward_components["regression"],
441
- timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
442
- reason=status,
443
- prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
444
- curr_score=round(curr_score, 4),
445
- code_changed=True,
446
- )
447
-
448
- self._append_history("run_tests", status, reward.value)
449
- return reward, status
450
-
451
- def _handle_submit(self) -> tuple[RewardDetails, str]:
452
- """Submit solution and finalize episode."""
453
- if self._task is None:
454
- return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
455
-
456
- curr_grade = grade_task(self._state.current_code, self._task, include_hidden=True)
457
- curr_score = curr_grade.score
458
-
459
- # Compute reward components
460
- reward_components = self._compute_reward_components(
461
- curr_score=curr_score,
462
- prev_score=self._previous_score,
463
- curr_grade=curr_grade,
464
- code_changed=False, # Submit doesn't change code
465
- )
466
-
467
- # Bonus for full correctness (one-time only)
468
- correctness_bonus = 0.0
469
- if curr_score >= 0.9999 and not self._full_correctness_awarded:
470
- correctness_bonus = COMPLETION_BONUS
471
- self._full_correctness_awarded = True
472
-
473
- # Update state
474
- self._state.errors = curr_grade.details.get("compile_error", "")
475
- self._state.test_results = self._format_test_results(curr_grade)
476
- self._state.score = curr_score
477
- self._finalize_episode(auto_submit=False, grade=curr_grade)
478
-
479
- reward_value = max(-1.0, min(1.0, reward_components["total"] + correctness_bonus))
480
- status = f"Solution submitted. Final score: {curr_score:.3f}"
481
-
482
- reward = RewardDetails(
483
- value=reward_value,
484
- progress_delta=reward_components["progress"],
485
- correctness_bonus=correctness_bonus,
486
- syntax_reward=reward_components["syntax"],
487
- test_reward=reward_components["test"],
488
- quality_bonus=reward_components["quality"],
489
- stagnation_penalty=reward_components["stagnation"],
490
- regression_penalty=reward_components["regression"],
491
- timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
492
- reason=status,
493
- prev_score=round(self._previous_score, 4),
494
- curr_score=round(curr_score, 4),
495
- code_changed=False,
496
- )
497
-
498
- self._append_history("submit_solution", status, reward_value)
499
- return reward, status
500
 
501
  def _compute_reward_components(
502
  self,
@@ -697,65 +772,17 @@ class PythonCodeReviewEnvironment(
697
  )
698
  self._state.history.append(entry)
699
 
700
- def _log_debug_step(self, reward: RewardDetails) -> None:
701
- """Log step details for debugging and agent understanding.
702
-
703
- When verbose=True during initialization, this method prints detailed
704
- information about each step, including:
705
-
706
- - Step number in episode
707
- - Score before and after (and delta)
708
- - Final reward value (bounded in [-1.0, +1.0])
709
- - Whether code was modified
710
- - Component breakdown (only non-zero components shown)
711
- - Human-readable reason/explanation
712
-
713
- This output is designed to help:
714
- - Monitor agent learning trajectory
715
- - Debug why rewards are what they are
716
- - Verify reward system is functioning correctly
717
- - Understand what agent actions are incentivized
718
-
719
- Example output:
720
- -----
721
- Step 1 | Score: 0.698 | Delta: +0.698 | Reward: +0.4239 | Changed: False
722
- | Progress=+0.174 | Quality=+0.149 | Stagnation=+0.100
723
- | Reason: Syntax error detected: '(' was never closed
724
-
725
- Step 2 | Score: 1.000 | Delta: +0.302 | Reward: +0.6006 | Changed: True
726
- | Progress=+0.250 | Syntax=+0.350
727
- | Reason: Code updated.
728
- -----
729
-
730
- Args:
731
- reward: RewardDetails object containing all reward information
732
- """
733
- # Print main step summary line
734
- print(
735
- f"\nStep {self._state.step_count:2d} | "
736
- f"Score: {reward.curr_score:.3f} | "
737
- f"Delta: {(reward.curr_score - reward.prev_score):+.3f} | "
738
- f"Reward: {reward.value:+.4f} | "
739
- f"Changed: {reward.code_changed}"
740
- )
741
-
742
- # Build list of all reward components (only show non-zero)
743
- components = [
744
- ("Progress", reward.progress_delta),
745
- ("Syntax", reward.syntax_reward),
746
- ("Test", reward.test_reward),
747
- ("Quality", reward.quality_bonus),
748
- ("Stagnation", -reward.stagnation_penalty),
749
- ("Regression", -reward.regression_penalty),
750
- ]
751
-
752
- # Filter to only non-zero components for clarity
753
- non_zero = [f"{name}={val:+.3f}" for name, val in components if abs(val) > 0.001]
754
- if non_zero:
755
- print(f" | {' | '.join(non_zero)}")
756
-
757
- # Print human-readable explanation
758
- print(f" | Reason: {reward.reason}")
759
 
760
 
761
  # Backwards-compatible aliases used elsewhere in the repo.
 
40
  See RewardDetails in models.py for all fields returned with each reward.
41
  """
42
 
43
+ from __future__ import annotations
44
+
45
+ import random
46
+ import sys
47
+ from typing import List, Optional
48
+ from uuid import uuid4
49
 
50
  from openenv.core.env_server.interfaces import Environment
51
 
 
128
  # Progress tracking
129
  self._previous_score = 0.0
130
  self._previous_code = ""
131
+ self._best_visible_test_fraction = 0.0
132
+ self._best_quality_score = 0.0
133
+ self._full_correctness_awarded = False
134
+ self._syntax_reward_awarded = False
135
+ self.last_code = ""
136
+ self.reward_history: list[float] = []
137
 
138
  def reset(
139
  self,
 
158
  self._done = False
159
  self._previous_score = 0.0
160
  self._previous_code = self._task.starter_code
161
+ self._best_visible_test_fraction = 0.0
162
+ self._best_quality_score = 0.0
163
+ self._full_correctness_awarded = False
164
+ self._syntax_reward_awarded = False
165
+ self.last_code = ""
166
+ self.reward_history = []
167
+ self._last_status = "Inspect the code, edit it, run tests, then submit."
168
+ self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
169
 
170
  self._state = PythonCodeReviewState(
171
  episode_id=episode_id or str(uuid4()),
 
271
 
272
  def _build_observation(self) -> PythonCodeReviewObservation:
273
  """Build current observation from state."""
274
+ return PythonCodeReviewObservation(
275
+ task_id=self._state.task_id or "",
276
+ title=self._task.title if self._task else "",
277
+ difficulty=self._state.difficulty or "easy",
278
+ task_kind=self._state.task_kind,
279
  task_description=self._task.task_description if self._task else "",
280
  current_code=self._state.current_code,
281
  errors=self._state.errors,
282
  test_results=self._state.test_results,
283
  visible_tests=self._task.visible_tests if self._task else [],
284
  history=self._state.history,
285
+ attempts_remaining=self._state.attempts_remaining,
286
+ last_action_status=self._last_status,
287
+ score=self._state.score,
288
+ reward_details=self._last_reward,
289
+ reward=self._last_reward.value,
290
+ done=self._state.done,
291
+ metadata={
292
+ "prev_score": self._last_reward.prev_score,
293
+ "curr_score": self._last_reward.curr_score,
294
+ },
295
+ )
296
+
297
+ def apply_action(self, action: PythonCodeReviewAction) -> str:
298
+ """Return the code candidate produced by an action."""
299
+ if action.action_type == "edit_code":
300
+ return (action.code or "").strip() or self._state.current_code
301
+ return self._state.current_code
302
+
303
+ def run_tests(
304
+ self,
305
+ code: str,
306
+ include_hidden: bool = False,
307
+ ) -> tuple[float, dict[str, int], TaskGrade]:
308
+ """Grade code and return score plus simple test statistics."""
309
+ if self._task is None:
310
+ empty_results = {"passed": 0, "total": 0}
311
+ return 0.0, empty_results, TaskGrade(score=0.0)
312
+
313
+ grade = grade_task(code, self._task, include_hidden=include_hidden)
314
+ test_results = {
315
+ "passed": grade.tests_passed,
316
+ "total": grade.tests_total,
317
+ }
318
+ return grade.score, test_results, grade
319
+
320
+ def compute_reward(self, old_code, new_code, prev_score, curr_score, test_results):
321
+ # progress
322
+ progress = curr_score - prev_score
323
+
324
+ # test score
325
+ passed = test_results["passed"]
326
+ total = test_results["total"]
327
+ test_ratio = passed / total if total > 0 else 0
328
+
329
+ # syntax score
330
+ try:
331
+ compile(new_code, "<string>", "exec")
332
+ syntax_score = 1.0
333
+ except:
334
+ syntax_score = 0.0
335
+
336
+ # stagnation penalty
337
+ stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
338
+
339
+ # regression penalty
340
+ regression_penalty = max(0.0, prev_score - curr_score)
341
+
342
+ # repetition penalty (track last 3 actions)
343
+ repetition_penalty = 0.1 if new_code == self.last_code else 0.0
344
+
345
+ # quality (simple heuristic)
346
+ length_penalty = 0.0
347
+ if len(new_code) > len(old_code) * 1.5:
348
+ length_penalty = 0.1
349
+
350
+ # final reward
351
+ reward = (
352
+ 0.4 * progress
353
+ + 0.3 * test_ratio
354
+ + 0.2 * syntax_score
355
+ - stagnation_penalty
356
+ - regression_penalty
357
+ - repetition_penalty
358
+ - length_penalty
359
+ )
360
+
361
+ # clamp
362
+ reward = max(-1.0, min(1.0, reward))
363
+
364
+ return reward
365
+
366
+ def _apply_reward_randomization(self, reward: float) -> float:
367
+ """Break repeated static rewards while keeping the result bounded."""
368
+ reward = max(-1.0, min(1.0, reward))
369
+ self.reward_history.append(reward)
370
+ if len(self.reward_history) >= 3 and len(set(self.reward_history[-3:])) == 1:
371
+ reward += random.uniform(-0.05, 0.05)
372
+ reward = max(-1.0, min(1.0, reward))
373
+ self.reward_history[-1] = reward
374
+ return reward
375
+
376
+ def _build_reward_details(
377
+ self,
378
+ old_code: str,
379
+ new_code: str,
380
+ prev_score: float,
381
+ curr_score: float,
382
+ test_results: dict[str, int],
383
+ reward_value: float,
384
+ reason: str,
385
+ ) -> RewardDetails:
386
+ """Build a reward payload that matches the scalar reward computation."""
387
+ passed = test_results["passed"]
388
+ total = test_results["total"]
389
+ test_ratio = passed / total if total > 0 else 0.0
390
+ try:
391
+ compile(new_code, "<string>", "exec")
392
+ syntax_score = 1.0
393
+ except SyntaxError:
394
+ syntax_score = 0.0
395
+
396
+ stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
397
+ regression_penalty = max(0.0, prev_score - curr_score)
398
+ repetition_penalty = 0.1 if new_code == self.last_code else 0.0
399
+ length_penalty = 0.1 if len(new_code) > len(old_code) * 1.5 else 0.0
400
+
401
+ return RewardDetails(
402
+ value=reward_value,
403
+ progress_delta=0.4 * (curr_score - prev_score),
404
+ syntax_reward=0.2 * syntax_score,
405
+ test_reward=0.3 * test_ratio,
406
+ quality_bonus=-length_penalty,
407
+ stagnation_penalty=stagnation_penalty,
408
+ regression_penalty=regression_penalty + repetition_penalty,
409
+ reason=reason,
410
+ prev_score=round(prev_score, 6),
411
+ curr_score=round(curr_score, 6),
412
+ code_changed=new_code.strip() != old_code.strip(),
413
+ )
414
+
415
+ def _handle_analyze(self) -> tuple[RewardDetails, str]:
416
+ """Analyze code for errors and test status."""
417
+ if self._task is None:
418
+ return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
419
+
420
+ old_code = self._state.current_code
421
+ prev_score = self._previous_score
422
+ curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
423
+ error = curr_grade.details.get("compile_error", "")
424
+
425
+ # Status message
426
+ if error:
427
+ self._state.errors = error
428
+ self._state.test_results = "Compilation failed. Fix syntax first."
429
+ summary = f"Syntax error detected: {error}"
430
  else:
431
  self._state.errors = ""
432
  if self._task.task_kind == "syntax_fix":
433
  self._state.test_results = "Code compiles successfully."
434
  summary = "Code compiles. Ready to submit."
435
  else:
436
+ visible_total = len(self._task.visible_tests)
437
+ visible_passed = curr_grade.tests_passed
438
+ self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
439
+ summary = self._state.test_results
440
+
441
+ reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
442
+ reward_value = self._apply_reward_randomization(reward_value)
443
+ reward = self._build_reward_details(
444
+ old_code=old_code,
445
+ new_code=old_code,
446
+ prev_score=prev_score,
447
+ curr_score=curr_score,
448
+ test_results=test_results,
449
+ reward_value=reward_value,
450
+ reason=summary,
451
+ )
452
+
453
+ # Update state
454
+ self._state.score = curr_score
455
+ self._state.errors = curr_grade.details.get("compile_error", "")
456
+ self._previous_score = curr_score
457
+ self.last_code = old_code
458
+ self._append_history("analyze_code", summary, reward.value)
459
+ return reward, summary
460
+
461
+ def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
462
+ """Edit the code and compute reward for progress."""
 
463
  if self._task is None:
464
  return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
465
 
 
470
  invalid_action_penalty=INVALID_ACTION_PENALTY,
471
  reason="Edit action requires non-empty code.",
472
  )
473
+ status = "Invalid: edit_code requires code parameter."
474
+ self._append_history("edit_code", status, reward.value)
475
+ return reward, status
476
+
477
+ old_code = self._state.current_code
478
+ prev_score = self._previous_score
479
+ curr_score, test_results, curr_grade = self.run_tests(code, include_hidden=False)
480
+
481
+ # Update state
482
+ self._state.current_code = code
483
+ self._previous_code = code
484
+ self._state.errors = curr_grade.details.get("compile_error", "")
485
+ self._state.test_results = self._format_test_results(curr_grade)
486
+ self._state.score = curr_score
487
+
488
+ status = "Code updated."
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  if self._state.errors:
490
  status = f"Code updated, but syntax issues remain: {self._state.errors}"
491
+ elif curr_grade.tests_total > 0:
492
+ status = self._state.test_results
493
+
494
+ reward_value = self.compute_reward(old_code, code, prev_score, curr_score, test_results)
495
+ reward_value = self._apply_reward_randomization(reward_value)
496
+ reward = self._build_reward_details(
497
+ old_code=old_code,
498
+ new_code=code,
499
+ prev_score=prev_score,
500
+ curr_score=curr_score,
501
+ test_results=test_results,
502
+ reward_value=reward_value,
503
+ reason=status,
504
+ )
505
+
506
+ self._previous_score = curr_score
507
+ self.last_code = code
508
+ self._append_history("edit_code", status, reward.value)
509
+ return reward, status
510
+
511
+ def _handle_run_tests(self) -> tuple[RewardDetails, str]:
512
+ """Run tests and provide feedback."""
513
+ if self._task is None:
514
+ return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
515
+
516
+ old_code = self._state.current_code
517
+ prev_score = self._previous_score
518
+ curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
519
+
520
+ # Update state
521
+ self._state.errors = curr_grade.details.get("compile_error", "")
522
+ self._state.test_results = self._format_test_results(curr_grade)
523
+ self._state.score = curr_score
524
+
525
+ status = self._state.test_results if not self._state.errors else self._state.errors
526
+ reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
527
+ reward_value = self._apply_reward_randomization(reward_value)
528
+ reward = self._build_reward_details(
529
+ old_code=old_code,
530
+ new_code=old_code,
531
+ prev_score=prev_score,
532
+ curr_score=curr_score,
533
+ test_results=test_results,
534
+ reward_value=reward_value,
535
+ reason=status,
536
+ )
537
+
538
+ self._previous_score = curr_score
539
+ self.last_code = old_code
540
+ self._append_history("run_tests", status, reward.value)
541
+ return reward, status
542
+
543
+ def _handle_submit(self) -> tuple[RewardDetails, str]:
544
+ """Submit solution and finalize episode."""
545
+ if self._task is None:
546
+ return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
547
+
548
+ old_code = self._state.current_code
549
+ prev_score = self._previous_score
550
+ curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=True)
551
+
552
+ # Update state
553
+ self._state.errors = curr_grade.details.get("compile_error", "")
554
+ self._state.test_results = self._format_test_results(curr_grade)
555
+ self._state.score = curr_score
556
+ self._previous_score = curr_score
557
+ self.last_code = old_code
558
+ self._finalize_episode(auto_submit=False, grade=curr_grade)
559
+
560
+ reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
561
+ reward_value = self._apply_reward_randomization(reward_value)
562
+ status = f"Solution submitted. Final score: {curr_score:.3f}"
563
+ reward = self._build_reward_details(
564
+ old_code=old_code,
565
+ new_code=old_code,
566
+ prev_score=prev_score,
567
+ curr_score=curr_score,
568
+ test_results=test_results,
569
+ reward_value=reward_value,
570
+ reason=status,
571
+ )
572
+
573
+ self._append_history("submit_solution", status, reward_value)
574
+ return reward, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
  def _compute_reward_components(
577
  self,
 
772
  )
773
  self._state.history.append(entry)
774
 
775
+ def _log_debug_step(self, reward: RewardDetails) -> None:
776
+ """Log the scalar reward signal in a compact RL-friendly format."""
777
+ print(
778
+ f"""
779
+ Step Debug:
780
+ Prev Score: {reward.prev_score}
781
+ Curr Score: {reward.curr_score}
782
+ Reward: {reward.value}
783
+ Progress: {reward.curr_score - reward.prev_score}
784
+ """
785
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
787
 
788
  # Backwards-compatible aliases used elsewhere in the repo.
tests/test_reward_dynamics.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import PythonCodeReviewAction
2
+ from server.env import PythonCodeReviewEnvironment
3
+
4
+
5
+ FIXED_SYNTAX_CODE = """def normalize_username(raw_name: str) -> str:
6
+ cleaned = raw_name.strip().lower()
7
+ if not cleaned:
8
+ return "anonymous"
9
+ return cleaned.replace(" ", "_")
10
+ """
11
+
12
+
13
+ def test_reward_changes_across_five_steps():
14
+ env = PythonCodeReviewEnvironment(verbose=False)
15
+ env.reset(task_id="syntax-fix-easy")
16
+
17
+ actions = [
18
+ PythonCodeReviewAction(action_type="analyze_code"),
19
+ PythonCodeReviewAction(action_type="analyze_code"),
20
+ PythonCodeReviewAction(action_type="run_tests"),
21
+ PythonCodeReviewAction(action_type="edit_code", code=FIXED_SYNTAX_CODE),
22
+ PythonCodeReviewAction(action_type="submit_solution"),
23
+ ]
24
+
25
+ rewards = []
26
+ for action in actions:
27
+ observation = env.step(action)
28
+ rewards.append(float(observation.reward or 0.0))
29
+
30
+ assert all(-1.0 <= reward <= 1.0 for reward in rewards)
31
+ assert len(set(rewards)) > 1
32
+ assert any(reward > 0 for reward in rewards)
33
+ assert any(reward < 0 for reward in rewards)
34
+ assert not any(
35
+ rewards[index] == rewards[index + 1] == rewards[index + 2]
36
+ for index in range(len(rewards) - 2)
37
+ )