Spaces:

uvpatel7271
/

openenv-python-env

Sleeping

App Files Files Community

uvpatel7271 commited on 14 days ago

Commit

1595dbc

verified ·

1 Parent(s): 9fa2f22

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

__init__.py +40 -35
pytest-cache-files-1f62ra1g/CACHEDIR.TAG +4 -0
pytest-cache-files-1f62ra1g/README.md +8 -0
pytest-cache-files-i2cpw3zw/CACHEDIR.TAG +4 -0
pytest-cache-files-i2cpw3zw/README.md +8 -0
pytest-cache-files-qun9v98v/CACHEDIR.TAG +4 -0
pytest-cache-files-qun9v98v/README.md +8 -0
pytest-cache-files-srp2otxc/CACHEDIR.TAG +4 -0
pytest-cache-files-srp2otxc/README.md +8 -0
pytest-cache-files-u6t7g29i/CACHEDIR.TAG +4 -0
pytest-cache-files-u6t7g29i/README.md +8 -0
pytest-cache-files-x1yzwik9/CACHEDIR.TAG +4 -0
pytest-cache-files-x1yzwik9/README.md +8 -0
server/env.py +308 -281
tests/test_reward_dynamics.py +37 -0

__init__.py CHANGED Viewed

@@ -1,35 +1,40 @@
-"""Public package API for the Python code review OpenEnv benchmark."""
-from .client import CodeReviewEnv, MyEnv, PythonEnv
-from .models import (
-    HealthResponse,
-    HistoryEntry,
-    PythonCodeReviewAction,
-    PythonCodeReviewObservation,
-    PythonCodeReviewState,
-    PythonReviewAction,
-    PythonReviewObservation,
-    PythonReviewReward,
-    PythonReviewState,
-    RewardDetails,
-    TaskDescriptor,
-    TaskGrade,
-)
-__all__ = [
-    "PythonEnv",
-    "CodeReviewEnv",
-    "MyEnv",
-    "PythonCodeReviewAction",
-    "PythonCodeReviewObservation",
-    "PythonCodeReviewState",
-    "PythonReviewAction",
-    "PythonReviewObservation",
-    "PythonReviewReward",
-    "PythonReviewState",
-    "RewardDetails",
-    "HistoryEntry",
-    "TaskDescriptor",
-    "TaskGrade",
-    "HealthResponse",
-]

+"""Public package API for the Python code review OpenEnv benchmark."""
+try:
+    from .client import CodeReviewEnv, MyEnv, PythonEnv
+    from .models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskDescriptor,
+        TaskGrade,
+    )
+except ImportError:  # pragma: no cover
+    from client import CodeReviewEnv, MyEnv, PythonEnv
+    from models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskDescriptor,
+        TaskGrade,
+    )
+__all__ = [
+    "PythonEnv",
+    "CodeReviewEnv",
+    "MyEnv",
+    "PythonCodeReviewAction",
+    "PythonCodeReviewObservation",
+    "PythonCodeReviewState",
+    HealthResponse,
+    HistoryEntry,
+    RewardDetails,
+    TaskDescriptor,
+    TaskGrade,
+]

pytest-cache-files-1f62ra1g/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-1f62ra1g/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

pytest-cache-files-i2cpw3zw/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-i2cpw3zw/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

pytest-cache-files-qun9v98v/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-qun9v98v/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

pytest-cache-files-srp2otxc/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-srp2otxc/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

pytest-cache-files-u6t7g29i/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-u6t7g29i/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

pytest-cache-files-x1yzwik9/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

pytest-cache-files-x1yzwik9/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

server/env.py CHANGED Viewed

@@ -40,11 +40,12 @@ Always bounded in [-1.0, +1.0] for interpretability and learning stability.
 See RewardDetails in models.py for all fields returned with each reward.
 """
-from __future__ import annotations
-import sys
-from typing import List, Optional
-from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
@@ -127,10 +128,12 @@ class PythonCodeReviewEnvironment(
         # Progress tracking
         self._previous_score = 0.0
         self._previous_code = ""
-        self._best_visible_test_fraction = 0.0
-        self._best_quality_score = 0.0
-        self._full_correctness_awarded = False
-        self._syntax_reward_awarded = False
     def reset(
         self,
@@ -155,12 +158,14 @@ class PythonCodeReviewEnvironment(
         self._done = False
         self._previous_score = 0.0
         self._previous_code = self._task.starter_code
-        self._best_visible_test_fraction = 0.0
-        self._best_quality_score = 0.0
-        self._full_correctness_awarded = False
-        self._syntax_reward_awarded = False
-        self._last_status = "Inspect the code, edit it, run tests, then submit."
-        self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
         self._state = PythonCodeReviewState(
             episode_id=episode_id or str(uuid4()),
@@ -266,80 +271,195 @@ class PythonCodeReviewEnvironment(
     def _build_observation(self) -> PythonCodeReviewObservation:
         """Build current observation from state."""
-        return PythonCodeReviewObservation(
-            task_id=self._state.task_id or "",
-            title=self._task.title if self._task else "",
-            difficulty=self._state.difficulty or "easy",
-            task_kind=self._state.task_kind,
             task_description=self._task.task_description if self._task else "",
             current_code=self._state.current_code,
             errors=self._state.errors,
             test_results=self._state.test_results,
             visible_tests=self._task.visible_tests if self._task else [],
             history=self._state.history,
-            attempts_remaining=self._state.attempts_remaining,
-            last_action_status=self._last_status,
-            score=self._state.score,
-            reward_details=self._last_reward,
-        )
-    def _handle_analyze(self) -> tuple[RewardDetails, str]:
-        """Analyze code for errors and test status."""
-        if self._task is None:
-            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
-        # Grade current code
-        curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
-        curr_score = curr_grade.score
-        error = curr_grade.details.get("compile_error", "")
-        # Compute reward components
-        reward_components = self._compute_reward_components(
-            curr_score=curr_score,
-            prev_score=self._previous_score,
-            curr_grade=curr_grade,
-            code_changed=False,  # analyze doesn't change code
-        )
-        # Status message
-        if error:
-            self._state.errors = error
-            self._state.test_results = "Compilation failed. Fix syntax first."
-            summary = f"Syntax error detected: {error}"
         else:
             self._state.errors = ""
             if self._task.task_kind == "syntax_fix":
                 self._state.test_results = "Code compiles successfully."
                 summary = "Code compiles. Ready to submit."
             else:
-                visible_total = len(self._task.visible_tests)
-                visible_passed = curr_grade.tests_passed
-                self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
-                summary = self._state.test_results
-        reward = RewardDetails(
-            value=reward_components["total"],
-            progress_delta=reward_components["progress"],
-            syntax_reward=reward_components["syntax"],
-            test_reward=reward_components["test"],
-            quality_bonus=reward_components["quality"],
-            stagnation_penalty=reward_components["stagnation"],
-            regression_penalty=reward_components["regression"],
-            timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
-            reason=summary,
-            prev_score=round(self._previous_score, 4),
-            curr_score=round(curr_score, 4),
-            code_changed=False,
-        )
-        # Update state
-        self._state.score = curr_score
-        self._state.errors = curr_grade.details.get("compile_error", "")
-        self._append_history("analyze_code", summary, reward.value)
-        return reward, summary
-    def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
-        """Edit the code and compute reward for progress."""
         if self._task is None:
             return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
@@ -350,153 +470,108 @@ class PythonCodeReviewEnvironment(
                 invalid_action_penalty=INVALID_ACTION_PENALTY,
                 reason="Edit action requires non-empty code.",
             )
-            status = "Invalid: edit_code requires code parameter."
-            self._append_history("edit_code", status, reward.value)
-            return reward, status
-        # Detect code change
-        code_changed = (code != self._previous_code)
-        # Grade before and after
-        curr_grade = grade_task(code, self._task, include_hidden=False)
-        curr_score = curr_grade.score
-        # Compute reward components
-        reward_components = self._compute_reward_components(
-            curr_score=curr_score,
-            prev_score=self._previous_score,
-            curr_grade=curr_grade,
-            code_changed=code_changed,
-            prev_grade_score=grade_task(self._previous_code, self._task, include_hidden=False).syntax_score,
-        )
-        # Update state
-        self._state.current_code = code
-        self._previous_code = code
-        self._previous_score = curr_score
-        self._state.errors = curr_grade.details.get("compile_error", "")
-        self._state.test_results = self._format_test_results(curr_grade)
-        self._state.score = curr_score
-        status = "Code updated."
         if self._state.errors:
             status = f"Code updated, but syntax issues remain: {self._state.errors}"
-        elif curr_grade.tests_total > 0:
-            status = self._state.test_results
-        reward = RewardDetails(
-            value=reward_components["total"],
-            progress_delta=reward_components["progress"],
-            syntax_reward=reward_components["syntax"],
-            test_reward=reward_components["test"],
-            quality_bonus=reward_components["quality"],
-            stagnation_penalty=reward_components["stagnation"],
-            regression_penalty=reward_components["regression"],
-            timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
-            reason=status,
-            prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
-            curr_score=round(curr_score, 4),
-            code_changed=code_changed,
-        )
-        self._append_history("edit_code", status, reward.value)
-        return reward, status
-    def _handle_run_tests(self) -> tuple[RewardDetails, str]:
-        """Run tests and provide feedback."""
-        if self._task is None:
-            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
-        curr_grade = grade_task(self._state.current_code, self._task, include_hidden=False)
-        curr_score = curr_grade.score
-        # Compute reward components
-        reward_components = self._compute_reward_components(
-            curr_score=curr_score,
-            prev_score=self._previous_score,
-            curr_grade=curr_grade,
-            code_changed=True,  # Consider any test run as "changed behavior"
-        )
-        # Update state
-        self._state.errors = curr_grade.details.get("compile_error", "")
-        self._state.test_results = self._format_test_results(curr_grade)
-        self._state.score = curr_score
-        self._previous_score = curr_score
-        # Add completion bonus if all visible tests pass
-        completion_bonus = 0.0
-        if curr_grade.tests_total > 0 and curr_grade.tests_passed == curr_grade.tests_total:
-            completion_bonus = 0.20
-        status = self._state.test_results if not self._state.errors else self._state.errors
-        reward = RewardDetails(
-            value=reward_components["total"] + completion_bonus,
-            progress_delta=reward_components["progress"],
-            test_reward=reward_components["test"] + completion_bonus,
-            syntax_reward=reward_components["syntax"],
-            quality_bonus=reward_components["quality"],
-            stagnation_penalty=reward_components["stagnation"],
-            regression_penalty=reward_components["regression"],
-            timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
-            reason=status,
-            prev_score=round(self._previous_score - curr_score + self._previous_score, 4) if curr_score != self._previous_score else round(self._previous_score, 4),
-            curr_score=round(curr_score, 4),
-            code_changed=True,
-        )
-        self._append_history("run_tests", status, reward.value)
-        return reward, status
-    def _handle_submit(self) -> tuple[RewardDetails, str]:
-        """Submit solution and finalize episode."""
-        if self._task is None:
-            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
-        curr_grade = grade_task(self._state.current_code, self._task, include_hidden=True)
-        curr_score = curr_grade.score
-        # Compute reward components
-        reward_components = self._compute_reward_components(
-            curr_score=curr_score,
-            prev_score=self._previous_score,
-            curr_grade=curr_grade,
-            code_changed=False,  # Submit doesn't change code
-        )
-        # Bonus for full correctness (one-time only)
-        correctness_bonus = 0.0
-        if curr_score >= 0.9999 and not self._full_correctness_awarded:
-            correctness_bonus = COMPLETION_BONUS
-            self._full_correctness_awarded = True
-        # Update state
-        self._state.errors = curr_grade.details.get("compile_error", "")
-        self._state.test_results = self._format_test_results(curr_grade)
-        self._state.score = curr_score
-        self._finalize_episode(auto_submit=False, grade=curr_grade)
-        reward_value = max(-1.0, min(1.0, reward_components["total"] + correctness_bonus))
-        status = f"Solution submitted. Final score: {curr_score:.3f}"
-        reward = RewardDetails(
-            value=reward_value,
-            progress_delta=reward_components["progress"],
-            correctness_bonus=correctness_bonus,
-            syntax_reward=reward_components["syntax"],
-            test_reward=reward_components["test"],
-            quality_bonus=reward_components["quality"],
-            stagnation_penalty=reward_components["stagnation"],
-            regression_penalty=reward_components["regression"],
-            timeout_penalty=TIMEOUT_PENALTY if curr_grade.timed_out else 0.0,
-            reason=status,
-            prev_score=round(self._previous_score, 4),
-            curr_score=round(curr_score, 4),
-            code_changed=False,
-        )
-        self._append_history("submit_solution", status, reward_value)
-        return reward, status
     def _compute_reward_components(
         self,
@@ -697,65 +772,17 @@ class PythonCodeReviewEnvironment(
         )
         self._state.history.append(entry)
-    def _log_debug_step(self, reward: RewardDetails) -> None:
-        """Log step details for debugging and agent understanding.
-        When verbose=True during initialization, this method prints detailed
-        information about each step, including:
-        - Step number in episode
-        - Score before and after (and delta)
-        - Final reward value (bounded in [-1.0, +1.0])
-        - Whether code was modified
-        - Component breakdown (only non-zero components shown)
-        - Human-readable reason/explanation
-        This output is designed to help:
-        - Monitor agent learning trajectory
-        - Debug why rewards are what they are
-        - Verify reward system is functioning correctly
-        - Understand what agent actions are incentivized
-        Example output:
-        -----
-        Step  1 | Score: 0.698 | Delta: +0.698 | Reward: +0.4239 | Changed: False
-                 | Progress=+0.174 | Quality=+0.149 | Stagnation=+0.100
-                 | Reason: Syntax error detected: '(' was never closed
-        Step  2 | Score: 1.000 | Delta: +0.302 | Reward: +0.6006 | Changed: True
-                 | Progress=+0.250 | Syntax=+0.350
-                 | Reason: Code updated.
-        -----
-        Args:
-            reward: RewardDetails object containing all reward information
-        """
-        # Print main step summary line
-        print(
-            f"\nStep {self._state.step_count:2d} | "
-            f"Score: {reward.curr_score:.3f} | "
-            f"Delta: {(reward.curr_score - reward.prev_score):+.3f} | "
-            f"Reward: {reward.value:+.4f} | "
-            f"Changed: {reward.code_changed}"
-        )
-        # Build list of all reward components (only show non-zero)
-        components = [
-            ("Progress", reward.progress_delta),
-            ("Syntax", reward.syntax_reward),
-            ("Test", reward.test_reward),
-            ("Quality", reward.quality_bonus),
-            ("Stagnation", -reward.stagnation_penalty),
-            ("Regression", -reward.regression_penalty),
-        ]
-        # Filter to only non-zero components for clarity
-        non_zero = [f"{name}={val:+.3f}" for name, val in components if abs(val) > 0.001]
-        if non_zero:
-            print(f"         | {' | '.join(non_zero)}")
-        # Print human-readable explanation
-        print(f"         | Reason: {reward.reason}")
 # Backwards-compatible aliases used elsewhere in the repo.

 See RewardDetails in models.py for all fields returned with each reward.
 """
+from __future__ import annotations
+import random
+import sys
+from typing import List, Optional
+from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
         # Progress tracking
         self._previous_score = 0.0
         self._previous_code = ""
+        self._best_visible_test_fraction = 0.0
+        self._best_quality_score = 0.0
+        self._full_correctness_awarded = False
+        self._syntax_reward_awarded = False
+        self.last_code = ""
+        self.reward_history: list[float] = []
     def reset(
         self,
         self._done = False
         self._previous_score = 0.0
         self._previous_code = self._task.starter_code
+        self._best_visible_test_fraction = 0.0
+        self._best_quality_score = 0.0
+        self._full_correctness_awarded = False
+        self._syntax_reward_awarded = False
+        self.last_code = ""
+        self.reward_history = []
+        self._last_status = "Inspect the code, edit it, run tests, then submit."
+        self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
         self._state = PythonCodeReviewState(
             episode_id=episode_id or str(uuid4()),
     def _build_observation(self) -> PythonCodeReviewObservation:
         """Build current observation from state."""
+        return PythonCodeReviewObservation(
+            task_id=self._state.task_id or "",
+            title=self._task.title if self._task else "",
+            difficulty=self._state.difficulty or "easy",
+            task_kind=self._state.task_kind,
             task_description=self._task.task_description if self._task else "",
             current_code=self._state.current_code,
             errors=self._state.errors,
             test_results=self._state.test_results,
             visible_tests=self._task.visible_tests if self._task else [],
             history=self._state.history,
+            attempts_remaining=self._state.attempts_remaining,
+            last_action_status=self._last_status,
+            score=self._state.score,
+            reward_details=self._last_reward,
+            reward=self._last_reward.value,
+            done=self._state.done,
+            metadata={
+                "prev_score": self._last_reward.prev_score,
+                "curr_score": self._last_reward.curr_score,
+            },
+        )
+    def apply_action(self, action: PythonCodeReviewAction) -> str:
+        """Return the code candidate produced by an action."""
+        if action.action_type == "edit_code":
+            return (action.code or "").strip() or self._state.current_code
+        return self._state.current_code
+    def run_tests(
+        self,
+        code: str,
+        include_hidden: bool = False,
+    ) -> tuple[float, dict[str, int], TaskGrade]:
+        """Grade code and return score plus simple test statistics."""
+        if self._task is None:
+            empty_results = {"passed": 0, "total": 0}
+            return 0.0, empty_results, TaskGrade(score=0.0)
+        grade = grade_task(code, self._task, include_hidden=include_hidden)
+        test_results = {
+            "passed": grade.tests_passed,
+            "total": grade.tests_total,
+        }
+        return grade.score, test_results, grade
+    def compute_reward(self, old_code, new_code, prev_score, curr_score, test_results):
+        # progress
+        progress = curr_score - prev_score
+        # test score
+        passed = test_results["passed"]
+        total = test_results["total"]
+        test_ratio = passed / total if total > 0 else 0
+        # syntax score
+        try:
+            compile(new_code, "<string>", "exec")
+            syntax_score = 1.0
+        except:
+            syntax_score = 0.0
+        # stagnation penalty
+        stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
+        # regression penalty
+        regression_penalty = max(0.0, prev_score - curr_score)
+        # repetition penalty (track last 3 actions)
+        repetition_penalty = 0.1 if new_code == self.last_code else 0.0
+        # quality (simple heuristic)
+        length_penalty = 0.0
+        if len(new_code) > len(old_code) * 1.5:
+            length_penalty = 0.1
+        # final reward
+        reward = (
+            0.4 * progress
+            + 0.3 * test_ratio
+            + 0.2 * syntax_score
+            - stagnation_penalty
+            - regression_penalty
+            - repetition_penalty
+            - length_penalty
+        )
+        # clamp
+        reward = max(-1.0, min(1.0, reward))
+        return reward
+    def _apply_reward_randomization(self, reward: float) -> float:
+        """Break repeated static rewards while keeping the result bounded."""
+        reward = max(-1.0, min(1.0, reward))
+        self.reward_history.append(reward)
+        if len(self.reward_history) >= 3 and len(set(self.reward_history[-3:])) == 1:
+            reward += random.uniform(-0.05, 0.05)
+            reward = max(-1.0, min(1.0, reward))
+            self.reward_history[-1] = reward
+        return reward
+    def _build_reward_details(
+        self,
+        old_code: str,
+        new_code: str,
+        prev_score: float,
+        curr_score: float,
+        test_results: dict[str, int],
+        reward_value: float,
+        reason: str,
+    ) -> RewardDetails:
+        """Build a reward payload that matches the scalar reward computation."""
+        passed = test_results["passed"]
+        total = test_results["total"]
+        test_ratio = passed / total if total > 0 else 0.0
+        try:
+            compile(new_code, "<string>", "exec")
+            syntax_score = 1.0
+        except SyntaxError:
+            syntax_score = 0.0
+        stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
+        regression_penalty = max(0.0, prev_score - curr_score)
+        repetition_penalty = 0.1 if new_code == self.last_code else 0.0
+        length_penalty = 0.1 if len(new_code) > len(old_code) * 1.5 else 0.0
+        return RewardDetails(
+            value=reward_value,
+            progress_delta=0.4 * (curr_score - prev_score),
+            syntax_reward=0.2 * syntax_score,
+            test_reward=0.3 * test_ratio,
+            quality_bonus=-length_penalty,
+            stagnation_penalty=stagnation_penalty,
+            regression_penalty=regression_penalty + repetition_penalty,
+            reason=reason,
+            prev_score=round(prev_score, 6),
+            curr_score=round(curr_score, 6),
+            code_changed=new_code.strip() != old_code.strip(),
+        )
+    def _handle_analyze(self) -> tuple[RewardDetails, str]:
+        """Analyze code for errors and test status."""
+        if self._task is None:
+            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
+        old_code = self._state.current_code
+        prev_score = self._previous_score
+        curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
+        error = curr_grade.details.get("compile_error", "")
+        # Status message
+        if error:
+            self._state.errors = error
+            self._state.test_results = "Compilation failed. Fix syntax first."
+            summary = f"Syntax error detected: {error}"
         else:
             self._state.errors = ""
             if self._task.task_kind == "syntax_fix":
                 self._state.test_results = "Code compiles successfully."
                 summary = "Code compiles. Ready to submit."
             else:
+                visible_total = len(self._task.visible_tests)
+                visible_passed = curr_grade.tests_passed
+                self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
+                summary = self._state.test_results
+        reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
+        reward_value = self._apply_reward_randomization(reward_value)
+        reward = self._build_reward_details(
+            old_code=old_code,
+            new_code=old_code,
+            prev_score=prev_score,
+            curr_score=curr_score,
+            test_results=test_results,
+            reward_value=reward_value,
+            reason=summary,
+        )
+        # Update state
+        self._state.score = curr_score
+        self._state.errors = curr_grade.details.get("compile_error", "")
+        self._previous_score = curr_score
+        self.last_code = old_code
+        self._append_history("analyze_code", summary, reward.value)
+        return reward, summary
+    def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
+        """Edit the code and compute reward for progress."""
         if self._task is None:
             return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
                 invalid_action_penalty=INVALID_ACTION_PENALTY,
                 reason="Edit action requires non-empty code.",
             )
+            status = "Invalid: edit_code requires code parameter."
+            self._append_history("edit_code", status, reward.value)
+            return reward, status
+        old_code = self._state.current_code
+        prev_score = self._previous_score
+        curr_score, test_results, curr_grade = self.run_tests(code, include_hidden=False)
+        # Update state
+        self._state.current_code = code
+        self._previous_code = code
+        self._state.errors = curr_grade.details.get("compile_error", "")
+        self._state.test_results = self._format_test_results(curr_grade)
+        self._state.score = curr_score
+        status = "Code updated."
         if self._state.errors:
             status = f"Code updated, but syntax issues remain: {self._state.errors}"
+        elif curr_grade.tests_total > 0:
+            status = self._state.test_results
+        reward_value = self.compute_reward(old_code, code, prev_score, curr_score, test_results)
+        reward_value = self._apply_reward_randomization(reward_value)
+        reward = self._build_reward_details(
+            old_code=old_code,
+            new_code=code,
+            prev_score=prev_score,
+            curr_score=curr_score,
+            test_results=test_results,
+            reward_value=reward_value,
+            reason=status,
+        )
+        self._previous_score = curr_score
+        self.last_code = code
+        self._append_history("edit_code", status, reward.value)
+        return reward, status
+    def _handle_run_tests(self) -> tuple[RewardDetails, str]:
+        """Run tests and provide feedback."""
+        if self._task is None:
+            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
+        old_code = self._state.current_code
+        prev_score = self._previous_score
+        curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
+        # Update state
+        self._state.errors = curr_grade.details.get("compile_error", "")
+        self._state.test_results = self._format_test_results(curr_grade)
+        self._state.score = curr_score
+        status = self._state.test_results if not self._state.errors else self._state.errors
+        reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
+        reward_value = self._apply_reward_randomization(reward_value)
+        reward = self._build_reward_details(
+            old_code=old_code,
+            new_code=old_code,
+            prev_score=prev_score,
+            curr_score=curr_score,
+            test_results=test_results,
+            reward_value=reward_value,
+            reason=status,
+        )
+        self._previous_score = curr_score
+        self.last_code = old_code
+        self._append_history("run_tests", status, reward.value)
+        return reward, status
+    def _handle_submit(self) -> tuple[RewardDetails, str]:
+        """Submit solution and finalize episode."""
+        if self._task is None:
+            return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
+        old_code = self._state.current_code
+        prev_score = self._previous_score
+        curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=True)
+        # Update state
+        self._state.errors = curr_grade.details.get("compile_error", "")
+        self._state.test_results = self._format_test_results(curr_grade)
+        self._state.score = curr_score
+        self._previous_score = curr_score
+        self.last_code = old_code
+        self._finalize_episode(auto_submit=False, grade=curr_grade)
+        reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
+        reward_value = self._apply_reward_randomization(reward_value)
+        status = f"Solution submitted. Final score: {curr_score:.3f}"
+        reward = self._build_reward_details(
+            old_code=old_code,
+            new_code=old_code,
+            prev_score=prev_score,
+            curr_score=curr_score,
+            test_results=test_results,
+            reward_value=reward_value,
+            reason=status,
+        )
+        self._append_history("submit_solution", status, reward_value)
+        return reward, status
     def _compute_reward_components(
         self,
         )
         self._state.history.append(entry)
+    def _log_debug_step(self, reward: RewardDetails) -> None:
+        """Log the scalar reward signal in a compact RL-friendly format."""
+        print(
+            f"""
+Step Debug:
+Prev Score: {reward.prev_score}
+Curr Score: {reward.curr_score}
+Reward: {reward.value}
+Progress: {reward.curr_score - reward.prev_score}
+"""
+        )
 # Backwards-compatible aliases used elsewhere in the repo.

tests/test_reward_dynamics.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from models import PythonCodeReviewAction
+from server.env import PythonCodeReviewEnvironment
+FIXED_SYNTAX_CODE = """def normalize_username(raw_name: str) -> str:
+    cleaned = raw_name.strip().lower()
+    if not cleaned:
+        return "anonymous"
+    return cleaned.replace(" ", "_")
+"""
+def test_reward_changes_across_five_steps():
+    env = PythonCodeReviewEnvironment(verbose=False)
+    env.reset(task_id="syntax-fix-easy")
+    actions = [
+        PythonCodeReviewAction(action_type="analyze_code"),
+        PythonCodeReviewAction(action_type="analyze_code"),
+        PythonCodeReviewAction(action_type="run_tests"),
+        PythonCodeReviewAction(action_type="edit_code", code=FIXED_SYNTAX_CODE),
+        PythonCodeReviewAction(action_type="submit_solution"),
+    ]
+    rewards = []
+    for action in actions:
+        observation = env.step(action)
+        rewards.append(float(observation.reward or 0.0))
+    assert all(-1.0 <= reward <= 1.0 for reward in rewards)
+    assert len(set(rewards)) > 1
+    assert any(reward > 0 for reward in rewards)
+    assert any(reward < 0 for reward in rewards)
+    assert not any(
+        rewards[index] == rewards[index + 1] == rewards[index + 2]
+        for index in range(len(rewards) - 2)
+    )