| """Visible verifier: the immediate reward signal the Repair Agent sees. |
| |
| 4 weighted components, summed to a scalar. This is what drives the Repair |
| Agent's GRPO updates each rollout. Multiple independent components were |
| chosen on purpose, per the reward-engineering survey (arxiv 2408.10215) |
| and software-tasks survey (arxiv 2601.19100): a single scalar is far |
| easier to game than a composable rubric. |
| """ |
| from __future__ import annotations |
|
|
| import re |
|
|
| from forgeenv.tasks.models import ExecutionResult, Task |
|
|
| WEIGHTS: dict[str, float] = { |
| "script_executes": 1.0, |
| "loss_decreased": 0.5, |
| "checkpoint_appeared": 0.3, |
| "diff_size_penalty": 0.2, |
| } |
|
|
|
|
| def compute_visible_reward( |
| result: ExecutionResult, task: Task |
| ) -> tuple[float, dict[str, float]]: |
| """Compute scalar visible reward and per-component breakdown.""" |
|
|
| components: dict[str, float] = {} |
|
|
| components["script_executes"] = 1.0 if result.exit_code == 0 else 0.0 |
| components["loss_decreased"] = _check_loss_trend(result.stdout) |
| components["checkpoint_appeared"] = 1.0 if result.checkpoint_exists else 0.0 |
|
|
| original_lines = max(len(task.script_content.splitlines()), 1) |
| current_lines = ( |
| len(result.script_content.splitlines()) if result.script_content else original_lines |
| ) |
| diff_ratio = abs(current_lines - original_lines) / original_lines |
| components["diff_size_penalty"] = -1.0 * diff_ratio if diff_ratio > 0.5 else 0.0 |
|
|
| total = sum(components[k] * WEIGHTS[k] for k in components) |
| return total, components |
|
|
|
|
| def _check_loss_trend(stdout: str) -> float: |
| """Parse stdout for `loss=...` patterns and return the fraction of |
| consecutive steps where loss strictly decreased.""" |
|
|
| losses: list[float] = [] |
| for line in stdout.splitlines(): |
| match = re.search(r"loss[=:\s]+([\d.]+)", line, re.IGNORECASE) |
| if match: |
| try: |
| losses.append(float(match.group(1))) |
| except ValueError: |
| continue |
|
|
| if len(losses) < 2: |
| return 0.0 |
|
|
| decreasing_steps = sum( |
| 1 for i in range(1, len(losses)) if losses[i] < losses[i - 1] |
| ) |
| return decreasing_steps / (len(losses) - 1) |
|
|