| """Visible verifier: the immediate reward signal the Repair Agent sees.
|
|
|
| 4 weighted components, summed to a scalar. This is what drives the Repair
|
| Agent's GRPO updates each rollout. Multiple independent components were
|
| chosen on purpose, per the reward-engineering survey (arxiv 2408.10215)
|
| and software-tasks survey (arxiv 2601.19100): a single scalar is far
|
| easier to game than a composable rubric.
|
| """
|
| from __future__ import annotations
|
|
|
| import re
|
|
|
| from forgeenv.tasks.models import ExecutionResult, Task
|
|
|
| WEIGHTS: dict[str, float] = {
|
| "script_executes": 1.0,
|
| "loss_decreased": 0.5,
|
| "checkpoint_appeared": 0.3,
|
| "diff_size_penalty": 0.2,
|
| }
|
|
|
|
|
| def compute_visible_reward(
|
| result: ExecutionResult, task: Task
|
| ) -> tuple[float, dict[str, float]]:
|
| """Compute scalar visible reward and per-component breakdown."""
|
|
|
| components: dict[str, float] = {}
|
|
|
| components["script_executes"] = 1.0 if result.exit_code == 0 else 0.0
|
| components["loss_decreased"] = _check_loss_trend(result.stdout)
|
| components["checkpoint_appeared"] = 1.0 if result.checkpoint_exists else 0.0
|
|
|
| original_lines = max(len(task.script_content.splitlines()), 1)
|
| current_lines = (
|
| len(result.script_content.splitlines()) if result.script_content else original_lines
|
| )
|
| diff_ratio = abs(current_lines - original_lines) / original_lines
|
| components["diff_size_penalty"] = -1.0 * diff_ratio if diff_ratio > 0.5 else 0.0
|
|
|
| total = sum(components[k] * WEIGHTS[k] for k in components)
|
| return total, components
|
|
|
|
|
| def _check_loss_trend(stdout: str) -> float:
|
| """Parse stdout for `loss=...` patterns and return the fraction of
|
| consecutive steps where loss strictly decreased."""
|
|
|
| losses: list[float] = []
|
| for line in stdout.splitlines():
|
| match = re.search(r"loss[=:\s]+([\d.]+)", line, re.IGNORECASE)
|
| if match:
|
| try:
|
| losses.append(float(match.group(1)))
|
| except ValueError:
|
| continue
|
|
|
| if len(losses) < 2:
|
| return 0.0
|
|
|
| decreasing_steps = sum(
|
| 1 for i in range(1, len(losses)) if losses[i] < losses[i - 1]
|
| )
|
| return decreasing_steps / (len(losses) - 1)
|
|
|