"""Visible verifier: the immediate reward signal the Repair Agent sees. 4 weighted components, summed to a scalar. This is what drives the Repair Agent's GRPO updates each rollout. Multiple independent components were chosen on purpose, per the reward-engineering survey (arxiv 2408.10215) and software-tasks survey (arxiv 2601.19100): a single scalar is far easier to game than a composable rubric. """ from __future__ import annotations import re from forgeenv.tasks.models import ExecutionResult, Task WEIGHTS: dict[str, float] = { "script_executes": 1.0, "loss_decreased": 0.5, "checkpoint_appeared": 0.3, "diff_size_penalty": 0.2, # multiplied with a non-positive component value } def compute_visible_reward( result: ExecutionResult, task: Task ) -> tuple[float, dict[str, float]]: """Compute scalar visible reward and per-component breakdown.""" components: dict[str, float] = {} components["script_executes"] = 1.0 if result.exit_code == 0 else 0.0 components["loss_decreased"] = _check_loss_trend(result.stdout) components["checkpoint_appeared"] = 1.0 if result.checkpoint_exists else 0.0 original_lines = max(len(task.script_content.splitlines()), 1) current_lines = ( len(result.script_content.splitlines()) if result.script_content else original_lines ) diff_ratio = abs(current_lines - original_lines) / original_lines components["diff_size_penalty"] = -1.0 * diff_ratio if diff_ratio > 0.5 else 0.0 total = sum(components[k] * WEIGHTS[k] for k in components) return total, components def _check_loss_trend(stdout: str) -> float: """Parse stdout for `loss=...` patterns and return the fraction of consecutive steps where loss strictly decreased.""" losses: list[float] = [] for line in stdout.splitlines(): match = re.search(r"loss[=:\s]+([\d.]+)", line, re.IGNORECASE) if match: try: losses.append(float(match.group(1))) except ValueError: continue if len(losses) < 2: return 0.0 decreasing_steps = sum( 1 for i in range(1, len(losses)) if losses[i] < losses[i - 1] ) return decreasing_steps / (len(losses) - 1)