AgentDebugger-training-v3 / server /reward_calculator.py
shank
Update: Started making changes for the hackathon
a55c81d
"""
DebugRewardCalculator β€” Multi-component reward system for AgentDebuggerEnv.
Reward taxonomy follows:
- Masud et al. (2026) "Reward Engineering for RL in Software Tasks"
β†’ Uses their execution-based + process-based + semantic similarity taxonomy
- Ibrahim et al. (2024) "Comprehensive Overview of Reward Engineering and Shaping"
β†’ Uses potential-based shaping for efficiency component to preserve policy invariance
Design principle: GRPO learns by comparing completions WITHIN a group.
Relative reward differences matter more than absolute values.
Therefore: be generous with partial credit so the model gets differentiated signal
even when nothing fully works.
"""
import difflib
import re
from dataclasses import dataclass
from typing import Optional
from server.models import StructuredAgentOutput
@dataclass
class RewardBreakdown:
format_compliance: float # fires every turn β€” gives early training signal
hypothesis_quality: float # process-based reward (Paper 2 taxonomy)
localization: float # execution-based proxy
fix_quality: float # execution-based reward (primary terminal signal)
semantic_similarity: float # semantic reward (Paper 2 taxonomy)
efficiency_potential: float # potential-based shaping (Paper 1)
penalties: float
total: float
class DebugRewardCalculator:
"""
Reward weights (must sum to 1.0 excluding penalties):
format_compliance: 0.10 β€” fires every turn, drives early curve movement
hypothesis_quality: 0.20 β€” process-based, independent of fix success
localization: 0.15 β€” did agent find the right place?
fix_quality: 0.35 β€” execution-based, primary terminal signal (sparse)
semantic_similarity: 0.10 β€” how close to canonical fix?
efficiency_potential: 0.10 β€” potential-based shaping across turns
IMPORTANT NOTE ON SPARSITY vs DENSITY:
The fix_quality reward (0.35) is sparse β€” it only fires when tests pass.
The format, hypothesis, localization rewards are dense β€” they fire every turn.
This combination is intentional: dense rewards carry gradient signal while the
model is still learning to fix bugs; sparse rewards dominate once it gets good.
This directly implements Ibrahim et al.'s recommendation to combine reward
shaping with terminal rewards to solve the sparse reward problem.
"""
MAX_TURNS = 5
def compute_turn_reward(
self,
agent_output: StructuredAgentOutput,
ground_truth: dict,
test_results: dict,
turn_number: int,
) -> RewardBreakdown:
"""
Compute reward for a single agent turn.
Args:
agent_output: parsed structured output from the agent
ground_truth: {
"bug_function": str, # name of function containing the bug
"bug_line": int, # line number of the bug
"bug_type": str, # category of bug
"canonical_fix_code": str, # the correct minimal fix
}
test_results: {
"passed": int,
"failed": int,
"total": int,
"newly_broken": int, # tests that passed before but fail after fix
}
turn_number: 0-indexed turn number within the episode
Returns:
RewardBreakdown with total and all component scores
"""
# ── COMPONENT 1: FORMAT COMPLIANCE ────────────────────────────────
# This fires EVERY turn. Gives the model early training signal before
# it learns to fix bugs. Drives curve movement in first 50-100 steps.
if agent_output.valid:
format_score = 0.10
else:
# Partial credit: how many fields were present?
fields_present = sum([
len(agent_output.observation) > 5,
len(agent_output.hypothesis) > 10,
agent_output.confidence in {"low", "medium", "high"},
agent_output.action in {"inspect_lines", "run_tests", "propose_fix",
"request_context", "give_up"},
len(agent_output.detail) > 0,
])
format_score = -0.25 + (fields_present * 0.04) # -0.25 to -0.05
# ── COMPONENT 2: HYPOTHESIS QUALITY (Process-based, Paper 2) ──────
# Score reasoning quality INDEPENDENTLY from whether the fix works.
# A correct diagnosis that leads to a wrong fix still gets rewarded here.
# This trains the model to reason carefully even when uncertain.
hypothesis_score = 0.0
hypothesis = agent_output.hypothesis
if len(hypothesis.split()) >= 20:
hypothesis_score += 0.05 # not a one-liner
# References specific code elements (backticks, quotes, or operators)
if re.search(r'[`\'"<>!=+\-*/]', hypothesis):
hypothesis_score += 0.05
# Mentions line numbers
if re.search(r'\bline\s+\d+\b|\b\d+\b', hypothesis):
hypothesis_score += 0.05
# Logically consistent: OBSERVATION and HYPOTHESIS reference same code area
obs_words = set(agent_output.observation.lower().split())
hyp_words = set(hypothesis.lower().split())
overlap = len(obs_words & hyp_words) / max(len(obs_words), 1)
if overlap > 0.15:
hypothesis_score += 0.05
# Confidence calibration: rewards correct confidence, penalizes overconfidence
# High confidence + correct = bonus, High confidence + wrong = penalty
if agent_output.action == "propose_fix":
tests_pass = test_results.get("passed", 0) == test_results.get("total", 1)
if agent_output.confidence == "high" and tests_pass:
hypothesis_score += 0.05 # well-calibrated
elif agent_output.confidence == "high" and not tests_pass:
hypothesis_score -= 0.05 # overconfident
elif agent_output.confidence == "low" and tests_pass:
hypothesis_score += 0.02 # humble but correct
hypothesis_score = max(0.0, min(hypothesis_score, 0.20))
# ── COMPONENT 3: LOCALIZATION (Execution-based proxy) ─────────────
# Did the agent identify WHERE the bug is, independently of fixing it?
localization_score = 0.0
bug_function = ground_truth.get("bug_function", "").lower()
bug_line = str(ground_truth.get("bug_line", -1))
combined_text = (agent_output.hypothesis + " " + agent_output.detail).lower()
if bug_function and bug_function in combined_text:
localization_score += 0.08
if bug_line != "-1" and bug_line in agent_output.hypothesis:
localization_score += 0.07
localization_score = min(localization_score, 0.15)
# ── COMPONENT 4: FIX QUALITY (Execution-based, Paper 2 primary) ───
# This is the dominant signal. Sparse but high value.
# Paper 1: combine with shaping (components 1-3) to solve sparse problem.
total_tests = test_results.get("total", 0)
passed_tests = test_results.get("passed", 0)
fix_score = 0.0
if total_tests > 0 and agent_output.action == "propose_fix":
pass_rate = passed_tests / total_tests
if pass_rate == 1.0:
fix_score = 0.35 # full solve β€” this is what we're training for
elif pass_rate >= 0.75:
fix_score = 0.20 # most tests pass
elif pass_rate >= 0.50:
fix_score = 0.12 # more than half pass
elif pass_rate > 0.0:
fix_score = 0.05 # at least something works
# 0.0 if nothing passes β€” no credit for non-fix actions
# ── COMPONENT 5: SEMANTIC SIMILARITY (Paper 2 taxonomy) ───────────
# How structurally close is the proposed fix to the canonical fix?
# Uses difflib β€” no heavy NLP dependencies needed.
semantic_score = 0.0
proposed = agent_output.detail
canonical = ground_truth.get("canonical_fix_code", "")
if proposed and canonical and agent_output.action == "propose_fix":
similarity = difflib.SequenceMatcher(None, proposed, canonical).ratio()
if similarity >= 0.85:
semantic_score = 0.10
elif similarity >= 0.65:
semantic_score = 0.05
elif similarity >= 0.40:
semantic_score = 0.02
# No reward below 0.40 similarity β€” prevents gaming with partial matches
# ── COMPONENT 6: EFFICIENCY POTENTIAL (Potential-based, Paper 1) ──
# Implements potential-based reward shaping: F(s,a,s') = Ξ³Ξ¦(s') - Ξ¦(s)
# where Ξ¦(state) = value of remaining turns
# This is PROVEN to not change the optimal policy (Ibrahim et al. Theorem 1)
# while still accelerating convergence.
remaining_turns = self.MAX_TURNS - turn_number
efficiency_potential = 0.02 * remaining_turns # max 0.10 on turn 0
# ── PENALTIES ─────────────────────────────────────────────────────
penalties = 0.0
# Regression: fix breaks previously-passing tests β€” severe
if test_results.get("newly_broken", 0) > 0:
penalties -= 0.20
# Give up: agent chose to give_up
if agent_output.action == "give_up":
penalties -= 0.15
# Invalid action: not one of the 5 valid actions
if agent_output.action == "invalid":
penalties -= 0.10
# Invalid format (already captured in format_score, add extra penalty)
if not agent_output.valid:
penalties -= 0.10
# ── TOTAL ─────────────────────────────────────────────────────────
raw_total = (
format_score
+ hypothesis_score
+ localization_score
+ fix_score
+ semantic_score
+ efficiency_potential
+ penalties
)
# Floor at -0.5 to prevent reward death spiral (Ibrahim et al.)
total = max(raw_total, -0.5)
return RewardBreakdown(
format_compliance=round(format_score, 4),
hypothesis_quality=round(hypothesis_score, 4),
localization=round(localization_score, 4),
fix_quality=round(fix_score, 4),
semantic_similarity=round(semantic_score, 4),
efficiency_potential=round(efficiency_potential, 4),
penalties=round(penalties, 4),
total=round(total, 4),
)
def compute_episode_reward(self, trajectory: list[dict]) -> float:
"""
Aggregate turn rewards across an episode.
Uses 0.9 discount factor β€” later turns worth slightly less.
Adds solve bonus if bug was fixed before max turns.
"""
if not trajectory:
return 0.0
total = 0.0
discount = 1.0
for turn in trajectory:
total += discount * turn["reward"].total
discount *= 0.9
# Solve bonus: incentivizes actually solving the bug
solved = any(t["reward"].fix_quality >= 0.35 for t in trajectory)
if solved:
total += 0.20
return round(total, 4)
def get_reward_breakdown_for_logging(self, trajectory: list[dict]) -> dict:
"""Returns per-component averages across episode for W&B logging."""
if not trajectory:
return {}
components = [
"format_compliance", "hypothesis_quality", "localization",
"fix_quality", "semantic_similarity", "efficiency_potential", "penalties"
]
return {
f"reward/{c}": round(
sum(t["reward"].__dict__[c] for t in trajectory) / len(trajectory), 4
)
for c in components
}