extends: grpo_small.yaml reward: mode: dense_train training_mode: dense_train stage: early progressive_cap: value: 8.0 description: "Higher shaping budget for SFT-warmstarted GRPO so early correct workflow actions separate from random exploration." penalty_floor: value: -4.0 description: "Less severe dense floor for fast policy learning while terminal verifier penalties still apply." train_cap: value: 26.0 description: "Allows strong progressive and terminal rewards in the same episode." shaping_weight: early: 1.4 middle: 1.1 late: 0.8 final: 0.25 description: "Emphasizes workflow shaping early, then anneals toward terminal verifier reward." policy_inspected: value: 0.80 description: "Stronger reward for starting with the policy graph, matching the SFT oracle trace." route_map_inspected: value: 0.45 cap: 0.90 description: "Rewards route discovery without making route-list loops attractive." relevant_file_inspected: value: 0.90 cap: 1.40 description: "Rewards reading or searching authorization-relevant code before patching." local_evidence_found: value: 2.20 cap: 2.20 description: "Prioritizes local evidence of the authorization failure before diagnosis." diagnosis_correct: value: 2.00 description: "Large reward for correct bug class, route, policy rule, and local evidence." patch_applies: value: 1.20 description: "Rewards applying a concrete patch after diagnosis." app_boots_after_patch: value: 1.00 description: "Rewards keeping the generated app bootable after patching." visible_tests_improved: value: 1.20 cap: 1.20 description: "Rewards visible test success after the patch." public_routes_visible_pass: value: 0.70 description: "Rewards preserving intentionally public routes." step_penalty: early: -0.002 middle: -0.004 late: -0.008 final: 0.0 cap: -0.35 description: "Keeps mild pressure toward concise episodes without discouraging exploration." speed_bonus: value: 0.5 description: "Small terminal success speed bonus; shaping carries early learning." token_penalty: target_tokens: 110 early: -0.002 middle: -0.0025 late: -0.003 final: 0.0 cap: -0.45 description: "Penalizes clipped or verbose tool calls immediately in SFT-warmstarted GRPO." invalid_action: value: -0.60 description: "Clear penalty for invalid tool calls, schema errors, or phase violations." repeated_invalid_action: value: -0.80 description: "Stronger penalty for repeating invalid behavior." repeated_low_value_action: value: -0.45 description: "Discourages repeated valid actions that add no new progress." no_progress_action: value: -0.20 description: "Penalizes valid but unhelpful actions after useful progress has already been collected." noop_action: value: -0.10 description: "Discourages no-op completions." repeated_file_read: value: -0.25 description: "Discourages rereading the same file without a patch change." repeated_local_request: value: -0.25 description: "Discourages repeated identical requests after evidence is known." repeated_visible_tests: value: -0.15 description: "Discourages rerunning visible tests without a new patch." patch_before_policy: value: -0.60 description: "Strongly discourages patching before policy inspection." submit_without_patch: value: -1.00 description: "Strongly discourages terminal submission without a patch." submit_without_visible_tests: value: -0.60 description: "Discourages submitting a patch before visible tests."