Cyber_analyst-round1 / training /configs /sft_warmstart_fast.yaml
Humanlearning's picture
feat: introduce GRPO GPU fallback support, enhance training script with warmstart tagging, and add learning rate parameter for improved training flexibility
1b6d30b
extends: grpo_small.yaml
reward:
mode: dense_train
training_mode: dense_train
stage: early
progressive_cap:
value: 8.0
description: "Higher shaping budget for SFT-warmstarted GRPO so early correct workflow actions separate from random exploration."
penalty_floor:
value: -4.0
description: "Less severe dense floor for fast policy learning while terminal verifier penalties still apply."
train_cap:
value: 26.0
description: "Allows strong progressive and terminal rewards in the same episode."
shaping_weight:
early: 1.4
middle: 1.1
late: 0.8
final: 0.25
description: "Emphasizes workflow shaping early, then anneals toward terminal verifier reward."
policy_inspected:
value: 0.80
description: "Stronger reward for starting with the policy graph, matching the SFT oracle trace."
route_map_inspected:
value: 0.45
cap: 0.90
description: "Rewards route discovery without making route-list loops attractive."
relevant_file_inspected:
value: 0.90
cap: 1.40
description: "Rewards reading or searching authorization-relevant code before patching."
local_evidence_found:
value: 2.20
cap: 2.20
description: "Prioritizes local evidence of the authorization failure before diagnosis."
diagnosis_correct:
value: 2.00
description: "Large reward for correct bug class, route, policy rule, and local evidence."
patch_applies:
value: 1.20
description: "Rewards applying a concrete patch after diagnosis."
app_boots_after_patch:
value: 1.00
description: "Rewards keeping the generated app bootable after patching."
visible_tests_improved:
value: 1.20
cap: 1.20
description: "Rewards visible test success after the patch."
public_routes_visible_pass:
value: 0.70
description: "Rewards preserving intentionally public routes."
step_penalty:
early: -0.002
middle: -0.004
late: -0.008
final: 0.0
cap: -0.35
description: "Keeps mild pressure toward concise episodes without discouraging exploration."
speed_bonus:
value: 0.5
description: "Small terminal success speed bonus; shaping carries early learning."
token_penalty:
target_tokens: 110
early: -0.002
middle: -0.0025
late: -0.003
final: 0.0
cap: -0.45
description: "Penalizes clipped or verbose tool calls immediately in SFT-warmstarted GRPO."
invalid_action:
value: -0.60
description: "Clear penalty for invalid tool calls, schema errors, or phase violations."
repeated_invalid_action:
value: -0.80
description: "Stronger penalty for repeating invalid behavior."
repeated_low_value_action:
value: -0.45
description: "Discourages repeated valid actions that add no new progress."
no_progress_action:
value: -0.20
description: "Penalizes valid but unhelpful actions after useful progress has already been collected."
noop_action:
value: -0.10
description: "Discourages no-op completions."
repeated_file_read:
value: -0.25
description: "Discourages rereading the same file without a patch change."
repeated_local_request:
value: -0.25
description: "Discourages repeated identical requests after evidence is known."
repeated_visible_tests:
value: -0.15
description: "Discourages rerunning visible tests without a new patch."
patch_before_policy:
value: -0.60
description: "Strongly discourages patching before policy inspection."
submit_without_patch:
value: -1.00
description: "Strongly discourages terminal submission without a patch."
submit_without_visible_tests:
value: -0.60
description: "Discourages submitting a patch before visible tests."