Spaces:

Humanlearning
/

Cyber_analyst-round1

Sleeping

Cyber_analyst-round1 / training /configs /sft_warmstart_fast.yaml

feat: introduce GRPO GPU fallback support, enhance training script with warmstart tagging, and add learning rate parameter for improved training flexibility

1b6d30b 28 days ago

raw

history blame contribute delete

3.72 kB

	extends: grpo_small.yaml
	reward:
	mode: dense_train
	training_mode: dense_train
	stage: early
	progressive_cap:
	value: 8.0
	description: "Higher shaping budget for SFT-warmstarted GRPO so early correct workflow actions separate from random exploration."
	penalty_floor:
	value: -4.0
	description: "Less severe dense floor for fast policy learning while terminal verifier penalties still apply."
	train_cap:
	value: 26.0
	description: "Allows strong progressive and terminal rewards in the same episode."
	shaping_weight:
	early: 1.4
	middle: 1.1
	late: 0.8
	final: 0.25
	description: "Emphasizes workflow shaping early, then anneals toward terminal verifier reward."
	policy_inspected:
	value: 0.80
	description: "Stronger reward for starting with the policy graph, matching the SFT oracle trace."
	route_map_inspected:
	value: 0.45
	cap: 0.90
	description: "Rewards route discovery without making route-list loops attractive."
	relevant_file_inspected:
	value: 0.90
	cap: 1.40
	description: "Rewards reading or searching authorization-relevant code before patching."
	local_evidence_found:
	value: 2.20
	cap: 2.20
	description: "Prioritizes local evidence of the authorization failure before diagnosis."
	diagnosis_correct:
	value: 2.00
	description: "Large reward for correct bug class, route, policy rule, and local evidence."
	patch_applies:
	value: 1.20
	description: "Rewards applying a concrete patch after diagnosis."
	app_boots_after_patch:
	value: 1.00
	description: "Rewards keeping the generated app bootable after patching."
	visible_tests_improved:
	value: 1.20
	cap: 1.20
	description: "Rewards visible test success after the patch."
	public_routes_visible_pass:
	value: 0.70
	description: "Rewards preserving intentionally public routes."
	step_penalty:
	early: -0.002
	middle: -0.004
	late: -0.008
	final: 0.0
	cap: -0.35
	description: "Keeps mild pressure toward concise episodes without discouraging exploration."
	speed_bonus:
	value: 0.5
	description: "Small terminal success speed bonus; shaping carries early learning."
	token_penalty:
	target_tokens: 110
	early: -0.002
	middle: -0.0025
	late: -0.003
	final: 0.0
	cap: -0.45
	description: "Penalizes clipped or verbose tool calls immediately in SFT-warmstarted GRPO."
	invalid_action:
	value: -0.60
	description: "Clear penalty for invalid tool calls, schema errors, or phase violations."
	repeated_invalid_action:
	value: -0.80
	description: "Stronger penalty for repeating invalid behavior."
	repeated_low_value_action:
	value: -0.45
	description: "Discourages repeated valid actions that add no new progress."
	no_progress_action:
	value: -0.20
	description: "Penalizes valid but unhelpful actions after useful progress has already been collected."
	noop_action:
	value: -0.10
	description: "Discourages no-op completions."
	repeated_file_read:
	value: -0.25
	description: "Discourages rereading the same file without a patch change."
	repeated_local_request:
	value: -0.25
	description: "Discourages repeated identical requests after evidence is known."
	repeated_visible_tests:
	value: -0.15
	description: "Discourages rerunning visible tests without a new patch."
	patch_before_policy:
	value: -0.60
	description: "Strongly discourages patching before policy inspection."
	submit_without_patch:
	value: -1.00
	description: "Strongly discourages terminal submission without a patch."
	submit_without_visible_tests:
	value: -0.60
	description: "Discourages submitting a patch before visible tests."