Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

AgentDebugger-training-v3 / training /train_grpo.py

shank

Fix: batch%num_generations math

2b499e7 12 days ago

23.3 kB

	"""
	AgentDebuggerEnv — GRPO Training Script
	Model: Qwen2.5-Coder-7B-Instruct (float16/bfloat16 + LoRA, no quantization)
	Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
	GPU: auto-detected at runtime (A100/H100 → bfloat16+large batch, T4/V100 → float16+small batch)

	Usage:
	# Local reward sanity-check (no GPU, no model loading):
	python training/train_grpo.py --test-local

	# Test run (Colab/GPU, 10 steps):
	python training/train_grpo.py --test

	# Full training run:
	python training/train_grpo.py

	# Resume from checkpoint:
	python training/train_grpo.py --resume ./checkpoints/checkpoint-400
	"""

	import os
	import sys
	import json
	import argparse
	import random
	import subprocess
	import tempfile
	import shutil
	from importlib import metadata

	# ── Parse args ────────────────────────────────────────────────────────────────
	parser = argparse.ArgumentParser()
	parser.add_argument("--test", action="store_true", help="Run 10 steps for testing (Colab/GPU)")
	parser.add_argument("--test-local", action="store_true", dest="test_local",
	help="Sanity-check reward function locally without any model or GPU")
	parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
	parser.add_argument("--max_steps", type=int, default=500)
	args = parser.parse_args()

	# ── Runtime dependency install ─────────────────────────────────────────────────
	# requirements.txt only has torch (too large to install at runtime).
	# Everything else is installed here, after gradio is already up.
	# NOTE: mergekit intentionally excluded — conflicts with accelerate/peft/trl.
	if not args.test_local:
	# ── Ensure CUDA-enabled torch is present before anything else imports it ──
	# The default PyPI torch wheel is CPU-only. We must install from the
	# PyTorch CUDA index so that torch.cuda.is_available() returns True and
	# device_map="auto" maps the model to GPU, not RAM.
	import importlib.util, importlib
	_needs_cuda_torch = True
	if importlib.util.find_spec("torch") is not None:
	import torch as _t
	if _t.cuda.is_available():
	_needs_cuda_torch = False
	del _t
	if _needs_cuda_torch:
	print("Installing CUDA-enabled torch (cu121)...", flush=True)
	_r = os.system(
	f"{sys.executable} -m pip install -q --no-cache-dir "
	"torch --index-url https://download.pytorch.org/whl/cu121"
	)
	if _r != 0:
	print("ERROR: CUDA torch install failed.", flush=True)
	sys.exit(1)
	print("CUDA torch installed.", flush=True)

	_TRAIN_DEPS = [
	"wandb==0.18.7",
	"datasets==3.0.2",
	"transformers==4.48.3",
	"accelerate==1.0.1",
	"trl==0.15.2",
	"peft==0.13.2",
	]
	print("Installing training dependencies...", flush=True)
	ret = os.system(
	f"{sys.executable} -m pip install -q --no-cache-dir " + " ".join(f'"{d}"' for d in _TRAIN_DEPS)
	)
	if ret != 0:
	print("ERROR: pip install failed. Training cannot continue.", flush=True)
	sys.exit(1)
	print("Dependencies installed.", flush=True)

	# ── GPU/training imports (skipped in --test-local mode) ───────────────────────
	if not args.test_local:
	import torch
	import wandb
	from datasets import Dataset
	from transformers import (
	AutoModelForCausalLM, AutoTokenizer, TrainerCallback
	)
	from peft import get_peft_model, LoraConfig, TaskType
	from trl import GRPOTrainer, GRPOConfig

	def _pkg_ver(name: str) -> str:
	try:
	return metadata.version(name)
	except metadata.PackageNotFoundError:
	return "not-installed"

	print(
	"Runtime package versions \| "
	f"python={sys.version.split()[0]} "
	f"torch={_pkg_ver('torch')} "
	f"transformers={_pkg_ver('transformers')} "
	f"trl={_pkg_ver('trl')} "
	f"accelerate={_pkg_ver('accelerate')} "
	f"peft={_pkg_ver('peft')}"
	)

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from server.reward_calculator import DebugRewardCalculator
	from server.models import parse_agent_output

	# ── Configuration ─────────────────────────────────────────────────────────────
	MODEL_NAME = "Qwen/Qwen2.5-Coder-3B-Instruct"
	HF_REPO = "shashaank0707/AgentDebugger-trained"
	MAX_STEPS = 10 if args.test else args.max_steps
	CHECKPOINT_DIR = "./checkpoints"

	# W&B — optional but strongly recommended for judging
	WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "") if not args.test_local else ""
	if WANDB_API_KEY:
	wandb.init(
	project="AgentDebuggerEnv",
	name=f"grpo-qwen-7b-{'test' if args.test else 'full'}",
	config={
	"model": MODEL_NAME,
	"algorithm": "GRPO",
	"curriculum": "tier1->tier2->tier3",
	"max_steps": MAX_STEPS,
	"reward_components": ["format", "hypothesis", "localization", "fix", "semantic", "efficiency"],
	"paper_citations": ["Masud et al. 2026", "Ibrahim et al. 2024"],
	}
	)

	# ── System prompt ─────────────────────────────────────────────────────────────
	SYSTEM_PROMPT = """You are an expert Python debugger. You reason through bugs systematically.

	You MUST respond in EXACTLY this format — no exceptions, no extra text:

	OBSERVATION: [Specific observations about the code and error. Reference exact line numbers.]
	HYPOTHESIS: [Your theory about the root cause. Must be at least 2 sentences. Reference specific variable names, operators, or logic.]
	CONFIDENCE: [low \| medium \| high]
	ACTION: [One of: inspect_lines \| run_tests \| propose_fix \| request_context \| give_up]
	DETAIL: [For propose_fix: the complete corrected function code. For inspect_lines: line numbers. For others: specific details.]

	Rules:
	- Never omit any field
	- HYPOTHESIS must explain WHY the bug causes the observed failure
	- If proposing a fix, DETAIL must contain the complete function, not just the changed line
	- Give up only if you have exhausted all reasonable hypotheses"""

	# ── Load bugs ─────────────────────────────────────────────────────────────────
	def load_bugs(tier: int) -> list[dict]:
	path = f"data/bugs_tier{tier}.jsonl"
	if not os.path.exists(path):
	print(f"WARNING: {path} not found. Run data/generate_bugs.py first.")
	return []
	with open(path) as f:
	return [json.loads(line) for line in f if line.strip()]

	def get_bugs_for_step(step: int) -> list[dict]:
	tier1 = load_bugs(1)
	if step < 150:
	return tier1
	elif step < 600:
	return tier1 + load_bugs(2)
	return tier1 + load_bugs(2) + load_bugs(3)

	def bug_to_prompt(bug: dict) -> str:
	return (
	f"<\|im_start\|>system\n{SYSTEM_PROMPT}<\|im_end\|>\n"
	f"<\|im_start\|>user\n"
	f"Debug this Python function:\n\n```python\n{bug['buggy_code']}\n```\n\n"
	f"Initial failure: {bug.get('initial_error', 'Some tests are failing.')}\n"
	f"<\|im_end\|>\n"
	f"<\|im_start\|>assistant\n"
	)

	def _run_fix(proposed_code: str, bug: dict) -> dict:
	"""Safely run proposed fix with subprocess timeout."""
	test_cases = bug.get("test_cases", [])
	func_name = bug.get("function_name", "")
	if not proposed_code or not test_cases or not func_name:
	return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}

	passed = 0
	for test in test_cases:
	inp = test["input"]
	args_str = ", ".join(repr(x) for x in inp)
	script = (
	f"{proposed_code}\n"
	f"try:\n"
	f" r={func_name}({args_str})\n"
	f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
	f"except Exception as e:\n"
	f" print(f'ERROR: {{e}}')\n"
	)
	try:
	with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
	f.write(script)
	fname = f.name
	python = shutil.which("python3") or shutil.which("python") or sys.executable
	r = subprocess.run([python, fname], capture_output=True, text=True, timeout=5)
	os.unlink(fname)
	if "PASS" in r.stdout:
	passed += 1
	except Exception:
	pass

	return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}

	# ── Mock completions for --test-local ─────────────────────────────────────────
	MOCK_GOOD = """
	OBSERVATION: The loop condition on line 4 uses <= instead of
	HYPOTHESIS: This causes an off-by-one error because Python lists are
	0-indexed, so the last valid index is len(arr)-1 not len(arr)
	CONFIDENCE: high
	ACTION: propose_fix
	DETAIL: def binary_search(arr, target):
	left, right = 0, len(arr) - 1
	while left < right:
	mid = (left + right) // 2
	if arr[mid] == target:
	return mid
	elif arr[mid] < target:
	left = mid + 1
	else:
	right = mid - 1
	return -1
	"""

	MOCK_BAD = """
	I think there might be a bug somewhere in the code.
	Let me try fixing it.
	"""

	# ── --test-local: reward sanity-check without any model ───────────────────────
	if args.test_local:
	print("=" * 60)
	print("LOCAL TEST MODE — no model loaded, testing reward function only")
	print("=" * 60)

	bugs = load_bugs(1)
	if not bugs:
	print("ERROR: No bugs found in data/bugs_tier1.jsonl. Run data/generate_bugs.py first.")
	sys.exit(1)

	bug = bugs[0]
	print(f"\nUsing bug: {bug.get('function_name', '?')} — {bug.get('bug_type', '?')}\n")

	calculator_local = DebugRewardCalculator()

	def _score(label: str, completion: str) -> float:
	try:
	agent_output = parse_agent_output(completion)
	test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
	if agent_output.action == "propose_fix":
	test_results = _run_fix(agent_output.detail, bug)
	breakdown = calculator_local.compute_turn_reward(
	agent_output=agent_output,
	ground_truth={
	"bug_function": bug.get("bug_location", {}).get("function", ""),
	"bug_line": bug.get("bug_location", {}).get("line_start", -1),
	"bug_type": bug.get("bug_type", ""),
	"canonical_fix_code": bug.get("original_code", ""),
	},
	test_results=test_results,
	turn_number=0,
	)
	print(f"--- {label} reward breakdown ---")
	for field, value in breakdown.__dict__.items():
	print(f" {field}: {value}")
	print(f" TOTAL: {breakdown.total}\n")
	return breakdown.total
	except Exception as e:
	print(f"Reward error for {label}: {e}")
	return -0.3

	good_score = _score("MOCK_GOOD", MOCK_GOOD)
	bad_score = _score("MOCK_BAD", MOCK_BAD)

	print(f"MOCK_GOOD score: {good_score:.4f}")
	print(f"MOCK_BAD score: {bad_score:.4f}")

	assert good_score > bad_score, (
	f"ASSERTION FAILED: MOCK_GOOD ({good_score:.4f}) should be > MOCK_BAD ({bad_score:.4f})"
	)
	print("\nLOCAL TEST PASSED")
	sys.exit(0)

	# ── Auto-detect GPU and set optimal config ────────────────────────────────────
	_gpu_vram_gb = 0
	_is_ampere_plus = False # A100/H100 support bfloat16 natively (compute cap >= 8.0)
	if torch.cuda.is_available():
	_props = torch.cuda.get_device_properties(0)
	_gpu_vram_gb = _props.total_memory / 1e9
	_is_ampere_plus = _props.major >= 8
	print(f"GPU: {_props.name} \| VRAM: {_gpu_vram_gb:.1f}GB \| "
	f"Compute cap: {_props.major}.{_props.minor} \| "
	f"bfloat16: {'yes' if _is_ampere_plus else 'no'}")

	COMPUTE_DTYPE = torch.bfloat16 if _is_ampere_plus else torch.float16

	# Scale batch/generation config to available VRAM.
	# GRPO constraint: per_device_train_batch_size % num_generations == 0
	if _gpu_vram_gb >= 70: # A100 80GB
	_batch = 8
	_grad_accum = 1 # effective batch = 8
	_num_gen = 8 # 8 % 8 == 0
	_max_comp = 256
	_lora_r = 16
	elif _gpu_vram_gb >= 40: # A100 40GB
	_batch = 4
	_grad_accum = 2 # effective batch = 8
	_num_gen = 4 # 4 % 4 == 0
	_max_comp = 256
	_lora_r = 16
	elif _gpu_vram_gb >= 20: # A10G 24GB / V100 32GB
	_batch = 2
	_grad_accum = 4
	_num_gen = 2 # 2 % 2 == 0
	_max_comp = 192
	_lora_r = 8
	else: # T4 15GB / anything smaller
	_batch = 2
	_grad_accum = 4
	_num_gen = 2 # 2 % 2 == 0
	_max_comp = 160
	_lora_r = 8

	print(f"Training config: batch={_batch} grad_accum={_grad_accum} "
	f"num_gen={_num_gen} max_comp={_max_comp} lora_r={_lora_r} "
	f"dtype={COMPUTE_DTYPE}")

	# ── Load model ────────────────────────────────────────────────────────────────
	# Load in native float16/bfloat16 — no bitsandbytes needed.
	# A10G (24GB) fits Qwen2.5-7B in float16 (~14GB) with room for LoRA + activations.
	print(f"Loading {MODEL_NAME} in {COMPUTE_DTYPE}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "left"

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	device_map="auto",
	trust_remote_code=True,
	torch_dtype=COMPUTE_DTYPE,
	)
	model.config.use_cache = False

	lora_config = LoraConfig(
	r=_lora_r,
	lora_alpha=_lora_r * 2,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.0,
	bias="none",
	task_type=TaskType.CAUSAL_LM,
	)
	model = get_peft_model(model, lora_config)
	model.enable_input_require_grads()
	model.gradient_checkpointing_enable()
	print(f"Trainable params: {model.num_parameters(only_trainable=True):,}")

	# ── Runtime device selection ──────────────────────────────────────────────────
	def _select_runtime_device(model) -> str:
	"""
	Pick the safest generation device without forcing CUDA init on broken drivers.
	"""
	def _cuda_usable() -> bool:
	try:
	if not torch.cuda.is_available():
	return False
	# Force lightweight CUDA init probe.
	_ = torch.zeros(1, device="cuda")
	return True
	except Exception as e:
	print(f"WARNING: CUDA initialization failed ({e}). Falling back to CPU.")
	return False

	# Prefer model's current device when available.
	try:
	model_device = str(next(model.parameters()).device)
	if model_device.startswith("cuda") and not _cuda_usable():
	return "cpu"
	return model_device
	except Exception:
	pass

	# Fallback to torch capability checks.
	if _cuda_usable():
	return "cuda"
	return "cpu"


	RUNTIME_DEVICE = _select_runtime_device(model)
	print(f"Using generation/training runtime device: {RUNTIME_DEVICE}")

	# ── Reward function ───────────────────────────────────────────────────────────
	calculator = DebugRewardCalculator()

	def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[float]:
	"""
	GRPO reward function. Called on groups of completions for the same prompt.
	GRPO learns from RELATIVE differences within each group.
	"""
	rewards = []
	bugs_raw = kwargs.get("bug_metadata", [{}] * len(completions))
	bugs = [json.loads(b) if isinstance(b, str) else b for b in bugs_raw]

	for completion, bug in zip(completions, bugs):
	try:
	agent_output = parse_agent_output(completion)

	# Run fix if agent proposes one
	test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
	if agent_output.action == "propose_fix" and bug:
	test_results = _run_fix(agent_output.detail, bug)

	breakdown = calculator.compute_turn_reward(
	agent_output=agent_output,
	ground_truth={
	"bug_function": bug.get("bug_location", {}).get("function", ""),
	"bug_line": bug.get("bug_location", {}).get("line_start", -1),
	"bug_type": bug.get("bug_type", ""),
	"canonical_fix_code": bug.get("original_code", ""),
	},
	test_results=test_results,
	turn_number=0,
	)

	if WANDB_API_KEY:
	wandb.log({k: v for k, v in breakdown.__dict__.items()})

	rewards.append(breakdown.total)

	except Exception as e:
	print(f"Reward error: {e}")
	rewards.append(-0.3)

	return rewards

	# ── Baseline evaluation (run BEFORE training) ─────────────────────────────────
	def run_baseline(n: int = 20) -> dict:
	print("\nRunning baseline evaluation on UNTRAINED model...")
	model.eval()
	bugs = load_bugs(1)[:n]
	rewards = []
	solved = 0
	for bug in bugs:
	prompt = bug_to_prompt(bug)
	inputs = tokenizer(prompt, return_tensors="pt").to(RUNTIME_DEVICE)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=200, do_sample=False)
	completion = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	r = reward_fn([completion], [prompt], bug_metadata=[bug])
	rewards.append(r[0])
	if r[0] > 0.20:
	solved += 1

	result = {"solve_rate": solved / max(len(bugs), 1), "avg_reward": sum(rewards) / max(len(rewards), 1), "rewards": rewards}
	with open("baseline_results.json", "w") as f:
	json.dump(result, f)
	print(f"Baseline: solve_rate={result['solve_rate']:.1%}, avg_reward={result['avg_reward']:.3f}")
	if WANDB_API_KEY:
	wandb.log({"baseline/solve_rate": result["solve_rate"], "baseline/avg_reward": result["avg_reward"]})
	return result

	baseline = run_baseline()
	model.train()

	# ── Build initial dataset ─────────────────────────────────────────────────────
	def make_dataset(step: int) -> Dataset:
	bugs = get_bugs_for_step(step)
	return Dataset.from_list([{"prompt": bug_to_prompt(b), "bug_metadata": json.dumps(b)} for b in bugs])

	# ── Training config ───────────────────────────────────────────────────────────
	config = GRPOConfig(
	output_dir=CHECKPOINT_DIR,
	max_steps=MAX_STEPS,
	per_device_train_batch_size=_batch,
	gradient_accumulation_steps=_grad_accum,
	learning_rate=2e-5,
	lr_scheduler_type="cosine",
	warmup_steps=10 if args.test else 30,
	num_generations=_num_gen,
	max_completion_length=_max_comp,
	temperature=0.9,
	logging_steps=5,
	save_steps=50,
	report_to="wandb" if WANDB_API_KEY else "none",
	)

	trainer = GRPOTrainer(
	model=model,
	args=config,
	train_dataset=make_dataset(0),
	reward_funcs=reward_fn,
	processing_class=tokenizer,
	)

	# ── Curriculum callback ───────────────────────────────────────────────────────
	class CurriculumCallback(TrainerCallback):
	def on_step_end(self, args, state, control, **kwargs):
	step = state.global_step
	if step in [150, 350]:
	trainer.train_dataset = make_dataset(step)
	print(f"\nCurriculum advanced at step {step}!")
	if WANDB_API_KEY:
	wandb.log({"curriculum/step": step})

	trainer.add_callback(CurriculumCallback())

	# ── Train ─────────────────────────────────────────────────────────────────────
	print(f"\nStarting GRPO training. Max steps: {MAX_STEPS}")
	print(f"Baseline solve rate: {baseline['solve_rate']:.1%} — target: >60% after training")
	trainer.train(resume_from_checkpoint=args.resume)

	# ── Post-training evaluation ──────────────────────────────────────────────────
	model.eval()
	bugs = load_bugs(1)[:20]
	post_rewards = []
	post_solved = 0
	for bug in bugs:
	prompt = bug_to_prompt(bug)
	inputs = tokenizer(prompt, return_tensors="pt").to(RUNTIME_DEVICE)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=200, do_sample=False)
	completion = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	r = reward_fn([completion], [prompt], bug_metadata=[bug])
	post_rewards.append(r[0])
	if r[0] > 0.20:
	post_solved += 1

	post_solve_rate = post_solved / max(len(bugs), 1)
	print(f"\n{'='*60}")
	print(f"RESULTS:")
	print(f"Before training: {baseline['solve_rate']:.1%} solve rate")
	print(f"After training: {post_solve_rate:.1%} solve rate")
	print(f"Improvement: +{post_solve_rate - baseline['solve_rate']:.1%}")
	print(f"{'='*60}")

	if WANDB_API_KEY:
	wandb.log({"final/solve_rate": post_solve_rate, "final/improvement": post_solve_rate - baseline["solve_rate"]})
	wandb.finish()

	# ── Save and push ─────────────────────────────────────────────────────────────
	model.save_pretrained("./final_model")
	tokenizer.save_pretrained("./final_model")
	HF_TOKEN = os.environ.get("HF_TOKEN")
	if HF_TOKEN and not args.test:
	model.push_to_hub(HF_REPO, token=HF_TOKEN, private=True)
	tokenizer.push_to_hub(HF_REPO, token=HF_TOKEN, private=True)
	print(f"Pushed to https://huggingface.co/{HF_REPO}")