Builder-Neekhil
/

career-agent-v1

Safetensors

qwen2

Model card Files Files and versions

xet

Community

Builder-Neekhil commited on 14 days ago

Commit

ff8a64f

verified ·

1 Parent(s): b286137

Upload career_os_sota.py

Browse files

Files changed (1) hide show

career_os_sota.py +671 -0

career_os_sota.py ADDED Viewed

	@@ -0,0 +1,671 @@

+"""
+═══════════════════════════════════════════════════════════════════════════════
+ CAREER OS — SOTA 3-Stage Training Pipeline
+ SFT (High-Rank LoRA) → DPO (Preference Tuning) → GRPO (Self-Improvement)
+═══════════════════════════════════════════════════════════════════════════════
+For A100 40GB (Google Colab Pro):
+  Runtime → GPU → A100
+  Estimated total time: ~3-4 hours for all 3 stages
+Architecture:
+  Stage 1: SFT on ~12K career conversations + reasoning data
+            LoRA r=256, target_modules="all-linear"
+  Stage 2: DPO on preference pairs generated by Stage 1 model
+            Custom career-quality reward (helpfulness + structure + specificity)
+  Stage 3: GRPO with multi-component reward function
+            Rule-based rewards: JSON-correctness, career-relevance, actionability
+Inspired by:
+  - Self-Rewarding LLMs (arXiv:2401.10020): iterative DPO pipeline
+  - iGRPO (arXiv:2602.09000): iterative GRPO with self-feedback
+  - LoRA Without Regret: r=256 for SFT, lower for RL
+  - AgentOrchestra (arXiv:2506.12508): multi-agent orchestration pattern
+Before running:
+  1. Set A100 runtime in Colab
+  2. Add HF_TOKEN to Secrets for push_to_hub
+  3. pip install transformers trl datasets peft torch accelerate trackio
+═══════════════════════════════════════════════════════════════════════════════
+"""
+import json, random, os, gc, re, math
+from datetime import datetime
+from typing import List, Dict, Tuple
+import torch
+import numpy as np
+from datasets import load_dataset, Dataset, concatenate_datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import LoraConfig, TaskType, PeftModel, get_peft_model
+from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig, GRPOTrainer, GRPOConfig
+# ── CONFIG ───────────────────────────────────────────────────────────────────
+MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+# For A100 40GB: upgrade to "Qwen/Qwen2.5-7B-Instruct" or "meta-llama/Llama-3.1-8B-Instruct"
+OUTPUT_HUB = "Builder-Neekhil/career-agent-v1"
+DATASET_HUB = "Builder-Neekhil/career-agent-dataset-v1"
+STAGE = os.environ.get("CAREER_STAGE", "all")  # "sft", "dpo", "grpo", or "all"
+CAREER_SYSTEM = (
+    "You are a seasoned career advising expert with 15 years of experience helping professionals "
+    "navigate their careers. You excel at: reviewing and tailoring resumes, assessing job fit, "
+    "generating interview questions, suggesting career paths, and optimizing for ATS keywords. "
+    "Be specific, honest, actionable, and concise. When appropriate, provide structured JSON outputs."
+)
+# ── DATASET BUILDERS (same as before, refactored for speed) ─────────────────
+def build_resume_job_fit():
+    ds = load_dataset("cnamuangtoun/resume-job-description-fit", split="train")
+    out = []
+    for ex in ds:
+        resume, job = ex["resume_text"].strip()[:2000], ex["job_description_text"].strip()[:2000]
+        fit = "Good Fit" if ex["label"] in ("Fit", "Good Fit", "Good") else "No Fit"
+        score = 75 if fit == "Good Fit" else 25
+        prompt = (f"Resume:\n{resume}\n\nJob Description:\n{job}\n\n"
+                  f"Task: Assess how well this resume matches the job. Return a JSON with fields: "
+                  f"'fit_assessment', 'score', 'strengths', 'gaps', 'suggestions'.")
+        answer = json.dumps({
+            "fit_assessment": fit, "score": score,
+            "strengths": ["Relevant experience present"],
+            "gaps": ["Tailor keywords to job description"],
+            "suggestions": ["Add measurable achievements", "Mirror JD language in skills section"]
+        }, indent=2)
+        out.append({"messages": [{"role": "system", "content": CAREER_SYSTEM},
+                                 {"role": "user", "content": prompt},
+                                 {"role": "assistant", "content": answer}]})
+    return Dataset.from_list(out)
+def build_resume_review():
+    ds = load_dataset("opensporks/resumes", split="train")
+    out = []
+    for ex in ds:
+        resume, cat = ex["Resume_str"].strip()[:2000], ex["Category"]
+        out.append({"messages": [
+            {"role": "system", "content": CAREER_SYSTEM},
+            {"role": "user", "content": f"Please review my resume and give me actionable feedback:\n\n{resume}"},
+            {"role": "assistant", "content": (
+                f"## Resume Review Feedback ({cat})\n\n"
+                f"**Strengths:**\n- Clear professional summary\n- Relevant work experience\n\n"
+                f"**Improvements:**\n1. Add quantifiable achievements\n"
+                f"2. Tailor skills to target roles\n3. Use active voice\n"
+                f"4. Remove outdated experience\n\n"
+                f"**ATS:** Include keywords from target job descriptions.")}
+        ]})
+    return Dataset.from_list(out)
+def build_resume_json():
+    ds = load_dataset("sandeeppanem/resume-json-extraction-5k", split="train")
+    out = []
+    for ex in ds:
+        raw = ex["text"][:3000]
+        out.append({"messages": [
+            {"role": "system", "content": CAREER_SYSTEM},
+            {"role": "user", "content": "Parse this resume into structured JSON."},
+            {"role": "assistant", "content": raw}
+        ]})
+    return Dataset.from_list(out)
+def build_synthetic_coaching(n=800):
+    topics = [
+        ("salary negotiation",
+         "I'm about to negotiate my salary for a new offer. What strategies should I use?",
+         "## Salary Negotiation Strategy\n\n1. **Research:** Know market rate (Glassdoor, Levels.fyi)\n"
+         "2. **Anchor high:** State 10-15% above target\n"
+         "3. **Total comp:** Include base + bonus + equity + benefits\n"
+         "4. **Leverage:** Competing offers or business case\n"
+         "5. **Practice:** Rehearse out loud\n\n"
+         "**Script:** 'I'm excited about this role. Based on my research, I was expecting $X. Is there flexibility?'"),
+        ("career pivot",
+         "I want to pivot from marketing to data science. What's my roadmap?",
+         "## Career Pivot: Marketing → Data Science\n\n"
+         "**Phase 1 (0-3 mo):** Learn Python + pandas + SQL\n"
+         "**Phase 2 (3-6 mo):** Build 3 portfolio projects with real datasets\n"
+         "**Phase 3 (6-9 mo):** Freelance / intern for real experience\n"
+         "**Phase 4 (9-12 mo):** Apply to hybrid roles (marketing analytics)\n\n"
+         "**Leverage:** A/B testing, segmentation, ROI analysis are transferable."),
+        ("networking",
+         "How do I effectively network on LinkedIn without being awkward?",
+         "## LinkedIn Networking\n\n"
+         "1. **Personalize invites:** Mention specific post or project\n"
+         "2. **Give before asking:** Share their content first\n"
+         "3. **Follow up:** Brief thank-you + one question\n"
+         "4. **Consistency:** Comment on 3-5 posts weekly\n"
+         "5. **Informational calls:** Request 15-min chats\n\n"
+         "**Template:** 'Hi [Name], I enjoyed your post on X. I'm exploring Y. Would you be open to a brief chat?'"),
+        ("resume gap",
+         "I have a 2-year employment gap. How do I address it?",
+         "## Addressing Employment Gaps\n\n"
+         "**On Resume:**\n- Use functional/hybrid format\n"
+         "- Include freelance/volunteer work\n"
+         "- Omit months for gaps <2 years\n\n"
+         "**In Interviews:**\n- Be honest but brief\n"
+         "- Pivot to what you did: courses, certifications\n"
+         "- Emphasize readiness\n\n"
+         "**Mindset:** How you frame gaps matters more than the gap."),
+        ("promotion",
+         "What should I do in the next 6 months to position myself for a promotion?",
+         "## Promotion Strategy (6-Month Plan)\n\n"
+         "**Month 1-2: Visibility**\n- Document wins in a 'brag document'\n"
+         "- Volunteer for high-visibility projects\n\n"
+         "**Month 3-4: Skill Gaps**\n- Identify next-level competencies\n"
+         "- Seek cross-functional exposure\n\n"
+         "**Month 5: Feedback**\n- Ask: 'What would it take to be ready for [next level]?'\n\n"
+         "**Month 6: The Ask**\n- Present documented impact aligned with company goals\n\n"
+         "**Key:** Promotions happen when you perform at the next level."),
+        ("interview prep",
+         "I'm interviewing for a Senior Software Engineer role. How should I prepare?",
+         "## Senior SWE Interview Prep\n\n"
+         "**Technical:**\n- System design: scalability, microservices, caching\n"
+         "- Coding: LeetCode medium/hard, focus on trees, graphs, DP\n"
+         "- Behavioral: STAR method for all answers\n\n"
+         "**Leadership:**\n- Cross-team collaboration examples\n"
+         "- Mentorship stories\n"
+         "- Conflict resolution\n\n"
+         "**Questions to ask them:**\n- Team structure and tech stack\n"
+         "- On-call expectations\n"
+         "- Growth trajectory\n\n"
+         "**Mock:** Practice with friends or Pramp 3-5 times before the real thing."),
+    ]
+    out = []
+    for _ in range(n):
+        _, u, a = random.choice(topics)
+        out.append({"messages": [
+            {"role": "system", "content": CAREER_SYSTEM},
+            {"role": "user", "content": u},
+            {"role": "assistant", "content": a}
+        ]})
+    return Dataset.from_list(out)
+def build_reasoning_boost(n=400):
+    """Add general reasoning data to improve instruction following quality."""
+    reasoning_data = [
+        ("A candidate has 3 years Python, 2 years ML, and wants a Senior Data Scientist role requiring 5+ years and deep learning. Assess fit step by step.",
+         "Let me analyze this systematically.\n\n"
+         "**Step 1 - Experience Gap:** The role requires 5+ years; candidate has 3 years Python and 2 years ML. Total relevant experience: ~3 years (Python is foundational, ML is specialized). This is a gap of ~2 years.\n\n"
+         "**Step 2 - Skills Gap:** The role requires deep learning. The candidate has ML experience but deep learning is a subset. Need to verify: does their ML include neural networks, transformers, or just classical ML?\n\n"
+         "**Step 3 - Mitigating Factors:**\n- If they have strong project outcomes (deployed models, business impact)\n"
+         "- If they've used deep learning frameworks (PyTorch, TensorFlow)\n"
+         "- If they have publications or Kaggle achievements\n\n"
+         "**Step 4 - Fit Assessment:** Partial Fit (60/100). The experience gap is the main blocker, but strong project portfolio could compensate.\n\n"
+         "**Recommendations:**\n1. Apply to Mid-Level Data Scientist roles instead\n"
+         "2. Build 2-3 deep learning portfolio projects\n"
+         "3. Highlight any leadership or mentoring experience to show senior potential\n"
+         "4. Consider companies that value skills over years of experience (startups, high-growth)")),
+        ("Compare two resumes for a Product Manager role. Resume A: 4 years PM at FAANG, launched 3 features with $10M+ revenue. Resume B: 6 years PM at startups, launched 8 features, $2M total revenue. Who is the stronger candidate and why?",
+         "Let me compare these candidates systematically.\n\n"
+         "**Dimension 1 - Impact per Year:**\n- Resume A: $10M / 4 years = $2.5M/year average impact\n"
+         "- Resume B: $2M / 6 years = $0.33M/year average impact\n"
+         "Resume A shows significantly higher revenue impact per year.\n\n"
+         "**Dimension 2 - Feature Complexity:**\n"
+         "FAANG products typically have billions of users; a single feature can impact millions. Startup features may serve thousands. Resume A likely dealt with higher complexity (scale, stakeholder management, regulatory).\n\n"
+         "**Dimension 3 - Breadth vs Depth:**\n"
+         "Resume B launched more features (8 vs 3) suggesting breadth. Resume A suggests depth with fewer, higher-impact launches. For a senior PM role at a large company, depth is valued. For an early-stage startup, breadth might be preferred.\n\n"
+         "**Dimension 4 - Career Trajectory:**\n"
+         "Resume A at FAANG indicates passing rigorous hiring bar, structured PM training. Resume B shows entrepreneurial resilience but less institutional training.\n\n"
+         "**Verdict:** Resume A is the stronger candidate for most corporate PM roles. Resume B might be preferred for an early-stage startup where scrappiness and speed matter more than scale experience.\n\n"
+         "**Recommendation:** When presenting Resume B, emphasize the breadth as 'adaptability across product stages' and frame the smaller revenue as 'efficient resource utilization in resource-constrained environments.'"),
+    ]
+    out = []
+    for i in range(n):
+        q, a = reasoning_data[i % len(reasoning_data)]
+        out.append({"messages": [
+            {"role": "system", "content": CAREER_SYSTEM},
+            {"role": "user", "content": q},
+            {"role": "assistant", "content": a}
+        ]})
+    return Dataset.from_list(out)
+def build_full_dataset():
+    """Build and cache the complete training dataset."""
+    print("\n[DATASET] Building training corpus...")
+    ds1 = build_resume_job_fit()
+    ds2 = build_resume_review()
+    ds3 = build_resume_json()
+    ds4 = build_synthetic_coaching()
+    ds5 = build_reasoning_boost()
+    full = concatenate_datasets([ds1, ds2, ds3, ds4, ds5]).shuffle(seed=42)
+    print(f"[DATASET] Total: {len(full)} examples")
+    print(f"         - Resume-Job Fit:     {len(ds1)}")
+    print(f"         - Resume Review:      {len(ds2)}")
+    print(f"         - Resume JSON Parse:  {len(ds3)}")
+    print(f"         - Synthetic Coaching: {len(ds4)}")
+    print(f"         - Reasoning Boost:    {len(ds5)}")
+    return full
+# ── REWARD FUNCTIONS FOR GRPO ────────────────────────────────────────────
+def career_reward_function(completions: List[str], prompts: List[str], **kwargs) -> List[float]:
+    """
+    Multi-component reward function for career agent outputs.
+    Returns a score from 0 to 1 for each completion.
+    """
+    scores = []
+    for completion, prompt in zip(completions, prompts):
+        score = 0.0
+        text = completion.strip()
+        # Component 1: Structure reward (headers, bullet points, sections)
+        structure_score = 0.0
+        if "##" in text or "**" in text or "1." in text:
+            structure_score = 0.25
+        elif "- " in text or "\n\n" in text:
+            structure_score = 0.15
+        score += structure_score
+        # Component 2: JSON correctness (if JSON requested)
+        if "json" in prompt.lower():
+            try:
+                # Find JSON in text
+                json_match = re.search(r'\{.*\}', text, re.DOTALL)
+                if json_match:
+                    json.loads(json_match.group())
+                    score += 0.25
+            except (json.JSONDecodeError, ValueError):
+                pass
+        else:
+            score += 0.15  # Non-JSON prompts get partial credit
+        # Component 3: Actionability (presence of actionable verbs)
+        action_words = ["add", "remove", "build", "create", "learn", "practice",
+                        "apply", "network", "research", "tailor", "optimize", "update"]
+        action_count = sum(1 for w in action_words if w.lower() in text.lower())
+        score += min(0.25, action_count * 0.05)
+        # Component 4: Career relevance (mentions of career-specific terms)
+        career_terms = ["resume", "interview", "career", "job", "skills",
+                       "experience", "promotion", "salary", "negotiation",
+                       "ATS", "keywords", "hiring", "manager", "role"]
+        career_count = sum(1 for t in career_terms if t.lower() in text.lower())
+        score += min(0.25, career_count * 0.03)
+        scores.append(min(1.0, max(0.0, score)))
+    return scores
+# ── DPO DATASET GENERATION ─────────────────────────────────────────────────
+def generate_dpo_dataset(model, tokenizer, sft_dataset: Dataset, n_pairs: int = 500) -> Dataset:
+    """
+    Generate preference pairs for DPO training by sampling the SFT model twice
+    and scoring outputs with the career reward function.
+    """
+    print(f"\n[DPO] Generating {n_pairs} preference pairs from SFT model...")
+    dpo_examples = []
+    model.eval()
+    # Sample prompts from the SFT dataset
+    indices = random.sample(range(len(sft_dataset)), min(n_pairs * 2, len(sft_dataset)))
+    for idx in indices[:n_pairs]:
+        ex = sft_dataset[idx]
+        messages = ex["messages"]
+        # Extract user message as prompt
+        user_msg = None
+        for m in messages:
+            if m["role"] == "user":
+                user_msg = m["content"]
+                break
+        if not user_msg:
+            continue
+        # Generate two completions
+        inputs = tokenizer.apply_chat_template(
+            [{"role": "user", "content": user_msg}],
+            tokenize=True, return_tensors="pt", add_generation_prompt=True
+        ).to(model.device)
+        completions_list = []
+        for _ in range(2):
+            with torch.no_grad():
+                out = model.generate(
+                    inputs,
+                    max_new_tokens=512,
+                    temperature=0.8,
+                    do_sample=True,
+                    top_p=0.9,
+                    pad_token_id=tokenizer.eos_token_id,
+                )
+            resp = tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)
+            completions_list.append(resp)
+        # Score both
+        scores = career_reward_function(completions_list, [user_msg, user_msg])
+        if scores[0] != scores[1]:
+            # Create preference pair
+            chosen_idx = 0 if scores[0] > scores[1] else 1
+            rejected_idx = 1 - chosen_idx
+            dpo_examples.append({
+                "prompt": [{"role": "user", "content": user_msg}],
+                "chosen": [{"role": "assistant", "content": completions_list[chosen_idx]}],
+                "rejected": [{"role": "assistant", "content": completions_list[rejected_idx]}],
+            })
+    print(f"[DPO] Generated {len(dpo_examples)} valid preference pairs")
+    return Dataset.from_list(dpo_examples)
+# ── STAGE 1: SFT ───────────────────────────────────────────────────────────
+def run_sft(dataset: Dataset, output_dir: str = "./stage1_sft") -> str:
+    """Stage 1: Supervised Fine-Tuning with high-rank LoRA."""
+    print("\n" + "=" * 60)
+    print("STAGE 1: SFT (Supervised Fine-Tuning)")
+    print("=" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # SOTA LoRA config: r=256 for SFT (high capacity)
+    peft_config = LoraConfig(
+        r=256,
+        lora_alpha=512,
+        lora_dropout=0.05,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules="all-linear",  # Target all linear layers (SOTA recipe)
+        use_rslora=True,  # Rank-Stabilized LoRA for better large-rank training
+    )
+    args = SFTConfig(
+        output_dir=output_dir,
+        num_train_epochs=2,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        learning_rate=2e-4,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.03,
+        logging_steps=10,
+        logging_first_step=True,
+        save_steps=500,
+        save_total_limit=2,
+        max_length=2048,
+        bf16=True,
+        gradient_checkpointing=True,
+        assistant_only_loss=True,
+        remove_unused_columns=False,
+        report_to=["none"],  # Use custom viz instead
+    )
+    trainer = SFTTrainer(
+        model=MODEL_ID,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        args=args,
+        peft_config=peft_config,
+    )
+    print(f"\n[STAGE 1] Training SFT for 2 epochs...")
+    trainer.train()
+    # Save adapter
+    sft_adapter_path = os.path.join(output_dir, "final_adapter")
+    trainer.save_model(sft_adapter_path)
+    print(f"[STAGE 1] SFT adapter saved to {sft_adapter_path}")
+    del trainer
+    gc.collect()
+    torch.cuda.empty_cache()
+    return sft_adapter_path
+# ── STAGE 2: DPO ───────────────────────────────────────────────────────────
+def run_dpo(sft_adapter_path: str, dpo_dataset: Dataset, output_dir: str = "./stage2_dpo") -> str:
+    """Stage 2: Direct Preference Optimization on career-quality pairs."""
+    print("\n" + "=" * 60)
+    print("STAGE 2: DPO (Direct Preference Optimization)")
+    print("=" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load the SFT model with adapter
+    base_model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
+    )
+    model = PeftModel.from_pretrained(base_model, sft_adapter_path, is_trainable=True)
+    # DPO with lower-rank LoRA (RL typically needs less capacity)
+    peft_config = LoraConfig(
+        r=64,
+        lora_alpha=128,
+        lora_dropout=0.05,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules="all-linear",
+    )
+    model = get_peft_model(model, peft_config)
+    args = DPOConfig(
+        output_dir=output_dir,
+        num_train_epochs=1,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=8,
+        learning_rate=5e-7,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.1,
+        logging_steps=10,
+        logging_first_step=True,
+        save_steps=200,
+        save_total_limit=2,
+        max_length=2048,
+        max_prompt_length=1024,
+        bf16=True,
+        gradient_checkpointing=True,
+        beta=0.1,  # DPO temperature
+        remove_unused_columns=False,
+        report_to=["none"],
+    )
+    trainer = DPOTrainer(
+        model=model,
+        ref_model=None,  # Use implicit reference (DPO default)
+        tokenizer=tokenizer,
+        train_dataset=dpo_dataset,
+        args=args,
+    )
+    print(f"\n[STAGE 2] Training DPO for 1 epoch on {len(dpo_dataset)} pairs...")
+    trainer.train()
+    dpo_adapter_path = os.path.join(output_dir, "final_adapter")
+    trainer.save_model(dpo_adapter_path)
+    print(f"[STAGE 2] DPO adapter saved to {dpo_adapter_path}")
+    del trainer, model, base_model
+    gc.collect()
+    torch.cuda.empty_cache()
+    return dpo_adapter_path
+# ── STAGE 3: GRPO ──────────────────────────────────────────────────────────
+def run_grpo(dpo_adapter_path: str, grpo_dataset: Dataset, output_dir: str = "./stage3_grpo") -> str:
+    """Stage 3: GRPO with custom multi-component reward function."""
+    print("\n" + "=" * 60)
+    print("STAGE 3: GRPO (Group Relative Policy Optimization)")
+    print("=" * 60)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load DPO model
+    base_model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
+    )
+    model = PeftModel.from_pretrained(base_model, dpo_adapter_path, is_trainable=True)
+    # GRPO config: even lower rank, very small LR
+    peft_config = LoraConfig(
+        r=32,
+        lora_alpha=64,
+        lora_dropout=0.05,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules="all-linear",
+    )
+    model = get_peft_model(model, peft_config)
+    # Prepare GRPO dataset: prompt-only format
+    grpo_prompts = []
+    for ex in grpo_dataset:
+        for m in ex["messages"]:
+            if m["role"] == "user":
+                grpo_prompts.append({"prompt": [{"role": "user", "content": m["content"]}]})
+                break
+    grpo_ds = Dataset.from_list(grpo_prompts[:1000])  # Use subset for GRPO
+    args = GRPOConfig(
+        output_dir=output_dir,
+        num_train_epochs=1,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=4,
+        learning_rate=1e-6,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.1,
+        logging_steps=10,
+        logging_first_step=True,
+        save_steps=200,
+        save_total_limit=2,
+        max_completion_length=512,
+        bf16=True,
+        gradient_checkpointing=True,
+        report_to=["none"],
+        num_generations=4,  # Group size for relative advantage
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=career_reward_function,
+        tokenizer=tokenizer,
+        train_dataset=grpo_ds,
+        args=args,
+    )
+    print(f"\n[STAGE 3] Training GRPO for 1 epoch on {len(grpo_ds)} prompts...")
+    trainer.train()
+    grpo_adapter_path = os.path.join(output_dir, "final_adapter")
+    trainer.save_model(grpo_adapter_path)
+    print(f"[STAGE 3] GRPO adapter saved to {grpo_adapter_path}")
+    del trainer, model, base_model
+    gc.collect()
+    torch.cuda.empty_cache()
+    return grpo_adapter_path
+# ── MERGE & PUSH ───────────────────────────────────────────────────────────
+def merge_and_push(base_model_id: str, adapter_path: str, hub_repo: str):
+    """Merge LoRA adapters into base model and push to Hub."""
+    print(f"\n[MERGE] Loading base model and merging adapters...")
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
+    )
+    model = PeftModel.from_pretrained(base_model, adapter_path)
+    merged_model = model.merge_and_unload()
+    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"[MERGE] Pushing merged model to {hub_repo}...")
+    merged_model.push_to_hub(hub_repo, safe_serialization=True)
+    tokenizer.push_to_hub(hub_repo)
+    print(f"[MERGE] ✅ Model pushed to https://huggingface.co/{hub_repo}")
+    del merged_model, model, base_model
+    gc.collect()
+    torch.cuda.empty_cache()
+# ── MAIN ORCHESTRATOR ──────────────────────────────────────────────────────
+def main():
+    stage = STAGE.lower()
+    # Always build dataset first
+    dataset = build_full_dataset()
+    # Cache dataset to Hub for reuse
+    try:
+        dataset.push_to_hub(DATASET_HUB, private=False)
+        print(f"[DATASET] Cached to https://huggingface.co/datasets/{DATASET_HUB}")
+    except Exception as e:
+        print(f"[DATASET] Push skipped: {e}")
+    sft_path = "./stage1_sft/final_adapter"
+    dpo_path = "./stage2_dpo/final_adapter"
+    grpo_path = "./stage3_grpo/final_adapter"
+    # STAGE 1: SFT
+    if stage in ("all", "sft", "dpo", "grpo"):
+        if not os.path.exists(sft_path) or stage == "sft":
+            sft_path = run_sft(dataset, output_dir="./stage1_sft")
+        else:
+            print(f"\n[SKIP] SFT adapter found at {sft_path}")
+    # STAGE 2: DPO
+    if stage in ("all", "dpo", "grpo"):
+        if not os.path.exists(dpo_path) or stage == "dpo":
+            # Generate preference pairs from SFT model
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+            base = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16,
+                                                         device_map="auto", trust_remote_code=True)
+            sft_model = PeftModel.from_pretrained(base, sft_path)
+            dpo_dataset = generate_dpo_dataset(sft_model, tokenizer, dataset, n_pairs=500)
+            del sft_model, base
+            gc.collect()
+            torch.cuda.empty_cache()
+            dpo_path = run_dpo(sft_path, dpo_dataset, output_dir="./stage2_dpo")
+        else:
+            print(f"\n[SKIP] DPO adapter found at {dpo_path}")
+    # STAGE 3: GRPO
+    if stage in ("all", "grpo"):
+        if not os.path.exists(grpo_path) or stage == "grpo":
+            grpo_path = run_grpo(dpo_path, dataset, output_dir="./stage3_grpo")
+        else:
+            print(f"\n[SKIP] GRPO adapter found at {grpo_path}")
+    # MERGE & PUSH
+    if stage == "all":
+        print("\n" + "=" * 60)
+        print("FINAL: Merging & Pushing")
+        print("=" * 60)
+        merge_and_push(MODEL_ID, grpo_path, OUTPUT_HUB)
+        print(f"\n🎉 COMPLETE! Your Career Agent is at https://huggingface.co/{OUTPUT_HUB}")
+    print("\n" + "=" * 60)
+    print("Training artifacts:")
+    print(f"  SFT adapter: {sft_path}")
+    print(f"  DPO adapter: {dpo_path}")
+    print(f"  GRPO adapter: {grpo_path}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()