Builder-Neekhil commited on
Commit
ff8a64f
Β·
verified Β·
1 Parent(s): b286137

Upload career_os_sota.py

Browse files
Files changed (1) hide show
  1. career_os_sota.py +671 -0
career_os_sota.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ═══════════════════════════════════════════════════════════════════════════════
3
+ CAREER OS β€” SOTA 3-Stage Training Pipeline
4
+ SFT (High-Rank LoRA) β†’ DPO (Preference Tuning) β†’ GRPO (Self-Improvement)
5
+ ═══════════════════════════════════════════════════════════════════════════════
6
+
7
+ For A100 40GB (Google Colab Pro):
8
+ Runtime β†’ GPU β†’ A100
9
+ Estimated total time: ~3-4 hours for all 3 stages
10
+
11
+ Architecture:
12
+ Stage 1: SFT on ~12K career conversations + reasoning data
13
+ LoRA r=256, target_modules="all-linear"
14
+ Stage 2: DPO on preference pairs generated by Stage 1 model
15
+ Custom career-quality reward (helpfulness + structure + specificity)
16
+ Stage 3: GRPO with multi-component reward function
17
+ Rule-based rewards: JSON-correctness, career-relevance, actionability
18
+
19
+ Inspired by:
20
+ - Self-Rewarding LLMs (arXiv:2401.10020): iterative DPO pipeline
21
+ - iGRPO (arXiv:2602.09000): iterative GRPO with self-feedback
22
+ - LoRA Without Regret: r=256 for SFT, lower for RL
23
+ - AgentOrchestra (arXiv:2506.12508): multi-agent orchestration pattern
24
+
25
+ Before running:
26
+ 1. Set A100 runtime in Colab
27
+ 2. Add HF_TOKEN to Secrets for push_to_hub
28
+ 3. pip install transformers trl datasets peft torch accelerate trackio
29
+ ═══════════════════════════════════════════════════════════════════════════════
30
+ """
31
+
32
+ import json, random, os, gc, re, math
33
+ from datetime import datetime
34
+ from typing import List, Dict, Tuple
35
+
36
+ import torch
37
+ import numpy as np
38
+ from datasets import load_dataset, Dataset, concatenate_datasets
39
+ from transformers import AutoTokenizer, AutoModelForCausalLM
40
+ from peft import LoraConfig, TaskType, PeftModel, get_peft_model
41
+ from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig, GRPOTrainer, GRPOConfig
42
+
43
+ # ── CONFIG ───────────────────────────────────────────────────────────────────
44
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
45
+ # For A100 40GB: upgrade to "Qwen/Qwen2.5-7B-Instruct" or "meta-llama/Llama-3.1-8B-Instruct"
46
+ OUTPUT_HUB = "Builder-Neekhil/career-agent-v1"
47
+ DATASET_HUB = "Builder-Neekhil/career-agent-dataset-v1"
48
+
49
+ STAGE = os.environ.get("CAREER_STAGE", "all") # "sft", "dpo", "grpo", or "all"
50
+
51
+ CAREER_SYSTEM = (
52
+ "You are a seasoned career advising expert with 15 years of experience helping professionals "
53
+ "navigate their careers. You excel at: reviewing and tailoring resumes, assessing job fit, "
54
+ "generating interview questions, suggesting career paths, and optimizing for ATS keywords. "
55
+ "Be specific, honest, actionable, and concise. When appropriate, provide structured JSON outputs."
56
+ )
57
+
58
+ # ── DATASET BUILDERS (same as before, refactored for speed) ─────────────────
59
+
60
+ def build_resume_job_fit():
61
+ ds = load_dataset("cnamuangtoun/resume-job-description-fit", split="train")
62
+ out = []
63
+ for ex in ds:
64
+ resume, job = ex["resume_text"].strip()[:2000], ex["job_description_text"].strip()[:2000]
65
+ fit = "Good Fit" if ex["label"] in ("Fit", "Good Fit", "Good") else "No Fit"
66
+ score = 75 if fit == "Good Fit" else 25
67
+ prompt = (f"Resume:\n{resume}\n\nJob Description:\n{job}\n\n"
68
+ f"Task: Assess how well this resume matches the job. Return a JSON with fields: "
69
+ f"'fit_assessment', 'score', 'strengths', 'gaps', 'suggestions'.")
70
+ answer = json.dumps({
71
+ "fit_assessment": fit, "score": score,
72
+ "strengths": ["Relevant experience present"],
73
+ "gaps": ["Tailor keywords to job description"],
74
+ "suggestions": ["Add measurable achievements", "Mirror JD language in skills section"]
75
+ }, indent=2)
76
+ out.append({"messages": [{"role": "system", "content": CAREER_SYSTEM},
77
+ {"role": "user", "content": prompt},
78
+ {"role": "assistant", "content": answer}]})
79
+ return Dataset.from_list(out)
80
+
81
+
82
+ def build_resume_review():
83
+ ds = load_dataset("opensporks/resumes", split="train")
84
+ out = []
85
+ for ex in ds:
86
+ resume, cat = ex["Resume_str"].strip()[:2000], ex["Category"]
87
+ out.append({"messages": [
88
+ {"role": "system", "content": CAREER_SYSTEM},
89
+ {"role": "user", "content": f"Please review my resume and give me actionable feedback:\n\n{resume}"},
90
+ {"role": "assistant", "content": (
91
+ f"## Resume Review Feedback ({cat})\n\n"
92
+ f"**Strengths:**\n- Clear professional summary\n- Relevant work experience\n\n"
93
+ f"**Improvements:**\n1. Add quantifiable achievements\n"
94
+ f"2. Tailor skills to target roles\n3. Use active voice\n"
95
+ f"4. Remove outdated experience\n\n"
96
+ f"**ATS:** Include keywords from target job descriptions.")}
97
+ ]})
98
+ return Dataset.from_list(out)
99
+
100
+
101
+ def build_resume_json():
102
+ ds = load_dataset("sandeeppanem/resume-json-extraction-5k", split="train")
103
+ out = []
104
+ for ex in ds:
105
+ raw = ex["text"][:3000]
106
+ out.append({"messages": [
107
+ {"role": "system", "content": CAREER_SYSTEM},
108
+ {"role": "user", "content": "Parse this resume into structured JSON."},
109
+ {"role": "assistant", "content": raw}
110
+ ]})
111
+ return Dataset.from_list(out)
112
+
113
+
114
+ def build_synthetic_coaching(n=800):
115
+ topics = [
116
+ ("salary negotiation",
117
+ "I'm about to negotiate my salary for a new offer. What strategies should I use?",
118
+ "## Salary Negotiation Strategy\n\n1. **Research:** Know market rate (Glassdoor, Levels.fyi)\n"
119
+ "2. **Anchor high:** State 10-15% above target\n"
120
+ "3. **Total comp:** Include base + bonus + equity + benefits\n"
121
+ "4. **Leverage:** Competing offers or business case\n"
122
+ "5. **Practice:** Rehearse out loud\n\n"
123
+ "**Script:** 'I'm excited about this role. Based on my research, I was expecting $X. Is there flexibility?'"),
124
+ ("career pivot",
125
+ "I want to pivot from marketing to data science. What's my roadmap?",
126
+ "## Career Pivot: Marketing β†’ Data Science\n\n"
127
+ "**Phase 1 (0-3 mo):** Learn Python + pandas + SQL\n"
128
+ "**Phase 2 (3-6 mo):** Build 3 portfolio projects with real datasets\n"
129
+ "**Phase 3 (6-9 mo):** Freelance / intern for real experience\n"
130
+ "**Phase 4 (9-12 mo):** Apply to hybrid roles (marketing analytics)\n\n"
131
+ "**Leverage:** A/B testing, segmentation, ROI analysis are transferable."),
132
+ ("networking",
133
+ "How do I effectively network on LinkedIn without being awkward?",
134
+ "## LinkedIn Networking\n\n"
135
+ "1. **Personalize invites:** Mention specific post or project\n"
136
+ "2. **Give before asking:** Share their content first\n"
137
+ "3. **Follow up:** Brief thank-you + one question\n"
138
+ "4. **Consistency:** Comment on 3-5 posts weekly\n"
139
+ "5. **Informational calls:** Request 15-min chats\n\n"
140
+ "**Template:** 'Hi [Name], I enjoyed your post on X. I'm exploring Y. Would you be open to a brief chat?'"),
141
+ ("resume gap",
142
+ "I have a 2-year employment gap. How do I address it?",
143
+ "## Addressing Employment Gaps\n\n"
144
+ "**On Resume:**\n- Use functional/hybrid format\n"
145
+ "- Include freelance/volunteer work\n"
146
+ "- Omit months for gaps <2 years\n\n"
147
+ "**In Interviews:**\n- Be honest but brief\n"
148
+ "- Pivot to what you did: courses, certifications\n"
149
+ "- Emphasize readiness\n\n"
150
+ "**Mindset:** How you frame gaps matters more than the gap."),
151
+ ("promotion",
152
+ "What should I do in the next 6 months to position myself for a promotion?",
153
+ "## Promotion Strategy (6-Month Plan)\n\n"
154
+ "**Month 1-2: Visibility**\n- Document wins in a 'brag document'\n"
155
+ "- Volunteer for high-visibility projects\n\n"
156
+ "**Month 3-4: Skill Gaps**\n- Identify next-level competencies\n"
157
+ "- Seek cross-functional exposure\n\n"
158
+ "**Month 5: Feedback**\n- Ask: 'What would it take to be ready for [next level]?'\n\n"
159
+ "**Month 6: The Ask**\n- Present documented impact aligned with company goals\n\n"
160
+ "**Key:** Promotions happen when you perform at the next level."),
161
+ ("interview prep",
162
+ "I'm interviewing for a Senior Software Engineer role. How should I prepare?",
163
+ "## Senior SWE Interview Prep\n\n"
164
+ "**Technical:**\n- System design: scalability, microservices, caching\n"
165
+ "- Coding: LeetCode medium/hard, focus on trees, graphs, DP\n"
166
+ "- Behavioral: STAR method for all answers\n\n"
167
+ "**Leadership:**\n- Cross-team collaboration examples\n"
168
+ "- Mentorship stories\n"
169
+ "- Conflict resolution\n\n"
170
+ "**Questions to ask them:**\n- Team structure and tech stack\n"
171
+ "- On-call expectations\n"
172
+ "- Growth trajectory\n\n"
173
+ "**Mock:** Practice with friends or Pramp 3-5 times before the real thing."),
174
+ ]
175
+ out = []
176
+ for _ in range(n):
177
+ _, u, a = random.choice(topics)
178
+ out.append({"messages": [
179
+ {"role": "system", "content": CAREER_SYSTEM},
180
+ {"role": "user", "content": u},
181
+ {"role": "assistant", "content": a}
182
+ ]})
183
+ return Dataset.from_list(out)
184
+
185
+
186
+ def build_reasoning_boost(n=400):
187
+ """Add general reasoning data to improve instruction following quality."""
188
+ reasoning_data = [
189
+ ("A candidate has 3 years Python, 2 years ML, and wants a Senior Data Scientist role requiring 5+ years and deep learning. Assess fit step by step.",
190
+ "Let me analyze this systematically.\n\n"
191
+ "**Step 1 - Experience Gap:** The role requires 5+ years; candidate has 3 years Python and 2 years ML. Total relevant experience: ~3 years (Python is foundational, ML is specialized). This is a gap of ~2 years.\n\n"
192
+ "**Step 2 - Skills Gap:** The role requires deep learning. The candidate has ML experience but deep learning is a subset. Need to verify: does their ML include neural networks, transformers, or just classical ML?\n\n"
193
+ "**Step 3 - Mitigating Factors:**\n- If they have strong project outcomes (deployed models, business impact)\n"
194
+ "- If they've used deep learning frameworks (PyTorch, TensorFlow)\n"
195
+ "- If they have publications or Kaggle achievements\n\n"
196
+ "**Step 4 - Fit Assessment:** Partial Fit (60/100). The experience gap is the main blocker, but strong project portfolio could compensate.\n\n"
197
+ "**Recommendations:**\n1. Apply to Mid-Level Data Scientist roles instead\n"
198
+ "2. Build 2-3 deep learning portfolio projects\n"
199
+ "3. Highlight any leadership or mentoring experience to show senior potential\n"
200
+ "4. Consider companies that value skills over years of experience (startups, high-growth)")),
201
+ ("Compare two resumes for a Product Manager role. Resume A: 4 years PM at FAANG, launched 3 features with $10M+ revenue. Resume B: 6 years PM at startups, launched 8 features, $2M total revenue. Who is the stronger candidate and why?",
202
+ "Let me compare these candidates systematically.\n\n"
203
+ "**Dimension 1 - Impact per Year:**\n- Resume A: $10M / 4 years = $2.5M/year average impact\n"
204
+ "- Resume B: $2M / 6 years = $0.33M/year average impact\n"
205
+ "Resume A shows significantly higher revenue impact per year.\n\n"
206
+ "**Dimension 2 - Feature Complexity:**\n"
207
+ "FAANG products typically have billions of users; a single feature can impact millions. Startup features may serve thousands. Resume A likely dealt with higher complexity (scale, stakeholder management, regulatory).\n\n"
208
+ "**Dimension 3 - Breadth vs Depth:**\n"
209
+ "Resume B launched more features (8 vs 3) suggesting breadth. Resume A suggests depth with fewer, higher-impact launches. For a senior PM role at a large company, depth is valued. For an early-stage startup, breadth might be preferred.\n\n"
210
+ "**Dimension 4 - Career Trajectory:**\n"
211
+ "Resume A at FAANG indicates passing rigorous hiring bar, structured PM training. Resume B shows entrepreneurial resilience but less institutional training.\n\n"
212
+ "**Verdict:** Resume A is the stronger candidate for most corporate PM roles. Resume B might be preferred for an early-stage startup where scrappiness and speed matter more than scale experience.\n\n"
213
+ "**Recommendation:** When presenting Resume B, emphasize the breadth as 'adaptability across product stages' and frame the smaller revenue as 'efficient resource utilization in resource-constrained environments.'"),
214
+ ]
215
+ out = []
216
+ for i in range(n):
217
+ q, a = reasoning_data[i % len(reasoning_data)]
218
+ out.append({"messages": [
219
+ {"role": "system", "content": CAREER_SYSTEM},
220
+ {"role": "user", "content": q},
221
+ {"role": "assistant", "content": a}
222
+ ]})
223
+ return Dataset.from_list(out)
224
+
225
+
226
+ def build_full_dataset():
227
+ """Build and cache the complete training dataset."""
228
+ print("\n[DATASET] Building training corpus...")
229
+ ds1 = build_resume_job_fit()
230
+ ds2 = build_resume_review()
231
+ ds3 = build_resume_json()
232
+ ds4 = build_synthetic_coaching()
233
+ ds5 = build_reasoning_boost()
234
+ full = concatenate_datasets([ds1, ds2, ds3, ds4, ds5]).shuffle(seed=42)
235
+ print(f"[DATASET] Total: {len(full)} examples")
236
+ print(f" - Resume-Job Fit: {len(ds1)}")
237
+ print(f" - Resume Review: {len(ds2)}")
238
+ print(f" - Resume JSON Parse: {len(ds3)}")
239
+ print(f" - Synthetic Coaching: {len(ds4)}")
240
+ print(f" - Reasoning Boost: {len(ds5)}")
241
+ return full
242
+
243
+
244
+ # ── REWARD FUNCTIONS FOR GRPO ────────────────────────────────────────────
245
+
246
+ def career_reward_function(completions: List[str], prompts: List[str], **kwargs) -> List[float]:
247
+ """
248
+ Multi-component reward function for career agent outputs.
249
+ Returns a score from 0 to 1 for each completion.
250
+ """
251
+ scores = []
252
+ for completion, prompt in zip(completions, prompts):
253
+ score = 0.0
254
+ text = completion.strip()
255
+
256
+ # Component 1: Structure reward (headers, bullet points, sections)
257
+ structure_score = 0.0
258
+ if "##" in text or "**" in text or "1." in text:
259
+ structure_score = 0.25
260
+ elif "- " in text or "\n\n" in text:
261
+ structure_score = 0.15
262
+ score += structure_score
263
+
264
+ # Component 2: JSON correctness (if JSON requested)
265
+ if "json" in prompt.lower():
266
+ try:
267
+ # Find JSON in text
268
+ json_match = re.search(r'\{.*\}', text, re.DOTALL)
269
+ if json_match:
270
+ json.loads(json_match.group())
271
+ score += 0.25
272
+ except (json.JSONDecodeError, ValueError):
273
+ pass
274
+ else:
275
+ score += 0.15 # Non-JSON prompts get partial credit
276
+
277
+ # Component 3: Actionability (presence of actionable verbs)
278
+ action_words = ["add", "remove", "build", "create", "learn", "practice",
279
+ "apply", "network", "research", "tailor", "optimize", "update"]
280
+ action_count = sum(1 for w in action_words if w.lower() in text.lower())
281
+ score += min(0.25, action_count * 0.05)
282
+
283
+ # Component 4: Career relevance (mentions of career-specific terms)
284
+ career_terms = ["resume", "interview", "career", "job", "skills",
285
+ "experience", "promotion", "salary", "negotiation",
286
+ "ATS", "keywords", "hiring", "manager", "role"]
287
+ career_count = sum(1 for t in career_terms if t.lower() in text.lower())
288
+ score += min(0.25, career_count * 0.03)
289
+
290
+ scores.append(min(1.0, max(0.0, score)))
291
+ return scores
292
+
293
+
294
+ # ── DPO DATASET GENERATION ─────────────────────────────────────────────────
295
+
296
+ def generate_dpo_dataset(model, tokenizer, sft_dataset: Dataset, n_pairs: int = 500) -> Dataset:
297
+ """
298
+ Generate preference pairs for DPO training by sampling the SFT model twice
299
+ and scoring outputs with the career reward function.
300
+ """
301
+ print(f"\n[DPO] Generating {n_pairs} preference pairs from SFT model...")
302
+ dpo_examples = []
303
+ model.eval()
304
+
305
+ # Sample prompts from the SFT dataset
306
+ indices = random.sample(range(len(sft_dataset)), min(n_pairs * 2, len(sft_dataset)))
307
+
308
+ for idx in indices[:n_pairs]:
309
+ ex = sft_dataset[idx]
310
+ messages = ex["messages"]
311
+ # Extract user message as prompt
312
+ user_msg = None
313
+ for m in messages:
314
+ if m["role"] == "user":
315
+ user_msg = m["content"]
316
+ break
317
+ if not user_msg:
318
+ continue
319
+
320
+ # Generate two completions
321
+ inputs = tokenizer.apply_chat_template(
322
+ [{"role": "user", "content": user_msg}],
323
+ tokenize=True, return_tensors="pt", add_generation_prompt=True
324
+ ).to(model.device)
325
+
326
+ completions_list = []
327
+ for _ in range(2):
328
+ with torch.no_grad():
329
+ out = model.generate(
330
+ inputs,
331
+ max_new_tokens=512,
332
+ temperature=0.8,
333
+ do_sample=True,
334
+ top_p=0.9,
335
+ pad_token_id=tokenizer.eos_token_id,
336
+ )
337
+ resp = tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)
338
+ completions_list.append(resp)
339
+
340
+ # Score both
341
+ scores = career_reward_function(completions_list, [user_msg, user_msg])
342
+
343
+ if scores[0] != scores[1]:
344
+ # Create preference pair
345
+ chosen_idx = 0 if scores[0] > scores[1] else 1
346
+ rejected_idx = 1 - chosen_idx
347
+ dpo_examples.append({
348
+ "prompt": [{"role": "user", "content": user_msg}],
349
+ "chosen": [{"role": "assistant", "content": completions_list[chosen_idx]}],
350
+ "rejected": [{"role": "assistant", "content": completions_list[rejected_idx]}],
351
+ })
352
+
353
+ print(f"[DPO] Generated {len(dpo_examples)} valid preference pairs")
354
+ return Dataset.from_list(dpo_examples)
355
+
356
+
357
+ # ── STAGE 1: SFT ───────────────────────────────────────────────────────────
358
+
359
+ def run_sft(dataset: Dataset, output_dir: str = "./stage1_sft") -> str:
360
+ """Stage 1: Supervised Fine-Tuning with high-rank LoRA."""
361
+ print("\n" + "=" * 60)
362
+ print("STAGE 1: SFT (Supervised Fine-Tuning)")
363
+ print("=" * 60)
364
+
365
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
366
+ if tokenizer.pad_token is None:
367
+ tokenizer.pad_token = tokenizer.eos_token
368
+
369
+ # SOTA LoRA config: r=256 for SFT (high capacity)
370
+ peft_config = LoraConfig(
371
+ r=256,
372
+ lora_alpha=512,
373
+ lora_dropout=0.05,
374
+ bias="none",
375
+ task_type=TaskType.CAUSAL_LM,
376
+ target_modules="all-linear", # Target all linear layers (SOTA recipe)
377
+ use_rslora=True, # Rank-Stabilized LoRA for better large-rank training
378
+ )
379
+
380
+ args = SFTConfig(
381
+ output_dir=output_dir,
382
+ num_train_epochs=2,
383
+ per_device_train_batch_size=2,
384
+ gradient_accumulation_steps=4,
385
+ learning_rate=2e-4,
386
+ lr_scheduler_type="cosine",
387
+ warmup_ratio=0.03,
388
+ logging_steps=10,
389
+ logging_first_step=True,
390
+ save_steps=500,
391
+ save_total_limit=2,
392
+ max_length=2048,
393
+ bf16=True,
394
+ gradient_checkpointing=True,
395
+ assistant_only_loss=True,
396
+ remove_unused_columns=False,
397
+ report_to=["none"], # Use custom viz instead
398
+ )
399
+
400
+ trainer = SFTTrainer(
401
+ model=MODEL_ID,
402
+ tokenizer=tokenizer,
403
+ train_dataset=dataset,
404
+ args=args,
405
+ peft_config=peft_config,
406
+ )
407
+
408
+ print(f"\n[STAGE 1] Training SFT for 2 epochs...")
409
+ trainer.train()
410
+
411
+ # Save adapter
412
+ sft_adapter_path = os.path.join(output_dir, "final_adapter")
413
+ trainer.save_model(sft_adapter_path)
414
+ print(f"[STAGE 1] SFT adapter saved to {sft_adapter_path}")
415
+
416
+ del trainer
417
+ gc.collect()
418
+ torch.cuda.empty_cache()
419
+
420
+ return sft_adapter_path
421
+
422
+
423
+ # ── STAGE 2: DPO ───────────────────────────────────────────────────────────
424
+
425
+ def run_dpo(sft_adapter_path: str, dpo_dataset: Dataset, output_dir: str = "./stage2_dpo") -> str:
426
+ """Stage 2: Direct Preference Optimization on career-quality pairs."""
427
+ print("\n" + "=" * 60)
428
+ print("STAGE 2: DPO (Direct Preference Optimization)")
429
+ print("=" * 60)
430
+
431
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
432
+ if tokenizer.pad_token is None:
433
+ tokenizer.pad_token = tokenizer.eos_token
434
+
435
+ # Load the SFT model with adapter
436
+ base_model = AutoModelForCausalLM.from_pretrained(
437
+ MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
438
+ )
439
+ model = PeftModel.from_pretrained(base_model, sft_adapter_path, is_trainable=True)
440
+
441
+ # DPO with lower-rank LoRA (RL typically needs less capacity)
442
+ peft_config = LoraConfig(
443
+ r=64,
444
+ lora_alpha=128,
445
+ lora_dropout=0.05,
446
+ bias="none",
447
+ task_type=TaskType.CAUSAL_LM,
448
+ target_modules="all-linear",
449
+ )
450
+ model = get_peft_model(model, peft_config)
451
+
452
+ args = DPOConfig(
453
+ output_dir=output_dir,
454
+ num_train_epochs=1,
455
+ per_device_train_batch_size=1,
456
+ gradient_accumulation_steps=8,
457
+ learning_rate=5e-7,
458
+ lr_scheduler_type="cosine",
459
+ warmup_ratio=0.1,
460
+ logging_steps=10,
461
+ logging_first_step=True,
462
+ save_steps=200,
463
+ save_total_limit=2,
464
+ max_length=2048,
465
+ max_prompt_length=1024,
466
+ bf16=True,
467
+ gradient_checkpointing=True,
468
+ beta=0.1, # DPO temperature
469
+ remove_unused_columns=False,
470
+ report_to=["none"],
471
+ )
472
+
473
+ trainer = DPOTrainer(
474
+ model=model,
475
+ ref_model=None, # Use implicit reference (DPO default)
476
+ tokenizer=tokenizer,
477
+ train_dataset=dpo_dataset,
478
+ args=args,
479
+ )
480
+
481
+ print(f"\n[STAGE 2] Training DPO for 1 epoch on {len(dpo_dataset)} pairs...")
482
+ trainer.train()
483
+
484
+ dpo_adapter_path = os.path.join(output_dir, "final_adapter")
485
+ trainer.save_model(dpo_adapter_path)
486
+ print(f"[STAGE 2] DPO adapter saved to {dpo_adapter_path}")
487
+
488
+ del trainer, model, base_model
489
+ gc.collect()
490
+ torch.cuda.empty_cache()
491
+
492
+ return dpo_adapter_path
493
+
494
+
495
+ # ── STAGE 3: GRPO ──────────────────────────────────────────────────────────
496
+
497
+ def run_grpo(dpo_adapter_path: str, grpo_dataset: Dataset, output_dir: str = "./stage3_grpo") -> str:
498
+ """Stage 3: GRPO with custom multi-component reward function."""
499
+ print("\n" + "=" * 60)
500
+ print("STAGE 3: GRPO (Group Relative Policy Optimization)")
501
+ print("=" * 60)
502
+
503
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
504
+ if tokenizer.pad_token is None:
505
+ tokenizer.pad_token = tokenizer.eos_token
506
+
507
+ # Load DPO model
508
+ base_model = AutoModelForCausalLM.from_pretrained(
509
+ MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
510
+ )
511
+ model = PeftModel.from_pretrained(base_model, dpo_adapter_path, is_trainable=True)
512
+
513
+ # GRPO config: even lower rank, very small LR
514
+ peft_config = LoraConfig(
515
+ r=32,
516
+ lora_alpha=64,
517
+ lora_dropout=0.05,
518
+ bias="none",
519
+ task_type=TaskType.CAUSAL_LM,
520
+ target_modules="all-linear",
521
+ )
522
+ model = get_peft_model(model, peft_config)
523
+
524
+ # Prepare GRPO dataset: prompt-only format
525
+ grpo_prompts = []
526
+ for ex in grpo_dataset:
527
+ for m in ex["messages"]:
528
+ if m["role"] == "user":
529
+ grpo_prompts.append({"prompt": [{"role": "user", "content": m["content"]}]})
530
+ break
531
+
532
+ grpo_ds = Dataset.from_list(grpo_prompts[:1000]) # Use subset for GRPO
533
+
534
+ args = GRPOConfig(
535
+ output_dir=output_dir,
536
+ num_train_epochs=1,
537
+ per_device_train_batch_size=1,
538
+ gradient_accumulation_steps=4,
539
+ learning_rate=1e-6,
540
+ lr_scheduler_type="cosine",
541
+ warmup_ratio=0.1,
542
+ logging_steps=10,
543
+ logging_first_step=True,
544
+ save_steps=200,
545
+ save_total_limit=2,
546
+ max_completion_length=512,
547
+ bf16=True,
548
+ gradient_checkpointing=True,
549
+ report_to=["none"],
550
+ num_generations=4, # Group size for relative advantage
551
+ )
552
+
553
+ trainer = GRPOTrainer(
554
+ model=model,
555
+ reward_funcs=career_reward_function,
556
+ tokenizer=tokenizer,
557
+ train_dataset=grpo_ds,
558
+ args=args,
559
+ )
560
+
561
+ print(f"\n[STAGE 3] Training GRPO for 1 epoch on {len(grpo_ds)} prompts...")
562
+ trainer.train()
563
+
564
+ grpo_adapter_path = os.path.join(output_dir, "final_adapter")
565
+ trainer.save_model(grpo_adapter_path)
566
+ print(f"[STAGE 3] GRPO adapter saved to {grpo_adapter_path}")
567
+
568
+ del trainer, model, base_model
569
+ gc.collect()
570
+ torch.cuda.empty_cache()
571
+
572
+ return grpo_adapter_path
573
+
574
+
575
+ # ── MERGE & PUSH ───────────────────────────────────────────────────────────
576
+
577
+ def merge_and_push(base_model_id: str, adapter_path: str, hub_repo: str):
578
+ """Merge LoRA adapters into base model and push to Hub."""
579
+ print(f"\n[MERGE] Loading base model and merging adapters...")
580
+ base_model = AutoModelForCausalLM.from_pretrained(
581
+ base_model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
582
+ )
583
+ model = PeftModel.from_pretrained(base_model, adapter_path)
584
+ merged_model = model.merge_and_unload()
585
+
586
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
587
+ if tokenizer.pad_token is None:
588
+ tokenizer.pad_token = tokenizer.eos_token
589
+
590
+ print(f"[MERGE] Pushing merged model to {hub_repo}...")
591
+ merged_model.push_to_hub(hub_repo, safe_serialization=True)
592
+ tokenizer.push_to_hub(hub_repo)
593
+ print(f"[MERGE] βœ… Model pushed to https://huggingface.co/{hub_repo}")
594
+
595
+ del merged_model, model, base_model
596
+ gc.collect()
597
+ torch.cuda.empty_cache()
598
+
599
+
600
+ # ── MAIN ORCHESTRATOR ──────────────────────────────────────────────────────
601
+
602
+ def main():
603
+ stage = STAGE.lower()
604
+
605
+ # Always build dataset first
606
+ dataset = build_full_dataset()
607
+
608
+ # Cache dataset to Hub for reuse
609
+ try:
610
+ dataset.push_to_hub(DATASET_HUB, private=False)
611
+ print(f"[DATASET] Cached to https://huggingface.co/datasets/{DATASET_HUB}")
612
+ except Exception as e:
613
+ print(f"[DATASET] Push skipped: {e}")
614
+
615
+ sft_path = "./stage1_sft/final_adapter"
616
+ dpo_path = "./stage2_dpo/final_adapter"
617
+ grpo_path = "./stage3_grpo/final_adapter"
618
+
619
+ # STAGE 1: SFT
620
+ if stage in ("all", "sft", "dpo", "grpo"):
621
+ if not os.path.exists(sft_path) or stage == "sft":
622
+ sft_path = run_sft(dataset, output_dir="./stage1_sft")
623
+ else:
624
+ print(f"\n[SKIP] SFT adapter found at {sft_path}")
625
+
626
+ # STAGE 2: DPO
627
+ if stage in ("all", "dpo", "grpo"):
628
+ if not os.path.exists(dpo_path) or stage == "dpo":
629
+ # Generate preference pairs from SFT model
630
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
631
+ if tokenizer.pad_token is None:
632
+ tokenizer.pad_token = tokenizer.eos_token
633
+ base = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16,
634
+ device_map="auto", trust_remote_code=True)
635
+ sft_model = PeftModel.from_pretrained(base, sft_path)
636
+
637
+ dpo_dataset = generate_dpo_dataset(sft_model, tokenizer, dataset, n_pairs=500)
638
+
639
+ del sft_model, base
640
+ gc.collect()
641
+ torch.cuda.empty_cache()
642
+
643
+ dpo_path = run_dpo(sft_path, dpo_dataset, output_dir="./stage2_dpo")
644
+ else:
645
+ print(f"\n[SKIP] DPO adapter found at {dpo_path}")
646
+
647
+ # STAGE 3: GRPO
648
+ if stage in ("all", "grpo"):
649
+ if not os.path.exists(grpo_path) or stage == "grpo":
650
+ grpo_path = run_grpo(dpo_path, dataset, output_dir="./stage3_grpo")
651
+ else:
652
+ print(f"\n[SKIP] GRPO adapter found at {grpo_path}")
653
+
654
+ # MERGE & PUSH
655
+ if stage == "all":
656
+ print("\n" + "=" * 60)
657
+ print("FINAL: Merging & Pushing")
658
+ print("=" * 60)
659
+ merge_and_push(MODEL_ID, grpo_path, OUTPUT_HUB)
660
+ print(f"\nπŸŽ‰ COMPLETE! Your Career Agent is at https://huggingface.co/{OUTPUT_HUB}")
661
+
662
+ print("\n" + "=" * 60)
663
+ print("Training artifacts:")
664
+ print(f" SFT adapter: {sft_path}")
665
+ print(f" DPO adapter: {dpo_path}")
666
+ print(f" GRPO adapter: {grpo_path}")
667
+ print("=" * 60)
668
+
669
+
670
+ if __name__ == "__main__":
671
+ main()