| """ |
| Personal Career Agent β LoRA SFT Training Script |
| Base: Qwen/Qwen2.5-1.5B-Instruct | Method: TRL SFTTrainer + LoRA |
| |
| Run: |
| pip install transformers trl datasets peft torch trackio |
| python train.py |
| |
| Requires: GPU with β₯16GB VRAM (T4 works at batch 2 + grad accum 4 + LoRA) |
| """ |
| import json, random |
| from datasets import load_dataset, Dataset, concatenate_datasets |
| from transformers import AutoTokenizer |
| from trl import SFTTrainer, SFTConfig |
| from peft import LoraConfig, TaskType |
|
|
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
| OUTPUT_HUB = "Builder-Neekhil/career-agent-v1" |
| DATASET_HUB = "Builder-Neekhil/career-agent-dataset-v1" |
|
|
| CAREER_SYSTEM = ( |
| "You are a seasoned career advising expert with 15 years of experience helping professionals " |
| "navigate their careers. You excel at: reviewing and tailoring resumes, assessing job fit, " |
| "generating interview questions, suggesting career paths, and optimizing for ATS keywords. " |
| "Be specific, honest, actionable, and concise. When appropriate, provide structured JSON outputs." |
| ) |
|
|
| |
|
|
| def build_resume_job_fit(): |
| ds = load_dataset("cnamuangtoun/resume-job-description-fit", split="train") |
| out = [] |
| for ex in ds: |
| resume, job = ex["resume_text"].strip(), ex["job_description_text"].strip() |
| fit = "Good Fit" if ex["label"] in ("Fit", "Good Fit", "Good") else "No Fit" |
| score = 75 if fit == "Good Fit" else 25 |
| prompt = ( |
| f"Resume:\n{resume}\n\nJob Description:\n{job}\n\n" |
| f"Task: Assess how well this resume matches the job. Return a JSON with fields: " |
| f"'fit_assessment' ('{fit}'), 'score' ({score}), 'strengths' (list), 'gaps' (list), " |
| f"'suggestions' (list of concrete improvements)." |
| ) |
| answer = json.dumps({ |
| "fit_assessment": fit, "score": score, |
| "strengths": ["Relevant experience present"], |
| "gaps": ["Tailor keywords to job description"], |
| "suggestions": ["Add measurable achievements", "Mirror JD language in skills section"] |
| }, indent=2) |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": prompt}, |
| {"role": "assistant", "content": answer} |
| ]}) |
| return Dataset.from_list(out) |
|
|
|
|
| def build_resume_review(): |
| ds = load_dataset("opensporks/resumes", split="train") |
| out = [] |
| for ex in ds: |
| resume, cat = ex["Resume_str"].strip(), ex["Category"] |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": f"Please review my resume and give me actionable feedback:\n\n{resume}"}, |
| {"role": "assistant", "content": ( |
| f"## Resume Review Feedback ({cat})\n\n" |
| f"**Strengths:**\n- Clear professional summary\n- Relevant work experience listed\n\n" |
| f"**Improvements:**\n1. Add quantifiable achievements (metrics, percentages)\n" |
| f"2. Tailor skills section to target roles\n3. Use active voice and strong action verbs\n" |
| f"4. Remove outdated or irrelevant experience\n\n" |
| f"**ATS Optimization:** Include keywords from job descriptions you're targeting." |
| )} |
| ]}) |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": f"Based on this {cat} resume, generate 5 interview questions I should prepare for.\n\n{resume}"}, |
| {"role": "assistant", "content": ( |
| f"## Top 5 Interview Questions for {cat} Role\n\n" |
| f"1. Walk me through your most impactful project and the measurable results.\n" |
| f"2. How do you handle conflicting priorities in a fast-paced environment?\n" |
| f"3. Describe a time you identified a process improvement and implemented it.\n" |
| f"4. What tools and methodologies do you use to stay current in {cat}?\n" |
| f"5. How do you collaborate with cross-functional teams to deliver outcomes?\n\n" |
| f"**Tip:** Use the STAR method (Situation, Task, Action, Result) for behavioral questions." |
| )} |
| ]}) |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": f"Given my background in {cat}, what are 3-5 next career steps?\n\n{resume}"}, |
| {"role": "assistant", "content": ( |
| f"## Suggested Career Path from {cat}\n\n" |
| f"1. **Senior {cat} Specialist** β deepen expertise, lead small projects\n" |
| f"2. **{cat} Manager / Team Lead** β transition to people leadership\n" |
| f"3. **Cross-functional Consultant** β leverage domain knowledge across teams\n" |
| f"4. **Product / Strategy** β pivot to business-facing roles\n" |
| f"5. **Entrepreneurship / Freelance** β independent consulting in {cat}\n\n" |
| f"**Next Action:** Identify which path aligns with your values and seek a mentor already in that role." |
| )} |
| ]}) |
| return Dataset.from_list(out) |
|
|
|
|
| def build_resume_json(): |
| ds = load_dataset("sandeeppanem/resume-json-extraction-5k", split="train") |
| out = [] |
| for ex in ds: |
| raw = ex["text"] |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": "Parse this resume into structured JSON."}, |
| {"role": "assistant", "content": raw} |
| ]}) |
| return Dataset.from_list(out) |
|
|
|
|
| def build_synthetic_coaching(n=600): |
| topics = [ |
| ("salary negotiation", |
| "I'm about to negotiate my salary for a new offer. What strategies should I use?", |
| "## Salary Negotiation Strategy\n\n1. **Research:** Know market rate for your role/location (Glassdoor, Levels.fyi)\n" |
| "2. **Anchor high:** State a number 10-15% above your target\n" |
| "3. **Total comp:** Consider base + bonus + equity + benefits\n" |
| "4. **Leverage:** Have competing offers or clear business case\n" |
| "5. **Practice:** Rehearse the conversation out loud\n\n" |
| "**Script starter:** 'I'm excited about this role. Based on my research and the value I bring, I was expecting a base of $X. Is there flexibility?'"), |
| ("career pivot", |
| "I want to pivot from marketing to data science. What's my roadmap?", |
| "## Career Pivot Roadmap: Marketing β Data Science\n\n" |
| "**Phase 1 (0-3 mo):** Learn Python + pandas + SQL via online courses\n" |
| "**Phase 2 (3-6 mo):** Build 3 portfolio projects with real datasets; blog your process\n" |
| "**Phase 3 (6-9 mo):** Freelance / intern to get real experience on your resume\n" |
| "**Phase 4 (9-12 mo):** Apply to hybrid roles (marketing analytics) as bridge\n\n" |
| "**Leverage your marketing background:** A/B testing, customer segmentation, and ROI analysis are highly transferable."), |
| ("networking", |
| "How do I effectively network on LinkedIn without being awkward?", |
| "## LinkedIn Networking Guide\n\n" |
| "1. **Personalize every invite:** Mention a specific post, project, or shared interest\n" |
| "2. **Give before asking:** Share their content, offer insight, or make an introduction\n" |
| "3. **Follow up:** After connecting, send a brief thank-you + one thoughtful question\n" |
| "4. **Consistency:** Comment on 3-5 posts weekly; post your own insights monthly\n" |
| "5. **Informational calls:** Request 15-min chats; prepare 3 smart questions\n\n" |
| "**Template:** 'Hi [Name], I enjoyed your post on X. I'm exploring Y and would love to learn from your experience. Would you be open to a brief chat?'"), |
| ("resume gap", |
| "I have a 2-year employment gap. How do I address it on my resume and in interviews?", |
| "## Addressing Employment Gaps\n\n" |
| "**On Resume:**\n- Use a functional or hybrid format emphasizing skills over chronology\n" |
| "- Include any freelance, volunteer, or learning activities during the gap\n" |
| "- Omit months if the gap is <2 years; list only years\n\n" |
| "**In Interviews:**\n- Be honest but brief: 'I took time to care for family / upskill / relocate'\n" |
| "- Pivot quickly to what you did: courses, certifications, side projects\n" |
| "- Emphasize readiness: 'I'm energized and fully committed to returning at full capacity'\n\n" |
| "**Mindset:** Gaps are common. How you frame them matters more than the gap itself."), |
| ("promotion", |
| "What should I do in the next 6 months to position myself for a promotion?", |
| "## Promotion Strategy (6-Month Plan)\n\n" |
| "**Month 1-2: Visibility**\n- Document your wins in a 'brag document' (metrics, impact, testimonials)\n" |
| "- Volunteer for high-visibility projects aligned with company priorities\n\n" |
| "**Month 3-4: Skill Gaps**\n- Identify the competencies of the next level; close 2-3 gaps\n" |
| "- Seek cross-functional exposure\n\n" |
| "**Month 5: Feedback Loop**\n- Request formal feedback from your manager and peers\n" |
| "- Ask explicitly: 'What would it take for me to be ready for [next level]?'\n\n" |
| "**Month 6: The Ask**\n- Present your documented impact, align with company goals, and make the request\n\n" |
| "**Key Principle:** Promotions happen when you already perform at the next level."), |
| ] |
| out = [] |
| for _ in range(n): |
| _, u, a = random.choice(topics) |
| out.append({"messages": [ |
| {"role": "system", "content": CAREER_SYSTEM}, |
| {"role": "user", "content": u}, |
| {"role": "assistant", "content": a} |
| ]}) |
| return Dataset.from_list(out) |
|
|
|
|
| |
| if __name__ == "__main__": |
| print("Building datasets...") |
| ds1 = build_resume_job_fit() |
| ds2 = build_resume_review() |
| ds3 = build_resume_json() |
| ds4 = build_synthetic_coaching() |
|
|
| full = concatenate_datasets([ds1, ds2, ds3, ds4]).shuffle(seed=42) |
| print(f"Total examples: {len(full)} (fit={len(ds1)}, review={len(ds2)}, json={len(ds3)}, synth={len(ds4)})") |
|
|
| |
| full.push_to_hub(DATASET_HUB, private=False) |
| print(f"Dataset pushed to https://huggingface.co/datasets/{DATASET_HUB}") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| peft_config = LoraConfig( |
| r=16, lora_alpha=32, lora_dropout=0.05, bias="none", |
| task_type=TaskType.CAUSAL_LM, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
| ) |
|
|
| args = SFTConfig( |
| output_dir="./career-agent-output", |
| hub_model_id=OUTPUT_HUB, |
| push_to_hub=True, |
| num_train_epochs=3, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| learning_rate=2e-4, |
| lr_scheduler_type="cosine", |
| warmup_ratio=0.03, |
| logging_steps=10, |
| logging_first_step=True, |
| save_steps=200, |
| save_total_limit=2, |
| max_length=2048, |
| bf16=True, |
| gradient_checkpointing=True, |
| assistant_only_loss=True, |
| remove_unused_columns=False, |
| report_to=["trackio"], |
| ) |
|
|
| trainer = SFTTrainer( |
| model=MODEL_ID, |
| tokenizer=tokenizer, |
| train_dataset=full, |
| args=args, |
| peft_config=peft_config, |
| ) |
|
|
| print("Training started...") |
| trainer.train() |
| trainer.save_model() |
| trainer.push_to_hub() |
| print(f"Done! Model at https://huggingface.co/{OUTPUT_HUB}") |
|
|