rafiakedir commited on
Commit
c4b7766
·
verified ·
1 Parent(s): e1374e8

fix: corrected Colab notebook (judge-format) + fixed ablation HF loader

Browse files
Files changed (2) hide show
  1. ablations/run_ablations.py +420 -0
  2. run_on_colab.ipynb +104 -75
ablations/run_ablations.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Day 6 — Run ablations on held-out partition.
4
+
5
+ Three conditions:
6
+ Condition 1 (baseline): Week 10 raw scoring_evaluator output, no judge
7
+ Condition 2 (trained): Trained LoRA judge from training/adapter/
8
+ Condition 3 (prompt_only): Qwen 2.5 1.5B with no LoRA, best prompt
9
+
10
+ Writes:
11
+ ablations/ablation_results.json
12
+ ablations/held_out_traces.jsonl
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import sys
18
+ import time
19
+ import datetime
20
+ import statistics
21
+ from pathlib import Path
22
+
23
+ import requests
24
+
25
+ from bootstrap_test import paired_bootstrap
26
+
27
+ ROOT = Path(__file__).parent.parent
28
+ sys.path.insert(0, str(ROOT))
29
+ from scoring_evaluator import score_task
30
+
31
+ HELD_OUT_PATH = ROOT / "tenacious_bench_v0.1/held_out/tasks.jsonl"
32
+ HF_JUDGE_MODEL = "rafiakedir/tenacious-bench-adapter" # merged model on HuggingFace
33
+ TRACES_PATH = Path(__file__).parent / "held_out_traces.jsonl"
34
+ RESULTS_PATH = Path(__file__).parent / "ablation_results.json"
35
+ COST_LOG = ROOT / "cost_log.csv"
36
+
37
+ OPENROUTER_KEY = ""
38
+ DEEPSEEK_MODEL = "deepseek/deepseek-chat-v3-0324"
39
+
40
+ JUDGE_SYSTEM_PROMPT = """You are a Tenacious Consulting sales judge. Evaluate the following B2B outbound sales email on these four rubric dimensions:
41
+
42
+ 1. SIGNAL_GROUNDING_FIDELITY: Does the email only assert claims supported by the hiring signal brief? Use assertive language for high-confidence signals, questions for low-confidence ones.
43
+ 2. BENCH_COMMITMENT_HONESTY: Does the email avoid committing more engineers than available on bench?
44
+ 3. ICP_SEGMENT_APPROPRIATENESS: Does the email use language matching the correct ICP segment (growth/scale for Segment 1, cost/efficiency for Segment 2, transition/leadership for Segment 3, hedged for ABSTAIN)?
45
+ 4. TONE_PRESERVATION: Does the email avoid banned re-engagement clichés, over-apologetic language, and include a 30-minute scoping CTA with a direct opener?
46
+
47
+ Additional Tenacious tone markers:
48
+ - No "just wanted to circle back", "touching base", "hope you're doing well"
49
+ - No "I apologize for taking your time", "we may not be the best fit"
50
+ - Must include calendar CTA: "30-minute scoping conversation" or equivalent
51
+ - No fabricated competitor claims
52
+
53
+ Score each dimension 0.0 to 1.0. Return ONLY a JSON object:
54
+ {"signal_grounding": <0-1>, "bench_honesty": <0-1>, "icp_segment": <0-1>, "tone": <0-1>, "overall": <0-1>, "reasoning": "<one sentence>"}"""
55
+
56
+
57
+ def _load_env():
58
+ env_path = ROOT / ".env"
59
+ if env_path.exists():
60
+ for line in env_path.read_text().splitlines():
61
+ line = line.strip()
62
+ if "=" in line and not line.startswith("#"):
63
+ k, v = line.split("=", 1)
64
+ os.environ.setdefault(k.strip(), v.strip().strip('"'))
65
+
66
+
67
+ def call_openrouter(messages: list, model: str, max_tokens: int = 200) -> tuple[str, int, float]:
68
+ url = "https://openrouter.ai/api/v1/chat/completions"
69
+ headers = {
70
+ "Authorization": f"Bearer {os.environ.get('OPENROUTER_API_KEY', '')}",
71
+ "Content-Type": "application/json",
72
+ "HTTP-Referer": "https://github.com/rafiakedir/tenacious-bench",
73
+ }
74
+ body = {"model": model, "messages": messages, "max_tokens": max_tokens, "temperature": 0.0}
75
+ t0 = time.time()
76
+ resp = requests.post(url, headers=headers, json=body, timeout=60)
77
+ latency_ms = int((time.time() - t0) * 1000)
78
+ try:
79
+ data = resp.json()
80
+ usage = data.get("usage", {})
81
+ prompt_toks = usage.get("prompt_tokens", 0)
82
+ comp_toks = usage.get("completion_tokens", 0)
83
+ cost = 0.0
84
+ if "deepseek" in model.lower():
85
+ cost = (prompt_toks * 0.14 + comp_toks * 0.28) / 1000000
86
+ else:
87
+ cost = (prompt_toks * 0.40 + comp_toks * 0.40) / 1000000
88
+ return data["choices"][0]["message"]["content"].strip(), latency_ms, cost
89
+ except Exception:
90
+ return "[failed]", latency_ms, 0.0
91
+
92
+
93
+ def load_held_out_tasks():
94
+ tasks = []
95
+ with open(HELD_OUT_PATH) as f:
96
+ for line in f:
97
+ tasks.append(json.loads(line))
98
+ return tasks
99
+
100
+
101
+ def generate_candidate_if_missing(task: dict) -> tuple[str, float]:
102
+ """If task has no candidate_output, generate one with DeepSeek."""
103
+ if task.get("candidate_output"):
104
+ return task["candidate_output"], 0.0
105
+
106
+ inp = task.get("input", {})
107
+ hsb = inp.get("hiring_signal_brief")
108
+ bs = inp.get("bench_summary")
109
+ task_type = task.get("task_type", "email_generation")
110
+
111
+ brief_text = json.dumps(hsb or bs or {}, indent=2)[:800]
112
+ msg = [
113
+ {"role": "system", "content": "You are a Tenacious Consulting sales agent writing B2B outreach emails."},
114
+ {"role": "user", "content": f"Write a {task_type} email for this prospect:\n{brief_text}\n\nKeep it under 120 words with a 30-minute scoping CTA."},
115
+ ]
116
+ try:
117
+ text, _, cost = call_openrouter(msg, DEEPSEEK_MODEL, max_tokens=300)
118
+ return text, cost
119
+ except Exception as e:
120
+ return f"[generation failed: {e}]", 0.0
121
+
122
+
123
+ def score_with_evaluator(task: dict, candidate_output: str) -> dict:
124
+ """Condition 1: machine-verifiable scoring_evaluator only."""
125
+ t = {**task, "candidate_output": candidate_output}
126
+ result = score_task(t)
127
+ return {
128
+ "signal_grounding": result.get("score", 0.0),
129
+ "bench_honesty": result.get("score", 0.0),
130
+ "icp_segment": result.get("score", 0.0),
131
+ "tone": result.get("score", 0.0),
132
+ "overall": result.get("score", 0.0),
133
+ "passed": result.get("passed", False),
134
+ "rubric_score": result.get("score", 0.0),
135
+ }
136
+
137
+
138
+ def score_with_prompt_judge(task: dict, candidate_output: str) -> tuple[dict, int, float]:
139
+ """Condition 3: zero-shot Qwen judge via OpenRouter (Qwen3-30B)."""
140
+ inp = task.get("input", {})
141
+ brief = json.dumps(inp.get("hiring_signal_brief") or inp.get("bench_summary") or {})[:600]
142
+ prompt = f"""TASK INPUT:
143
+ {brief}
144
+
145
+ CANDIDATE EMAIL:
146
+ {candidate_output[:600]}
147
+
148
+ Score this email on all four rubric dimensions."""
149
+
150
+ msg = [
151
+ {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
152
+ {"role": "user", "content": prompt},
153
+ ]
154
+ try:
155
+ text, latency_ms, cost = call_openrouter(msg, "qwen/qwen3-30b-a3b", max_tokens=200)
156
+ # Extract JSON from response
157
+ import re
158
+ json_match = re.search(r'\{[^}]+\}', text, re.DOTALL)
159
+ if json_match:
160
+ scores = json.loads(json_match.group())
161
+ else:
162
+ scores = {"overall": 0.5, "reasoning": "parse_error"}
163
+ scores["raw_response"] = text[:200]
164
+ return scores, latency_ms, cost
165
+ except Exception as e:
166
+ return {"overall": 0.5, "error": str(e)}, 0, 0.0
167
+
168
+
169
+ TRAINED_MODEL = None
170
+ TRAINED_TOKENIZER = None
171
+
172
+ JUDGE_SYSTEM_FOR_TRAINED = (
173
+ "You are a rubric-aware judge for Tenacious Consulting B2B outbound sales emails. "
174
+ "Given a task context and a candidate email, score the email on the specified rubric "
175
+ "dimension. Respond with a JSON object only:\n"
176
+ '{"dimension": "<dim>", "score": <0.0-1.0>, "pass": <true|false>, "reasoning": "<one sentence>"}'
177
+ )
178
+
179
+
180
+ def _load_trained_model():
181
+ """Load merged judge model from HuggingFace (once, cached in module globals)."""
182
+ global TRAINED_MODEL, TRAINED_TOKENIZER
183
+ if TRAINED_MODEL is not None:
184
+ return TRAINED_MODEL, TRAINED_TOKENIZER
185
+ try:
186
+ import torch
187
+ from transformers import AutoTokenizer, AutoModelForCausalLM
188
+ print(f" Loading trained judge from {HF_JUDGE_MODEL}...")
189
+ TRAINED_TOKENIZER = AutoTokenizer.from_pretrained(HF_JUDGE_MODEL)
190
+ TRAINED_MODEL = AutoModelForCausalLM.from_pretrained(
191
+ HF_JUDGE_MODEL,
192
+ torch_dtype=torch.float16,
193
+ device_map="auto",
194
+ )
195
+ TRAINED_MODEL.eval()
196
+ print(f" Trained judge loaded")
197
+ return TRAINED_MODEL, TRAINED_TOKENIZER
198
+ except Exception as e:
199
+ print(f" Could not load trained judge from HF: {e}")
200
+ return None, None
201
+
202
+
203
+ def score_with_trained_judge(task: dict, candidate_output: str) -> tuple[dict, int, float]:
204
+ """Condition 2: merged judge model loaded from HuggingFace."""
205
+ import re, torch
206
+
207
+ model, tokenizer = _load_trained_model()
208
+ if model is None:
209
+ # Graceful fallback — mark clearly so results aren't confused with trained scores
210
+ return {"overall": 0.5, "error": "hf_model_unavailable", "note": "judge not loaded"}, 0, 0.0
211
+
212
+ dim = task.get("dimension", "signal_grounding_fidelity")
213
+ inp = task.get("input", {})
214
+ brief = json.dumps(
215
+ inp.get("hiring_signal_brief") or inp.get("bench_summary") or {}
216
+ )[:600]
217
+
218
+ user_content = (
219
+ f"EVALUATION DIMENSION: {dim}\n\n"
220
+ f"TASK CONTEXT:\n{brief}\n\n"
221
+ f"CANDIDATE EMAIL:\n{candidate_output.strip()[:500]}\n\n"
222
+ f"Score this email on the {dim} dimension."
223
+ )
224
+ msgs = [
225
+ {"role": "system", "content": JUDGE_SYSTEM_FOR_TRAINED},
226
+ {"role": "user", "content": user_content},
227
+ ]
228
+ text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
229
+ inputs = tokenizer(text, return_tensors="pt").to(model.device)
230
+
231
+ t0 = time.time()
232
+ with torch.no_grad():
233
+ output = model.generate(
234
+ **inputs, max_new_tokens=150, temperature=0.1, do_sample=True,
235
+ pad_token_id=tokenizer.eos_token_id,
236
+ )
237
+ latency_ms = int((time.time() - t0) * 1000)
238
+
239
+ generated = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
240
+
241
+ json_match = re.search(r'\{[^}]+\}', generated, re.DOTALL)
242
+ if json_match:
243
+ try:
244
+ scores = json.loads(json_match.group())
245
+ scores["overall"] = scores.get("score", 0.5)
246
+ return scores, latency_ms, 0.0
247
+ except json.JSONDecodeError:
248
+ pass
249
+ return {"overall": 0.5, "reasoning": "parse_error", "raw": generated[:200]}, latency_ms, 0.0
250
+
251
+
252
+ def append_trace(entry: dict):
253
+ with open(TRACES_PATH, "a") as f:
254
+ f.write(json.dumps(entry) + "\n")
255
+
256
+
257
+ def condition_baseline(tasks: list) -> list:
258
+ """Condition 1: scoring_evaluator only, no judge."""
259
+ print("\n=== CONDITION 1: Baseline (scoring_evaluator) ===")
260
+ results = []
261
+ for i, task in enumerate(tasks):
262
+ t0 = time.time()
263
+ candidate, cost_gen = generate_candidate_if_missing(task)
264
+ scores = score_with_evaluator(task, candidate)
265
+ latency_ms = int((time.time() - t0) * 1000)
266
+
267
+ entry = {
268
+ "task_id": task["task_id"],
269
+ "condition": "baseline",
270
+ "candidate_output": candidate[:300],
271
+ "score": scores,
272
+ "latency_ms": latency_ms,
273
+ "cost_usd": cost_gen,
274
+ }
275
+ append_trace(entry)
276
+ results.append(scores.get("overall", 0.0))
277
+ print(f" [{i+1}/{len(tasks)}] {task['task_id']} score={scores.get('overall',0):.3f}")
278
+
279
+ return results
280
+
281
+
282
+ def condition_trained_judge(tasks: list) -> list:
283
+ """Condition 2: trained LoRA adapter."""
284
+ print("\n=== CONDITION 2: Trained Judge (LoRA adapter) ===")
285
+ results = []
286
+ for i, task in enumerate(tasks):
287
+ t0 = time.time()
288
+ candidate, cost_gen = generate_candidate_if_missing(task)
289
+ scores, latency_ms, cost_judge = score_with_trained_judge(task, candidate)
290
+
291
+ # Blend with machine scorer for reliability
292
+ machine_scores = score_with_evaluator(task, candidate)
293
+ blended_overall = 0.6 * scores.get("overall", 0.5) + 0.4 * machine_scores.get("overall", 0.5)
294
+ scores["blended_overall"] = round(blended_overall, 4)
295
+ scores["machine_score"] = machine_scores.get("overall", 0.5)
296
+
297
+ entry = {
298
+ "task_id": task["task_id"],
299
+ "condition": "trained",
300
+ "candidate_output": candidate[:300],
301
+ "score": scores,
302
+ "latency_ms": latency_ms,
303
+ "cost_usd": cost_gen + cost_judge,
304
+ }
305
+ append_trace(entry)
306
+ results.append(blended_overall)
307
+ print(f" [{i+1}/{len(tasks)}] {task['task_id']} overall={blended_overall:.3f}")
308
+
309
+ return results
310
+
311
+
312
+ def condition_prompt_only(tasks: list) -> list:
313
+ """Condition 3: Qwen3 with prompt-engineered judge, no training."""
314
+ print("\n=== CONDITION 3: Prompt-Only Judge (Qwen3-30B) ===")
315
+ results = []
316
+ for i, task in enumerate(tasks):
317
+ t0 = time.time()
318
+ candidate, cost_gen = generate_candidate_if_missing(task)
319
+ scores, latency_ms, cost_judge = score_with_prompt_judge(task, candidate)
320
+
321
+ # Blend with machine scorer
322
+ machine_scores = score_with_evaluator(task, candidate)
323
+ blended_overall = 0.6 * scores.get("overall", 0.5) + 0.4 * machine_scores.get("overall", 0.5)
324
+ scores["blended_overall"] = round(blended_overall, 4)
325
+ scores["machine_score"] = machine_scores.get("overall", 0.5)
326
+
327
+ entry = {
328
+ "task_id": task["task_id"],
329
+ "condition": "prompt_only",
330
+ "candidate_output": candidate[:300],
331
+ "score": scores,
332
+ "latency_ms": latency_ms,
333
+ "cost_usd": cost_gen + cost_judge,
334
+ }
335
+ append_trace(entry)
336
+ results.append(blended_overall)
337
+ print(f" [{i+1}/{len(tasks)}] {task['task_id']} overall={blended_overall:.3f}")
338
+
339
+ return results
340
+
341
+
342
+ def main():
343
+ _load_env()
344
+
345
+ tasks = load_held_out_tasks()
346
+ print(f"Loaded {len(tasks)} held-out tasks")
347
+
348
+ # Clear traces file
349
+ TRACES_PATH.unlink(missing_ok=True)
350
+
351
+ baseline_scores = condition_baseline(tasks)
352
+ trained_scores = condition_trained_judge(tasks)
353
+ prompt_scores = condition_prompt_only(tasks)
354
+
355
+ def summarize(scores: list) -> dict:
356
+ if not scores:
357
+ return {"mean": 0, "std": 0, "min": 0, "max": 0, "p95": 0}
358
+ return {
359
+ "mean": round(statistics.mean(scores), 4),
360
+ "std": round(statistics.stdev(scores) if len(scores) > 1 else 0, 4),
361
+ "min": round(min(scores), 4),
362
+ "max": round(max(scores), 4),
363
+ "p95": round(sorted(scores)[int(0.95 * len(scores))], 4),
364
+ "n": len(scores),
365
+ }
366
+
367
+ # Compute latencies from traces
368
+ traces = []
369
+ with open(TRACES_PATH) as f:
370
+ for line in f:
371
+ traces.append(json.loads(line))
372
+
373
+ def latency_p95(condition: str) -> int:
374
+ lats = [t["latency_ms"] for t in traces if t["condition"] == condition]
375
+ if not lats:
376
+ return 0
377
+ return sorted(lats)[int(0.95 * len(lats))]
378
+
379
+ def cost_p95(condition: str) -> float:
380
+ costs = [t.get("cost_usd", 0.0) for t in traces if t["condition"] == condition]
381
+ if not costs:
382
+ return 0.0
383
+ return round(sorted(costs)[int(0.95 * len(costs))], 5)
384
+
385
+ delta_a_boot = paired_bootstrap(trained_scores, baseline_scores)
386
+ delta_a_boot["description"] = "trained judge vs baseline"
387
+
388
+ delta_b_boot = paired_bootstrap(trained_scores, prompt_scores)
389
+ delta_b_boot["description"] = "trained judge vs prompt-only"
390
+
391
+ delta_c_boot = paired_bootstrap(prompt_scores, baseline_scores)
392
+ delta_c_boot["description"] = "prompt-only vs baseline"
393
+
394
+ results = {
395
+ "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
396
+ "held_out_task_count": len(tasks),
397
+ "baseline": {**summarize(baseline_scores), "p95_latency_ms": latency_p95("baseline"), "p95_cost_usd": cost_p95("baseline")},
398
+ "trained": {**summarize(trained_scores), "p95_latency_ms": latency_p95("trained"), "p95_cost_usd": cost_p95("trained")},
399
+ "prompt_only": {**summarize(prompt_scores), "p95_latency_ms": latency_p95("prompt_only"), "p95_cost_usd": cost_p95("prompt_only")},
400
+ "delta_a": delta_a_boot,
401
+ "delta_b": delta_b_boot,
402
+ "delta_c": delta_c_boot,
403
+ }
404
+
405
+ with open(RESULTS_PATH, "w") as f:
406
+ json.dump(results, f, indent=2)
407
+
408
+ print(f"\n=== ABLATION RESULTS ===")
409
+ print(f"Baseline mean: {results['baseline']['mean']:.4f}")
410
+ print(f"Trained mean: {results['trained']['mean']:.4f}")
411
+ print(f"Prompt mean: {results['prompt_only']['mean']:.4f}")
412
+ print(f"Delta A (trained vs baseline): {results['delta_a']['mean_diff']:+.4f} (p={results['delta_a']['p_value']:.4f})")
413
+ print(f"Delta B (trained vs prompt): {results['delta_b']['mean_diff']:+.4f} (p={results['delta_b']['p_value']:.4f})")
414
+ print(f"Delta C (prompt vs baseline): {results['delta_c']['mean_diff']:+.4f} (p={results['delta_c']['p_value']:.4f})")
415
+ print(f"\nResults written to {RESULTS_PATH}")
416
+ print(f"Traces written to {TRACES_PATH}")
417
+
418
+
419
+ if __name__ == "__main__":
420
+ main()
run_on_colab.ipynb CHANGED
@@ -1,77 +1,106 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "kernelspec": {"display_name": "Python 3", "name": "python3"},
6
- "language_info": {"name": "python"},
7
- "accelerator": "GPU",
8
- "colab": {"provenance": [], "gpuType": "T4", "name": "tenacious_bench_orpo_training.ipynb"}
9
- },
10
- "cells": [
11
- {
12
- "cell_type": "markdown",
13
- "metadata": {},
14
- "source": ["# Tenacious-Bench ORPO Judge Training\n\n**Trains Qwen2.5-1.5B-Instruct** with LoRA using ORPO on Tenacious-specific rubric preference pairs.\n\nRuntime: T4 GPU (Colab free tier) \nExpected training time: ~45-90 minutes for 3 epochs\n\n## Setup\n1. Set HF_TOKEN and OPENROUTER_API_KEY in Colab Secrets (key icon in left sidebar)\n2. Run all cells in order\n"]
15
  },
16
- {
17
- "cell_type": "code",
18
- "metadata": {},
19
- "source": ["# Step 1: Check GPU\nimport subprocess\nresult = subprocess.run(['nvidia-smi'], capture_output=True, text=True)\nprint(result.stdout[:500])"]
20
- },
21
- {
22
- "cell_type": "code",
23
- "metadata": {},
24
- "source": ["# Step 2: Install Unsloth and dependencies (pinned versions)\n!pip install -q 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'\n!pip install -q trl==0.12.2 peft==0.14.0 transformers==4.47.1 datasets==3.2.0\n!pip install -q accelerate==1.2.1 bitsandbytes==0.45.0\nprint('Installation complete')"]
25
- },
26
- {
27
- "cell_type": "code",
28
- "metadata": {},
29
- "source": ["# Step 3: Clone the repo\nimport os\nfrom google.colab import userdata\n\nHF_TOKEN = userdata.get('HF_TOKEN')\nOPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')\n\nos.environ['HF_TOKEN'] = HF_TOKEN\nos.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n\n!git clone https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1 /content/tenacious-bench-data\nprint('Repo cloned')"]
30
- },
31
- {
32
- "cell_type": "code",
33
- "metadata": {},
34
- "source": ["# Step 4: Load preference pairs\nimport json\nfrom pathlib import Path\n\npairs_path = Path('/content/tenacious-bench-data/training_data/preference_pairs.jsonl')\npairs = []\nwith open(pairs_path) as f:\n for line in f:\n p = json.loads(line)\n pairs.append({'prompt': p['prompt'], 'chosen': p['chosen'], 'rejected': p['rejected']})\n\nprint(f'Loaded {len(pairs)} preference pairs')\nprint(f'Sample pair task context (first 200 chars of prompt):')\nprint(pairs[0]['prompt'][:200])"]
35
- },
36
- {
37
- "cell_type": "code",
38
- "metadata": {},
39
- "source": ["# Step 5: Load Unsloth model with 4-bit quantization\nfrom unsloth import FastLanguageModel\nimport torch\n\nMAX_SEQ_LENGTH = 1024\nDTYPE = None # auto-detect\nLOAD_IN_4BIT = True\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n model_name='unsloth/Qwen2.5-1.5B-Instruct',\n max_seq_length=MAX_SEQ_LENGTH,\n dtype=DTYPE,\n load_in_4bit=LOAD_IN_4BIT,\n)\nprint('Model loaded')"]
40
- },
41
- {
42
- "cell_type": "code",
43
- "metadata": {},
44
- "source": ["# Step 6: Apply LoRA\nmodel = FastLanguageModel.get_peft_model(\n model,\n r=16,\n target_modules=['q_proj', 'v_proj'],\n lora_alpha=32,\n lora_dropout=0.05,\n bias='none',\n use_gradient_checkpointing='unsloth',\n random_state=42,\n)\nprint('LoRA applied')"]
45
- },
46
- {
47
- "cell_type": "code",
48
- "metadata": {},
49
- "source": ["# Step 7: Set up ORPO trainer\nimport random\nimport numpy as np\nfrom datasets import Dataset\nfrom trl import ORPOConfig, ORPOTrainer\n\n# Fixed seed\nrandom.seed(42)\nnp.random.seed(42)\ntorch.manual_seed(42)\n\n# Detect precision\ncap = torch.cuda.get_device_capability()\nuse_fp16 = cap[0] < 8 # T4 uses fp16\nuse_bf16 = cap[0] >= 8 # A100/4090 use bf16\nprint(f'GPU compute capability: {cap}, fp16={use_fp16}, bf16={use_bf16}')\n\ndataset = Dataset.from_list(pairs)\n\ntraining_args = ORPOConfig(\n output_dir='/content/tenacious-adapter',\n learning_rate=8e-6,\n per_device_train_batch_size=2,\n gradient_accumulation_steps=4,\n num_train_epochs=3,\n warmup_ratio=0.1,\n lr_scheduler_type='cosine',\n beta=0.1,\n max_length=1024,\n max_prompt_length=512,\n logging_steps=10,\n save_steps=50,\n seed=42,\n fp16=use_fp16,\n bf16=use_bf16,\n report_to='none',\n remove_unused_columns=False,\n)\n\ntrainer = ORPOTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n tokenizer=tokenizer,\n)\nprint('Trainer initialized')"]
50
- },
51
- {
52
- "cell_type": "code",
53
- "metadata": {},
54
- "source": ["# Step 8: Train\nprint('Starting ORPO training...')\ntrain_result = trainer.train()\nprint(f'Training complete!')\nprint(f'Metrics: {train_result.metrics}')"]
55
- },
56
- {
57
- "cell_type": "code",
58
- "metadata": {},
59
- "source": ["# Step 9: Plot loss curve\nimport matplotlib.pyplot as plt\n\nlog_history = trainer.state.log_history\nsteps = [x['step'] for x in log_history if 'loss' in x]\nlosses = [x['loss'] for x in log_history if 'loss' in x]\n\nif steps:\n plt.figure(figsize=(10, 5))\n plt.plot(steps, losses, 'b-', linewidth=2, label='Training Loss')\n plt.xlabel('Step')\n plt.ylabel('Loss')\n plt.title('ORPO Training Loss — Tenacious Judge')\n plt.legend()\n plt.grid(True, alpha=0.3)\n plt.savefig('/content/loss_curve.png', dpi=150, bbox_inches='tight')\n plt.show()\n print(f'Initial loss: {losses[0]:.4f}')\n print(f'Final loss: {losses[-1]:.4f}')\n print(f'Loss decrease: {losses[0] - losses[-1]:.4f} ({(1-losses[-1]/losses[0])*100:.1f}%)')\nelse:\n print('No loss history available')"]
60
- },
61
- {
62
- "cell_type": "code",
63
- "metadata": {},
64
- "source": ["# Step 10: Save adapter locally and push to HuggingFace\nADAPTER_DIR = '/content/tenacious-adapter'\n\nmodel.save_pretrained(ADAPTER_DIR)\ntokenizer.save_pretrained(ADAPTER_DIR)\nprint(f'Adapter saved to {ADAPTER_DIR}')\n\n# Push to HuggingFace\nHUB_MODEL_ID = 'rafiakedir/tenacious-bench-adapter'\nprint(f'Pushing to {HUB_MODEL_ID}...')\nmodel.push_to_hub(HUB_MODEL_ID, token=HF_TOKEN)\ntokenizer.push_to_hub(HUB_MODEL_ID, token=HF_TOKEN)\nprint(f'Adapter pushed to https://huggingface.co/{HUB_MODEL_ID}')"]
65
- },
66
- {
67
- "cell_type": "code",
68
- "metadata": {},
69
- "source": ["# Step 11: Verify adapter on HuggingFace\nfrom huggingface_hub import HfApi\napi = HfApi(token=HF_TOKEN)\nfiles = api.list_repo_files(HUB_MODEL_ID)\nprint(f'Files in {HUB_MODEL_ID}:')\nfor f in files:\n print(f' {f}')"]
70
- },
71
- {
72
- "cell_type": "code",
73
- "metadata": {},
74
- "source": ["# Step 12: Quick smoke test — run judge on one sample\nfrom peft import PeftModel\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\n\nJUDGE_SYSTEM = (\n 'You are evaluating outbound sales emails for Tenacious Consulting. '\n 'Score the following output on signal-grounding fidelity, bench commitment honesty, '\n 'ICP segment appropriateness, and Tenacious tone adherence. '\n 'Return JSON: {\\\"signal_grounding\\\": 0-1, \\\"bench_honesty\\\": 0-1, \\\"icp_segment\\\": 0-1, \\\"tone\\\": 0-1, \\\"overall\\\": 0-1}'\n)\n\ntest_email = '''Subject: TalentBridge's ML hiring + 30-min call\\n\\nHi Casey,\\nTalentBridge recently closed a Series A and currently has 8 open ML roles.\\nWe staff ML squads, typically 4 engineers in under 3 weeks.\\nWant to set up a 30-minute scoping conversation?\\n\\nBest,\\nYabi'''\n\nprompt_text = (\n f'<|im_start|>system\\n{JUDGE_SYSTEM}<|im_end|>\\n'\n f'<|im_start|>user\\n{test_email}<|im_end|>\\n'\n f'<|im_start|>assistant\\n'\n)\n\ninputs = tokenizer(prompt_text, return_tensors='pt').to(model.device)\nwith torch.no_grad():\n output = model.generate(**inputs, max_new_tokens=100, temperature=0.0, do_sample=False)\ngenerated = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)\nprint('Judge output:')\nprint(generated)"]
75
- }
76
- ]
77
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "kernelspec": {
6
+ "display_name": "Python 3",
7
+ "language": "python",
8
+ "name": "python3"
9
+ },
10
+ "language_info": {
11
+ "name": "python"
12
+ },
13
+ "accelerator": "GPU"
 
14
  },
15
+ "cells": [
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": "# Tenacious-Bench ORPO Judge Training (Fixed)\n\n**Trains Qwen3.5-0.8B as a scoring judge** using ORPO on judge-format preference pairs.\n\nEach pair teaches the model: given [task context + candidate email] \u2192 output correct JSON score.\n\n**Fixes vs original notebook:**\n- Training data is judge pairs (score output) not generator pairs (email output)\n- Conversations list format for ORPOTrainer (no pre-tokenized ChatML strings)\n- Merges + pushes full model for clean HuggingFace inference\n\nRuntime: T4 GPU \u00b7 ~30-60 min \u00b7 3 epochs \u00b7 188 judge pairs\n\n## Setup\n1. Runtime \u2192 Change runtime type \u2192 T4 GPU\n2. Secrets (key icon left sidebar): `HF_TOKEN`, `OPENROUTER_API_KEY`\n3. Run all cells in order\n"
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "metadata": {},
24
+ "source": "# Cell 1: Check GPU\nimport subprocess\nresult = subprocess.run(['nvidia-smi'], capture_output=True, text=True)\nprint(result.stdout[:600] if result.returncode == 0 else 'No GPU \u2014 change runtime type to T4')\n",
25
+ "outputs": [],
26
+ "execution_count": null
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "metadata": {},
31
+ "source": "# Cell 2: Install dependencies (pinned)\n!pip install -q 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'\n!pip install -q trl==0.12.2 peft==0.14.0 transformers==4.47.1 datasets==3.2.0\n!pip install -q accelerate==1.2.1 bitsandbytes==0.45.0\nprint('Installation complete')\n",
32
+ "outputs": [],
33
+ "execution_count": null
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "metadata": {},
38
+ "source": "# Cell 3: Auth + clone dataset from HuggingFace\nimport os\nfrom google.colab import userdata\n\nHF_TOKEN = userdata.get('HF_TOKEN')\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n!git clone https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1 /content/tb-data\nprint('Dataset cloned')\n",
39
+ "outputs": [],
40
+ "execution_count": null
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "metadata": {},
45
+ "source": "# Cell 4: Load judge pairs (conversations format)\n# judge_pairs.jsonl was built by training/build_judge_pairs.py\n# Each item: {\"chosen\": [{role, content}, ...], \"rejected\": [{role, content}, ...]}\n# The assistant turn in chosen = correct JSON score; in rejected = wrong JSON score\nimport json\nfrom pathlib import Path\n\npairs_path = Path('/content/tb-data/training_data/judge_pairs.jsonl')\npairs = []\nwith open(pairs_path) as f:\n for line in f:\n p = json.loads(line)\n pairs.append({'chosen': p['chosen'], 'rejected': p['rejected']})\n\nprint(f'Loaded {len(pairs)} judge pairs')\nprint('Sample chosen (correct score):', pairs[0]['chosen'][-1]['content'])\nprint('Sample rejected (wrong score): ', pairs[0]['rejected'][-1]['content'])\n",
46
+ "outputs": [],
47
+ "execution_count": null
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "metadata": {},
52
+ "source": "# Cell 5: Load Qwen3.5-0.8B-Instruct via Unsloth (4-bit quantization)\nfrom unsloth import FastLanguageModel\nimport torch\n\nMAX_SEQ_LENGTH = 1024\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n model_name='unsloth/Qwen3.5-0.8B-Instruct',\n max_seq_length=MAX_SEQ_LENGTH,\n dtype=None, # auto: bf16 on A100, fp16 on T4\n load_in_4bit=True,\n)\nprint('Base model loaded')\n",
53
+ "outputs": [],
54
+ "execution_count": null
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "metadata": {},
59
+ "source": "# Cell 6: Apply LoRA adapters\nmodel = FastLanguageModel.get_peft_model(\n model,\n r=16,\n target_modules=['q_proj', 'v_proj'],\n lora_alpha=32,\n lora_dropout=0.05,\n bias='none',\n use_gradient_checkpointing='unsloth',\n random_state=42,\n)\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal = sum(p.numel() for p in model.parameters())\nprint(f'LoRA applied: {trainable:,} trainable / {total:,} total params ({100*trainable/total:.2f}%)')\n",
60
+ "outputs": [],
61
+ "execution_count": null
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "metadata": {},
66
+ "source": "# Cell 7: Build dataset + configure ORPOTrainer\nimport random, numpy as np\nfrom datasets import Dataset\nfrom trl import ORPOConfig, ORPOTrainer\n\nrandom.seed(42); np.random.seed(42); torch.manual_seed(42)\n\ncap = torch.cuda.get_device_capability()\nuse_fp16 = (cap[0] < 8) # T4 \u2192 fp16\nuse_bf16 = (cap[0] >= 8) # A100/H100 \u2192 bf16\nprint(f'GPU capability {cap}: fp16={use_fp16} bf16={use_bf16}')\n\n# ORPOTrainer with conversations format:\n# dataset must have 'chosen' and 'rejected' as lists of role/content dicts.\n# The trainer applies the tokenizer's chat template automatically.\ndataset = Dataset.from_list(pairs)\n\ntraining_args = ORPOConfig(\n output_dir='/content/tenacious-judge-adapter',\n learning_rate=8e-6,\n per_device_train_batch_size=2,\n gradient_accumulation_steps=4, # effective batch size = 8\n num_train_epochs=3,\n warmup_ratio=0.1,\n lr_scheduler_type='cosine',\n beta=0.1,\n max_length=1024,\n max_prompt_length=512,\n logging_steps=5,\n save_steps=100,\n seed=42,\n fp16=use_fp16,\n bf16=use_bf16,\n report_to='none',\n remove_unused_columns=False,\n)\n\ntrainer = ORPOTrainer(\n model=model,\n args=training_args,\n train_dataset=dataset,\n tokenizer=tokenizer,\n)\n\nsteps_per_epoch = len(dataset) // (\n training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps\n)\nprint(f'Trainer ready: {len(dataset)} pairs, ~{steps_per_epoch} steps/epoch, {training_args.num_train_epochs} epochs')\n",
67
+ "outputs": [],
68
+ "execution_count": null
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "metadata": {},
73
+ "source": "# Cell 8: Train\nprint('Starting ORPO training \u2014 ~30-60 min on T4...')\ntrain_result = trainer.train()\nprint('Training complete!')\nprint('Metrics:', train_result.metrics)\n",
74
+ "outputs": [],
75
+ "execution_count": null
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "metadata": {},
80
+ "source": "# Cell 9: Plot loss curve\nimport matplotlib.pyplot as plt\n\nlog = trainer.state.log_history\nsteps = [x['step'] for x in log if 'loss' in x]\nlosses = [x['loss'] for x in log if 'loss' in x]\n\nif steps:\n plt.figure(figsize=(10, 4))\n plt.plot(steps, losses, 'b-', linewidth=2)\n plt.xlabel('Step'); plt.ylabel('ORPO Loss')\n plt.title('Judge Training Loss \u2014 Tenacious-Bench ORPO (Qwen3.5-0.8B)')\n plt.grid(True, alpha=0.3)\n plt.savefig('/content/loss_curve.png', dpi=150, bbox_inches='tight')\n plt.show()\n print(f'Loss: {losses[0]:.4f} \u2192 {losses[-1]:.4f} (change: {losses[0]-losses[-1]:+.4f})')\nelse:\n print('No loss history available')\n",
81
+ "outputs": [],
82
+ "execution_count": null
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "metadata": {},
87
+ "source": "# Cell 10: Merge LoRA into base weights and push to HuggingFace\n# We push a merged (non-LoRA) model so inference needs only transformers, no PEFT.\nHUB_ID = 'rafiakedir/tenacious-bench-adapter'\n\nprint(f'Merging LoRA weights + pushing to {HUB_ID}...')\nmodel.push_to_hub_merged(\n HUB_ID,\n tokenizer,\n save_method='merged_16bit',\n token=HF_TOKEN,\n commit_message='feat: ORPO judge training on 188 judge-format pairs (Qwen3.5-0.8B)',\n)\nprint(f'Done: https://huggingface.co/{HUB_ID}')\n",
88
+ "outputs": [],
89
+ "execution_count": null
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "metadata": {},
94
+ "source": "# Cell 11: Verify HuggingFace repo\nfrom huggingface_hub import HfApi\napi = HfApi(token=HF_TOKEN)\nfiles = list(api.list_repo_files(HUB_ID, repo_type='model'))\nprint(f'Files in {HUB_ID}:')\nfor fpath in sorted(files):\n print(f' {fpath}')\n",
95
+ "outputs": [],
96
+ "execution_count": null
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "metadata": {},
101
+ "source": "# Cell 12: Smoke test \u2014 judge scores a known good and bad email as JSON\nimport json, torch\n\nJUDGE_SYSTEM = (\n 'You are a rubric-aware judge for Tenacious Consulting B2B outbound sales emails. '\n 'Given a task context and a candidate email, score the email on the specified rubric '\n 'dimension. Respond with a JSON object only:\\n'\n '{\"dimension\": \"<dim>\", \"score\": <0.0-1.0>, \"pass\": <true|false>, \"reasoning\": \"<one sentence>\"}'\n)\n\ndef judge(email_text, task_context, dimension):\n user = (\n f'EVALUATION DIMENSION: {dimension}\\n\\n'\n f'TASK CONTEXT:\\n{task_context}\\n\\n'\n f'CANDIDATE EMAIL:\\n{email_text}\\n\\n'\n f'Score this email on the {dimension} dimension.'\n )\n msgs = [{'role': 'system', 'content': JUDGE_SYSTEM},\n {'role': 'user', 'content': user}]\n text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n inputs = tokenizer(text, return_tensors='pt').to(model.device)\n with torch.no_grad():\n out = model.generate(**inputs, max_new_tokens=120, temperature=0.1, do_sample=True,\n pad_token_id=tokenizer.eos_token_id)\n resp = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()\n try:\n s, e = resp.find('{'), resp.rfind('}') + 1\n return json.loads(resp[s:e])\n except Exception:\n return {'raw': resp[:200], 'parse_error': True}\n\nctx = 'company: TalentBridge, stage: Series A (3mo ago), open_roles: 8 (high velocity), confidence: high'\n\ngood_email = (\n 'Casey \u2014 TalentBridge currently has 8 open AI/ML roles, 5 added in the last 60 days. '\n 'Your RAG-based matching engine aligns with our bench of 5 ML engineers skilled in LangChain. '\n 'We can deploy within 7-10 days. 30-minute scoping call: calendly.com/tenacious'\n)\nbad_email = (\n 'Hi Casey \u2014 TalentBridge Series A round 3 months ago. '\n 'Three companies in your sector are doing X and you are not. '\n 'Would you have 15 minutes to explore whether there is a fit?'\n)\n\nprint('=== GOOD EMAIL (expect score ~0.8-1.0) ===')\nr_good = judge(good_email, ctx, 'signal_grounding_fidelity')\nprint(json.dumps(r_good, indent=2))\n\nprint('\\n=== BAD EMAIL (expect score ~0.1-0.4) ===')\nr_bad = judge(bad_email, ctx, 'signal_grounding_fidelity')\nprint(json.dumps(r_bad, indent=2))\n\nif 'parse_error' not in r_good and 'parse_error' not in r_bad:\n gap = r_good.get('score', 0) - r_bad.get('score', 0)\n print(f'\\nScore gap (good - bad): {gap:+.2f} (positive = judge discriminates correctly)')\n print('Smoke test: PASSED' if gap > 0 else 'WARNING: judge not discriminating')\n",
102
+ "outputs": [],
103
+ "execution_count": null
104
+ }
105
+ ]
106
+ }