rafiakedir
/

tenacious-bench-adapter

+#!/usr/bin/env python3
+"""
+Day 6 — Run ablations on held-out partition.
+Three conditions:
+  Condition 1 (baseline): Week 10 raw scoring_evaluator output, no judge
+  Condition 2 (trained):  Trained LoRA judge from training/adapter/
+  Condition 3 (prompt_only): Qwen 2.5 1.5B with no LoRA, best prompt
+Writes:
+  ablations/ablation_results.json
+  ablations/held_out_traces.jsonl
+"""
+import json
+import os
+import sys
+import time
+import datetime
+import statistics
+from pathlib import Path
+import requests
+from bootstrap_test import paired_bootstrap
+ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(ROOT))
+from scoring_evaluator import score_task
+HELD_OUT_PATH = ROOT / "tenacious_bench_v0.1/held_out/tasks.jsonl"
+HF_JUDGE_MODEL = "rafiakedir/tenacious-bench-adapter"  # merged model on HuggingFace
+TRACES_PATH = Path(__file__).parent / "held_out_traces.jsonl"
+RESULTS_PATH = Path(__file__).parent / "ablation_results.json"
+COST_LOG = ROOT / "cost_log.csv"
+OPENROUTER_KEY = ""
+DEEPSEEK_MODEL = "deepseek/deepseek-chat-v3-0324"
+JUDGE_SYSTEM_PROMPT = """You are a Tenacious Consulting sales judge. Evaluate the following B2B outbound sales email on these four rubric dimensions:
+1. SIGNAL_GROUNDING_FIDELITY: Does the email only assert claims supported by the hiring signal brief? Use assertive language for high-confidence signals, questions for low-confidence ones.
+2. BENCH_COMMITMENT_HONESTY: Does the email avoid committing more engineers than available on bench?
+3. ICP_SEGMENT_APPROPRIATENESS: Does the email use language matching the correct ICP segment (growth/scale for Segment 1, cost/efficiency for Segment 2, transition/leadership for Segment 3, hedged for ABSTAIN)?
+4. TONE_PRESERVATION: Does the email avoid banned re-engagement clichés, over-apologetic language, and include a 30-minute scoping CTA with a direct opener?
+Additional Tenacious tone markers:
+- No "just wanted to circle back", "touching base", "hope you're doing well"
+- No "I apologize for taking your time", "we may not be the best fit"
+- Must include calendar CTA: "30-minute scoping conversation" or equivalent
+- No fabricated competitor claims
+Score each dimension 0.0 to 1.0. Return ONLY a JSON object:
+{"signal_grounding": <0-1>, "bench_honesty": <0-1>, "icp_segment": <0-1>, "tone": <0-1>, "overall": <0-1>, "reasoning": "<one sentence>"}"""
+def _load_env():
+    env_path = ROOT / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            line = line.strip()
+            if "=" in line and not line.startswith("#"):
+                k, v = line.split("=", 1)
+                os.environ.setdefault(k.strip(), v.strip().strip('"'))
+def call_openrouter(messages: list, model: str, max_tokens: int = 200) -> tuple[str, int, float]:
+    url = "https://openrouter.ai/api/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENROUTER_API_KEY', '')}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "https://github.com/rafiakedir/tenacious-bench",
+    }
+    body = {"model": model, "messages": messages, "max_tokens": max_tokens, "temperature": 0.0}
+    t0 = time.time()
+    resp = requests.post(url, headers=headers, json=body, timeout=60)
+    latency_ms = int((time.time() - t0) * 1000)
+    try:
+        data = resp.json()
+        usage = data.get("usage", {})
+        prompt_toks = usage.get("prompt_tokens", 0)
+        comp_toks = usage.get("completion_tokens", 0)
+        cost = 0.0
+        if "deepseek" in model.lower():
+            cost = (prompt_toks * 0.14 + comp_toks * 0.28) / 1000000
+        else:
+            cost = (prompt_toks * 0.40 + comp_toks * 0.40) / 1000000
+        return data["choices"][0]["message"]["content"].strip(), latency_ms, cost
+    except Exception:
+        return "[failed]", latency_ms, 0.0
+def load_held_out_tasks():
+    tasks = []
+    with open(HELD_OUT_PATH) as f:
+        for line in f:
+            tasks.append(json.loads(line))
+    return tasks
+def generate_candidate_if_missing(task: dict) -> tuple[str, float]:
+    """If task has no candidate_output, generate one with DeepSeek."""
+    if task.get("candidate_output"):
+        return task["candidate_output"], 0.0
+    inp = task.get("input", {})
+    hsb = inp.get("hiring_signal_brief")
+    bs = inp.get("bench_summary")
+    task_type = task.get("task_type", "email_generation")
+    brief_text = json.dumps(hsb or bs or {}, indent=2)[:800]
+    msg = [
+        {"role": "system", "content": "You are a Tenacious Consulting sales agent writing B2B outreach emails."},
+        {"role": "user", "content": f"Write a {task_type} email for this prospect:\n{brief_text}\n\nKeep it under 120 words with a 30-minute scoping CTA."},
+    ]
+    try:
+        text, _, cost = call_openrouter(msg, DEEPSEEK_MODEL, max_tokens=300)
+        return text, cost
+    except Exception as e:
+        return f"[generation failed: {e}]", 0.0
+def score_with_evaluator(task: dict, candidate_output: str) -> dict:
+    """Condition 1: machine-verifiable scoring_evaluator only."""
+    t = {**task, "candidate_output": candidate_output}
+    result = score_task(t)
+    return {
+        "signal_grounding": result.get("score", 0.0),
+        "bench_honesty": result.get("score", 0.0),
+        "icp_segment": result.get("score", 0.0),
+        "tone": result.get("score", 0.0),
+        "overall": result.get("score", 0.0),
+        "passed": result.get("passed", False),
+        "rubric_score": result.get("score", 0.0),
+    }
+def score_with_prompt_judge(task: dict, candidate_output: str) -> tuple[dict, int, float]:
+    """Condition 3: zero-shot Qwen judge via OpenRouter (Qwen3-30B)."""
+    inp = task.get("input", {})
+    brief = json.dumps(inp.get("hiring_signal_brief") or inp.get("bench_summary") or {})[:600]
+    prompt = f"""TASK INPUT:
+{brief}
+CANDIDATE EMAIL:
+{candidate_output[:600]}
+Score this email on all four rubric dimensions."""
+    msg = [
+        {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
+        {"role": "user", "content": prompt},
+    ]
+    try:
+        text, latency_ms, cost = call_openrouter(msg, "qwen/qwen3-30b-a3b", max_tokens=200)
+        # Extract JSON from response
+        import re
+        json_match = re.search(r'\{[^}]+\}', text, re.DOTALL)
+        if json_match:
+            scores = json.loads(json_match.group())
+        else:
+            scores = {"overall": 0.5, "reasoning": "parse_error"}
+        scores["raw_response"] = text[:200]
+        return scores, latency_ms, cost
+    except Exception as e:
+        return {"overall": 0.5, "error": str(e)}, 0, 0.0
+TRAINED_MODEL = None
+TRAINED_TOKENIZER = None
+JUDGE_SYSTEM_FOR_TRAINED = (
+    "You are a rubric-aware judge for Tenacious Consulting B2B outbound sales emails. "
+    "Given a task context and a candidate email, score the email on the specified rubric "
+    "dimension. Respond with a JSON object only:\n"
+    '{"dimension": "<dim>", "score": <0.0-1.0>, "pass": <true|false>, "reasoning": "<one sentence>"}'
+)
+def _load_trained_model():
+    """Load merged judge model from HuggingFace (once, cached in module globals)."""
+    global TRAINED_MODEL, TRAINED_TOKENIZER
+    if TRAINED_MODEL is not None:
+        return TRAINED_MODEL, TRAINED_TOKENIZER
+    try:
+        import torch
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        print(f"  Loading trained judge from {HF_JUDGE_MODEL}...")
+        TRAINED_TOKENIZER = AutoTokenizer.from_pretrained(HF_JUDGE_MODEL)
+        TRAINED_MODEL = AutoModelForCausalLM.from_pretrained(
+            HF_JUDGE_MODEL,
+            torch_dtype=torch.float16,
+            device_map="auto",
+        )
+        TRAINED_MODEL.eval()
+        print(f"  Trained judge loaded")
+        return TRAINED_MODEL, TRAINED_TOKENIZER
+    except Exception as e:
+        print(f"  Could not load trained judge from HF: {e}")
+        return None, None
+def score_with_trained_judge(task: dict, candidate_output: str) -> tuple[dict, int, float]:
+    """Condition 2: merged judge model loaded from HuggingFace."""
+    import re, torch
+    model, tokenizer = _load_trained_model()
+    if model is None:
+        # Graceful fallback — mark clearly so results aren't confused with trained scores
+        return {"overall": 0.5, "error": "hf_model_unavailable", "note": "judge not loaded"}, 0, 0.0
+    dim = task.get("dimension", "signal_grounding_fidelity")
+    inp = task.get("input", {})
+    brief = json.dumps(
+        inp.get("hiring_signal_brief") or inp.get("bench_summary") or {}
+    )[:600]
+    user_content = (
+        f"EVALUATION DIMENSION: {dim}\n\n"
+        f"TASK CONTEXT:\n{brief}\n\n"
+        f"CANDIDATE EMAIL:\n{candidate_output.strip()[:500]}\n\n"
+        f"Score this email on the {dim} dimension."
+    )
+    msgs = [
+        {"role": "system", "content": JUDGE_SYSTEM_FOR_TRAINED},
+        {"role": "user",   "content": user_content},
+    ]
+    text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt").to(model.device)
+    t0 = time.time()
+    with torch.no_grad():
+        output = model.generate(
+            **inputs, max_new_tokens=150, temperature=0.1, do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    latency_ms = int((time.time() - t0) * 1000)
+    generated = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    json_match = re.search(r'\{[^}]+\}', generated, re.DOTALL)
+    if json_match:
+        try:
+            scores = json.loads(json_match.group())
+            scores["overall"] = scores.get("score", 0.5)
+            return scores, latency_ms, 0.0
+        except json.JSONDecodeError:
+            pass
+    return {"overall": 0.5, "reasoning": "parse_error", "raw": generated[:200]}, latency_ms, 0.0
+def append_trace(entry: dict):
+    with open(TRACES_PATH, "a") as f:
+        f.write(json.dumps(entry) + "\n")
+def condition_baseline(tasks: list) -> list:
+    """Condition 1: scoring_evaluator only, no judge."""
+    print("\n=== CONDITION 1: Baseline (scoring_evaluator) ===")
+    results = []
+    for i, task in enumerate(tasks):
+        t0 = time.time()
+        candidate, cost_gen = generate_candidate_if_missing(task)
+        scores = score_with_evaluator(task, candidate)
+        latency_ms = int((time.time() - t0) * 1000)
+        entry = {
+            "task_id": task["task_id"],
+            "condition": "baseline",
+            "candidate_output": candidate[:300],
+            "score": scores,
+            "latency_ms": latency_ms,
+            "cost_usd": cost_gen,
+        }
+        append_trace(entry)
+        results.append(scores.get("overall", 0.0))
+        print(f"  [{i+1}/{len(tasks)}] {task['task_id']} score={scores.get('overall',0):.3f}")
+    return results
+def condition_trained_judge(tasks: list) -> list:
+    """Condition 2: trained LoRA adapter."""
+    print("\n=== CONDITION 2: Trained Judge (LoRA adapter) ===")
+    results = []
+    for i, task in enumerate(tasks):
+        t0 = time.time()
+        candidate, cost_gen = generate_candidate_if_missing(task)
+        scores, latency_ms, cost_judge = score_with_trained_judge(task, candidate)
+        # Blend with machine scorer for reliability
+        machine_scores = score_with_evaluator(task, candidate)
+        blended_overall = 0.6 * scores.get("overall", 0.5) + 0.4 * machine_scores.get("overall", 0.5)
+        scores["blended_overall"] = round(blended_overall, 4)
+        scores["machine_score"] = machine_scores.get("overall", 0.5)
+        entry = {
+            "task_id": task["task_id"],
+            "condition": "trained",
+            "candidate_output": candidate[:300],
+            "score": scores,
+            "latency_ms": latency_ms,
+            "cost_usd": cost_gen + cost_judge,
+        }
+        append_trace(entry)
+        results.append(blended_overall)
+        print(f"  [{i+1}/{len(tasks)}] {task['task_id']} overall={blended_overall:.3f}")
+    return results
+def condition_prompt_only(tasks: list) -> list:
+    """Condition 3: Qwen3 with prompt-engineered judge, no training."""
+    print("\n=== CONDITION 3: Prompt-Only Judge (Qwen3-30B) ===")
+    results = []
+    for i, task in enumerate(tasks):
+        t0 = time.time()
+        candidate, cost_gen = generate_candidate_if_missing(task)
+        scores, latency_ms, cost_judge = score_with_prompt_judge(task, candidate)
+        # Blend with machine scorer
+        machine_scores = score_with_evaluator(task, candidate)
+        blended_overall = 0.6 * scores.get("overall", 0.5) + 0.4 * machine_scores.get("overall", 0.5)
+        scores["blended_overall"] = round(blended_overall, 4)
+        scores["machine_score"] = machine_scores.get("overall", 0.5)
+        entry = {
+            "task_id": task["task_id"],
+            "condition": "prompt_only",
+            "candidate_output": candidate[:300],
+            "score": scores,
+            "latency_ms": latency_ms,
+            "cost_usd": cost_gen + cost_judge,
+        }
+        append_trace(entry)
+        results.append(blended_overall)
+        print(f"  [{i+1}/{len(tasks)}] {task['task_id']} overall={blended_overall:.3f}")
+    return results
+def main():
+    _load_env()
+    tasks = load_held_out_tasks()
+    print(f"Loaded {len(tasks)} held-out tasks")
+    # Clear traces file
+    TRACES_PATH.unlink(missing_ok=True)
+    baseline_scores = condition_baseline(tasks)
+    trained_scores = condition_trained_judge(tasks)
+    prompt_scores = condition_prompt_only(tasks)
+    def summarize(scores: list) -> dict:
+        if not scores:
+            return {"mean": 0, "std": 0, "min": 0, "max": 0, "p95": 0}
+        return {
+            "mean": round(statistics.mean(scores), 4),
+            "std": round(statistics.stdev(scores) if len(scores) > 1 else 0, 4),
+            "min": round(min(scores), 4),
+            "max": round(max(scores), 4),
+            "p95": round(sorted(scores)[int(0.95 * len(scores))], 4),
+            "n": len(scores),
+        }
+    # Compute latencies from traces
+    traces = []
+    with open(TRACES_PATH) as f:
+        for line in f:
+            traces.append(json.loads(line))
+    def latency_p95(condition: str) -> int:
+        lats = [t["latency_ms"] for t in traces if t["condition"] == condition]
+        if not lats:
+            return 0
+        return sorted(lats)[int(0.95 * len(lats))]
+    def cost_p95(condition: str) -> float:
+        costs = [t.get("cost_usd", 0.0) for t in traces if t["condition"] == condition]
+        if not costs:
+            return 0.0
+        return round(sorted(costs)[int(0.95 * len(costs))], 5)
+    delta_a_boot = paired_bootstrap(trained_scores, baseline_scores)
+    delta_a_boot["description"] = "trained judge vs baseline"
+    delta_b_boot = paired_bootstrap(trained_scores, prompt_scores)
+    delta_b_boot["description"] = "trained judge vs prompt-only"
+    delta_c_boot = paired_bootstrap(prompt_scores, baseline_scores)
+    delta_c_boot["description"] = "prompt-only vs baseline"
+    results = {
+        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "held_out_task_count": len(tasks),
+        "baseline": {**summarize(baseline_scores), "p95_latency_ms": latency_p95("baseline"), "p95_cost_usd": cost_p95("baseline")},
+        "trained": {**summarize(trained_scores), "p95_latency_ms": latency_p95("trained"), "p95_cost_usd": cost_p95("trained")},
+        "prompt_only": {**summarize(prompt_scores), "p95_latency_ms": latency_p95("prompt_only"), "p95_cost_usd": cost_p95("prompt_only")},
+        "delta_a": delta_a_boot,
+        "delta_b": delta_b_boot,
+        "delta_c": delta_c_boot,
+    }
+    with open(RESULTS_PATH, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n=== ABLATION RESULTS ===")
+    print(f"Baseline mean: {results['baseline']['mean']:.4f}")
+    print(f"Trained mean:  {results['trained']['mean']:.4f}")
+    print(f"Prompt mean:   {results['prompt_only']['mean']:.4f}")
+    print(f"Delta A (trained vs baseline): {results['delta_a']['mean_diff']:+.4f} (p={results['delta_a']['p_value']:.4f})")
+    print(f"Delta B (trained vs prompt):   {results['delta_b']['mean_diff']:+.4f} (p={results['delta_b']['p_value']:.4f})")
+    print(f"Delta C (prompt vs baseline):  {results['delta_c']['mean_diff']:+.4f} (p={results['delta_c']['p_value']:.4f})")
+    print(f"\nResults written to {RESULTS_PATH}")
+    print(f"Traces written to {TRACES_PATH}")
+if __name__ == "__main__":
+    main()

run_on_colab.ipynb CHANGED Viewed

@@ -1,77 +1,106 @@
 {
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "kernelspec": {"display_name": "Python 3", "name": "python3"},
-  "language_info": {"name": "python"},
-  "accelerator": "GPU",
-  "colab": {"provenance": [], "gpuType": "T4", "name": "tenacious_bench_orpo_training.ipynb"}
- },
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": ["# Tenacious-Bench ORPO Judge Training\n\n**Trains Qwen2.5-1.5B-Instruct** with LoRA using ORPO on Tenacious-specific rubric preference pairs.\n\nRuntime: T4 GPU (Colab free tier)  \nExpected training time: ~45-90 minutes for 3 epochs\n\n## Setup\n1. Set HF_TOKEN and OPENROUTER_API_KEY in Colab Secrets (key icon in left sidebar)\n2. Run all cells in order\n"]
   },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 1: Check GPU\nimport subprocess\nresult = subprocess.run(['nvidia-smi'], capture_output=True, text=True)\nprint(result.stdout[:500])"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 2: Install Unsloth and dependencies (pinned versions)\n!pip install -q 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'\n!pip install -q trl==0.12.2 peft==0.14.0 transformers==4.47.1 datasets==3.2.0\n!pip install -q accelerate==1.2.1 bitsandbytes==0.45.0\nprint('Installation complete')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 3: Clone the repo\nimport os\nfrom google.colab import userdata\n\nHF_TOKEN = userdata.get('HF_TOKEN')\nOPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')\n\nos.environ['HF_TOKEN'] = HF_TOKEN\nos.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n\n!git clone https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1 /content/tenacious-bench-data\nprint('Repo cloned')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 4: Load preference pairs\nimport json\nfrom pathlib import Path\n\npairs_path = Path('/content/tenacious-bench-data/training_data/preference_pairs.jsonl')\npairs = []\nwith open(pairs_path) as f:\n    for line in f:\n        p = json.loads(line)\n        pairs.append({'prompt': p['prompt'], 'chosen': p['chosen'], 'rejected': p['rejected']})\n\nprint(f'Loaded {len(pairs)} preference pairs')\nprint(f'Sample pair task context (first 200 chars of prompt):')\nprint(pairs[0]['prompt'][:200])"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 5: Load Unsloth model with 4-bit quantization\nfrom unsloth import FastLanguageModel\nimport torch\n\nMAX_SEQ_LENGTH = 1024\nDTYPE = None  # auto-detect\nLOAD_IN_4BIT = True\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n    model_name='unsloth/Qwen2.5-1.5B-Instruct',\n    max_seq_length=MAX_SEQ_LENGTH,\n    dtype=DTYPE,\n    load_in_4bit=LOAD_IN_4BIT,\n)\nprint('Model loaded')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 6: Apply LoRA\nmodel = FastLanguageModel.get_peft_model(\n    model,\n    r=16,\n    target_modules=['q_proj', 'v_proj'],\n    lora_alpha=32,\n    lora_dropout=0.05,\n    bias='none',\n    use_gradient_checkpointing='unsloth',\n    random_state=42,\n)\nprint('LoRA applied')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 7: Set up ORPO trainer\nimport random\nimport numpy as np\nfrom datasets import Dataset\nfrom trl import ORPOConfig, ORPOTrainer\n\n# Fixed seed\nrandom.seed(42)\nnp.random.seed(42)\ntorch.manual_seed(42)\n\n# Detect precision\ncap = torch.cuda.get_device_capability()\nuse_fp16 = cap[0] < 8  # T4 uses fp16\nuse_bf16 = cap[0] >= 8  # A100/4090 use bf16\nprint(f'GPU compute capability: {cap}, fp16={use_fp16}, bf16={use_bf16}')\n\ndataset = Dataset.from_list(pairs)\n\ntraining_args = ORPOConfig(\n    output_dir='/content/tenacious-adapter',\n    learning_rate=8e-6,\n    per_device_train_batch_size=2,\n    gradient_accumulation_steps=4,\n    num_train_epochs=3,\n    warmup_ratio=0.1,\n    lr_scheduler_type='cosine',\n    beta=0.1,\n    max_length=1024,\n    max_prompt_length=512,\n    logging_steps=10,\n    save_steps=50,\n    seed=42,\n    fp16=use_fp16,\n    bf16=use_bf16,\n    report_to='none',\n    remove_unused_columns=False,\n)\n\ntrainer = ORPOTrainer(\n    model=model,\n    args=training_args,\n    train_dataset=dataset,\n    tokenizer=tokenizer,\n)\nprint('Trainer initialized')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 8: Train\nprint('Starting ORPO training...')\ntrain_result = trainer.train()\nprint(f'Training complete!')\nprint(f'Metrics: {train_result.metrics}')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 9: Plot loss curve\nimport matplotlib.pyplot as plt\n\nlog_history = trainer.state.log_history\nsteps = [x['step'] for x in log_history if 'loss' in x]\nlosses = [x['loss'] for x in log_history if 'loss' in x]\n\nif steps:\n    plt.figure(figsize=(10, 5))\n    plt.plot(steps, losses, 'b-', linewidth=2, label='Training Loss')\n    plt.xlabel('Step')\n    plt.ylabel('Loss')\n    plt.title('ORPO Training Loss — Tenacious Judge')\n    plt.legend()\n    plt.grid(True, alpha=0.3)\n    plt.savefig('/content/loss_curve.png', dpi=150, bbox_inches='tight')\n    plt.show()\n    print(f'Initial loss: {losses[0]:.4f}')\n    print(f'Final loss:   {losses[-1]:.4f}')\n    print(f'Loss decrease: {losses[0] - losses[-1]:.4f} ({(1-losses[-1]/losses[0])*100:.1f}%)')\nelse:\n    print('No loss history available')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 10: Save adapter locally and push to HuggingFace\nADAPTER_DIR = '/content/tenacious-adapter'\n\nmodel.save_pretrained(ADAPTER_DIR)\ntokenizer.save_pretrained(ADAPTER_DIR)\nprint(f'Adapter saved to {ADAPTER_DIR}')\n\n# Push to HuggingFace\nHUB_MODEL_ID = 'rafiakedir/tenacious-bench-adapter'\nprint(f'Pushing to {HUB_MODEL_ID}...')\nmodel.push_to_hub(HUB_MODEL_ID, token=HF_TOKEN)\ntokenizer.push_to_hub(HUB_MODEL_ID, token=HF_TOKEN)\nprint(f'Adapter pushed to https://huggingface.co/{HUB_MODEL_ID}')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 11: Verify adapter on HuggingFace\nfrom huggingface_hub import HfApi\napi = HfApi(token=HF_TOKEN)\nfiles = api.list_repo_files(HUB_MODEL_ID)\nprint(f'Files in {HUB_MODEL_ID}:')\nfor f in files:\n    print(f'  {f}')"]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": ["# Step 12: Quick smoke test — run judge on one sample\nfrom peft import PeftModel\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\n\nJUDGE_SYSTEM = (\n    'You are evaluating outbound sales emails for Tenacious Consulting. '\n    'Score the following output on signal-grounding fidelity, bench commitment honesty, '\n    'ICP segment appropriateness, and Tenacious tone adherence. '\n    'Return JSON: {\\\"signal_grounding\\\": 0-1, \\\"bench_honesty\\\": 0-1, \\\"icp_segment\\\": 0-1, \\\"tone\\\": 0-1, \\\"overall\\\": 0-1}'\n)\n\ntest_email = '''Subject: TalentBridge's ML hiring + 30-min call\\n\\nHi Casey,\\nTalentBridge recently closed a Series A and currently has 8 open ML roles.\\nWe staff ML squads, typically 4 engineers in under 3 weeks.\\nWant to set up a 30-minute scoping conversation?\\n\\nBest,\\nYabi'''\n\nprompt_text = (\n    f'<|im_start|>system\\n{JUDGE_SYSTEM}<|im_end|>\\n'\n    f'<|im_start|>user\\n{test_email}<|im_end|>\\n'\n    f'<|im_start|>assistant\\n'\n)\n\ninputs = tokenizer(prompt_text, return_tensors='pt').to(model.device)\nwith torch.no_grad():\n    output = model.generate(**inputs, max_new_tokens=100, temperature=0.0, do_sample=False)\ngenerated = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)\nprint('Judge output:')\nprint(generated)"]
-  }
- ]
-}

 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
   },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "# Tenacious-Bench ORPO Judge Training (Fixed)\n\n**Trains Qwen3.5-0.8B as a scoring judge** using ORPO on judge-format preference pairs.\n\nEach pair teaches the model: given [task context + candidate email] \u2192 output correct JSON score.\n\n**Fixes vs original notebook:**\n- Training data is judge pairs (score output) not generator pairs (email output)\n- Conversations list format for ORPOTrainer (no pre-tokenized ChatML strings)\n- Merges + pushes full model for clean HuggingFace inference\n\nRuntime: T4 GPU \u00b7 ~30-60 min \u00b7 3 epochs \u00b7 188 judge pairs\n\n## Setup\n1. Runtime \u2192 Change runtime type \u2192 T4 GPU\n2. Secrets (key icon left sidebar): `HF_TOKEN`, `OPENROUTER_API_KEY`\n3. Run all cells in order\n"
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 1: Check GPU\nimport subprocess\nresult = subprocess.run(['nvidia-smi'], capture_output=True, text=True)\nprint(result.stdout[:600] if result.returncode == 0 else 'No GPU \u2014 change runtime type to T4')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 2: Install dependencies (pinned)\n!pip install -q 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'\n!pip install -q trl==0.12.2 peft==0.14.0 transformers==4.47.1 datasets==3.2.0\n!pip install -q accelerate==1.2.1 bitsandbytes==0.45.0\nprint('Installation complete')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 3: Auth + clone dataset from HuggingFace\nimport os\nfrom google.colab import userdata\n\nHF_TOKEN = userdata.get('HF_TOKEN')\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n!git clone https://huggingface.co/datasets/rafiakedir/tenacious-bench-v0.1 /content/tb-data\nprint('Dataset cloned')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 4: Load judge pairs (conversations format)\n# judge_pairs.jsonl was built by training/build_judge_pairs.py\n# Each item: {\"chosen\": [{role, content}, ...], \"rejected\": [{role, content}, ...]}\n# The assistant turn in chosen = correct JSON score; in rejected = wrong JSON score\nimport json\nfrom pathlib import Path\n\npairs_path = Path('/content/tb-data/training_data/judge_pairs.jsonl')\npairs = []\nwith open(pairs_path) as f:\n    for line in f:\n        p = json.loads(line)\n        pairs.append({'chosen': p['chosen'], 'rejected': p['rejected']})\n\nprint(f'Loaded {len(pairs)} judge pairs')\nprint('Sample chosen  (correct score):', pairs[0]['chosen'][-1]['content'])\nprint('Sample rejected (wrong score): ', pairs[0]['rejected'][-1]['content'])\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 5: Load Qwen3.5-0.8B-Instruct via Unsloth (4-bit quantization)\nfrom unsloth import FastLanguageModel\nimport torch\n\nMAX_SEQ_LENGTH = 1024\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n    model_name='unsloth/Qwen3.5-0.8B-Instruct',\n    max_seq_length=MAX_SEQ_LENGTH,\n    dtype=None,        # auto: bf16 on A100, fp16 on T4\n    load_in_4bit=True,\n)\nprint('Base model loaded')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 6: Apply LoRA adapters\nmodel = FastLanguageModel.get_peft_model(\n    model,\n    r=16,\n    target_modules=['q_proj', 'v_proj'],\n    lora_alpha=32,\n    lora_dropout=0.05,\n    bias='none',\n    use_gradient_checkpointing='unsloth',\n    random_state=42,\n)\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal     = sum(p.numel() for p in model.parameters())\nprint(f'LoRA applied: {trainable:,} trainable / {total:,} total params ({100*trainable/total:.2f}%)')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 7: Build dataset + configure ORPOTrainer\nimport random, numpy as np\nfrom datasets import Dataset\nfrom trl import ORPOConfig, ORPOTrainer\n\nrandom.seed(42); np.random.seed(42); torch.manual_seed(42)\n\ncap = torch.cuda.get_device_capability()\nuse_fp16 = (cap[0] < 8)   # T4 \u2192 fp16\nuse_bf16 = (cap[0] >= 8)  # A100/H100 \u2192 bf16\nprint(f'GPU capability {cap}: fp16={use_fp16} bf16={use_bf16}')\n\n# ORPOTrainer with conversations format:\n# dataset must have 'chosen' and 'rejected' as lists of role/content dicts.\n# The trainer applies the tokenizer's chat template automatically.\ndataset = Dataset.from_list(pairs)\n\ntraining_args = ORPOConfig(\n    output_dir='/content/tenacious-judge-adapter',\n    learning_rate=8e-6,\n    per_device_train_batch_size=2,\n    gradient_accumulation_steps=4,     # effective batch size = 8\n    num_train_epochs=3,\n    warmup_ratio=0.1,\n    lr_scheduler_type='cosine',\n    beta=0.1,\n    max_length=1024,\n    max_prompt_length=512,\n    logging_steps=5,\n    save_steps=100,\n    seed=42,\n    fp16=use_fp16,\n    bf16=use_bf16,\n    report_to='none',\n    remove_unused_columns=False,\n)\n\ntrainer = ORPOTrainer(\n    model=model,\n    args=training_args,\n    train_dataset=dataset,\n    tokenizer=tokenizer,\n)\n\nsteps_per_epoch = len(dataset) // (\n    training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps\n)\nprint(f'Trainer ready: {len(dataset)} pairs, ~{steps_per_epoch} steps/epoch, {training_args.num_train_epochs} epochs')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 8: Train\nprint('Starting ORPO training \u2014 ~30-60 min on T4...')\ntrain_result = trainer.train()\nprint('Training complete!')\nprint('Metrics:', train_result.metrics)\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 9: Plot loss curve\nimport matplotlib.pyplot as plt\n\nlog = trainer.state.log_history\nsteps  = [x['step'] for x in log if 'loss' in x]\nlosses = [x['loss']  for x in log if 'loss' in x]\n\nif steps:\n    plt.figure(figsize=(10, 4))\n    plt.plot(steps, losses, 'b-', linewidth=2)\n    plt.xlabel('Step'); plt.ylabel('ORPO Loss')\n    plt.title('Judge Training Loss \u2014 Tenacious-Bench ORPO (Qwen3.5-0.8B)')\n    plt.grid(True, alpha=0.3)\n    plt.savefig('/content/loss_curve.png', dpi=150, bbox_inches='tight')\n    plt.show()\n    print(f'Loss: {losses[0]:.4f} \u2192 {losses[-1]:.4f}  (change: {losses[0]-losses[-1]:+.4f})')\nelse:\n    print('No loss history available')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 10: Merge LoRA into base weights and push to HuggingFace\n# We push a merged (non-LoRA) model so inference needs only transformers, no PEFT.\nHUB_ID = 'rafiakedir/tenacious-bench-adapter'\n\nprint(f'Merging LoRA weights + pushing to {HUB_ID}...')\nmodel.push_to_hub_merged(\n    HUB_ID,\n    tokenizer,\n    save_method='merged_16bit',\n    token=HF_TOKEN,\n    commit_message='feat: ORPO judge training on 188 judge-format pairs (Qwen3.5-0.8B)',\n)\nprint(f'Done: https://huggingface.co/{HUB_ID}')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 11: Verify HuggingFace repo\nfrom huggingface_hub import HfApi\napi = HfApi(token=HF_TOKEN)\nfiles = list(api.list_repo_files(HUB_ID, repo_type='model'))\nprint(f'Files in {HUB_ID}:')\nfor fpath in sorted(files):\n    print(f'  {fpath}')\n",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": "# Cell 12: Smoke test \u2014 judge scores a known good and bad email as JSON\nimport json, torch\n\nJUDGE_SYSTEM = (\n    'You are a rubric-aware judge for Tenacious Consulting B2B outbound sales emails. '\n    'Given a task context and a candidate email, score the email on the specified rubric '\n    'dimension. Respond with a JSON object only:\\n'\n    '{\"dimension\": \"<dim>\", \"score\": <0.0-1.0>, \"pass\": <true|false>, \"reasoning\": \"<one sentence>\"}'\n)\n\ndef judge(email_text, task_context, dimension):\n    user = (\n        f'EVALUATION DIMENSION: {dimension}\\n\\n'\n        f'TASK CONTEXT:\\n{task_context}\\n\\n'\n        f'CANDIDATE EMAIL:\\n{email_text}\\n\\n'\n        f'Score this email on the {dimension} dimension.'\n    )\n    msgs = [{'role': 'system', 'content': JUDGE_SYSTEM},\n            {'role': 'user',   'content': user}]\n    text   = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n    inputs = tokenizer(text, return_tensors='pt').to(model.device)\n    with torch.no_grad():\n        out = model.generate(**inputs, max_new_tokens=120, temperature=0.1, do_sample=True,\n                             pad_token_id=tokenizer.eos_token_id)\n    resp = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()\n    try:\n        s, e = resp.find('{'), resp.rfind('}') + 1\n        return json.loads(resp[s:e])\n    except Exception:\n        return {'raw': resp[:200], 'parse_error': True}\n\nctx = 'company: TalentBridge, stage: Series A (3mo ago), open_roles: 8 (high velocity), confidence: high'\n\ngood_email = (\n    'Casey \u2014 TalentBridge currently has 8 open AI/ML roles, 5 added in the last 60 days. '\n    'Your RAG-based matching engine aligns with our bench of 5 ML engineers skilled in LangChain. '\n    'We can deploy within 7-10 days. 30-minute scoping call: calendly.com/tenacious'\n)\nbad_email = (\n    'Hi Casey \u2014 TalentBridge Series A round 3 months ago. '\n    'Three companies in your sector are doing X and you are not. '\n    'Would you have 15 minutes to explore whether there is a fit?'\n)\n\nprint('=== GOOD EMAIL (expect score ~0.8-1.0) ===')\nr_good = judge(good_email, ctx, 'signal_grounding_fidelity')\nprint(json.dumps(r_good, indent=2))\n\nprint('\\n=== BAD EMAIL (expect score ~0.1-0.4) ===')\nr_bad = judge(bad_email, ctx, 'signal_grounding_fidelity')\nprint(json.dumps(r_bad, indent=2))\n\nif 'parse_error' not in r_good and 'parse_error' not in r_bad:\n    gap = r_good.get('score', 0) - r_bad.get('score', 0)\n    print(f'\\nScore gap (good - bad): {gap:+.2f}  (positive = judge discriminates correctly)')\n    print('Smoke test: PASSED' if gap > 0 else 'WARNING: judge not discriminating')\n",
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}