rtferraz
/

tucano2-commerce

Model card Files Files and versions

xet

Community

rtferraz commited on 5 days ago

Commit

0c9199c

verified ·

1 Parent(s): b1be31c

add: base vs tuned comparison cell for V4.2 final evaluation

Browse files

Files changed (1) hide show

notebooks/cell_comparison_base_vs_tuned.py +284 -0

notebooks/cell_comparison_base_vs_tuned.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# ══════════════════════════════════════════════════════════════════════════════
+# V4.2 FINAL: Base Model vs GRPO-Tuned Comparison
+# ══════════════════════════════════════════════════════════════════════════════
+#
+# Run AFTER: Cells 1-5 (deps, GPU, config, model load, token verify) + Cell 7 (reward fns)
+# Run AFTER: Cell 10 (dataset preparation — loads eval_v2_stratified.jsonl)
+#
+# This cell evaluates BOTH models on the same 65 stratified eval prompts:
+#   1. Base model (no adapter — raw Tucano2-qwen-0.5B-Instruct)
+#   2. GRPO-tuned model (best_checkpoint from V4.2 training)
+#
+# Output: side-by-side comparison table + per-task delta + sample outputs
+# ══════════════════════════════════════════════════════════════════════════════
+from scipy.stats import wilcoxon
+import numpy as np
+COMPARISON_TEMP = 0.1  # near-deterministic for fair comparison
+COMPARISON_MAX_TOKENS = 512
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 1: Load eval prompts from stratified eval set
+# ══════════════════════════════════════════════════════════════════════════════
+eval_v2_stratified_path = DATA_DIR / "pairs" / "eval_v2_stratified.jsonl"
+assert eval_v2_stratified_path.exists(), f"Eval set not found: {eval_v2_stratified_path}"
+eval_prompts = []
+eval_task_types = []
+with open(eval_v2_stratified_path) as f:
+    for line in f:
+        rec = json.loads(line)
+        prompt_msgs = rec["prompt_msgs"]
+        user_text = " ".join(m["content"] for m in prompt_msgs if m["role"] == "user")
+        task = _classify_task_type(user_text)
+        # Inject task-specific system prompt
+        prepared = inject_task_system_prompt(prompt_msgs, task)
+        eval_prompts.append(prepared)
+        eval_task_types.append(task)
+assert len(eval_prompts) == EVAL_TOTAL, f"Expected {EVAL_TOTAL} eval prompts, got {len(eval_prompts)}"
+print(f"✓ Loaded {len(eval_prompts)} eval prompts")
+print(f"  Distribution: {dict(zip(*np.unique(eval_task_types, return_counts=True)))}")
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 2: Helper — generate completions for all eval prompts
+# ══════════════════════════════════════════════════════════════════════════════
+def generate_eval_completions(model_obj, label="model"):
+    """Generate completions for all eval prompts, return texts + rewards."""
+    FastLanguageModel.for_inference(model_obj)
+    completions = []
+    rewards = []
+    for i, (msgs, task) in enumerate(zip(eval_prompts, eval_task_types)):
+        text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model_obj.device)
+        with torch.no_grad():
+            out = model_obj.generate(
+                **inputs,
+                max_new_tokens=COMPARISON_MAX_TOKENS,
+                temperature=COMPARISON_TEMP,
+                do_sample=True,
+                repetition_penalty=1.0,
+            )
+        resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        completions.append(resp)
+        # Score with raw reward function
+        r = commerce_reward_fn_raw([resp], [text])[0]
+        rewards.append(r)
+        if (i + 1) % 20 == 0:
+            print(f"  [{label}] {i+1}/{len(eval_prompts)} done...")
+    return completions, rewards
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 3: Evaluate BASE model (disable adapter)
+# ══════════════════════════════════════════════════════════════════════════════
+print("\n" + "=" * 70)
+print("EVALUATING BASE MODEL (no adapter)")
+print("=" * 70)
+# Disable LoRA adapter to get base model behavior
+model.disable_adapter_layers()
+base_completions, base_rewards = generate_eval_completions(model, label="base")
+model.enable_adapter_layers()
+print(f"  ✓ Base model: {len(base_rewards)} completions, mean reward = {np.mean(base_rewards):.3f}")
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 4: Evaluate TUNED model (load best checkpoint adapter)
+# ══════════════════════════════════════════════════════════════════════════════
+print("\n" + "=" * 70)
+print("EVALUATING TUNED MODEL (best_checkpoint, step 1100)")
+print("=" * 70)
+# Load the best checkpoint adapter
+best_ckpt_path = ADAPTER_DIR / "best_checkpoint"
+assert best_ckpt_path.exists(), f"Best checkpoint not found: {best_ckpt_path}"
+# Load adapter weights from best checkpoint
+from peft import set_peft_model_state_dict
+import safetensors.torch
+adapter_weights = safetensors.torch.load_file(str(best_ckpt_path / "adapter_model.safetensors"))
+set_peft_model_state_dict(model, adapter_weights)
+print(f"  ✓ Loaded adapter from {best_ckpt_path}")
+tuned_completions, tuned_rewards = generate_eval_completions(model, label="tuned")
+print(f"  ✓ Tuned model: {len(tuned_rewards)} completions, mean reward = {np.mean(tuned_rewards):.3f}")
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 5: Comparison analysis
+# ══════════════════════════════════════════════════════════════════════════════
+print("\n" + "=" * 70)
+print("V4.2 FINAL COMPARISON: BASE vs GRPO-TUNED")
+print("=" * 70)
+# Per-task breakdown
+tasks_unique = ["extraction", "sql_qa", "insights", "push"]
+print(f"\n{'Task':<14s} {'Base':>8s} {'Tuned':>8s} {'Δ':>8s} {'Δ%':>8s} {'N':>4s}")
+print(f"{'─' * 52}")
+task_results = {}
+for task in tasks_unique:
+    indices = [i for i, t in enumerate(eval_task_types) if t == task]
+    base_task = [base_rewards[i] for i in indices]
+    tuned_task = [tuned_rewards[i] for i in indices]
+    base_mean = np.mean(base_task)
+    tuned_mean = np.mean(tuned_task)
+    delta = tuned_mean - base_mean
+    delta_pct = (delta / base_mean * 100) if base_mean > 0 else float('inf')
+    task_results[task] = {
+        "base": base_mean, "tuned": tuned_mean,
+        "delta": delta, "delta_pct": delta_pct,
+        "n": len(indices),
+        "base_scores": base_task, "tuned_scores": tuned_task,
+    }
+    arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "→")
+    print(f"{task:<14s} {base_mean:>8.3f} {tuned_mean:>8.3f} {delta:>+8.3f} {delta_pct:>+7.1f}% {len(indices):>4d} {arrow}")
+# Overall
+base_overall = np.mean(base_rewards)
+tuned_overall = np.mean(tuned_rewards)
+delta_overall = tuned_overall - base_overall
+delta_pct_overall = (delta_overall / base_overall * 100) if base_overall > 0 else float('inf')
+print(f"{'─' * 52}")
+print(f"{'OVERALL':<14s} {base_overall:>8.3f} {tuned_overall:>8.3f} {delta_overall:>+8.3f} {delta_pct_overall:>+7.1f}% {len(base_rewards):>4d}")
+# Statistical significance (Wilcoxon signed-rank test — paired samples)
+print(f"\n{'─' * 52}")
+print("Statistical Significance (Wilcoxon signed-rank, paired)")
+print(f"{'─' * 52}")
+try:
+    stat, p_val = wilcoxon(tuned_rewards, base_rewards, alternative='greater')
+    sig = "✅ YES (p < 0.05)" if p_val < 0.05 else "❌ NO (p ≥ 0.05)"
+    print(f"  Overall: W={stat:.0f}, p={p_val:.4f} → {sig}")
+except Exception as e:
+    print(f"  Overall: Could not compute ({e})")
+for task in tasks_unique:
+    tr = task_results[task]
+    try:
+        # Need at least 10 samples and not all differences = 0
+        diffs = [t - b for t, b in zip(tr["tuned_scores"], tr["base_scores"])]
+        if all(d == 0 for d in diffs):
+            print(f"  {task}: all differences = 0 (identical outputs)")
+        else:
+            stat, p_val = wilcoxon(tr["tuned_scores"], tr["base_scores"], alternative='greater')
+            sig = "p < 0.05 ✅" if p_val < 0.05 else f"p = {p_val:.3f}"
+            print(f"  {task}: W={stat:.0f}, {sig}")
+    except Exception as e:
+        print(f"  {task}: insufficient data ({e})")
+# ═════════════════════════════════════════════════════════════════���════════════
+# STEP 6: Sample outputs — show 2 examples per task (base vs tuned)
+# ══════════════════════════════════════════════════════════════════════════════
+print(f"\n\n{'=' * 70}")
+print("SAMPLE OUTPUTS — Base vs Tuned (2 per task)")
+print("=" * 70)
+for task in tasks_unique:
+    indices = [i for i, t in enumerate(eval_task_types) if t == task]
+    # Pick the sample with largest positive delta and one with largest negative
+    deltas = [(tuned_rewards[i] - base_rewards[i], i) for i in indices]
+    deltas.sort(reverse=True)
+    # Show best improvement and worst regression (or 2nd best if no regression)
+    show_indices = [deltas[0][1]]  # best improvement
+    if deltas[-1][0] < 0:
+        show_indices.append(deltas[-1][1])  # worst regression
+    else:
+        show_indices.append(deltas[min(1, len(deltas)-1)][1])  # 2nd sample
+    print(f"\n{'─' * 70}")
+    print(f"  TASK: {task.upper()}")
+    print(f"{'─' * 70}")
+    for idx in show_indices:
+        b_r = base_rewards[idx]
+        t_r = tuned_rewards[idx]
+        delta = t_r - b_r
+        arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "→")
+        # Truncate long outputs for readability
+        base_out = strip_think(base_completions[idx])[:300]
+        tuned_out = strip_think(tuned_completions[idx])[:300]
+        print(f"\n  Sample {idx+1}: base={b_r:.3f} → tuned={t_r:.3f} ({delta:+.3f} {arrow})")
+        print(f"  BASE:  {base_out}")
+        print(f"  TUNED: {tuned_out}")
+# ══════════════════════════════════════════════════════════════════════════════
+# STEP 7: Summary and conclusion
+# ══════════════════════════════════════════════════════════════════════════════
+print(f"\n\n{'═' * 70}")
+print("V4.2 EXPERIMENT CONCLUSION")
+print(f"{'═' * 70}")
+print(f"""
+Model:      Polygl0t/Tucano2-qwen-0.5B-Instruct
+Method:     GRPO + LoRA (r=16, α=32) + GDPO normalization + Dynamic IWU
+Training:   1,500 steps (best @ step 1100), LR=5e-6, β=0, G=16, τ=1.0
+Hardware:   1× L4 (24GB), ~22h runtime
+Data:       1,480 prompts (4 tasks: extraction, sql_qa, insights, push)
+Eval:       65 stratified samples (20 + 15 + 15 + 15)
+┌─────────────────────────────────────────────────────────────────────┐
+│  RESULTS SUMMARY                                                     │
+├─────────────┬──────────┬──────────┬──────────┬───────────────────────┤
+│  Task       │  Base    │  Tuned   │  Δ       │  Assessment           │
+├─────────────┼──────────┼──────────┼──────────┼───────────────────────┤
+│  extraction │  {task_results['extraction']['base']:.3f}   │  {task_results['extraction']['tuned']:.3f}   │  {task_results['extraction']['delta']:+.3f}   │  {'Improved' if task_results['extraction']['delta'] > 0.01 else 'Flat' if abs(task_results['extraction']['delta']) <= 0.01 else 'Regressed'}  │
+│  sql_qa     │  {task_results['sql_qa']['base']:.3f}   │  {task_results['sql_qa']['tuned']:.3f}   │  {task_results['sql_qa']['delta']:+.3f}   │  {'Improved' if task_results['sql_qa']['delta'] > 0.01 else 'Flat' if abs(task_results['sql_qa']['delta']) <= 0.01 else 'Regressed'}  │
+│  insights   │  {task_results['insights']['base']:.3f}   │  {task_results['insights']['tuned']:.3f}   │  {task_results['insights']['delta']:+.3f}   │  {'Improved' if task_results['insights']['delta'] > 0.01 else 'Flat' if abs(task_results['insights']['delta']) <= 0.01 else 'Regressed'}  │
+│  push       │  {task_results['push']['base']:.3f}   │  {task_results['push']['tuned']:.3f}   │  {task_results['push']['delta']:+.3f}   │  {'Improved' if task_results['push']['delta'] > 0.01 else 'Flat' if abs(task_results['push']['delta']) <= 0.01 else 'Regressed'}  │
+├─────────────┼──────────┼──────────┼──────────┼───────────────────────┤
+│  OVERALL    │  {base_overall:.3f}   │  {tuned_overall:.3f}   │  {delta_overall:+.3f}   │  {delta_pct_overall:+.1f}%               │
+└──────���──────┴──────────┴──────────┴──────────┴───────────────────────┘
+""")
+# Save results
+comparison_results = {
+    "experiment": "V4.2 Base vs GRPO-Tuned Comparison",
+    "model_id": MODEL_ID,
+    "adapter_path": str(best_ckpt_path),
+    "best_step": 1100,
+    "eval_samples": EVAL_TOTAL,
+    "temperature": COMPARISON_TEMP,
+    "seed": CURRENT_SEED,
+    "results": {
+        "overall": {"base": float(base_overall), "tuned": float(tuned_overall), "delta": float(delta_overall)},
+        **{task: {"base": float(tr["base"]), "tuned": float(tr["tuned"]), "delta": float(tr["delta"]), "n": tr["n"]}
+           for task, tr in task_results.items()}
+    },
+    "per_sample": [
+        {"task": eval_task_types[i], "base_reward": float(base_rewards[i]), "tuned_reward": float(tuned_rewards[i])}
+        for i in range(len(base_rewards))
+    ]
+}
+results_path = ADAPTER_DIR / "comparison_base_vs_tuned.json"
+with open(results_path, "w") as f:
+    json.dump(comparison_results, f, indent=2, ensure_ascii=False)
+print(f"✓ Results saved to {results_path}")