# ══════════════════════════════════════════════════════════════════════════════ # V4.2 FINAL: Base Model vs GRPO-Tuned Comparison # ══════════════════════════════════════════════════════════════════════════════ # # Run AFTER: Cells 1-5 (deps, GPU, config, model load, token verify) + Cell 7 (reward fns) # Run AFTER: Cell 10 (dataset preparation — loads eval_v2_stratified.jsonl) # # This cell evaluates BOTH models on the same 65 stratified eval prompts: # 1. Base model (no adapter — raw Tucano2-qwen-0.5B-Instruct) # 2. GRPO-tuned model (best_checkpoint from V4.2 training) # # Output: side-by-side comparison table + per-task delta + sample outputs # ══════════════════════════════════════════════════════════════════════════════ from scipy.stats import wilcoxon import numpy as np COMPARISON_TEMP = 0.1 # near-deterministic for fair comparison COMPARISON_MAX_TOKENS = 512 # ══════════════════════════════════════════════════════════════════════════════ # STEP 1: Load eval prompts from stratified eval set # ══════════════════════════════════════════════════════════════════════════════ eval_v2_stratified_path = DATA_DIR / "pairs" / "eval_v2_stratified.jsonl" assert eval_v2_stratified_path.exists(), f"Eval set not found: {eval_v2_stratified_path}" eval_prompts = [] eval_task_types = [] with open(eval_v2_stratified_path) as f: for line in f: rec = json.loads(line) prompt_msgs = rec["prompt_msgs"] user_text = " ".join(m["content"] for m in prompt_msgs if m["role"] == "user") task = _classify_task_type(user_text) # Inject task-specific system prompt prepared = inject_task_system_prompt(prompt_msgs, task) eval_prompts.append(prepared) eval_task_types.append(task) assert len(eval_prompts) == EVAL_TOTAL, f"Expected {EVAL_TOTAL} eval prompts, got {len(eval_prompts)}" print(f"✓ Loaded {len(eval_prompts)} eval prompts") print(f" Distribution: {dict(zip(*np.unique(eval_task_types, return_counts=True)))}") # ══════════════════════════════════════════════════════════════════════════════ # STEP 2: Helper — generate completions for all eval prompts # ══════════════════════════════════════════════════════════════════════════════ def generate_eval_completions(model_obj, label="model"): """Generate completions for all eval prompts, return texts + rewards.""" FastLanguageModel.for_inference(model_obj) completions = [] rewards = [] for i, (msgs, task) in enumerate(zip(eval_prompts, eval_task_types)): text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model_obj.device) with torch.no_grad(): out = model_obj.generate( **inputs, max_new_tokens=COMPARISON_MAX_TOKENS, temperature=COMPARISON_TEMP, do_sample=True, repetition_penalty=1.0, ) resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) completions.append(resp) # Score with raw reward function r = commerce_reward_fn_raw([resp], [text])[0] rewards.append(r) if (i + 1) % 20 == 0: print(f" [{label}] {i+1}/{len(eval_prompts)} done...") return completions, rewards # ══════════════════════════════════════════════════════════════════════════════ # STEP 3: Evaluate BASE model (disable adapter) # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("EVALUATING BASE MODEL (no adapter)") print("=" * 70) # Disable LoRA adapter to get base model behavior model.disable_adapter_layers() base_completions, base_rewards = generate_eval_completions(model, label="base") model.enable_adapter_layers() print(f" ✓ Base model: {len(base_rewards)} completions, mean reward = {np.mean(base_rewards):.3f}") # ══════════════════════════════════════════════════════════════════════════════ # STEP 4: Evaluate TUNED model (load best checkpoint adapter) # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("EVALUATING TUNED MODEL (best_checkpoint, step 1100)") print("=" * 70) # Load the best checkpoint adapter best_ckpt_path = ADAPTER_DIR / "best_checkpoint" assert best_ckpt_path.exists(), f"Best checkpoint not found: {best_ckpt_path}" # Load adapter weights from best checkpoint from peft import set_peft_model_state_dict import safetensors.torch adapter_weights = safetensors.torch.load_file(str(best_ckpt_path / "adapter_model.safetensors")) set_peft_model_state_dict(model, adapter_weights) print(f" ✓ Loaded adapter from {best_ckpt_path}") tuned_completions, tuned_rewards = generate_eval_completions(model, label="tuned") print(f" ✓ Tuned model: {len(tuned_rewards)} completions, mean reward = {np.mean(tuned_rewards):.3f}") # ══════════════════════════════════════════════════════════════════════════════ # STEP 5: Comparison analysis # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("V4.2 FINAL COMPARISON: BASE vs GRPO-TUNED") print("=" * 70) # Per-task breakdown tasks_unique = ["extraction", "sql_qa", "insights", "push"] print(f"\n{'Task':<14s} {'Base':>8s} {'Tuned':>8s} {'Δ':>8s} {'Δ%':>8s} {'N':>4s}") print(f"{'─' * 52}") task_results = {} for task in tasks_unique: indices = [i for i, t in enumerate(eval_task_types) if t == task] base_task = [base_rewards[i] for i in indices] tuned_task = [tuned_rewards[i] for i in indices] base_mean = np.mean(base_task) tuned_mean = np.mean(tuned_task) delta = tuned_mean - base_mean delta_pct = (delta / base_mean * 100) if base_mean > 0 else float('inf') task_results[task] = { "base": base_mean, "tuned": tuned_mean, "delta": delta, "delta_pct": delta_pct, "n": len(indices), "base_scores": base_task, "tuned_scores": tuned_task, } arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "→") print(f"{task:<14s} {base_mean:>8.3f} {tuned_mean:>8.3f} {delta:>+8.3f} {delta_pct:>+7.1f}% {len(indices):>4d} {arrow}") # Overall base_overall = np.mean(base_rewards) tuned_overall = np.mean(tuned_rewards) delta_overall = tuned_overall - base_overall delta_pct_overall = (delta_overall / base_overall * 100) if base_overall > 0 else float('inf') print(f"{'─' * 52}") print(f"{'OVERALL':<14s} {base_overall:>8.3f} {tuned_overall:>8.3f} {delta_overall:>+8.3f} {delta_pct_overall:>+7.1f}% {len(base_rewards):>4d}") # Statistical significance (Wilcoxon signed-rank test — paired samples) print(f"\n{'─' * 52}") print("Statistical Significance (Wilcoxon signed-rank, paired)") print(f"{'─' * 52}") try: stat, p_val = wilcoxon(tuned_rewards, base_rewards, alternative='greater') sig = "✅ YES (p < 0.05)" if p_val < 0.05 else "❌ NO (p ≥ 0.05)" print(f" Overall: W={stat:.0f}, p={p_val:.4f} → {sig}") except Exception as e: print(f" Overall: Could not compute ({e})") for task in tasks_unique: tr = task_results[task] try: # Need at least 10 samples and not all differences = 0 diffs = [t - b for t, b in zip(tr["tuned_scores"], tr["base_scores"])] if all(d == 0 for d in diffs): print(f" {task}: all differences = 0 (identical outputs)") else: stat, p_val = wilcoxon(tr["tuned_scores"], tr["base_scores"], alternative='greater') sig = "p < 0.05 ✅" if p_val < 0.05 else f"p = {p_val:.3f}" print(f" {task}: W={stat:.0f}, {sig}") except Exception as e: print(f" {task}: insufficient data ({e})") # ══════════════════════════════════════════════════════════════════════════════ # STEP 6: Sample outputs — show 2 examples per task (base vs tuned) # ══════════════════════════════════════════════════════════════════════════════ print(f"\n\n{'=' * 70}") print("SAMPLE OUTPUTS — Base vs Tuned (2 per task)") print("=" * 70) for task in tasks_unique: indices = [i for i, t in enumerate(eval_task_types) if t == task] # Pick the sample with largest positive delta and one with largest negative deltas = [(tuned_rewards[i] - base_rewards[i], i) for i in indices] deltas.sort(reverse=True) # Show best improvement and worst regression (or 2nd best if no regression) show_indices = [deltas[0][1]] # best improvement if deltas[-1][0] < 0: show_indices.append(deltas[-1][1]) # worst regression else: show_indices.append(deltas[min(1, len(deltas)-1)][1]) # 2nd sample print(f"\n{'─' * 70}") print(f" TASK: {task.upper()}") print(f"{'─' * 70}") for idx in show_indices: b_r = base_rewards[idx] t_r = tuned_rewards[idx] delta = t_r - b_r arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "→") # Truncate long outputs for readability base_out = strip_think(base_completions[idx])[:300] tuned_out = strip_think(tuned_completions[idx])[:300] print(f"\n Sample {idx+1}: base={b_r:.3f} → tuned={t_r:.3f} ({delta:+.3f} {arrow})") print(f" BASE: {base_out}") print(f" TUNED: {tuned_out}") # ══════════════════════════════════════════════════════════════════════════════ # STEP 7: Summary and conclusion # ══════════════════════════════════════════════════════════════════════════════ print(f"\n\n{'═' * 70}") print("V4.2 EXPERIMENT CONCLUSION") print(f"{'═' * 70}") print(f""" Model: Polygl0t/Tucano2-qwen-0.5B-Instruct Method: GRPO + LoRA (r=16, α=32) + GDPO normalization + Dynamic IWU Training: 1,500 steps (best @ step 1100), LR=5e-6, β=0, G=16, τ=1.0 Hardware: 1× L4 (24GB), ~22h runtime Data: 1,480 prompts (4 tasks: extraction, sql_qa, insights, push) Eval: 65 stratified samples (20 + 15 + 15 + 15) ┌─────────────────────────────────────────────────────────────────────┐ │ RESULTS SUMMARY │ ├─────────────┬──────────┬──────────┬──────────┬───────────────────────┤ │ Task │ Base │ Tuned │ Δ │ Assessment │ ├─────────────┼──────────┼──────────┼──────────┼───────────────────────┤ │ extraction │ {task_results['extraction']['base']:.3f} │ {task_results['extraction']['tuned']:.3f} │ {task_results['extraction']['delta']:+.3f} │ {'Improved' if task_results['extraction']['delta'] > 0.01 else 'Flat' if abs(task_results['extraction']['delta']) <= 0.01 else 'Regressed'} │ │ sql_qa │ {task_results['sql_qa']['base']:.3f} │ {task_results['sql_qa']['tuned']:.3f} │ {task_results['sql_qa']['delta']:+.3f} │ {'Improved' if task_results['sql_qa']['delta'] > 0.01 else 'Flat' if abs(task_results['sql_qa']['delta']) <= 0.01 else 'Regressed'} │ │ insights │ {task_results['insights']['base']:.3f} │ {task_results['insights']['tuned']:.3f} │ {task_results['insights']['delta']:+.3f} │ {'Improved' if task_results['insights']['delta'] > 0.01 else 'Flat' if abs(task_results['insights']['delta']) <= 0.01 else 'Regressed'} │ │ push │ {task_results['push']['base']:.3f} │ {task_results['push']['tuned']:.3f} │ {task_results['push']['delta']:+.3f} │ {'Improved' if task_results['push']['delta'] > 0.01 else 'Flat' if abs(task_results['push']['delta']) <= 0.01 else 'Regressed'} │ ├─────────────┼──────────┼──────────┼──────────┼───────────────────────┤ │ OVERALL │ {base_overall:.3f} │ {tuned_overall:.3f} │ {delta_overall:+.3f} │ {delta_pct_overall:+.1f}% │ └─────────────┴──────────┴──────────┴──────────┴───────────────────────┘ """) # Save results comparison_results = { "experiment": "V4.2 Base vs GRPO-Tuned Comparison", "model_id": MODEL_ID, "adapter_path": str(best_ckpt_path), "best_step": 1100, "eval_samples": EVAL_TOTAL, "temperature": COMPARISON_TEMP, "seed": CURRENT_SEED, "results": { "overall": {"base": float(base_overall), "tuned": float(tuned_overall), "delta": float(delta_overall)}, **{task: {"base": float(tr["base"]), "tuned": float(tr["tuned"]), "delta": float(tr["delta"]), "n": tr["n"]} for task, tr in task_results.items()} }, "per_sample": [ {"task": eval_task_types[i], "base_reward": float(base_rewards[i]), "tuned_reward": float(tuned_rewards[i])} for i in range(len(base_rewards)) ] } results_path = ADAPTER_DIR / "comparison_base_vs_tuned.json" with open(results_path, "w") as f: json.dump(comparison_results, f, indent=2, ensure_ascii=False) print(f"✓ Results saved to {results_path}")