rtferraz commited on
Commit
0c9199c
Β·
verified Β·
1 Parent(s): b1be31c

add: base vs tuned comparison cell for V4.2 final evaluation

Browse files
notebooks/cell_comparison_base_vs_tuned.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ══════════════════════════════════════════════════════════════════════════════
2
+ # V4.2 FINAL: Base Model vs GRPO-Tuned Comparison
3
+ # ══════════════════════════════════════════════════════════════════════════════
4
+ #
5
+ # Run AFTER: Cells 1-5 (deps, GPU, config, model load, token verify) + Cell 7 (reward fns)
6
+ # Run AFTER: Cell 10 (dataset preparation β€” loads eval_v2_stratified.jsonl)
7
+ #
8
+ # This cell evaluates BOTH models on the same 65 stratified eval prompts:
9
+ # 1. Base model (no adapter β€” raw Tucano2-qwen-0.5B-Instruct)
10
+ # 2. GRPO-tuned model (best_checkpoint from V4.2 training)
11
+ #
12
+ # Output: side-by-side comparison table + per-task delta + sample outputs
13
+ # ══════════════════════════════════════════════════════════════════════════════
14
+
15
+ from scipy.stats import wilcoxon
16
+ import numpy as np
17
+
18
+ COMPARISON_TEMP = 0.1 # near-deterministic for fair comparison
19
+ COMPARISON_MAX_TOKENS = 512
20
+
21
+ # ══════════════════════════════════════════════════════════════════════════════
22
+ # STEP 1: Load eval prompts from stratified eval set
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+
25
+ eval_v2_stratified_path = DATA_DIR / "pairs" / "eval_v2_stratified.jsonl"
26
+ assert eval_v2_stratified_path.exists(), f"Eval set not found: {eval_v2_stratified_path}"
27
+
28
+ eval_prompts = []
29
+ eval_task_types = []
30
+ with open(eval_v2_stratified_path) as f:
31
+ for line in f:
32
+ rec = json.loads(line)
33
+ prompt_msgs = rec["prompt_msgs"]
34
+ user_text = " ".join(m["content"] for m in prompt_msgs if m["role"] == "user")
35
+ task = _classify_task_type(user_text)
36
+ # Inject task-specific system prompt
37
+ prepared = inject_task_system_prompt(prompt_msgs, task)
38
+ eval_prompts.append(prepared)
39
+ eval_task_types.append(task)
40
+
41
+ assert len(eval_prompts) == EVAL_TOTAL, f"Expected {EVAL_TOTAL} eval prompts, got {len(eval_prompts)}"
42
+ print(f"βœ“ Loaded {len(eval_prompts)} eval prompts")
43
+ print(f" Distribution: {dict(zip(*np.unique(eval_task_types, return_counts=True)))}")
44
+
45
+
46
+ # ══════════════════════════════════════════════════════════════════════════════
47
+ # STEP 2: Helper β€” generate completions for all eval prompts
48
+ # ══════════════════════════════════════════════════════════════════════════════
49
+
50
+ def generate_eval_completions(model_obj, label="model"):
51
+ """Generate completions for all eval prompts, return texts + rewards."""
52
+ FastLanguageModel.for_inference(model_obj)
53
+ completions = []
54
+ rewards = []
55
+
56
+ for i, (msgs, task) in enumerate(zip(eval_prompts, eval_task_types)):
57
+ text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
58
+ inputs = tokenizer(text, return_tensors="pt").to(model_obj.device)
59
+
60
+ with torch.no_grad():
61
+ out = model_obj.generate(
62
+ **inputs,
63
+ max_new_tokens=COMPARISON_MAX_TOKENS,
64
+ temperature=COMPARISON_TEMP,
65
+ do_sample=True,
66
+ repetition_penalty=1.0,
67
+ )
68
+ resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
69
+ completions.append(resp)
70
+
71
+ # Score with raw reward function
72
+ r = commerce_reward_fn_raw([resp], [text])[0]
73
+ rewards.append(r)
74
+
75
+ if (i + 1) % 20 == 0:
76
+ print(f" [{label}] {i+1}/{len(eval_prompts)} done...")
77
+
78
+ return completions, rewards
79
+
80
+
81
+ # ══════════════════════════════════════════════════════════════════════════════
82
+ # STEP 3: Evaluate BASE model (disable adapter)
83
+ # ══════════════════════════════════════════════════════════════════════════════
84
+
85
+ print("\n" + "=" * 70)
86
+ print("EVALUATING BASE MODEL (no adapter)")
87
+ print("=" * 70)
88
+
89
+ # Disable LoRA adapter to get base model behavior
90
+ model.disable_adapter_layers()
91
+ base_completions, base_rewards = generate_eval_completions(model, label="base")
92
+ model.enable_adapter_layers()
93
+
94
+ print(f" βœ“ Base model: {len(base_rewards)} completions, mean reward = {np.mean(base_rewards):.3f}")
95
+
96
+
97
+ # ══════════════════════════════════════════════════════════════════════════════
98
+ # STEP 4: Evaluate TUNED model (load best checkpoint adapter)
99
+ # ══════════════════════════════════════════════════════════════════════════════
100
+
101
+ print("\n" + "=" * 70)
102
+ print("EVALUATING TUNED MODEL (best_checkpoint, step 1100)")
103
+ print("=" * 70)
104
+
105
+ # Load the best checkpoint adapter
106
+ best_ckpt_path = ADAPTER_DIR / "best_checkpoint"
107
+ assert best_ckpt_path.exists(), f"Best checkpoint not found: {best_ckpt_path}"
108
+
109
+ # Load adapter weights from best checkpoint
110
+ from peft import set_peft_model_state_dict
111
+ import safetensors.torch
112
+
113
+ adapter_weights = safetensors.torch.load_file(str(best_ckpt_path / "adapter_model.safetensors"))
114
+ set_peft_model_state_dict(model, adapter_weights)
115
+ print(f" βœ“ Loaded adapter from {best_ckpt_path}")
116
+
117
+ tuned_completions, tuned_rewards = generate_eval_completions(model, label="tuned")
118
+ print(f" βœ“ Tuned model: {len(tuned_rewards)} completions, mean reward = {np.mean(tuned_rewards):.3f}")
119
+
120
+
121
+ # ══════════════════════════════════════════════════════════════════════════════
122
+ # STEP 5: Comparison analysis
123
+ # ══════════════════════════════════════════════════════════════════════════════
124
+
125
+ print("\n" + "=" * 70)
126
+ print("V4.2 FINAL COMPARISON: BASE vs GRPO-TUNED")
127
+ print("=" * 70)
128
+
129
+ # Per-task breakdown
130
+ tasks_unique = ["extraction", "sql_qa", "insights", "push"]
131
+
132
+ print(f"\n{'Task':<14s} {'Base':>8s} {'Tuned':>8s} {'Ξ”':>8s} {'Ξ”%':>8s} {'N':>4s}")
133
+ print(f"{'─' * 52}")
134
+
135
+ task_results = {}
136
+ for task in tasks_unique:
137
+ indices = [i for i, t in enumerate(eval_task_types) if t == task]
138
+ base_task = [base_rewards[i] for i in indices]
139
+ tuned_task = [tuned_rewards[i] for i in indices]
140
+
141
+ base_mean = np.mean(base_task)
142
+ tuned_mean = np.mean(tuned_task)
143
+ delta = tuned_mean - base_mean
144
+ delta_pct = (delta / base_mean * 100) if base_mean > 0 else float('inf')
145
+
146
+ task_results[task] = {
147
+ "base": base_mean, "tuned": tuned_mean,
148
+ "delta": delta, "delta_pct": delta_pct,
149
+ "n": len(indices),
150
+ "base_scores": base_task, "tuned_scores": tuned_task,
151
+ }
152
+
153
+ arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "β†’")
154
+ print(f"{task:<14s} {base_mean:>8.3f} {tuned_mean:>8.3f} {delta:>+8.3f} {delta_pct:>+7.1f}% {len(indices):>4d} {arrow}")
155
+
156
+ # Overall
157
+ base_overall = np.mean(base_rewards)
158
+ tuned_overall = np.mean(tuned_rewards)
159
+ delta_overall = tuned_overall - base_overall
160
+ delta_pct_overall = (delta_overall / base_overall * 100) if base_overall > 0 else float('inf')
161
+
162
+ print(f"{'─' * 52}")
163
+ print(f"{'OVERALL':<14s} {base_overall:>8.3f} {tuned_overall:>8.3f} {delta_overall:>+8.3f} {delta_pct_overall:>+7.1f}% {len(base_rewards):>4d}")
164
+
165
+ # Statistical significance (Wilcoxon signed-rank test β€” paired samples)
166
+ print(f"\n{'─' * 52}")
167
+ print("Statistical Significance (Wilcoxon signed-rank, paired)")
168
+ print(f"{'─' * 52}")
169
+
170
+ try:
171
+ stat, p_val = wilcoxon(tuned_rewards, base_rewards, alternative='greater')
172
+ sig = "βœ… YES (p < 0.05)" if p_val < 0.05 else "❌ NO (p β‰₯ 0.05)"
173
+ print(f" Overall: W={stat:.0f}, p={p_val:.4f} β†’ {sig}")
174
+ except Exception as e:
175
+ print(f" Overall: Could not compute ({e})")
176
+
177
+ for task in tasks_unique:
178
+ tr = task_results[task]
179
+ try:
180
+ # Need at least 10 samples and not all differences = 0
181
+ diffs = [t - b for t, b in zip(tr["tuned_scores"], tr["base_scores"])]
182
+ if all(d == 0 for d in diffs):
183
+ print(f" {task}: all differences = 0 (identical outputs)")
184
+ else:
185
+ stat, p_val = wilcoxon(tr["tuned_scores"], tr["base_scores"], alternative='greater')
186
+ sig = "p < 0.05 βœ…" if p_val < 0.05 else f"p = {p_val:.3f}"
187
+ print(f" {task}: W={stat:.0f}, {sig}")
188
+ except Exception as e:
189
+ print(f" {task}: insufficient data ({e})")
190
+
191
+
192
+ # ═════════════════════════════════════════════════════════════════���════════════
193
+ # STEP 6: Sample outputs β€” show 2 examples per task (base vs tuned)
194
+ # ══════════════════════════════════════════════════════════════════════════════
195
+
196
+ print(f"\n\n{'=' * 70}")
197
+ print("SAMPLE OUTPUTS β€” Base vs Tuned (2 per task)")
198
+ print("=" * 70)
199
+
200
+ for task in tasks_unique:
201
+ indices = [i for i, t in enumerate(eval_task_types) if t == task]
202
+ # Pick the sample with largest positive delta and one with largest negative
203
+ deltas = [(tuned_rewards[i] - base_rewards[i], i) for i in indices]
204
+ deltas.sort(reverse=True)
205
+
206
+ # Show best improvement and worst regression (or 2nd best if no regression)
207
+ show_indices = [deltas[0][1]] # best improvement
208
+ if deltas[-1][0] < 0:
209
+ show_indices.append(deltas[-1][1]) # worst regression
210
+ else:
211
+ show_indices.append(deltas[min(1, len(deltas)-1)][1]) # 2nd sample
212
+
213
+ print(f"\n{'─' * 70}")
214
+ print(f" TASK: {task.upper()}")
215
+ print(f"{'─' * 70}")
216
+
217
+ for idx in show_indices:
218
+ b_r = base_rewards[idx]
219
+ t_r = tuned_rewards[idx]
220
+ delta = t_r - b_r
221
+ arrow = "↑" if delta > 0.01 else ("↓" if delta < -0.01 else "β†’")
222
+
223
+ # Truncate long outputs for readability
224
+ base_out = strip_think(base_completions[idx])[:300]
225
+ tuned_out = strip_think(tuned_completions[idx])[:300]
226
+
227
+ print(f"\n Sample {idx+1}: base={b_r:.3f} β†’ tuned={t_r:.3f} ({delta:+.3f} {arrow})")
228
+ print(f" BASE: {base_out}")
229
+ print(f" TUNED: {tuned_out}")
230
+
231
+
232
+ # ══════════════════════════════════════════════════════════════════════════════
233
+ # STEP 7: Summary and conclusion
234
+ # ══════════════════════════════════════════════════════════════════════════════
235
+
236
+ print(f"\n\n{'═' * 70}")
237
+ print("V4.2 EXPERIMENT CONCLUSION")
238
+ print(f"{'═' * 70}")
239
+ print(f"""
240
+ Model: Polygl0t/Tucano2-qwen-0.5B-Instruct
241
+ Method: GRPO + LoRA (r=16, Ξ±=32) + GDPO normalization + Dynamic IWU
242
+ Training: 1,500 steps (best @ step 1100), LR=5e-6, Ξ²=0, G=16, Ο„=1.0
243
+ Hardware: 1Γ— L4 (24GB), ~22h runtime
244
+ Data: 1,480 prompts (4 tasks: extraction, sql_qa, insights, push)
245
+ Eval: 65 stratified samples (20 + 15 + 15 + 15)
246
+
247
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
248
+ β”‚ RESULTS SUMMARY β”‚
249
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
250
+ β”‚ Task β”‚ Base β”‚ Tuned β”‚ Ξ” β”‚ Assessment β”‚
251
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
252
+ β”‚ extraction β”‚ {task_results['extraction']['base']:.3f} β”‚ {task_results['extraction']['tuned']:.3f} β”‚ {task_results['extraction']['delta']:+.3f} β”‚ {'Improved' if task_results['extraction']['delta'] > 0.01 else 'Flat' if abs(task_results['extraction']['delta']) <= 0.01 else 'Regressed'} β”‚
253
+ β”‚ sql_qa β”‚ {task_results['sql_qa']['base']:.3f} β”‚ {task_results['sql_qa']['tuned']:.3f} β”‚ {task_results['sql_qa']['delta']:+.3f} β”‚ {'Improved' if task_results['sql_qa']['delta'] > 0.01 else 'Flat' if abs(task_results['sql_qa']['delta']) <= 0.01 else 'Regressed'} β”‚
254
+ β”‚ insights β”‚ {task_results['insights']['base']:.3f} β”‚ {task_results['insights']['tuned']:.3f} β”‚ {task_results['insights']['delta']:+.3f} β”‚ {'Improved' if task_results['insights']['delta'] > 0.01 else 'Flat' if abs(task_results['insights']['delta']) <= 0.01 else 'Regressed'} β”‚
255
+ β”‚ push β”‚ {task_results['push']['base']:.3f} β”‚ {task_results['push']['tuned']:.3f} β”‚ {task_results['push']['delta']:+.3f} β”‚ {'Improved' if task_results['push']['delta'] > 0.01 else 'Flat' if abs(task_results['push']['delta']) <= 0.01 else 'Regressed'} β”‚
256
+ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
257
+ β”‚ OVERALL β”‚ {base_overall:.3f} β”‚ {tuned_overall:.3f} β”‚ {delta_overall:+.3f} β”‚ {delta_pct_overall:+.1f}% β”‚
258
+ β””β”€β”€β”€β”€β”€β”€οΏ½οΏ½οΏ½β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
259
+ """)
260
+
261
+ # Save results
262
+ comparison_results = {
263
+ "experiment": "V4.2 Base vs GRPO-Tuned Comparison",
264
+ "model_id": MODEL_ID,
265
+ "adapter_path": str(best_ckpt_path),
266
+ "best_step": 1100,
267
+ "eval_samples": EVAL_TOTAL,
268
+ "temperature": COMPARISON_TEMP,
269
+ "seed": CURRENT_SEED,
270
+ "results": {
271
+ "overall": {"base": float(base_overall), "tuned": float(tuned_overall), "delta": float(delta_overall)},
272
+ **{task: {"base": float(tr["base"]), "tuned": float(tr["tuned"]), "delta": float(tr["delta"]), "n": tr["n"]}
273
+ for task, tr in task_results.items()}
274
+ },
275
+ "per_sample": [
276
+ {"task": eval_task_types[i], "base_reward": float(base_rewards[i]), "tuned_reward": float(tuned_rewards[i])}
277
+ for i in range(len(base_rewards))
278
+ ]
279
+ }
280
+
281
+ results_path = ADAPTER_DIR / "comparison_base_vs_tuned.json"
282
+ with open(results_path, "w") as f:
283
+ json.dump(comparison_results, f, indent=2, ensure_ascii=False)
284
+ print(f"βœ“ Results saved to {results_path}")