| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from scipy.stats import wilcoxon |
| import numpy as np |
|
|
| COMPARISON_TEMP = 0.1 |
| COMPARISON_MAX_TOKENS = 512 |
|
|
| |
| |
| |
|
|
| eval_v2_stratified_path = DATA_DIR / "pairs" / "eval_v2_stratified.jsonl" |
| assert eval_v2_stratified_path.exists(), f"Eval set not found: {eval_v2_stratified_path}" |
|
|
| eval_prompts = [] |
| eval_task_types = [] |
| with open(eval_v2_stratified_path) as f: |
| for line in f: |
| rec = json.loads(line) |
| prompt_msgs = rec["prompt_msgs"] |
| user_text = " ".join(m["content"] for m in prompt_msgs if m["role"] == "user") |
| task = _classify_task_type(user_text) |
| |
| prepared = inject_task_system_prompt(prompt_msgs, task) |
| eval_prompts.append(prepared) |
| eval_task_types.append(task) |
|
|
| assert len(eval_prompts) == EVAL_TOTAL, f"Expected {EVAL_TOTAL} eval prompts, got {len(eval_prompts)}" |
| print(f"β Loaded {len(eval_prompts)} eval prompts") |
| print(f" Distribution: {dict(zip(*np.unique(eval_task_types, return_counts=True)))}") |
|
|
|
|
| |
| |
| |
|
|
| def generate_eval_completions(model_obj, label="model"): |
| """Generate completions for all eval prompts, return texts + rewards.""" |
| FastLanguageModel.for_inference(model_obj) |
| completions = [] |
| rewards = [] |
| |
| for i, (msgs, task) in enumerate(zip(eval_prompts, eval_task_types)): |
| text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(text, return_tensors="pt").to(model_obj.device) |
| |
| with torch.no_grad(): |
| out = model_obj.generate( |
| **inputs, |
| max_new_tokens=COMPARISON_MAX_TOKENS, |
| temperature=COMPARISON_TEMP, |
| do_sample=True, |
| repetition_penalty=1.0, |
| ) |
| resp = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| completions.append(resp) |
| |
| |
| r = commerce_reward_fn_raw([resp], [text])[0] |
| rewards.append(r) |
| |
| if (i + 1) % 20 == 0: |
| print(f" [{label}] {i+1}/{len(eval_prompts)} done...") |
| |
| return completions, rewards |
|
|
|
|
| |
| |
| |
|
|
| print("\n" + "=" * 70) |
| print("EVALUATING BASE MODEL (no adapter)") |
| print("=" * 70) |
|
|
| |
| model.disable_adapter_layers() |
| base_completions, base_rewards = generate_eval_completions(model, label="base") |
| model.enable_adapter_layers() |
|
|
| print(f" β Base model: {len(base_rewards)} completions, mean reward = {np.mean(base_rewards):.3f}") |
|
|
|
|
| |
| |
| |
|
|
| print("\n" + "=" * 70) |
| print("EVALUATING TUNED MODEL (best_checkpoint, step 1100)") |
| print("=" * 70) |
|
|
| |
| best_ckpt_path = ADAPTER_DIR / "best_checkpoint" |
| assert best_ckpt_path.exists(), f"Best checkpoint not found: {best_ckpt_path}" |
|
|
| |
| from peft import set_peft_model_state_dict |
| import safetensors.torch |
|
|
| adapter_weights = safetensors.torch.load_file(str(best_ckpt_path / "adapter_model.safetensors")) |
| set_peft_model_state_dict(model, adapter_weights) |
| print(f" β Loaded adapter from {best_ckpt_path}") |
|
|
| tuned_completions, tuned_rewards = generate_eval_completions(model, label="tuned") |
| print(f" β Tuned model: {len(tuned_rewards)} completions, mean reward = {np.mean(tuned_rewards):.3f}") |
|
|
|
|
| |
| |
| |
|
|
| print("\n" + "=" * 70) |
| print("V4.2 FINAL COMPARISON: BASE vs GRPO-TUNED") |
| print("=" * 70) |
|
|
| |
| tasks_unique = ["extraction", "sql_qa", "insights", "push"] |
|
|
| print(f"\n{'Task':<14s} {'Base':>8s} {'Tuned':>8s} {'Ξ':>8s} {'Ξ%':>8s} {'N':>4s}") |
| print(f"{'β' * 52}") |
|
|
| task_results = {} |
| for task in tasks_unique: |
| indices = [i for i, t in enumerate(eval_task_types) if t == task] |
| base_task = [base_rewards[i] for i in indices] |
| tuned_task = [tuned_rewards[i] for i in indices] |
| |
| base_mean = np.mean(base_task) |
| tuned_mean = np.mean(tuned_task) |
| delta = tuned_mean - base_mean |
| delta_pct = (delta / base_mean * 100) if base_mean > 0 else float('inf') |
| |
| task_results[task] = { |
| "base": base_mean, "tuned": tuned_mean, |
| "delta": delta, "delta_pct": delta_pct, |
| "n": len(indices), |
| "base_scores": base_task, "tuned_scores": tuned_task, |
| } |
| |
| arrow = "β" if delta > 0.01 else ("β" if delta < -0.01 else "β") |
| print(f"{task:<14s} {base_mean:>8.3f} {tuned_mean:>8.3f} {delta:>+8.3f} {delta_pct:>+7.1f}% {len(indices):>4d} {arrow}") |
|
|
| |
| base_overall = np.mean(base_rewards) |
| tuned_overall = np.mean(tuned_rewards) |
| delta_overall = tuned_overall - base_overall |
| delta_pct_overall = (delta_overall / base_overall * 100) if base_overall > 0 else float('inf') |
|
|
| print(f"{'β' * 52}") |
| print(f"{'OVERALL':<14s} {base_overall:>8.3f} {tuned_overall:>8.3f} {delta_overall:>+8.3f} {delta_pct_overall:>+7.1f}% {len(base_rewards):>4d}") |
|
|
| |
| print(f"\n{'β' * 52}") |
| print("Statistical Significance (Wilcoxon signed-rank, paired)") |
| print(f"{'β' * 52}") |
|
|
| try: |
| stat, p_val = wilcoxon(tuned_rewards, base_rewards, alternative='greater') |
| sig = "β
YES (p < 0.05)" if p_val < 0.05 else "β NO (p β₯ 0.05)" |
| print(f" Overall: W={stat:.0f}, p={p_val:.4f} β {sig}") |
| except Exception as e: |
| print(f" Overall: Could not compute ({e})") |
|
|
| for task in tasks_unique: |
| tr = task_results[task] |
| try: |
| |
| diffs = [t - b for t, b in zip(tr["tuned_scores"], tr["base_scores"])] |
| if all(d == 0 for d in diffs): |
| print(f" {task}: all differences = 0 (identical outputs)") |
| else: |
| stat, p_val = wilcoxon(tr["tuned_scores"], tr["base_scores"], alternative='greater') |
| sig = "p < 0.05 β
" if p_val < 0.05 else f"p = {p_val:.3f}" |
| print(f" {task}: W={stat:.0f}, {sig}") |
| except Exception as e: |
| print(f" {task}: insufficient data ({e})") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n\n{'=' * 70}") |
| print("SAMPLE OUTPUTS β Base vs Tuned (2 per task)") |
| print("=" * 70) |
|
|
| for task in tasks_unique: |
| indices = [i for i, t in enumerate(eval_task_types) if t == task] |
| |
| deltas = [(tuned_rewards[i] - base_rewards[i], i) for i in indices] |
| deltas.sort(reverse=True) |
| |
| |
| show_indices = [deltas[0][1]] |
| if deltas[-1][0] < 0: |
| show_indices.append(deltas[-1][1]) |
| else: |
| show_indices.append(deltas[min(1, len(deltas)-1)][1]) |
| |
| print(f"\n{'β' * 70}") |
| print(f" TASK: {task.upper()}") |
| print(f"{'β' * 70}") |
| |
| for idx in show_indices: |
| b_r = base_rewards[idx] |
| t_r = tuned_rewards[idx] |
| delta = t_r - b_r |
| arrow = "β" if delta > 0.01 else ("β" if delta < -0.01 else "β") |
| |
| |
| base_out = strip_think(base_completions[idx])[:300] |
| tuned_out = strip_think(tuned_completions[idx])[:300] |
| |
| print(f"\n Sample {idx+1}: base={b_r:.3f} β tuned={t_r:.3f} ({delta:+.3f} {arrow})") |
| print(f" BASE: {base_out}") |
| print(f" TUNED: {tuned_out}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n\n{'β' * 70}") |
| print("V4.2 EXPERIMENT CONCLUSION") |
| print(f"{'β' * 70}") |
| print(f""" |
| Model: Polygl0t/Tucano2-qwen-0.5B-Instruct |
| Method: GRPO + LoRA (r=16, Ξ±=32) + GDPO normalization + Dynamic IWU |
| Training: 1,500 steps (best @ step 1100), LR=5e-6, Ξ²=0, G=16, Ο=1.0 |
| Hardware: 1Γ L4 (24GB), ~22h runtime |
| Data: 1,480 prompts (4 tasks: extraction, sql_qa, insights, push) |
| Eval: 65 stratified samples (20 + 15 + 15 + 15) |
| |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β RESULTS SUMMARY β |
| βββββββββββββββ¬βββββββββββ¬βββββββββββ¬βββββββββββ¬ββββββββββββββββββββββββ€ |
| β Task β Base β Tuned β Ξ β Assessment β |
| βββββββββββββββΌβββββββββββΌβββββββββββΌβββββββββββΌββββββββββββββββββββββββ€ |
| β extraction β {task_results['extraction']['base']:.3f} β {task_results['extraction']['tuned']:.3f} β {task_results['extraction']['delta']:+.3f} β {'Improved' if task_results['extraction']['delta'] > 0.01 else 'Flat' if abs(task_results['extraction']['delta']) <= 0.01 else 'Regressed'} β |
| β sql_qa β {task_results['sql_qa']['base']:.3f} β {task_results['sql_qa']['tuned']:.3f} β {task_results['sql_qa']['delta']:+.3f} β {'Improved' if task_results['sql_qa']['delta'] > 0.01 else 'Flat' if abs(task_results['sql_qa']['delta']) <= 0.01 else 'Regressed'} β |
| β insights β {task_results['insights']['base']:.3f} β {task_results['insights']['tuned']:.3f} β {task_results['insights']['delta']:+.3f} β {'Improved' if task_results['insights']['delta'] > 0.01 else 'Flat' if abs(task_results['insights']['delta']) <= 0.01 else 'Regressed'} β |
| β push β {task_results['push']['base']:.3f} β {task_results['push']['tuned']:.3f} β {task_results['push']['delta']:+.3f} β {'Improved' if task_results['push']['delta'] > 0.01 else 'Flat' if abs(task_results['push']['delta']) <= 0.01 else 'Regressed'} β |
| βββββββββββββββΌβββββββββββΌβββββββββββΌβββββββββββΌββββββββββββββββββββββββ€ |
| β OVERALL β {base_overall:.3f} β {tuned_overall:.3f} β {delta_overall:+.3f} β {delta_pct_overall:+.1f}% β |
| βββββββββββββββ΄βββββββββββ΄βββββββββββ΄βββββββββββ΄ββββββββββββββββββββββββ |
| """) |
|
|
| |
| comparison_results = { |
| "experiment": "V4.2 Base vs GRPO-Tuned Comparison", |
| "model_id": MODEL_ID, |
| "adapter_path": str(best_ckpt_path), |
| "best_step": 1100, |
| "eval_samples": EVAL_TOTAL, |
| "temperature": COMPARISON_TEMP, |
| "seed": CURRENT_SEED, |
| "results": { |
| "overall": {"base": float(base_overall), "tuned": float(tuned_overall), "delta": float(delta_overall)}, |
| **{task: {"base": float(tr["base"]), "tuned": float(tr["tuned"]), "delta": float(tr["delta"]), "n": tr["n"]} |
| for task, tr in task_results.items()} |
| }, |
| "per_sample": [ |
| {"task": eval_task_types[i], "base_reward": float(base_rewards[i]), "tuned_reward": float(tuned_rewards[i])} |
| for i in range(len(base_rewards)) |
| ] |
| } |
|
|
| results_path = ADAPTER_DIR / "comparison_base_vs_tuned.json" |
| with open(results_path, "w") as f: |
| json.dump(comparison_results, f, indent=2, ensure_ascii=False) |
| print(f"β Results saved to {results_path}") |
|
|