""" SalesPath — Evaluate Baseline vs Trained Model Runs episodes at each difficulty level with both the base model and the trained (GRPO) model, then compares performance. Usage: python training/eval_baseline_vs_trained.py \ --base Qwen/Qwen2.5-0.5B-Instruct \ --trained ./salespath_out/grpo_final \ --env-url http://127.0.0.1:8000 \ --episodes-per-level 4 \ --output ./salespath_out/eval_results.json """ import argparse import asyncio import json import os import sys import time from pathlib import Path import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Ensure project root is on path _ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if _ROOT not in sys.path: sys.path.insert(0, _ROOT) from training.rollout import run_episode DIFFICULTIES = [1, 2, 3, 4] async def eval_model( model, tokenizer, env_url: str, episodes_per_level: int = 4, label: str = "model", ) -> dict: """Evaluate a model across all difficulty levels.""" results = { "label": label, "per_difficulty": {}, "overall": {}, } all_rewards = [] all_violations = [] all_closes = [] all_lengths = [] for difficulty in DIFFICULTIES: diff_rewards = [] diff_violations = [] diff_closes = [] diff_lengths = [] print(f" Difficulty {difficulty}...") for ep in range(episodes_per_level): result = await run_episode( model=model, tokenizer=tokenizer, env_url=env_url, difficulty=difficulty, message_timeout_s=120.0, ) trajectory = result["trajectory"] reward = result["total_reward"] violations = result["violations"] steps = result["steps_completed"] length = len(trajectory) # Did we close successfully? last_action = trajectory[-1]["action_type"] if trajectory else "" last_traj = trajectory[-1] if trajectory else {} components = last_traj.get("components", {}) r_outcome = components.get("r_outcome", 0.0) closed = last_action == "CLOSE" and r_outcome > 0 diff_rewards.append(reward) diff_violations.append(len(violations)) diff_closes.append(1 if closed else 0) diff_lengths.append(length) all_rewards.append(reward) all_violations.append(len(violations)) all_closes.append(1 if closed else 0) all_lengths.append(length) results["per_difficulty"][difficulty] = { "mean_reward": sum(diff_rewards) / len(diff_rewards) if diff_rewards else 0, "mean_violations": sum(diff_violations) / len(diff_violations) if diff_violations else 0, "close_rate": sum(diff_closes) / len(diff_closes) if diff_closes else 0, "mean_episode_length": sum(diff_lengths) / len(diff_lengths) if diff_lengths else 0, "num_episodes": len(diff_rewards), } results["overall"] = { "mean_reward": sum(all_rewards) / len(all_rewards) if all_rewards else 0, "mean_violations": sum(all_violations) / len(all_violations) if all_violations else 0, "close_rate": sum(all_closes) / len(all_closes) if all_closes else 0, "mean_episode_length": sum(all_lengths) / len(all_lengths) if all_lengths else 0, "num_episodes": len(all_rewards), } return results def load_model(model_name_or_path: str): """Load model, detecting if it's a PEFT adapter.""" print(f"Loading model: {model_name_or_path}") tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Check if this is a PEFT adapter directory adapter_path = Path(model_name_or_path) is_adapter = (adapter_path / "adapter_config.json").exists() if is_adapter: print(" Detected PEFT adapter — loading base model + adapter...") from peft import PeftModel # Find the base model name from adapter config import json as _json with open(adapter_path / "adapter_config.json") as f: adapter_cfg = _json.load(f) base_model_name = adapter_cfg.get("base_model_name_or_path", "Qwen/Qwen2.5-0.5B-Instruct") print(f" Base model: {base_model_name}") bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported() base_model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.bfloat16 if bf16_supported else torch.float32, device_map="auto", ) model = PeftModel.from_pretrained(base_model, model_name_or_path) # Merge adapter for faster inference model = model.merge_and_unload() print(" Adapter loaded and merged ✅") else: bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported() model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16 if bf16_supported else torch.float32, device_map="auto", ) model.eval() print(f" Model on: {next(model.parameters()).device}") return model, tokenizer async def main(): parser = argparse.ArgumentParser(description="Evaluate baseline vs trained model") parser.add_argument("--base", default="Qwen/Qwen2.5-0.5B-Instruct") parser.add_argument("--trained", default="./salespath_out/grpo_final") parser.add_argument("--env-url", default="http://127.0.0.1:8000") parser.add_argument("--episodes-per-level", type=int, default=4) parser.add_argument("--output", default="./salespath_out/eval_results.json") args = parser.parse_args() print("=" * 60) print("SalesPath — Model Evaluation") print("=" * 60) print(f"Base model: {args.base}") print(f"Trained model: {args.trained}") print(f"Episodes/level: {args.episodes_per_level}") print() # Evaluate base model print("Loading base model...") base_model, base_tokenizer = load_model(args.base) print("\nEvaluating base model...") base_results = await eval_model( base_model, base_tokenizer, args.env_url, episodes_per_level=args.episodes_per_level, label="baseline", ) # Clean up del base_model if torch.cuda.is_available(): torch.cuda.empty_cache() # Evaluate trained model print("\nLoading trained model...") trained_model, trained_tokenizer = load_model(args.trained) print("\nEvaluating trained model...") trained_results = await eval_model( trained_model, trained_tokenizer, args.env_url, episodes_per_level=args.episodes_per_level, label="trained", ) # Print comparison print("\n" + "=" * 60) print("RESULTS COMPARISON") print("=" * 60) for model_results in [base_results, trained_results]: label = model_results["label"] overall = model_results["overall"] print(f"\n--- {label.upper()} ---") print(f" Mean reward: {overall['mean_reward']:.4f}") print(f" Mean violations: {overall['mean_violations']:.2f}") print(f" Close rate: {overall['close_rate']:.2%}") print(f" Mean ep. length: {overall['mean_episode_length']:.1f}") for diff, metrics in model_results["per_difficulty"].items(): print(f" Difficulty {diff}: reward={metrics['mean_reward']:.3f}, " f"violations={metrics['mean_violations']:.1f}, " f"close={metrics['close_rate']:.0%}") # Save results output = { "base": base_results, "trained": trained_results, "comparison": { "reward_delta": trained_results["overall"]["mean_reward"] - base_results["overall"]["mean_reward"], "violation_reduction": base_results["overall"]["mean_violations"] - trained_results["overall"]["mean_violations"], "close_rate_improvement": trained_results["overall"]["close_rate"] - base_results["overall"]["close_rate"], }, "config": { "base_model": args.base, "trained_model": args.trained, "episodes_per_level": args.episodes_per_level, "difficulties": DIFFICULTIES, }, } os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) with open(args.output, "w") as f: json.dump(output, f, indent=2) print(f"\nResults saved to {args.output}") # Print comparison summary print("\n=== KEY METRICS ===") c = output["comparison"] print(f" Reward delta: {c['reward_delta']:+.4f}") print(f" Violation reduction: {c['violation_reduction']:+.2f}") print(f" Close rate change: {c['close_rate_improvement']:+.2%}") if __name__ == "__main__": asyncio.run(main())