salespath-env / training /eval_baseline_vs_trained.py
Imsachin010's picture
HF Spaces GPU training pipeline
1af4cba
"""
SalesPath — Evaluate Baseline vs Trained Model
Runs episodes at each difficulty level with both the base model
and the trained (GRPO) model, then compares performance.
Usage:
python training/eval_baseline_vs_trained.py \
--base Qwen/Qwen2.5-0.5B-Instruct \
--trained ./salespath_out/grpo_final \
--env-url http://127.0.0.1:8000 \
--episodes-per-level 4 \
--output ./salespath_out/eval_results.json
"""
import argparse
import asyncio
import json
import os
import sys
import time
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Ensure project root is on path
_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if _ROOT not in sys.path:
sys.path.insert(0, _ROOT)
from training.rollout import run_episode
DIFFICULTIES = [1, 2, 3, 4]
async def eval_model(
model,
tokenizer,
env_url: str,
episodes_per_level: int = 4,
label: str = "model",
) -> dict:
"""Evaluate a model across all difficulty levels."""
results = {
"label": label,
"per_difficulty": {},
"overall": {},
}
all_rewards = []
all_violations = []
all_closes = []
all_lengths = []
for difficulty in DIFFICULTIES:
diff_rewards = []
diff_violations = []
diff_closes = []
diff_lengths = []
print(f" Difficulty {difficulty}...")
for ep in range(episodes_per_level):
result = await run_episode(
model=model,
tokenizer=tokenizer,
env_url=env_url,
difficulty=difficulty,
message_timeout_s=120.0,
)
trajectory = result["trajectory"]
reward = result["total_reward"]
violations = result["violations"]
steps = result["steps_completed"]
length = len(trajectory)
# Did we close successfully?
last_action = trajectory[-1]["action_type"] if trajectory else ""
last_traj = trajectory[-1] if trajectory else {}
components = last_traj.get("components", {})
r_outcome = components.get("r_outcome", 0.0)
closed = last_action == "CLOSE" and r_outcome > 0
diff_rewards.append(reward)
diff_violations.append(len(violations))
diff_closes.append(1 if closed else 0)
diff_lengths.append(length)
all_rewards.append(reward)
all_violations.append(len(violations))
all_closes.append(1 if closed else 0)
all_lengths.append(length)
results["per_difficulty"][difficulty] = {
"mean_reward": sum(diff_rewards) / len(diff_rewards) if diff_rewards else 0,
"mean_violations": sum(diff_violations) / len(diff_violations) if diff_violations else 0,
"close_rate": sum(diff_closes) / len(diff_closes) if diff_closes else 0,
"mean_episode_length": sum(diff_lengths) / len(diff_lengths) if diff_lengths else 0,
"num_episodes": len(diff_rewards),
}
results["overall"] = {
"mean_reward": sum(all_rewards) / len(all_rewards) if all_rewards else 0,
"mean_violations": sum(all_violations) / len(all_violations) if all_violations else 0,
"close_rate": sum(all_closes) / len(all_closes) if all_closes else 0,
"mean_episode_length": sum(all_lengths) / len(all_lengths) if all_lengths else 0,
"num_episodes": len(all_rewards),
}
return results
def load_model(model_name_or_path: str):
"""Load model, detecting if it's a PEFT adapter."""
print(f"Loading model: {model_name_or_path}")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Check if this is a PEFT adapter directory
adapter_path = Path(model_name_or_path)
is_adapter = (adapter_path / "adapter_config.json").exists()
if is_adapter:
print(" Detected PEFT adapter — loading base model + adapter...")
from peft import PeftModel
# Find the base model name from adapter config
import json as _json
with open(adapter_path / "adapter_config.json") as f:
adapter_cfg = _json.load(f)
base_model_name = adapter_cfg.get("base_model_name_or_path", "Qwen/Qwen2.5-0.5B-Instruct")
print(f" Base model: {base_model_name}")
bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.bfloat16 if bf16_supported else torch.float32,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, model_name_or_path)
# Merge adapter for faster inference
model = model.merge_and_unload()
print(" Adapter loaded and merged ✅")
else:
bf16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16 if bf16_supported else torch.float32,
device_map="auto",
)
model.eval()
print(f" Model on: {next(model.parameters()).device}")
return model, tokenizer
async def main():
parser = argparse.ArgumentParser(description="Evaluate baseline vs trained model")
parser.add_argument("--base", default="Qwen/Qwen2.5-0.5B-Instruct")
parser.add_argument("--trained", default="./salespath_out/grpo_final")
parser.add_argument("--env-url", default="http://127.0.0.1:8000")
parser.add_argument("--episodes-per-level", type=int, default=4)
parser.add_argument("--output", default="./salespath_out/eval_results.json")
args = parser.parse_args()
print("=" * 60)
print("SalesPath — Model Evaluation")
print("=" * 60)
print(f"Base model: {args.base}")
print(f"Trained model: {args.trained}")
print(f"Episodes/level: {args.episodes_per_level}")
print()
# Evaluate base model
print("Loading base model...")
base_model, base_tokenizer = load_model(args.base)
print("\nEvaluating base model...")
base_results = await eval_model(
base_model, base_tokenizer, args.env_url,
episodes_per_level=args.episodes_per_level,
label="baseline",
)
# Clean up
del base_model
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Evaluate trained model
print("\nLoading trained model...")
trained_model, trained_tokenizer = load_model(args.trained)
print("\nEvaluating trained model...")
trained_results = await eval_model(
trained_model, trained_tokenizer, args.env_url,
episodes_per_level=args.episodes_per_level,
label="trained",
)
# Print comparison
print("\n" + "=" * 60)
print("RESULTS COMPARISON")
print("=" * 60)
for model_results in [base_results, trained_results]:
label = model_results["label"]
overall = model_results["overall"]
print(f"\n--- {label.upper()} ---")
print(f" Mean reward: {overall['mean_reward']:.4f}")
print(f" Mean violations: {overall['mean_violations']:.2f}")
print(f" Close rate: {overall['close_rate']:.2%}")
print(f" Mean ep. length: {overall['mean_episode_length']:.1f}")
for diff, metrics in model_results["per_difficulty"].items():
print(f" Difficulty {diff}: reward={metrics['mean_reward']:.3f}, "
f"violations={metrics['mean_violations']:.1f}, "
f"close={metrics['close_rate']:.0%}")
# Save results
output = {
"base": base_results,
"trained": trained_results,
"comparison": {
"reward_delta": trained_results["overall"]["mean_reward"] - base_results["overall"]["mean_reward"],
"violation_reduction": base_results["overall"]["mean_violations"] - trained_results["overall"]["mean_violations"],
"close_rate_improvement": trained_results["overall"]["close_rate"] - base_results["overall"]["close_rate"],
},
"config": {
"base_model": args.base,
"trained_model": args.trained,
"episodes_per_level": args.episodes_per_level,
"difficulties": DIFFICULTIES,
},
}
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
with open(args.output, "w") as f:
json.dump(output, f, indent=2)
print(f"\nResults saved to {args.output}")
# Print comparison summary
print("\n=== KEY METRICS ===")
c = output["comparison"]
print(f" Reward delta: {c['reward_delta']:+.4f}")
print(f" Violation reduction: {c['violation_reduction']:+.2f}")
print(f" Close rate change: {c['close_rate_improvement']:+.2%}")
if __name__ == "__main__":
asyncio.run(main())