#!/usr/bin/env bash # Evaluate every saved checkpoint and find the best one. # # RL training is bumpy — the FINAL checkpoint isn't always the best policy. # This script runs compare_eval.py against the baseline AND each checkpoint # in ./checkpoints/, then picks whichever has the highest mean composite # reward to be the "best trained" submission. # # Usage: # bash scripts/eval_all_checkpoints.sh # 10 episodes each # EVAL_EPISODES=20 bash scripts/eval_all_checkpoints.sh # more confidence cd "$(dirname "$0")/.." mkdir -p eval_results EVAL_EPISODES="${EVAL_EPISODES:-10}" EVAL_MAX_OVERS="${EVAL_MAX_OVERS:-5}" BASE_MODEL="${BASE_MODEL:-Qwen/Qwen3.5-4B}" # 1. Baseline (untrained) if [ ! -f eval_results/baseline.json ]; then echo "=== [baseline] running ===" python compare_eval.py \ --model "$BASE_MODEL" \ --label baseline \ --episodes "$EVAL_EPISODES" \ --max-overs "$EVAL_MAX_OVERS" \ --opponent-mode heuristic \ --output eval_results/baseline.json fi # 2. Each checkpoint for ckpt in ./checkpoints/stage*/checkpoint-* ./checkpoints/stage*_final; do if [ -d "$ckpt" ]; then # Verify it's a PEFT adapter dir (has adapter_config.json) if [ ! -f "$ckpt/adapter_config.json" ]; then echo "=== [$ckpt] no adapter_config.json — skip ===" continue fi # Use a safe filename derived from path label=$(echo "$ckpt" | sed 's|[/.]|_|g' | sed 's|^_||') out="eval_results/${label}.json" if [ -f "$out" ]; then echo "=== [$ckpt] already evaluated, reading $out ===" continue fi echo "=== [$ckpt] evaluating ===" python compare_eval.py \ --model "$BASE_MODEL" \ --adapter "$ckpt" \ --label "$label" \ --episodes "$EVAL_EPISODES" \ --max-overs "$EVAL_MAX_OVERS" \ --opponent-mode heuristic \ --output "$out" fi done # 3. Pick the best one and run a final compare against baseline echo "" echo "=== picking best checkpoint by mean composite reward ===" python - <<'PYEOF' import json, glob best = None for path in glob.glob("eval_results/*.json"): if path.endswith("baseline.json"): continue try: with open(path) as f: data = json.load(f) score = data["summary"].get("mean_composite_reward", 0.0) or 0.0 win_rate = data["summary"].get("win_rate_overall", 0.0) or 0.0 # Composite ranking: composite reward + 0.5 * win_rate composite_score = score + 0.5 * win_rate print(f" {path:50s} composite={score:.4f} win_rate={win_rate:.4f} ranking={composite_score:.4f}") if best is None or composite_score > best[0]: best = (composite_score, path, data) except Exception as e: print(f" {path}: skip ({e})") if best: print(f"\nBEST: {best[1]} (composite_score={best[0]:.4f}, label={best[2].get('label')})") print(f" adapter: {best[2].get('adapter')}") # Save a symlink/copy as 'best.json' for easy reference import shutil shutil.copy(best[1], "eval_results/best_trained.json") print(f"\nCopied to eval_results/best_trained.json") PYEOF echo "" echo "=== final comparison: baseline vs best trained ===" python compare_eval.py --compare eval_results/baseline.json eval_results/best_trained.json \ | tee eval_results/final_comparison.txt