cricket-captain-llm / scripts /eval_all_checkpoints.sh
pratinavseth's picture
sync: today's source updates (XML-only prompt, reward unclip, neg-reward on loss, pinned versions, configs reorg)
2fc50a9 verified
#!/usr/bin/env bash
# Evaluate every saved checkpoint and find the best one.
#
# RL training is bumpy — the FINAL checkpoint isn't always the best policy.
# This script runs compare_eval.py against the baseline AND each checkpoint
# in ./checkpoints/, then picks whichever has the highest mean composite
# reward to be the "best trained" submission.
#
# Usage:
# bash scripts/eval_all_checkpoints.sh # 10 episodes each
# EVAL_EPISODES=20 bash scripts/eval_all_checkpoints.sh # more confidence
cd "$(dirname "$0")/.."
mkdir -p eval_results
EVAL_EPISODES="${EVAL_EPISODES:-10}"
EVAL_MAX_OVERS="${EVAL_MAX_OVERS:-5}"
BASE_MODEL="${BASE_MODEL:-Qwen/Qwen3.5-4B}"
# 1. Baseline (untrained)
if [ ! -f eval_results/baseline.json ]; then
echo "=== [baseline] running ==="
python compare_eval.py \
--model "$BASE_MODEL" \
--label baseline \
--episodes "$EVAL_EPISODES" \
--max-overs "$EVAL_MAX_OVERS" \
--opponent-mode heuristic \
--output eval_results/baseline.json
fi
# 2. Each checkpoint
for ckpt in ./checkpoints/stage*/checkpoint-* ./checkpoints/stage*_final; do
if [ -d "$ckpt" ]; then
# Verify it's a PEFT adapter dir (has adapter_config.json)
if [ ! -f "$ckpt/adapter_config.json" ]; then
echo "=== [$ckpt] no adapter_config.json — skip ==="
continue
fi
# Use a safe filename derived from path
label=$(echo "$ckpt" | sed 's|[/.]|_|g' | sed 's|^_||')
out="eval_results/${label}.json"
if [ -f "$out" ]; then
echo "=== [$ckpt] already evaluated, reading $out ==="
continue
fi
echo "=== [$ckpt] evaluating ==="
python compare_eval.py \
--model "$BASE_MODEL" \
--adapter "$ckpt" \
--label "$label" \
--episodes "$EVAL_EPISODES" \
--max-overs "$EVAL_MAX_OVERS" \
--opponent-mode heuristic \
--output "$out"
fi
done
# 3. Pick the best one and run a final compare against baseline
echo ""
echo "=== picking best checkpoint by mean composite reward ==="
python - <<'PYEOF'
import json, glob
best = None
for path in glob.glob("eval_results/*.json"):
if path.endswith("baseline.json"):
continue
try:
with open(path) as f:
data = json.load(f)
score = data["summary"].get("mean_composite_reward", 0.0) or 0.0
win_rate = data["summary"].get("win_rate_overall", 0.0) or 0.0
# Composite ranking: composite reward + 0.5 * win_rate
composite_score = score + 0.5 * win_rate
print(f" {path:50s} composite={score:.4f} win_rate={win_rate:.4f} ranking={composite_score:.4f}")
if best is None or composite_score > best[0]:
best = (composite_score, path, data)
except Exception as e:
print(f" {path}: skip ({e})")
if best:
print(f"\nBEST: {best[1]} (composite_score={best[0]:.4f}, label={best[2].get('label')})")
print(f" adapter: {best[2].get('adapter')}")
# Save a symlink/copy as 'best.json' for easy reference
import shutil
shutil.copy(best[1], "eval_results/best_trained.json")
print(f"\nCopied to eval_results/best_trained.json")
PYEOF
echo ""
echo "=== final comparison: baseline vs best trained ==="
python compare_eval.py --compare eval_results/baseline.json eval_results/best_trained.json \
| tee eval_results/final_comparison.txt