#!/usr/bin/env bash
# Evaluate every saved checkpoint and find the best one.
#
# RL training is bumpy — the FINAL checkpoint isn't always the best policy.
# This script runs compare_eval.py against the baseline AND each checkpoint
# in ./checkpoints/, then picks whichever has the highest mean composite
# reward to be the "best trained" submission.
#
# Usage:
#   bash scripts/eval_all_checkpoints.sh                      # 10 episodes each
#   EVAL_EPISODES=20 bash scripts/eval_all_checkpoints.sh     # more confidence

cd "$(dirname "$0")/.."
mkdir -p eval_results

EVAL_EPISODES="${EVAL_EPISODES:-10}"
EVAL_MAX_OVERS="${EVAL_MAX_OVERS:-5}"
BASE_MODEL="${BASE_MODEL:-Qwen/Qwen3.5-4B}"

# 1. Baseline (untrained)
if [ ! -f eval_results/baseline.json ]; then
    echo "=== [baseline] running ==="
    python compare_eval.py \
        --model "$BASE_MODEL" \
        --label baseline \
        --episodes "$EVAL_EPISODES" \
        --max-overs "$EVAL_MAX_OVERS" \
        --opponent-mode heuristic \
        --output eval_results/baseline.json
fi

# 2. Each checkpoint
for ckpt in ./checkpoints/stage*/checkpoint-* ./checkpoints/stage*_final; do
    if [ -d "$ckpt" ]; then
        # Verify it's a PEFT adapter dir (has adapter_config.json)
        if [ ! -f "$ckpt/adapter_config.json" ]; then
            echo "=== [$ckpt] no adapter_config.json — skip ==="
            continue
        fi
        # Use a safe filename derived from path
        label=$(echo "$ckpt" | sed 's|[/.]|_|g' | sed 's|^_||')
        out="eval_results/${label}.json"
        if [ -f "$out" ]; then
            echo "=== [$ckpt] already evaluated, reading $out ==="
            continue
        fi
        echo "=== [$ckpt] evaluating ==="
        python compare_eval.py \
            --model "$BASE_MODEL" \
            --adapter "$ckpt" \
            --label "$label" \
            --episodes "$EVAL_EPISODES" \
            --max-overs "$EVAL_MAX_OVERS" \
            --opponent-mode heuristic \
            --output "$out"
    fi
done

# 3. Pick the best one and run a final compare against baseline
echo ""
echo "=== picking best checkpoint by mean composite reward ==="
python - <<'PYEOF'
import json, glob

best = None
for path in glob.glob("eval_results/*.json"):
    if path.endswith("baseline.json"):
        continue
    try:
        with open(path) as f:
            data = json.load(f)
        score = data["summary"].get("mean_composite_reward", 0.0) or 0.0
        win_rate = data["summary"].get("win_rate_overall", 0.0) or 0.0
        # Composite ranking: composite reward + 0.5 * win_rate
        composite_score = score + 0.5 * win_rate
        print(f"  {path:50s}  composite={score:.4f}  win_rate={win_rate:.4f}  ranking={composite_score:.4f}")
        if best is None or composite_score > best[0]:
            best = (composite_score, path, data)
    except Exception as e:
        print(f"  {path}: skip ({e})")

if best:
    print(f"\nBEST: {best[1]} (composite_score={best[0]:.4f}, label={best[2].get('label')})")
    print(f"      adapter: {best[2].get('adapter')}")

    # Save a symlink/copy as 'best.json' for easy reference
    import shutil
    shutil.copy(best[1], "eval_results/best_trained.json")
    print(f"\nCopied to eval_results/best_trained.json")
PYEOF

echo ""
echo "=== final comparison: baseline vs best trained ==="
python compare_eval.py --compare eval_results/baseline.json eval_results/best_trained.json \
    | tee eval_results/final_comparison.txt