sync: today's source updates (XML-only prompt, reward unclip, neg-reward on loss, pinned versions, configs reorg)
2fc50a9 verified | # Evaluate every saved checkpoint and find the best one. | |
| # | |
| # RL training is bumpy — the FINAL checkpoint isn't always the best policy. | |
| # This script runs compare_eval.py against the baseline AND each checkpoint | |
| # in ./checkpoints/, then picks whichever has the highest mean composite | |
| # reward to be the "best trained" submission. | |
| # | |
| # Usage: | |
| # bash scripts/eval_all_checkpoints.sh # 10 episodes each | |
| # EVAL_EPISODES=20 bash scripts/eval_all_checkpoints.sh # more confidence | |
| cd "$(dirname "$0")/.." | |
| mkdir -p eval_results | |
| EVAL_EPISODES="${EVAL_EPISODES:-10}" | |
| EVAL_MAX_OVERS="${EVAL_MAX_OVERS:-5}" | |
| BASE_MODEL="${BASE_MODEL:-Qwen/Qwen3.5-4B}" | |
| # 1. Baseline (untrained) | |
| if [ ! -f eval_results/baseline.json ]; then | |
| echo "=== [baseline] running ===" | |
| python compare_eval.py \ | |
| --model "$BASE_MODEL" \ | |
| --label baseline \ | |
| --episodes "$EVAL_EPISODES" \ | |
| --max-overs "$EVAL_MAX_OVERS" \ | |
| --opponent-mode heuristic \ | |
| --output eval_results/baseline.json | |
| fi | |
| # 2. Each checkpoint | |
| for ckpt in ./checkpoints/stage*/checkpoint-* ./checkpoints/stage*_final; do | |
| if [ -d "$ckpt" ]; then | |
| # Verify it's a PEFT adapter dir (has adapter_config.json) | |
| if [ ! -f "$ckpt/adapter_config.json" ]; then | |
| echo "=== [$ckpt] no adapter_config.json — skip ===" | |
| continue | |
| fi | |
| # Use a safe filename derived from path | |
| label=$(echo "$ckpt" | sed 's|[/.]|_|g' | sed 's|^_||') | |
| out="eval_results/${label}.json" | |
| if [ -f "$out" ]; then | |
| echo "=== [$ckpt] already evaluated, reading $out ===" | |
| continue | |
| fi | |
| echo "=== [$ckpt] evaluating ===" | |
| python compare_eval.py \ | |
| --model "$BASE_MODEL" \ | |
| --adapter "$ckpt" \ | |
| --label "$label" \ | |
| --episodes "$EVAL_EPISODES" \ | |
| --max-overs "$EVAL_MAX_OVERS" \ | |
| --opponent-mode heuristic \ | |
| --output "$out" | |
| fi | |
| done | |
| # 3. Pick the best one and run a final compare against baseline | |
| echo "" | |
| echo "=== picking best checkpoint by mean composite reward ===" | |
| python - <<'PYEOF' | |
| import json, glob | |
| best = None | |
| for path in glob.glob("eval_results/*.json"): | |
| if path.endswith("baseline.json"): | |
| continue | |
| try: | |
| with open(path) as f: | |
| data = json.load(f) | |
| score = data["summary"].get("mean_composite_reward", 0.0) or 0.0 | |
| win_rate = data["summary"].get("win_rate_overall", 0.0) or 0.0 | |
| # Composite ranking: composite reward + 0.5 * win_rate | |
| composite_score = score + 0.5 * win_rate | |
| print(f" {path:50s} composite={score:.4f} win_rate={win_rate:.4f} ranking={composite_score:.4f}") | |
| if best is None or composite_score > best[0]: | |
| best = (composite_score, path, data) | |
| except Exception as e: | |
| print(f" {path}: skip ({e})") | |
| if best: | |
| print(f"\nBEST: {best[1]} (composite_score={best[0]:.4f}, label={best[2].get('label')})") | |
| print(f" adapter: {best[2].get('adapter')}") | |
| # Save a symlink/copy as 'best.json' for easy reference | |
| import shutil | |
| shutil.copy(best[1], "eval_results/best_trained.json") | |
| print(f"\nCopied to eval_results/best_trained.json") | |
| PYEOF | |
| echo "" | |
| echo "=== final comparison: baseline vs best trained ===" | |
| python compare_eval.py --compare eval_results/baseline.json eval_results/best_trained.json \ | |
| | tee eval_results/final_comparison.txt | |