Spaces:

pratinavseth
/

cricket-captain-llm

Running

App Files Files Community

cricket-captain-llm / scripts /eval_all_checkpoints.sh

pratinavseth

sync: today's source updates (XML-only prompt, reward unclip, neg-reward on loss, pinned versions, configs reorg)

2fc50a9 verified 12 days ago

raw

history blame contribute delete

3.48 kB

	#!/usr/bin/env bash
	# Evaluate every saved checkpoint and find the best one.
	#
	# RL training is bumpy — the FINAL checkpoint isn't always the best policy.
	# This script runs compare_eval.py against the baseline AND each checkpoint
	# in ./checkpoints/, then picks whichever has the highest mean composite
	# reward to be the "best trained" submission.
	#
	# Usage:
	# bash scripts/eval_all_checkpoints.sh # 10 episodes each
	# EVAL_EPISODES=20 bash scripts/eval_all_checkpoints.sh # more confidence

	cd "$(dirname "$0")/.."
	mkdir -p eval_results

	EVAL_EPISODES="${EVAL_EPISODES:-10}"
	EVAL_MAX_OVERS="${EVAL_MAX_OVERS:-5}"
	BASE_MODEL="${BASE_MODEL:-Qwen/Qwen3.5-4B}"

	# 1. Baseline (untrained)
	if [ ! -f eval_results/baseline.json ]; then
	echo "=== [baseline] running ==="
	python compare_eval.py \
	--model "$BASE_MODEL" \
	--label baseline \
	--episodes "$EVAL_EPISODES" \
	--max-overs "$EVAL_MAX_OVERS" \
	--opponent-mode heuristic \
	--output eval_results/baseline.json
	fi

	# 2. Each checkpoint
	for ckpt in ./checkpoints/stage/checkpoint- ./checkpoints/stage*_final; do
	if [ -d "$ckpt" ]; then
	# Verify it's a PEFT adapter dir (has adapter_config.json)
	if [ ! -f "$ckpt/adapter_config.json" ]; then
	echo "=== [$ckpt] no adapter_config.json — skip ==="
	continue
	fi
	# Use a safe filename derived from path
	label=$(echo "$ckpt" \| sed 's\|[/.]\|_\|g' \| sed 's\|^_\|\|')
	out="eval_results/${label}.json"
	if [ -f "$out" ]; then
	echo "=== [$ckpt] already evaluated, reading $out ==="
	continue
	fi
	echo "=== [$ckpt] evaluating ==="
	python compare_eval.py \
	--model "$BASE_MODEL" \
	--adapter "$ckpt" \
	--label "$label" \
	--episodes "$EVAL_EPISODES" \
	--max-overs "$EVAL_MAX_OVERS" \
	--opponent-mode heuristic \
	--output "$out"
	fi
	done

	# 3. Pick the best one and run a final compare against baseline
	echo ""
	echo "=== picking best checkpoint by mean composite reward ==="
	python - <<'PYEOF'
	import json, glob

	best = None
	for path in glob.glob("eval_results/*.json"):
	if path.endswith("baseline.json"):
	continue
	try:
	with open(path) as f:
	data = json.load(f)
	score = data["summary"].get("mean_composite_reward", 0.0) or 0.0
	win_rate = data["summary"].get("win_rate_overall", 0.0) or 0.0
	# Composite ranking: composite reward + 0.5 * win_rate
	composite_score = score + 0.5 * win_rate
	print(f" {path:50s} composite={score:.4f} win_rate={win_rate:.4f} ranking={composite_score:.4f}")
	if best is None or composite_score > best[0]:
	best = (composite_score, path, data)
	except Exception as e:
	print(f" {path}: skip ({e})")

	if best:
	print(f"\nBEST: {best[1]} (composite_score={best[0]:.4f}, label={best[2].get('label')})")
	print(f" adapter: {best[2].get('adapter')}")

	# Save a symlink/copy as 'best.json' for easy reference
	import shutil
	shutil.copy(best[1], "eval_results/best_trained.json")
	print(f"\nCopied to eval_results/best_trained.json")
	PYEOF

	echo ""
	echo "=== final comparison: baseline vs best trained ==="
	python compare_eval.py --compare eval_results/baseline.json eval_results/best_trained.json \
	\| tee eval_results/final_comparison.txt