fix OOM: chunked KL with checkpointing + PYTORCH_CUDA_ALLOC_CONF expandable_segments; add kl_chunk_size config key

eb5278f verified 11 days ago

1.05 kB

	#!/usr/bin/env bash
	# Re-runs the two configs that OOM'd in the original sweep, now with the
	# chunked-KL fix and PYTORCH_CUDA_ALLOC_CONF=expandable_segments in distill.py.
	# Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env.
	#
	# Launch with:
	# nohup ./scripts/run_sweep_rerun.sh > logs/sweep_rerun_master.log 2>&1 &

	set -uo pipefail
	cd "$(dirname "$0")/.."

	CONFIGS=(
	"configs/replicate_zero4.toml"
	"configs/grow40_winning.toml"
	)

	LOG_DIR="logs"
	mkdir -p "$LOG_DIR"

	for cfg in "${CONFIGS[@]}"; do
	name="$(basename "$cfg" .toml)"
	log="$LOG_DIR/$name.log"
	echo ">>> [$(date '+%F %T')] starting $name -> $log"
	.venv/bin/accelerate launch \
	--config_file configs/accelerate.yaml \
	distill.py \
	--config "$cfg" \
	> "$log" 2>&1
	rc=$?
	echo "<<< [$(date '+%F %T')] finished $name (exit=$rc)"
	if [[ $rc -ne 0 ]]; then
	echo " last 20 lines of $log:"
	tail -20 "$log" \| sed 's/^/ /'
	fi
	done

	echo ">>> [$(date '+%F %T')] rerun complete"