distill-m-6a3lnzvb-code / scripts /run_hparam_sweep.sh
Delta-Vector's picture
add 9-config hparam sweep + new_layer_lr_mul param-groups support
3af7f4c verified
#!/usr/bin/env bash
# Hyperparameter sweep over 9 configs that try to push past grow40_winning's 0.2219.
#
# Each config grabs all 8 GPUs via accelerate, so they run sequentially.
# Output goes to logs/<run>.log; the master log goes to logs/sweep_hparam_master.log.
# Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env.
#
# Launch in the background with:
# nohup ./scripts/run_hparam_sweep.sh > logs/sweep_hparam_master.log 2>&1 &
set -uo pipefail
cd "$(dirname "$0")/.."
CONFIGS=(
"configs/sweep/A_resume_lr1e7_cos.toml"
"configs/sweep/B_resume_lr5e8_cos.toml"
"configs/sweep/C_resume_lr2e8_cos.toml"
"configs/sweep/D_resume_lr1e7_const.toml"
"configs/sweep/E_resume_lr5e8_b95.toml"
"configs/sweep/F_cold_lr1e7_grow40.toml"
"configs/sweep/G_cold_lr2e7_grow40.toml"
"configs/sweep/H_cold_lr1e7_32L.toml"
"configs/sweep/I_cold_paramgroups_grow40.toml"
)
LOG_DIR="logs"
mkdir -p "$LOG_DIR"
for cfg in "${CONFIGS[@]}"; do
name="$(basename "$cfg" .toml)"
log="$LOG_DIR/$name.log"
echo ">>> [$(date '+%F %T')] starting $name -> $log"
.venv/bin/accelerate launch \
--config_file configs/accelerate.yaml \
distill.py \
--config "$cfg" \
> "$log" 2>&1
rc=$?
best_line=$(grep -E "Best eval KL" "$log" | tail -1)
echo "<<< [$(date '+%F %T')] finished $name (exit=$rc) ${best_line}"
if [[ $rc -ne 0 ]]; then
echo " last 12 lines of $log:"
tail -12 "$log" | sed 's/^/ /'
fi
done
echo ">>> [$(date '+%F %T')] hparam sweep complete"
echo ">>> summary of best eval KLs:"
for cfg in "${CONFIGS[@]}"; do
name="$(basename "$cfg" .toml)"
log="$LOG_DIR/$name.log"
best=$(grep -E "Best eval KL" "$log" | tail -1 | sed 's/.*Best eval KL = //')
printf " %-32s %s\n" "$name" "${best:-FAILED}"
done