#!/usr/bin/env bash # Hyperparameter sweep over 9 configs that try to push past grow40_winning's 0.2219. # # Each config grabs all 8 GPUs via accelerate, so they run sequentially. # Output goes to logs/.log; the master log goes to logs/sweep_hparam_master.log. # Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env. # # Launch in the background with: # nohup ./scripts/run_hparam_sweep.sh > logs/sweep_hparam_master.log 2>&1 & set -uo pipefail cd "$(dirname "$0")/.." CONFIGS=( "configs/sweep/A_resume_lr1e7_cos.toml" "configs/sweep/B_resume_lr5e8_cos.toml" "configs/sweep/C_resume_lr2e8_cos.toml" "configs/sweep/D_resume_lr1e7_const.toml" "configs/sweep/E_resume_lr5e8_b95.toml" "configs/sweep/F_cold_lr1e7_grow40.toml" "configs/sweep/G_cold_lr2e7_grow40.toml" "configs/sweep/H_cold_lr1e7_32L.toml" "configs/sweep/I_cold_paramgroups_grow40.toml" ) LOG_DIR="logs" mkdir -p "$LOG_DIR" for cfg in "${CONFIGS[@]}"; do name="$(basename "$cfg" .toml)" log="$LOG_DIR/$name.log" echo ">>> [$(date '+%F %T')] starting $name -> $log" .venv/bin/accelerate launch \ --config_file configs/accelerate.yaml \ distill.py \ --config "$cfg" \ > "$log" 2>&1 rc=$? best_line=$(grep -E "Best eval KL" "$log" | tail -1) echo "<<< [$(date '+%F %T')] finished $name (exit=$rc) ${best_line}" if [[ $rc -ne 0 ]]; then echo " last 12 lines of $log:" tail -12 "$log" | sed 's/^/ /' fi done echo ">>> [$(date '+%F %T')] hparam sweep complete" echo ">>> summary of best eval KLs:" for cfg in "${CONFIGS[@]}"; do name="$(basename "$cfg" .toml)" log="$LOG_DIR/$name.log" best=$(grep -E "Best eval KL" "$log" | tail -1 | sed 's/.*Best eval KL = //') printf " %-32s %s\n" "$name" "${best:-FAILED}" done