add phase-2 ultra-conservative sweep (J,K,L,M) + waiter that auto-launches after phase 1 from the best ckpt
729546e verified | # Phase 2 sweep: waits for phase 1 to finish, then resumes from whichever | |
| # phase 1 run achieved the lowest eval KL. All configs use very small LRs | |
| # and constant/very-slow schedules. Goal: monotone, very slow KL descent. | |
| # | |
| # Launch in the background with: | |
| # nohup ./scripts/run_phase2_sweep.sh > logs/sweep_phase2_master.log 2>&1 & | |
| set -uo pipefail | |
| cd "$(dirname "$0")/.." | |
| LOG_DIR="logs" | |
| mkdir -p "$LOG_DIR" | |
| # 1. Wait for phase 1 to finish. | |
| echo ">>> [$(date '+%F %T')] phase2 waiter: waiting for phase 1 to finish..." | |
| while pgrep -f "run_hparam_sweep.sh" > /dev/null; do | |
| sleep 30 | |
| done | |
| # Also wait for any straggler distill.py procs from phase 1 to die | |
| while pgrep -f "distill.py --config configs/sweep/[A-I]_" > /dev/null; do | |
| sleep 30 | |
| done | |
| echo ">>> [$(date '+%F %T')] phase2 waiter: phase 1 done." | |
| # 2. Find phase 1's best ckpt. | |
| PHASE1_BEST=$(.venv/bin/python - <<'PY' | |
| import json, glob, os, sys | |
| best_kl = float("inf") | |
| best_dir = None | |
| for f in glob.glob("out/sweep/[A-I]_*/best/best.json"): | |
| try: | |
| kl = json.load(open(f))["eval_kl"] | |
| except Exception: | |
| continue | |
| if kl < best_kl: | |
| best_kl = kl | |
| best_dir = os.path.dirname(f) | |
| if best_dir is None: | |
| sys.exit("no phase 1 best found") | |
| print(f"{best_dir}\t{best_kl}") | |
| PY | |
| ) | |
| BEST_DIR=$(echo "$PHASE1_BEST" | cut -f1) | |
| BEST_KL=$(echo "$PHASE1_BEST" | cut -f2) | |
| echo ">>> phase 1 best: $BEST_DIR (eval_kl=$BEST_KL)" | |
| # 3. Symlink ./out/phase1_best -> the winner so phase 2 configs can reference | |
| # a stable path. | |
| mkdir -p out | |
| rm -f out/phase1_best | |
| ln -sfn "$(realpath "$BEST_DIR")" out/phase1_best | |
| echo ">>> linked out/phase1_best -> $(readlink out/phase1_best)" | |
| # 4. Run phase 2 configs sequentially. | |
| CONFIGS=( | |
| "configs/sweep/J_phase2_lr5e9_const.toml" | |
| "configs/sweep/K_phase2_lr2e8_const.toml" | |
| "configs/sweep/L_phase2_lr1e8_warmup500.toml" | |
| "configs/sweep/M_phase2_lr2e8_largebatch.toml" | |
| ) | |
| for cfg in "${CONFIGS[@]}"; do | |
| name="$(basename "$cfg" .toml)" | |
| log="$LOG_DIR/$name.log" | |
| echo ">>> [$(date '+%F %T')] starting $name -> $log" | |
| .venv/bin/accelerate launch \ | |
| --config_file configs/accelerate.yaml \ | |
| distill.py \ | |
| --config "$cfg" \ | |
| > "$log" 2>&1 | |
| rc=$? | |
| best_line=$(grep -E "Best eval KL" "$log" | tail -1) | |
| echo "<<< [$(date '+%F %T')] finished $name (exit=$rc) ${best_line}" | |
| if [[ $rc -ne 0 ]]; then | |
| echo " last 12 lines of $log:" | |
| tail -12 "$log" | sed 's/^/ /' | |
| fi | |
| done | |
| echo ">>> [$(date '+%F %T')] phase2 sweep complete" | |
| echo ">>> overall summary (phase 1 + phase 2):" | |
| for log in $LOG_DIR/[A-M]_*.log; do | |
| name=$(basename "$log" .log) | |
| best=$(grep -E "Best eval KL" "$log" 2>/dev/null | tail -1 | sed 's/.*Best eval KL = //') | |
| printf " %-32s %s\n" "$name" "${best:-FAILED}" | |
| done | sort -k2 | |