add phase-2 ultra-conservative sweep (J,K,L,M) + waiter that auto-launches after phase 1 from the best ckpt

Browse files

Files changed (6) hide show

configs/sweep/J_phase2_lr5e9_const.toml +52 -0
configs/sweep/K_phase2_lr2e8_const.toml +51 -0
configs/sweep/L_phase2_lr1e8_warmup500.toml +51 -0
configs/sweep/M_phase2_lr2e8_largebatch.toml +52 -0
scripts/backup_to_hf.py +5 -0
scripts/run_phase2_sweep.sh +87 -0

configs/sweep/J_phase2_lr5e9_const.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Phase 2: ultra-conservative resume from phase 1 best.
+# Tiny LR, constant, zero warmup, very high beta2 for max smoothing.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-9
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-2
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "J_phase2_lr5e9_const"
+log_every     = 1
+output_dir    = "./out/sweep/J_phase2_lr5e9_const"
+[init]
+zero_layers        = []
+target_num_layers  = 40

configs/sweep/K_phase2_lr2e8_const.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Phase 2: still conservative but a bit more LR than J.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "K_phase2_lr2e8_const"
+log_every     = 1
+output_dir    = "./out/sweep/K_phase2_lr2e8_const"
+[init]
+zero_layers        = []
+target_num_layers  = 40

configs/sweep/L_phase2_lr1e8_warmup500.toml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Phase 2: very gentle cosine warmup over 500 steps to avoid any LR shock.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 1.0e-8
+schedule             = "cosine"
+warmup_steps         = 500
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 4
+micro_batch_size     = 4
+max_steps            = 3000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "L_phase2_lr1e8_warmup500"
+log_every     = 1
+output_dir    = "./out/sweep/L_phase2_lr1e8_warmup500"
+[init]
+zero_layers        = []
+target_num_layers  = 40

configs/sweep/M_phase2_lr2e8_largebatch.toml ADDED Viewed

	@@ -0,0 +1,52 @@

+# Phase 2: same tiny LR but larger inner batch (16/rank → effective 128) so the
+# gradients are much smoother. Should give the smoothest descent of all.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "./out/phase1_best"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 2.0e-8
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.99]
+eps                  = 1.0e-3
+samples_per_step     = 16
+micro_batch_size     = 1
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+kl_chunk_size        = 256
+new_layer_lr_mul     = 1.0
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "M_phase2_lr2e8_largebatch"
+log_every     = 1
+output_dir    = "./out/sweep/M_phase2_lr2e8_largebatch"
+[init]
+zero_layers        = []
+target_num_layers  = 40

scripts/backup_to_hf.py CHANGED Viewed

@@ -31,11 +31,16 @@ INCLUDE = [
     "configs/sweep/G_cold_lr2e7_grow40.toml",
     "configs/sweep/H_cold_lr1e7_32L.toml",
     "configs/sweep/I_cold_paramgroups_grow40.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",
     "scripts/run_sweep_rerun.sh",
     "scripts/run_hparam_sweep.sh",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

     "configs/sweep/G_cold_lr2e7_grow40.toml",
     "configs/sweep/H_cold_lr1e7_32L.toml",
     "configs/sweep/I_cold_paramgroups_grow40.toml",
+    "configs/sweep/J_phase2_lr5e9_const.toml",
+    "configs/sweep/K_phase2_lr2e8_const.toml",
+    "configs/sweep/L_phase2_lr1e8_warmup500.toml",
+    "configs/sweep/M_phase2_lr2e8_largebatch.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",
     "scripts/run_sweep_rerun.sh",
     "scripts/run_hparam_sweep.sh",
+    "scripts/run_phase2_sweep.sh",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

scripts/run_phase2_sweep.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/usr/bin/env bash
+# Phase 2 sweep: waits for phase 1 to finish, then resumes from whichever
+# phase 1 run achieved the lowest eval KL. All configs use very small LRs
+# and constant/very-slow schedules. Goal: monotone, very slow KL descent.
+#
+# Launch in the background with:
+#   nohup ./scripts/run_phase2_sweep.sh > logs/sweep_phase2_master.log 2>&1 &
+set -uo pipefail
+cd "$(dirname "$0")/.."
+LOG_DIR="logs"
+mkdir -p "$LOG_DIR"
+# 1. Wait for phase 1 to finish.
+echo ">>> [$(date '+%F %T')] phase2 waiter: waiting for phase 1 to finish..."
+while pgrep -f "run_hparam_sweep.sh" > /dev/null; do
+    sleep 30
+done
+# Also wait for any straggler distill.py procs from phase 1 to die
+while pgrep -f "distill.py --config configs/sweep/[A-I]_" > /dev/null; do
+    sleep 30
+done
+echo ">>> [$(date '+%F %T')] phase2 waiter: phase 1 done."
+# 2. Find phase 1's best ckpt.
+PHASE1_BEST=$(.venv/bin/python - <<'PY'
+import json, glob, os, sys
+best_kl = float("inf")
+best_dir = None
+for f in glob.glob("out/sweep/[A-I]_*/best/best.json"):
+    try:
+        kl = json.load(open(f))["eval_kl"]
+    except Exception:
+        continue
+    if kl < best_kl:
+        best_kl = kl
+        best_dir = os.path.dirname(f)
+if best_dir is None:
+    sys.exit("no phase 1 best found")
+print(f"{best_dir}\t{best_kl}")
+PY
+)
+BEST_DIR=$(echo "$PHASE1_BEST" | cut -f1)
+BEST_KL=$(echo "$PHASE1_BEST" | cut -f2)
+echo ">>> phase 1 best: $BEST_DIR (eval_kl=$BEST_KL)"
+# 3. Symlink ./out/phase1_best -> the winner so phase 2 configs can reference
+#    a stable path.
+mkdir -p out
+rm -f out/phase1_best
+ln -sfn "$(realpath "$BEST_DIR")" out/phase1_best
+echo ">>> linked out/phase1_best -> $(readlink out/phase1_best)"
+# 4. Run phase 2 configs sequentially.
+CONFIGS=(
+    "configs/sweep/J_phase2_lr5e9_const.toml"
+    "configs/sweep/K_phase2_lr2e8_const.toml"
+    "configs/sweep/L_phase2_lr1e8_warmup500.toml"
+    "configs/sweep/M_phase2_lr2e8_largebatch.toml"
+)
+for cfg in "${CONFIGS[@]}"; do
+    name="$(basename "$cfg" .toml)"
+    log="$LOG_DIR/$name.log"
+    echo ">>> [$(date '+%F %T')] starting $name -> $log"
+    .venv/bin/accelerate launch \
+        --config_file configs/accelerate.yaml \
+        distill.py \
+        --config "$cfg" \
+        > "$log" 2>&1
+    rc=$?
+    best_line=$(grep -E "Best eval KL" "$log" | tail -1)
+    echo "<<< [$(date '+%F %T')] finished $name (exit=$rc) ${best_line}"
+    if [[ $rc -ne 0 ]]; then
+        echo "    last 12 lines of $log:"
+        tail -12 "$log" | sed 's/^/      /'
+    fi
+done
+echo ">>> [$(date '+%F %T')] phase2 sweep complete"
+echo ">>> overall summary (phase 1 + phase 2):"
+for log in $LOG_DIR/[A-M]_*.log; do
+    name=$(basename "$log" .log)
+    best=$(grep -E "Best eval KL" "$log" 2>/dev/null | tail -1 | sed 's/.*Best eval KL = //')
+    printf "    %-32s %s\n" "$name" "${best:-FAILED}"
+done | sort -k2