add grow_layers, sweep configs (replicate_zero4, grow40_winning, grow40_simple), sweep runner

Browse files

Files changed (8) hide show

configs/base.toml +5 -1
configs/grow40_simple.toml +50 -0
configs/grow40_winning.toml +50 -0
configs/replicate_zero4.toml +49 -0
configs/zero_14_17.toml +5 -1
distill.py +107 -9
scripts/backup_to_hf.py +4 -0
scripts/run_sweep.sh +40 -0

configs/base.toml CHANGED Viewed

@@ -28,6 +28,9 @@ samples_per_step     = 4
 max_steps            = 5
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
 [eval]
 every_steps = 5
@@ -42,4 +45,5 @@ log_every     = 1
 output_dir    = "./out/smoketest"
 [init]
-zero_layers = []

 max_steps            = 5
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
 [eval]
 every_steps = 5
 output_dir    = "./out/smoketest"
 [init]
+zero_layers        = []
+target_num_layers  = 32

configs/grow40_simple.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+# Grow student to 40 layers with the current (bf16, seq=640) hparams.
+# Tests the architectural change in isolation without the winning hparams,
+# so we can attribute any improvement.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 8
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+[eval]
+every_steps = 50
+samples     = 64
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_simple"
+log_every     = 1
+output_dir    = "./out/grow40_simple"
+[init]
+zero_layers        = []
+target_num_layers  = 40

configs/grow40_winning.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+# Grow student to 40 layers AND apply the winning hparams from zero4_long.
+# New layers (32-39) are appended at the end with output projections zeroed
+# (identity at init, gradients still flow). No layer zeroing.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "float32"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "grow40_winning"
+log_every     = 1
+output_dir    = "./out/grow40_winning"
+[init]
+zero_layers        = []
+target_num_layers  = 40

configs/replicate_zero4.toml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Replicates wandb run "zero4_long" (mepqfry1, eval kl 0.275).
+# Same hparams as that run; same 4-layer zero (14-17). 32-layer student.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 2048
+kl_start_pos   = 128
+seed           = 6767
+shuffle_buffer = 10000
+[train]
+seed                 = 6767
+lr                   = 5.0e-7
+schedule             = "cosine"
+warmup_steps         = 100
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.999]
+eps                  = 1.0e-3
+samples_per_step     = 4
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+student_dtype        = "float32"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
+[eval]
+every_steps = 50
+samples     = 500
+seed        = 4242
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "replicate_zero4"
+log_every     = 1
+output_dir    = "./out/replicate_zero4"
+[init]
+zero_layers        = [14, 15, 16, 17]
+target_num_layers  = 32

configs/zero_14_17.toml CHANGED Viewed

@@ -29,6 +29,9 @@ samples_per_step     = 8
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
 [eval]
 every_steps = 50
@@ -43,4 +46,5 @@ log_every     = 1
 output_dir    = "./out/zero_14_17"
 [init]
-zero_layers = [14, 15, 16, 17]

 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"
+student_dtype        = "bfloat16"
+teacher_dtype        = "bfloat16"
+mixed_precision      = "bf16"
 [eval]
 every_steps = 50
 output_dir    = "./out/zero_14_17"
 [init]
+zero_layers        = [14, 15, 16, 17]
+target_num_layers  = 32

distill.py CHANGED Viewed

@@ -62,12 +62,26 @@ REQUIRED_KEYS = {
         "max_steps",
         "grad_checkpointing",
         "attn_implementation",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
-    "init": ("zero_layers",),
 }
 def load_config(path):
     with open(path, "rb") as f:
@@ -117,9 +131,82 @@ def zero_layers(model, layer_indices):
     return n
 def load_student(model_id, dtype, grad_ckpt, attn_impl):
     from transformers import AutoModelForCausalLM
-    log.info(f"Loading student: {model_id}")
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         dtype=dtype,
@@ -142,7 +229,7 @@ def load_teacher(model_id, dtype, attn_impl):
     archs = list(getattr(cfg, "architectures", []) or [])
     arch = archs[0] if archs else ""
     is_multimodal = "ConditionalGeneration" in arch or "ImageText" in arch
-    log.info(f"Loading teacher: {model_id} (arch={arch}, multimodal={is_multimodal})")
     if is_multimodal:
         from transformers import AutoModelForImageTextToText
@@ -354,12 +441,13 @@ def main():
     cfg = load_config(args.config)
-    accelerator = Accelerator(mixed_precision="bf16")
     set_seed(cfg["train"]["seed"])
     if accelerator.is_main_process:
         log.info(f"Loaded config from {args.config}")
         log.info(f"World size: {accelerator.num_processes}")
     # ---- Tokenizer
     from transformers import AutoTokenizer
@@ -368,21 +456,31 @@ def main():
         tokenizer.pad_token = tokenizer.eos_token
     pad_id = tokenizer.pad_token_id
-    # ---- Models
-    dtype = torch.bfloat16
     student = load_student(
         cfg["model"]["student"],
-        dtype,
         grad_ckpt=cfg["train"]["grad_checkpointing"],
         attn_impl=cfg["train"]["attn_implementation"],
     )
     teacher = load_teacher(
         cfg["model"]["teacher"],
-        dtype,
         attn_impl=cfg["train"]["attn_implementation"],
     )
-    # ---- Layer modifications (post-load, pre-prepare)
     zero_idx = cfg["init"]["zero_layers"]
     if zero_idx:
         n = zero_layers(student, zero_idx)

         "max_steps",
         "grad_checkpointing",
         "attn_implementation",
+        "student_dtype",
+        "teacher_dtype",
+        "mixed_precision",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
+    "init": ("zero_layers", "target_num_layers"),
 }
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+def parse_dtype(s):
+    if s not in DTYPE_MAP:
+        raise ValueError(f"unknown dtype {s!r}; must be one of {list(DTYPE_MAP)}")
+    return DTYPE_MAP[s]
 def load_config(path):
     with open(path, "rb") as f:
     return n
+def _zero_output_projections(layer):
+    """Zero out attention and MLP output projections so the layer is identity
+    at init while still allowing gradients to flow into o_proj/down_proj first
+    (and from there back into the rest of the layer's params after one step).
+    Knows about Qwen3.5 names: self_attn.o_proj (full attention),
+    linear_attn.out_proj (linear attention), mlp.down_proj.
+    """
+    zeroed = []
+    with torch.no_grad():
+        if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "o_proj"):
+            layer.self_attn.o_proj.weight.zero_()
+            zeroed.append("self_attn.o_proj")
+        if hasattr(layer, "linear_attn") and hasattr(layer.linear_attn, "out_proj"):
+            layer.linear_attn.out_proj.weight.zero_()
+            zeroed.append("linear_attn.out_proj")
+        if hasattr(layer, "mlp") and hasattr(layer.mlp, "down_proj"):
+            layer.mlp.down_proj.weight.zero_()
+            zeroed.append("mlp.down_proj")
+    return zeroed
+def grow_layers(model, target_n):
+    """Grow the student to `target_n` decoder layers by appending new ones at the end.
+    New layers are constructed via the existing decoder layer class with the model's
+    own _init_weights, then their output projections are zeroed so each new layer
+    starts as the identity but is still trainable.
+    """
+    inner = get_inner_with_layers(model)
+    cur_n = len(inner.layers)
+    if target_n == cur_n:
+        return cur_n
+    if target_n < cur_n:
+        raise ValueError(f"target_num_layers={target_n} < current {cur_n}; cannot shrink")
+    # Locate the (text) config that the layers are built from. For multimodal
+    # wrappers this lives at .text_config; for the dense student it's the same
+    # object as model.config.
+    cfg = model.config
+    text_cfg = getattr(cfg, "text_config", cfg)
+    # Extend layer_types by repeating the existing periodic pattern
+    if not hasattr(text_cfg, "layer_types") or not text_cfg.layer_types:
+        raise RuntimeError("text config has no layer_types; cannot extend pattern")
+    period = getattr(text_cfg, "full_attention_interval", 4)
+    new_types = list(text_cfg.layer_types)
+    while len(new_types) < target_n:
+        new_types.append(new_types[len(new_types) % period])
+    text_cfg.layer_types = new_types
+    text_cfg.num_hidden_layers = target_n
+    if hasattr(cfg, "num_hidden_layers") and cfg is not text_cfg:
+        cfg.num_hidden_layers = target_n
+    # Construct new layers using the same class as the existing ones
+    layer_cls = type(inner.layers[0])
+    device = next(inner.parameters()).device
+    dtype = next(inner.parameters()).dtype
+    new_layer_zeroed = []
+    for i in range(cur_n, target_n):
+        new_layer = layer_cls(text_cfg, layer_idx=i)
+        # Apply the parent model's init scheme (std=initializer_range etc.)
+        new_layer.apply(model._init_weights)
+        new_layer.to(device=device, dtype=dtype)
+        # Zero output projections -> identity at init, gradients still flow
+        zeroed = _zero_output_projections(new_layer)
+        new_layer_zeroed.append((i, zeroed))
+        inner.layers.append(new_layer)
+    return target_n, new_layer_zeroed
 def load_student(model_id, dtype, grad_ckpt, attn_impl):
     from transformers import AutoModelForCausalLM
+    log.info(f"Loading student: {model_id} (dtype={dtype})")
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         dtype=dtype,
     archs = list(getattr(cfg, "architectures", []) or [])
     arch = archs[0] if archs else ""
     is_multimodal = "ConditionalGeneration" in arch or "ImageText" in arch
+    log.info(f"Loading teacher: {model_id} (arch={arch}, multimodal={is_multimodal}, dtype={dtype})")
     if is_multimodal:
         from transformers import AutoModelForImageTextToText
     cfg = load_config(args.config)
+    accelerator = Accelerator(mixed_precision=cfg["train"]["mixed_precision"])
     set_seed(cfg["train"]["seed"])
     if accelerator.is_main_process:
         log.info(f"Loaded config from {args.config}")
         log.info(f"World size: {accelerator.num_processes}")
+        log.info(f"Mixed precision: {cfg['train']['mixed_precision']}")
     # ---- Tokenizer
     from transformers import AutoTokenizer
         tokenizer.pad_token = tokenizer.eos_token
     pad_id = tokenizer.pad_token_id
+    # ---- Models (separate dtypes per config)
+    student_dtype = parse_dtype(cfg["train"]["student_dtype"])
+    teacher_dtype = parse_dtype(cfg["train"]["teacher_dtype"])
     student = load_student(
         cfg["model"]["student"],
+        student_dtype,
         grad_ckpt=cfg["train"]["grad_checkpointing"],
         attn_impl=cfg["train"]["attn_implementation"],
     )
     teacher = load_teacher(
         cfg["model"]["teacher"],
+        teacher_dtype,
         attn_impl=cfg["train"]["attn_implementation"],
     )
+    # ---- Layer modifications: grow first, then zero (composable)
+    target_n = cfg["init"]["target_num_layers"]
+    cur_n = len(get_inner_with_layers(student).layers)
+    if target_n != cur_n:
+        new_n, new_zeroed = grow_layers(student, target_n)
+        if accelerator.is_main_process:
+            log.info(f"Grew student from {cur_n} -> {new_n} layers")
+            for idx, names in new_zeroed:
+                log.info(f"  layer {idx}: zeroed {names}")
     zero_idx = cfg["init"]["zero_layers"]
     if zero_idx:
         n = zero_layers(student, zero_idx)

scripts/backup_to_hf.py CHANGED Viewed

@@ -18,8 +18,12 @@ INCLUDE = [
     "distill.py",
     "configs/base.toml",
     "configs/zero_14_17.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

     "distill.py",
     "configs/base.toml",
     "configs/zero_14_17.toml",
+    "configs/replicate_zero4.toml",
+    "configs/grow40_winning.toml",
+    "configs/grow40_simple.toml",
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
+    "scripts/run_sweep.sh",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

scripts/run_sweep.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env bash
+# Sequential sweep runner.
+#
+# Each config grabs all 8 GPUs via accelerate, so they run back-to-back, not in
+# parallel. Output goes to logs/<run>.log; the master log goes to logs/sweep_master.log.
+# Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env.
+#
+# Launch in the background with:
+#   nohup ./scripts/run_sweep.sh > logs/sweep_master.log 2>&1 &
+set -uo pipefail
+cd "$(dirname "$0")/.."
+CONFIGS=(
+    "configs/replicate_zero4.toml"
+    "configs/grow40_winning.toml"
+    "configs/grow40_simple.toml"
+)
+LOG_DIR="logs"
+mkdir -p "$LOG_DIR"
+for cfg in "${CONFIGS[@]}"; do
+    name="$(basename "$cfg" .toml)"
+    log="$LOG_DIR/$name.log"
+    echo ">>> [$(date '+%F %T')] starting $name -> $log"
+    .venv/bin/accelerate launch \
+        --config_file configs/accelerate.yaml \
+        distill.py \
+        --config "$cfg" \
+        > "$log" 2>&1
+    rc=$?
+    echo "<<< [$(date '+%F %T')] finished $name (exit=$rc)"
+    if [[ $rc -ne 0 ]]; then
+        echo "    last 20 lines of $log:"
+        tail -20 "$log" | sed 's/^/      /'
+    fi
+done
+echo ">>> [$(date '+%F %T')] sweep complete"