fix OOM: chunked KL with checkpointing + PYTORCH_CUDA_ALLOC_CONF expandable_segments; add kl_chunk_size config key

Browse files

Files changed (8) hide show

configs/base.toml +1 -0
configs/grow40_simple.toml +1 -0
configs/grow40_winning.toml +1 -0
configs/replicate_zero4.toml +1 -0
configs/zero_14_17.toml +1 -0
distill.py +54 -16
scripts/backup_to_hf.py +1 -0
scripts/run_sweep_rerun.sh +37 -0

configs/base.toml CHANGED Viewed

@@ -31,6 +31,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
 [eval]
 every_steps = 5

 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 0
 [eval]
 every_steps = 5

configs/grow40_simple.toml CHANGED Viewed

@@ -32,6 +32,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
 [eval]
 every_steps = 50

 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 0
 [eval]
 every_steps = 50

configs/grow40_winning.toml CHANGED Viewed

@@ -32,6 +32,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "float32"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
 [eval]
 every_steps = 50

 student_dtype        = "float32"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 256
 [eval]
 every_steps = 50

configs/replicate_zero4.toml CHANGED Viewed

@@ -31,6 +31,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "float32"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
 [eval]
 every_steps = 50

 student_dtype        = "float32"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 256
 [eval]
 every_steps = 50

configs/zero_14_17.toml CHANGED Viewed

@@ -32,6 +32,7 @@ attn_implementation  = "flash_attention_2"
 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
 [eval]
 every_steps = 50

 student_dtype        = "bfloat16"
 teacher_dtype        = "bfloat16"
 mixed_precision      = "bf16"
+kl_chunk_size        = 0
 [eval]
 every_steps = 50

distill.py CHANGED Viewed

@@ -9,6 +9,10 @@ The TOML config is the single source of truth - no hardcoded defaults in this fi
 The only command line argument is --config <path-to-toml>.
 """
 import argparse
 import gc
 import json
@@ -20,6 +24,7 @@ from pathlib import Path
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
 from accelerate import Accelerator
@@ -65,6 +70,7 @@ REQUIRED_KEYS = {
         "student_dtype",
         "teacher_dtype",
         "mixed_precision",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
@@ -337,21 +343,44 @@ def collate_pad(token_lists, pad_id):
 # Loss
 # ----------------------------------------------------------------------------
-def kl_loss_masked(student_logits, teacher_logits, attention_mask, start_pos):
-    """Forward KL(teacher || student), masked for padding & start_pos.
-    Computed in fp32 for numerical stability.
-    """
-    s = student_logits[:, start_pos:, :].float()
-    t = teacher_logits[:, start_pos:, :].detach().float()
-    mask = attention_mask[:, start_pos:].float()
     t_log_p = F.log_softmax(t, dim=-1)
     s_log_p = F.log_softmax(s, dim=-1)
     t_p = t_log_p.exp()
-    per_token = (t_p * (t_log_p - s_log_p)).sum(-1)  # [B, T-start]
-    return (per_token * mask).sum() / mask.sum().clamp_min(1.0)
 # ----------------------------------------------------------------------------
@@ -390,7 +419,7 @@ def make_scheduler(optimizer, train_cfg):
 # ----------------------------------------------------------------------------
 @torch.no_grad()
-def evaluate(accelerator, student, teacher, eval_batches, pad_id, kl_start_pos):
     student.eval()
     sdev = accelerator.device
     total = 0.0
@@ -401,7 +430,10 @@ def evaluate(accelerator, student, teacher, eval_batches, pad_id, kl_start_pos):
         mask = mask.to(sdev)
         t_logits = teacher_forward(teacher, ids, mask)
         s_logits = student(input_ids=ids, attention_mask=mask).logits
-        loss = kl_loss_masked(s_logits, t_logits, mask, start_pos=kl_start_pos)
         total += loss.item()
         n += 1
         del t_logits, s_logits, loss
@@ -550,6 +582,7 @@ def main():
     samples_per_step = cfg["train"]["samples_per_step"]
     grad_clip = cfg["train"]["grad_clip"]
     kl_start_pos = cfg["data"]["kl_start_pos"]
     max_steps = cfg["train"]["max_steps"]
     eval_every = cfg["eval"]["every_steps"]
     log_every = cfg["log"]["log_every"]
@@ -578,7 +611,10 @@ def main():
         with torch.no_grad():
             t_logits = teacher_forward(teacher, ids, mask)
         s_logits = student(input_ids=ids, attention_mask=mask).logits
-        loss = kl_loss_masked(s_logits, t_logits, mask, start_pos=kl_start_pos)
         optimizer.zero_grad()
         accelerator.backward(loss)
@@ -612,7 +648,8 @@ def main():
         if global_step % eval_every == 0:
             eval_kl = evaluate(
-                accelerator, student, teacher, eval_batches, pad_id, kl_start_pos
             )
             if accelerator.is_main_process:
                 log.info(
@@ -635,7 +672,8 @@ def main():
     # Final eval
     eval_kl = evaluate(
-        accelerator, student, teacher, eval_batches, pad_id, kl_start_pos
     )
     if accelerator.is_main_process:
         log.info(f"  final eval: kl={eval_kl:.6f} (best={best_kl:.6f})")

 The only command line argument is --config <path-to-toml>.
 """
+import os
+# Reduce fragmentation; large vocab + long seq creates many short-lived big tensors.
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
 import argparse
 import gc
 import json
 import torch
 import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint_utils
 from torch.optim import AdamW
 from accelerate import Accelerator
         "student_dtype",
         "teacher_dtype",
         "mixed_precision",
+        "kl_chunk_size",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
 # Loss
 # ----------------------------------------------------------------------------
+def _kl_chunk_sum(s_chunk, t_chunk, m_chunk):
+    """Compute (sum of masked KL) over a slice. Used as a checkpointed unit so the
+    fp32 softmax intermediates only live for one chunk's worth of memory at a time."""
+    s = s_chunk.float()
+    t = t_chunk.float()
     t_log_p = F.log_softmax(t, dim=-1)
     s_log_p = F.log_softmax(s, dim=-1)
     t_p = t_log_p.exp()
+    per_token = (t_p * (t_log_p - s_log_p)).sum(-1)
+    return (per_token * m_chunk).sum()
+def kl_loss_masked(student_logits, teacher_logits, attention_mask, start_pos, chunk_size):
+    """Forward KL(teacher || student), masked for padding & start_pos, in fp32.
+    If chunk_size > 0, processes the [start_pos:] sequence in chunks of that many
+    positions, with gradient checkpointing on each chunk so peak memory is bounded
+    by one chunk's intermediates rather than the full sequence's.
+    """
+    s_full = student_logits[:, start_pos:, :]
+    t_full = teacher_logits[:, start_pos:, :].detach()
+    m_full = attention_mask[:, start_pos:].float()
+    T = s_full.shape[1]
+    if chunk_size <= 0 or chunk_size >= T:
+        return _kl_chunk_sum(s_full, t_full, m_full) / m_full.sum().clamp_min(1.0)
+    total_kl = torch.zeros((), device=s_full.device, dtype=torch.float32)
+    for i in range(0, T, chunk_size):
+        end = min(i + chunk_size, T)
+        s_c = s_full[:, i:end, :]
+        t_c = t_full[:, i:end, :]
+        m_c = m_full[:, i:end]
+        chunk_kl = checkpoint_utils.checkpoint(
+            _kl_chunk_sum, s_c, t_c, m_c, use_reentrant=False
+        )
+        total_kl = total_kl + chunk_kl
+    return total_kl / m_full.sum().clamp_min(1.0)
 # ----------------------------------------------------------------------------
 # ----------------------------------------------------------------------------
 @torch.no_grad()
+def evaluate(accelerator, student, teacher, eval_batches, pad_id, kl_start_pos, kl_chunk_size):
     student.eval()
     sdev = accelerator.device
     total = 0.0
         mask = mask.to(sdev)
         t_logits = teacher_forward(teacher, ids, mask)
         s_logits = student(input_ids=ids, attention_mask=mask).logits
+        loss = kl_loss_masked(
+            s_logits, t_logits, mask,
+            start_pos=kl_start_pos, chunk_size=kl_chunk_size,
+        )
         total += loss.item()
         n += 1
         del t_logits, s_logits, loss
     samples_per_step = cfg["train"]["samples_per_step"]
     grad_clip = cfg["train"]["grad_clip"]
     kl_start_pos = cfg["data"]["kl_start_pos"]
+    kl_chunk_size = cfg["train"]["kl_chunk_size"]
     max_steps = cfg["train"]["max_steps"]
     eval_every = cfg["eval"]["every_steps"]
     log_every = cfg["log"]["log_every"]
         with torch.no_grad():
             t_logits = teacher_forward(teacher, ids, mask)
         s_logits = student(input_ids=ids, attention_mask=mask).logits
+        loss = kl_loss_masked(
+            s_logits, t_logits, mask,
+            start_pos=kl_start_pos, chunk_size=kl_chunk_size,
+        )
         optimizer.zero_grad()
         accelerator.backward(loss)
         if global_step % eval_every == 0:
             eval_kl = evaluate(
+                accelerator, student, teacher, eval_batches,
+                pad_id, kl_start_pos, kl_chunk_size,
             )
             if accelerator.is_main_process:
                 log.info(
     # Final eval
     eval_kl = evaluate(
+        accelerator, student, teacher, eval_batches,
+        pad_id, kl_start_pos, kl_chunk_size,
     )
     if accelerator.is_main_process:
         log.info(f"  final eval: kl={eval_kl:.6f} (best={best_kl:.6f})")

scripts/backup_to_hf.py CHANGED Viewed

@@ -24,6 +24,7 @@ INCLUDE = [
     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

     "configs/accelerate.yaml",
     "scripts/backup_to_hf.py",
     "scripts/run_sweep.sh",
+    "scripts/run_sweep_rerun.sh",
     "pyproject.toml",
     "requirements.lock.txt",
 ]

scripts/run_sweep_rerun.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env bash
+# Re-runs the two configs that OOM'd in the original sweep, now with the
+# chunked-KL fix and PYTORCH_CUDA_ALLOC_CONF=expandable_segments in distill.py.
+# Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env.
+#
+# Launch with:
+#   nohup ./scripts/run_sweep_rerun.sh > logs/sweep_rerun_master.log 2>&1 &
+set -uo pipefail
+cd "$(dirname "$0")/.."
+CONFIGS=(
+    "configs/replicate_zero4.toml"
+    "configs/grow40_winning.toml"
+)
+LOG_DIR="logs"
+mkdir -p "$LOG_DIR"
+for cfg in "${CONFIGS[@]}"; do
+    name="$(basename "$cfg" .toml)"
+    log="$LOG_DIR/$name.log"
+    echo ">>> [$(date '+%F %T')] starting $name -> $log"
+    .venv/bin/accelerate launch \
+        --config_file configs/accelerate.yaml \
+        distill.py \
+        --config "$cfg" \
+        > "$log" 2>&1
+    rc=$?
+    echo "<<< [$(date '+%F %T')] finished $name (exit=$rc)"
+    if [[ $rc -ne 0 ]]; then
+        echo "    last 20 lines of $log:"
+        tail -20 "$log" | sed 's/^/      /'
+    fi
+done
+echo ">>> [$(date '+%F %T')] rerun complete"