initial scaffold: distill.py + base/zero_14_17 configs + accelerate yaml

Browse files

Files changed (7) hide show

configs/accelerate.yaml +16 -0
configs/base.toml +45 -0
configs/zero_14_17.toml +46 -0
distill.py +559 -0
pyproject.toml +5 -0
requirements.lock.txt +91 -0
scripts/backup_to_hf.py +62 -0

configs/accelerate.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+mixed_precision: bf16
+num_processes: 8
+num_machines: 1
+machine_rank: 0
+gpu_ids: all
+rdzv_backend: static
+same_network: true
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+debug: false
+enable_cpu_affinity: false
+main_training_function: main
+downcast_bf16: 'no'

configs/base.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Base distillation config (smoketest variant).
+# Every value the script reads must live in this file - no defaults in code.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 4
+max_steps            = 5
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+[eval]
+every_steps = 5
+samples     = 16
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "smoketest"
+log_every     = 1
+output_dir    = "./out/smoketest"
+[init]
+zero_layers = []

configs/zero_14_17.toml ADDED Viewed

	@@ -0,0 +1,46 @@

+# Layer-zero distillation: zero student layers 14-17 at init,
+# constant LR 5e-7, 2000 steps. Aim: lower KL than the prior checkpoint
+# despite the surgery.
+[model]
+teacher    = "Qwen/Qwen3.5-35B-A3B"
+student    = "Troiaaa/m-6a3lnzvb"
+tokenizer  = "Qwen/Qwen3.5-35B-A3B"
+[data]
+dataset        = "karpathy/climbmix-400b-shuffle"
+text_field     = "text"
+min_chars      = 2560
+max_seq_len    = 640
+kl_start_pos   = 128
+seed           = 42
+shuffle_buffer = 10000
+[train]
+seed                 = 42
+lr                   = 5.0e-7
+schedule             = "constant"
+warmup_steps         = 0
+weight_decay         = 0.0
+grad_clip            = 1.0
+betas                = [0.9, 0.95]
+eps                  = 1.0e-8
+samples_per_step     = 8
+max_steps            = 2000
+grad_checkpointing   = true
+attn_implementation  = "flash_attention_2"
+[eval]
+every_steps = 50
+samples     = 64
+seed        = 1234
+[log]
+wandb         = true
+wandb_project = "distil-subnet97"
+wandb_run     = "m-6a3lnzvb-zero14_17"
+log_every     = 1
+output_dir    = "./out/zero_14_17"
+[init]
+zero_layers = [14, 15, 16, 17]

distill.py ADDED Viewed

	@@ -0,0 +1,559 @@

+#!/usr/bin/env python3
+"""
+KL Distillation Training - TOML-driven, accelerate multi-GPU.
+Run with:
+    accelerate launch --config_file configs/accelerate.yaml distill.py --config configs/base.toml
+The TOML config is the single source of truth - no hardcoded defaults in this file.
+The only command line argument is --config <path-to-toml>.
+"""
+import argparse
+import gc
+import json
+import logging
+import shutil
+import time
+import tomllib
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch.optim import AdamW
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("distill")
+# ----------------------------------------------------------------------------
+# Config
+# ----------------------------------------------------------------------------
+REQUIRED_SECTIONS = ("model", "data", "train", "eval", "log", "init")
+REQUIRED_KEYS = {
+    "model": ("teacher", "student", "tokenizer"),
+    "data": (
+        "dataset",
+        "text_field",
+        "min_chars",
+        "max_seq_len",
+        "kl_start_pos",
+        "seed",
+        "shuffle_buffer",
+    ),
+    "train": (
+        "seed",
+        "lr",
+        "schedule",
+        "warmup_steps",
+        "weight_decay",
+        "grad_clip",
+        "betas",
+        "eps",
+        "samples_per_step",
+        "max_steps",
+        "grad_checkpointing",
+        "attn_implementation",
+    ),
+    "eval": ("every_steps", "samples", "seed"),
+    "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
+    "init": ("zero_layers",),
+}
+def load_config(path):
+    with open(path, "rb") as f:
+        cfg = tomllib.load(f)
+    for sec in REQUIRED_SECTIONS:
+        if sec not in cfg:
+            raise KeyError(f"config missing required section [{sec}]")
+        for key in REQUIRED_KEYS[sec]:
+            if key not in cfg[sec]:
+                raise KeyError(f"config missing required key [{sec}].{key}")
+    return cfg
+# ----------------------------------------------------------------------------
+# Model loading
+# ----------------------------------------------------------------------------
+def get_inner_with_layers(model):
+    """Walk wrappers (model, language_model, transformer, ...) to find an
+    object that has `.layers`. Used by zero_layers."""
+    seen = set()
+    stack = [model]
+    while stack:
+        m = stack.pop()
+        if id(m) in seen:
+            continue
+        seen.add(id(m))
+        if hasattr(m, "layers"):
+            return m
+        for attr in ("model", "language_model", "transformer", "base_model"):
+            child = getattr(m, attr, None)
+            if child is not None:
+                stack.append(child)
+    raise RuntimeError(f"Could not locate `.layers` inside {type(model).__name__}")
+def zero_layers(model, layer_indices):
+    inner = get_inner_with_layers(model)
+    layers = inner.layers
+    n = len(layers)
+    for idx in layer_indices:
+        if idx < 0 or idx >= n:
+            raise IndexError(f"layer {idx} out of range (0..{n - 1})")
+        with torch.no_grad():
+            for p in layers[idx].parameters():
+                p.zero_()
+    return n
+def load_student(model_id, dtype, grad_ckpt, attn_impl):
+    from transformers import AutoModelForCausalLM
+    log.info(f"Loading student: {model_id}")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        dtype=dtype,
+        low_cpu_mem_usage=True,
+        attn_implementation=attn_impl,
+    )
+    model.config.use_cache = False
+    if grad_ckpt:
+        model.gradient_checkpointing_enable(
+            gradient_checkpointing_kwargs={"use_reentrant": False}
+        )
+    return model
+def load_teacher(model_id, dtype, attn_impl):
+    """Load teacher model. Handles both pure CausalLM and multimodal
+    (ConditionalGeneration) wrappers."""
+    from transformers import AutoConfig
+    cfg = AutoConfig.from_pretrained(model_id)
+    archs = list(getattr(cfg, "architectures", []) or [])
+    arch = archs[0] if archs else ""
+    is_multimodal = "ConditionalGeneration" in arch or "ImageText" in arch
+    log.info(f"Loading teacher: {model_id} (arch={arch}, multimodal={is_multimodal})")
+    if is_multimodal:
+        from transformers import AutoModelForImageTextToText
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_id,
+            dtype=dtype,
+            low_cpu_mem_usage=True,
+            attn_implementation=attn_impl,
+        )
+    else:
+        from transformers import AutoModelForCausalLM
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=dtype,
+            low_cpu_mem_usage=True,
+            attn_implementation=attn_impl,
+        )
+    model.config.use_cache = False
+    model.eval()
+    for p in model.parameters():
+        p.requires_grad_(False)
+    return model
+def teacher_forward(teacher, input_ids, attention_mask):
+    """Get teacher logits whether the model is unimodal or multimodal."""
+    out = teacher(input_ids=input_ids, attention_mask=attention_mask)
+    logits = getattr(out, "logits", None)
+    if logits is None:
+        raise RuntimeError("teacher forward did not return .logits")
+    return logits
+# ----------------------------------------------------------------------------
+# Data
+# ----------------------------------------------------------------------------
+class StreamingTextLoader:
+    """Per-rank shard of a HF streaming dataset, yielding tokenized samples."""
+    def __init__(
+        self,
+        name,
+        text_field,
+        min_chars,
+        max_seq_len,
+        kl_start_pos,
+        tokenizer,
+        rank,
+        world_size,
+        seed,
+        shuffle_buffer,
+    ):
+        from datasets import load_dataset
+        from datasets.distributed import split_dataset_by_node
+        ds = load_dataset(name, split="train", streaming=True)
+        ds = ds.shuffle(seed=seed, buffer_size=shuffle_buffer)
+        ds = split_dataset_by_node(ds, rank=rank, world_size=world_size)
+        self._ds = iter(ds)
+        self._text_field = text_field
+        self._min_chars = min_chars
+        self._max_seq_len = max_seq_len
+        self._min_tokens = kl_start_pos + 16
+        self._tokenizer = tokenizer
+    def next_batch(self, n):
+        out = []
+        scanned = 0
+        while len(out) < n and scanned < n * 50:
+            try:
+                item = next(self._ds)
+            except StopIteration:
+                break
+            scanned += 1
+            text = item.get(self._text_field, "") or ""
+            if len(text) < self._min_chars:
+                continue
+            ids = self._tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=self._max_seq_len,
+            ).input_ids.squeeze(0)
+            if ids.shape[0] < self._min_tokens:
+                continue
+            out.append(ids)
+        return out
+def collate_pad(token_lists, pad_id):
+    """Right-pad a list of [L_i] tensors into [B, max_L] + attention_mask."""
+    max_len = max(t.shape[0] for t in token_lists)
+    B = len(token_lists)
+    input_ids = torch.full((B, max_len), pad_id, dtype=torch.long)
+    attention_mask = torch.zeros((B, max_len), dtype=torch.long)
+    for i, t in enumerate(token_lists):
+        L = t.shape[0]
+        input_ids[i, :L] = t
+        attention_mask[i, :L] = 1
+    return input_ids, attention_mask
+# ----------------------------------------------------------------------------
+# Loss
+# ----------------------------------------------------------------------------
+def kl_loss_masked(student_logits, teacher_logits, attention_mask, start_pos):
+    """Forward KL(teacher || student), masked for padding & start_pos.
+    Computed in fp32 for numerical stability.
+    """
+    s = student_logits[:, start_pos:, :].float()
+    t = teacher_logits[:, start_pos:, :].detach().float()
+    mask = attention_mask[:, start_pos:].float()
+    t_log_p = F.log_softmax(t, dim=-1)
+    s_log_p = F.log_softmax(s, dim=-1)
+    t_p = t_log_p.exp()
+    per_token = (t_p * (t_log_p - s_log_p)).sum(-1)  # [B, T-start]
+    return (per_token * mask).sum() / mask.sum().clamp_min(1.0)
+# ----------------------------------------------------------------------------
+# Optimizer / scheduler
+# ----------------------------------------------------------------------------
+def make_optimizer(model, train_cfg):
+    return AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=train_cfg["lr"],
+        weight_decay=train_cfg["weight_decay"],
+        betas=tuple(train_cfg["betas"]),
+        eps=train_cfg["eps"],
+    )
+def make_scheduler(optimizer, train_cfg):
+    schedule = train_cfg["schedule"]
+    warmup = train_cfg["warmup_steps"]
+    total = train_cfg["max_steps"]
+    if schedule == "constant":
+        from transformers import get_constant_schedule_with_warmup
+        return get_constant_schedule_with_warmup(optimizer, warmup)
+    if schedule == "cosine":
+        from transformers import get_cosine_schedule_with_warmup
+        return get_cosine_schedule_with_warmup(optimizer, warmup, total)
+    if schedule == "linear":
+        from transformers import get_linear_schedule_with_warmup
+        return get_linear_schedule_with_warmup(optimizer, warmup, total)
+    raise ValueError(f"unknown schedule: {schedule!r}")
+# ----------------------------------------------------------------------------
+# Eval
+# ----------------------------------------------------------------------------
+@torch.no_grad()
+def evaluate(accelerator, student, teacher, eval_batches, pad_id, kl_start_pos):
+    student.eval()
+    sdev = accelerator.device
+    total = 0.0
+    n = 0
+    for sample in eval_batches:
+        ids, mask = collate_pad([sample], pad_id)
+        ids = ids.to(sdev)
+        mask = mask.to(sdev)
+        t_logits = teacher_forward(teacher, ids, mask)
+        s_logits = student(input_ids=ids, attention_mask=mask).logits
+        loss = kl_loss_masked(s_logits, t_logits, mask, start_pos=kl_start_pos)
+        total += loss.item()
+        n += 1
+        del t_logits, s_logits, loss
+    student.train()
+    if n == 0:
+        local = torch.tensor(float("inf"), device=sdev)
+    else:
+        local = torch.tensor(total / n, device=sdev)
+    gathered = accelerator.gather(local.unsqueeze(0))
+    return gathered.mean().item()
+def save_best(accelerator, student, tokenizer, output_dir, step, eval_kl):
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        out_dir = Path(output_dir) / "best"
+        if out_dir.exists():
+            shutil.rmtree(out_dir)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        unwrapped = accelerator.unwrap_model(student)
+        unwrapped.save_pretrained(out_dir, safe_serialization=True)
+        tokenizer.save_pretrained(out_dir)
+        with open(out_dir / "best.json", "w") as f:
+            json.dump({"step": step, "eval_kl": eval_kl}, f, indent=2)
+        log.info(f"  saved best @ step {step}: eval_kl={eval_kl:.6f} -> {out_dir}")
+    accelerator.wait_for_everyone()
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--config", required=True, help="Path to TOML config")
+    args = p.parse_args()
+    cfg = load_config(args.config)
+    accelerator = Accelerator(mixed_precision="bf16")
+    set_seed(cfg["train"]["seed"])
+    if accelerator.is_main_process:
+        log.info(f"Loaded config from {args.config}")
+        log.info(f"World size: {accelerator.num_processes}")
+    # ---- Tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(cfg["model"]["tokenizer"])
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    pad_id = tokenizer.pad_token_id
+    # ---- Models
+    dtype = torch.bfloat16
+    student = load_student(
+        cfg["model"]["student"],
+        dtype,
+        grad_ckpt=cfg["train"]["grad_checkpointing"],
+        attn_impl=cfg["train"]["attn_implementation"],
+    )
+    teacher = load_teacher(
+        cfg["model"]["teacher"],
+        dtype,
+        attn_impl=cfg["train"]["attn_implementation"],
+    )
+    # ---- Layer modifications (post-load, pre-prepare)
+    zero_idx = cfg["init"]["zero_layers"]
+    if zero_idx:
+        n = zero_layers(student, zero_idx)
+        if accelerator.is_main_process:
+            log.info(f"Zeroed student layers {zero_idx} (model has {n} layers)")
+    teacher = teacher.to(accelerator.device)
+    # ---- Optimizer / scheduler
+    optimizer = make_optimizer(student, cfg["train"])
+    scheduler = make_scheduler(optimizer, cfg["train"])
+    student, optimizer, scheduler = accelerator.prepare(
+        student, optimizer, scheduler
+    )
+    # ---- Output dir + config snapshot
+    output_dir = Path(cfg["log"]["output_dir"])
+    if accelerator.is_main_process:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(args.config, output_dir / "config.snapshot.toml")
+    # ---- Wandb
+    use_wandb = cfg["log"]["wandb"]
+    if use_wandb and accelerator.is_main_process:
+        import wandb
+        wandb.init(
+            project=cfg["log"]["wandb_project"],
+            name=cfg["log"]["wandb_run"],
+            config=cfg,
+        )
+    # ---- Data loaders
+    train_loader = StreamingTextLoader(
+        name=cfg["data"]["dataset"],
+        text_field=cfg["data"]["text_field"],
+        min_chars=cfg["data"]["min_chars"],
+        max_seq_len=cfg["data"]["max_seq_len"],
+        kl_start_pos=cfg["data"]["kl_start_pos"],
+        tokenizer=tokenizer,
+        rank=accelerator.process_index,
+        world_size=accelerator.num_processes,
+        seed=cfg["data"]["seed"],
+        shuffle_buffer=cfg["data"]["shuffle_buffer"],
+    )
+    eval_loader = StreamingTextLoader(
+        name=cfg["data"]["dataset"],
+        text_field=cfg["data"]["text_field"],
+        min_chars=cfg["data"]["min_chars"],
+        max_seq_len=cfg["data"]["max_seq_len"],
+        kl_start_pos=cfg["data"]["kl_start_pos"],
+        tokenizer=tokenizer,
+        rank=accelerator.process_index,
+        world_size=accelerator.num_processes,
+        seed=cfg["eval"]["seed"],
+        shuffle_buffer=cfg["data"]["shuffle_buffer"],
+    )
+    eval_per_rank = max(1, cfg["eval"]["samples"] // accelerator.num_processes)
+    eval_batches = eval_loader.next_batch(eval_per_rank)
+    if accelerator.is_main_process:
+        log.info(
+            f"Eval set: {len(eval_batches)}/rank x {accelerator.num_processes} ranks "
+            f"= {len(eval_batches) * accelerator.num_processes} samples"
+        )
+    # ---- Train loop
+    samples_per_step = cfg["train"]["samples_per_step"]
+    grad_clip = cfg["train"]["grad_clip"]
+    kl_start_pos = cfg["data"]["kl_start_pos"]
+    max_steps = cfg["train"]["max_steps"]
+    eval_every = cfg["eval"]["every_steps"]
+    log_every = cfg["log"]["log_every"]
+    if accelerator.is_main_process:
+        log.info(
+            f"=== Training: max_steps={max_steps}, samples_per_step={samples_per_step} "
+            f"(per rank), effective batch={samples_per_step * accelerator.num_processes}"
+        )
+    student.train()
+    best_kl = float("inf")
+    global_step = 0
+    while global_step < max_steps:
+        t0 = time.time()
+        batch = train_loader.next_batch(samples_per_step)
+        if not batch:
+            log.warning(f"rank {accelerator.process_index}: data exhausted")
+            break
+        ids, mask = collate_pad(batch, pad_id)
+        ids = ids.to(accelerator.device)
+        mask = mask.to(accelerator.device)
+        with torch.no_grad():
+            t_logits = teacher_forward(teacher, ids, mask)
+        s_logits = student(input_ids=ids, attention_mask=mask).logits
+        loss = kl_loss_masked(s_logits, t_logits, mask, start_pos=kl_start_pos)
+        optimizer.zero_grad()
+        accelerator.backward(loss)
+        if grad_clip > 0:
+            accelerator.clip_grad_norm_(student.parameters(), grad_clip)
+        optimizer.step()
+        scheduler.step()
+        global_step += 1
+        elapsed = time.time() - t0
+        kl_local = loss.detach()
+        kl_avg = accelerator.gather(kl_local.unsqueeze(0)).mean().item()
+        del t_logits, s_logits, loss, kl_local
+        if accelerator.is_main_process and global_step % log_every == 0:
+            lr_now = scheduler.get_last_lr()[0]
+            log.info(
+                f"step {global_step}/{max_steps} | kl {kl_avg:.4f} | "
+                f"lr {lr_now:.2e} | {elapsed:.2f}s"
+            )
+            if use_wandb:
+                import wandb
+                wandb.log(
+                    {
+                        "train/kl": kl_avg,
+                        "train/lr": lr_now,
+                        "perf/step_time_s": elapsed,
+                    },
+                    step=global_step,
+                )
+        if global_step % eval_every == 0:
+            eval_kl = evaluate(
+                accelerator, student, teacher, eval_batches, pad_id, kl_start_pos
+            )
+            if accelerator.is_main_process:
+                log.info(
+                    f"  eval @ step {global_step}: kl={eval_kl:.6f} "
+                    f"(best={best_kl:.6f})"
+                )
+                if use_wandb:
+                    import wandb
+                    wandb.log({"eval/kl": eval_kl}, step=global_step)
+            if eval_kl < best_kl:
+                best_kl = eval_kl
+                save_best(
+                    accelerator, student, tokenizer, output_dir, global_step, eval_kl
+                )
+            student.train()
+        if global_step % 20 == 0:
+            gc.collect()
+            torch.cuda.empty_cache()
+    # Final eval
+    eval_kl = evaluate(
+        accelerator, student, teacher, eval_batches, pad_id, kl_start_pos
+    )
+    if accelerator.is_main_process:
+        log.info(f"  final eval: kl={eval_kl:.6f} (best={best_kl:.6f})")
+        if use_wandb:
+            import wandb
+            wandb.log({"eval/kl": eval_kl}, step=global_step)
+    if eval_kl < best_kl:
+        best_kl = eval_kl
+        save_best(accelerator, student, tokenizer, output_dir, global_step, eval_kl)
+    if accelerator.is_main_process:
+        log.info(f"Done. Best eval KL = {best_kl:.6f}")
+        if use_wandb:
+            import wandb
+            wandb.finish()
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[project]
+name = "distill"
+version = "0.1.0"
+requires-python = ">=3.12"
+dependencies = []

requirements.lock.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accelerate==1.13.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.5
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+attrs==26.1.0
+certifi==2026.2.25
+charset-normalizer==3.4.7
+click==8.3.2
+cuda-bindings==12.9.4
+cuda-pathfinder==1.2.2
+cuda-toolkit==12.8.1
+datasets==4.8.4
+dill==0.4.1
+einops==0.8.2
+filelock==3.25.2
+fla-core==0.4.2
+flash-attn @ file:///tmp/flash_attn-2.8.3+cu128torch2.11-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
+flash-linear-attention==0.4.2
+frozenlist==1.8.0
+fsspec==2026.2.0
+gitdb==4.0.12
+gitpython==3.1.46
+h11==0.16.0
+hf-xet==1.4.3
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==1.9.0
+idna==3.11
+jinja2==3.1.6
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.19
+networkx==3.6.1
+numpy==2.4.4
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.19.0.56
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.28.9
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.8.90
+packaging==26.0
+pandas==3.0.2
+platformdirs==4.9.4
+propcache==0.4.1
+protobuf==6.33.6
+psutil==7.2.2
+pyarrow==23.0.1
+pydantic==2.12.5
+pydantic-core==2.41.5
+pygments==2.20.0
+python-dateutil==2.9.0.post0
+pyyaml==6.0.3
+regex==2026.4.4
+requests==2.33.1
+rich==14.3.3
+safetensors==0.7.0
+sentencepiece==0.2.1
+sentry-sdk==2.57.0
+setuptools==70.2.0
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.3
+sympy==1.14.0
+tokenizers==0.22.2
+tomli-w==1.2.0
+torch==2.11.0+cu128
+tqdm==4.67.3
+transformers @ git+https://github.com/huggingface/transformers.git@52cb0653b48fcb0737a74546911df77034b61732
+triton==3.6.0
+typer==0.24.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+urllib3==2.6.3
+wandb==0.25.1
+xxhash==3.6.0
+yarl==1.23.0

scripts/backup_to_hf.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+"""Push the distill code/configs to the HF backup repo.
+Usage:
+    .venv/bin/python scripts/backup_to_hf.py "<commit message>"
+"""
+import os
+import sys
+from pathlib import Path
+from huggingface_hub import HfApi, CommitOperationAdd, create_commit
+REPO_ID = "Delta-Vector/distill-m-6a3lnzvb-code"
+REPO_TYPE = "model"
+# Files/directories to mirror to the repo
+INCLUDE = [
+    "distill.py",
+    "configs/base.toml",
+    "configs/zero_14_17.toml",
+    "configs/accelerate.yaml",
+    "scripts/backup_to_hf.py",
+    "pyproject.toml",
+    "requirements.lock.txt",
+]
+def main():
+    msg = sys.argv[1] if len(sys.argv) > 1 else "update"
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("HF_TOKEN env var required", file=sys.stderr)
+        sys.exit(1)
+    root = Path(__file__).resolve().parent.parent
+    ops = []
+    for rel in INCLUDE:
+        local = root / rel
+        if not local.exists():
+            print(f"  skip (missing): {rel}")
+            continue
+        ops.append(
+            CommitOperationAdd(path_in_repo=rel, path_or_fileobj=str(local))
+        )
+        print(f"  add: {rel}")
+    if not ops:
+        print("nothing to upload")
+        return
+    api = HfApi(token=token)
+    api.create_commit(
+        repo_id=REPO_ID,
+        repo_type=REPO_TYPE,
+        operations=ops,
+        commit_message=msg,
+    )
+    print(f"pushed {len(ops)} files to {REPO_ID}: {msg}")
+if __name__ == "__main__":
+    main()