# Layer-zero distillation: zero student layers 14-17 at init, # constant LR 5e-7, 2000 steps. Aim: lower KL than the prior checkpoint # despite the surgery. [model] teacher = "Qwen/Qwen3.5-35B-A3B" student = "Troiaaa/m-6a3lnzvb" tokenizer = "Qwen/Qwen3.5-35B-A3B" [data] dataset = "karpathy/climbmix-400b-shuffle" text_field = "text" min_chars = 2560 max_seq_len = 640 kl_start_pos = 128 seed = 42 shuffle_buffer = 10000 [train] seed = 42 lr = 5.0e-7 schedule = "constant" warmup_steps = 0 weight_decay = 0.0 grad_clip = 1.0 betas = [0.9, 0.95] eps = 1.0e-8 samples_per_step = 8 micro_batch_size = 8 max_steps = 2000 grad_checkpointing = true attn_implementation = "flash_attention_2" student_dtype = "bfloat16" teacher_dtype = "bfloat16" mixed_precision = "bf16" kl_chunk_size = 0 new_layer_lr_mul = 1.0 [eval] every_steps = 50 samples = 64 seed = 1234 [log] wandb = true wandb_project = "distil-subnet97" wandb_run = "m-6a3lnzvb-zero14_17" log_every = 1 output_dir = "./out/zero_14_17" [init] zero_layers = [14, 15, 16, 17] target_num_layers = 32