| [model] |
| teacher = "../models/Qwen3.5-35B-A3B" |
| student = "../models/sn97-text" |
| tokenizer = "../models/Qwen3.5-35B-A3B" |
| student_device = "cuda:7" |
| teacher_devices = [0, 1, 2, 3, 4, 5] |
| teacher_max_memory_gb = 70 |
|
|
| [data] |
| dataset = "karpathy/climbmix-400b-shuffle" |
| text_field = "text" |
| min_chars = 768 |
| max_seq_len = 768 |
| kl_start_pos = 64 |
| seed = 6767 |
| shuffle_buffer = 10000 |
|
|
| [train] |
| seed = 6767 |
| lr = 1.0e-7 |
| schedule = "cosine" |
| warmup_steps = 20 |
| weight_decay = 0.0 |
| grad_clip = 1.0 |
| betas = [0.9, 0.999] |
| eps = 1.0e-3 |
| samples_per_step = 2 |
| micro_batch_size = 1 |
| max_steps = 20 |
| grad_checkpointing = true |
| attn_implementation = "flash_attention_2" |
| student_dtype = "bfloat16" |
| teacher_dtype = "bfloat16" |
| kl_chunk_size = 128 |
| new_layer_lr_mul = 5.0 |
| trainable_patterns = ["^model\\.layers\\.(32|33|34)\\.", "^lm_head", "^model\\.norm"] |
|
|
| [eval] |
| every_steps = 5 |
| samples = 32 |
| seed = 4242 |
| cache_path = "./cache/sn97_eval_32.pt" |
|
|
| [log] |
| wandb = false |
| wandb_project = "distil-subnet97" |
| wandb_run = "sn97_text35_warm" |
| log_every = 1 |
| output_dir = "./out/sn97_text35_warm" |
| experiment_log = "./out/experiments.jsonl" |
|
|
| [init] |
| zero_layers = [] |
| target_num_layers = 35 |
|
|