| # Base distillation config (smoketest variant). | |
| # Every value the script reads must live in this file - no defaults in code. | |
| [model] | |
| teacher = "Qwen/Qwen3.5-35B-A3B" | |
| student = "Troiaaa/m-6a3lnzvb" | |
| tokenizer = "Qwen/Qwen3.5-35B-A3B" | |
| [data] | |
| dataset = "karpathy/climbmix-400b-shuffle" | |
| text_field = "text" | |
| min_chars = 2560 | |
| max_seq_len = 640 | |
| kl_start_pos = 128 | |
| seed = 42 | |
| shuffle_buffer = 10000 | |
| [train] | |
| seed = 42 | |
| lr = 5.0e-7 | |
| schedule = "constant" | |
| warmup_steps = 0 | |
| weight_decay = 0.0 | |
| grad_clip = 1.0 | |
| betas = [0.9, 0.95] | |
| eps = 1.0e-8 | |
| samples_per_step = 4 | |
| micro_batch_size = 4 | |
| max_steps = 5 | |
| grad_checkpointing = true | |
| attn_implementation = "flash_attention_2" | |
| student_dtype = "bfloat16" | |
| teacher_dtype = "bfloat16" | |
| mixed_precision = "bf16" | |
| kl_chunk_size = 0 | |
| new_layer_lr_mul = 1.0 | |
| [eval] | |
| every_steps = 5 | |
| samples = 16 | |
| seed = 1234 | |
| [log] | |
| wandb = true | |
| wandb_project = "distil-subnet97" | |
| wandb_run = "smoketest" | |
| log_every = 1 | |
| output_dir = "./out/smoketest" | |
| [init] | |
| zero_layers = [] | |
| target_num_layers = 32 | |