| # grow40_winning v2: same hparams, but the scheduler bug is fixed in distill.py | |
| # (we no longer prepare the scheduler with accelerate, so cosine reaches its | |
| # minimum at step max_steps instead of step max_steps / num_processes). | |
| [model] | |
| teacher = "Qwen/Qwen3.5-35B-A3B" | |
| student = "Troiaaa/m-6a3lnzvb" | |
| tokenizer = "Qwen/Qwen3.5-35B-A3B" | |
| [data] | |
| dataset = "karpathy/climbmix-400b-shuffle" | |
| text_field = "text" | |
| min_chars = 2560 | |
| max_seq_len = 2048 | |
| kl_start_pos = 128 | |
| seed = 6767 | |
| shuffle_buffer = 10000 | |
| [train] | |
| seed = 6767 | |
| lr = 5.0e-7 | |
| schedule = "cosine" | |
| warmup_steps = 100 | |
| weight_decay = 0.0 | |
| grad_clip = 1.0 | |
| betas = [0.9, 0.999] | |
| eps = 1.0e-3 | |
| samples_per_step = 4 | |
| micro_batch_size = 4 | |
| max_steps = 2000 | |
| grad_checkpointing = true | |
| attn_implementation = "flash_attention_2" | |
| student_dtype = "bfloat16" | |
| teacher_dtype = "bfloat16" | |
| mixed_precision = "bf16" | |
| kl_chunk_size = 256 | |
| new_layer_lr_mul = 1.0 | |
| [eval] | |
| every_steps = 50 | |
| samples = 500 | |
| seed = 4242 | |
| [log] | |
| wandb = true | |
| wandb_project = "distil-subnet97" | |
| wandb_run = "grow40_winning_v2" | |
| log_every = 1 | |
| output_dir = "./out/grow40_winning_v2" | |
| [init] | |
| zero_layers = [] | |
| target_num_layers = 40 | |