| [model] | |
| teacher = "../models/Qwen3.5-35B-A3B" | |
| student = "./out/sn97_text32_long_phase3/best" | |
| tokenizer = "../models/Qwen3.5-35B-A3B" | |
| student_device = "cuda:7" | |
| teacher_devices = [0, 1, 2, 3, 4, 5] | |
| teacher_max_memory_gb = 70 | |
| [data] | |
| dataset = "karpathy/climbmix-400b-shuffle" | |
| text_field = "text" | |
| min_chars = 2560 | |
| max_seq_len = 2048 | |
| kl_start_pos = 128 | |
| seed = 6767 | |
| shuffle_buffer = 10000 | |
| [train] | |
| seed = 6767 | |
| lr = 2.0e-9 | |
| schedule = "constant" | |
| warmup_steps = 0 | |
| weight_decay = 0.0 | |
| grad_clip = 1.0 | |
| betas = [0.9, 0.99] | |
| eps = 1.0e-2 | |
| samples_per_step = 16 | |
| micro_batch_size = 1 | |
| max_steps = 20 | |
| grad_checkpointing = true | |
| attn_implementation = "flash_attention_2" | |
| student_dtype = "bfloat16" | |
| teacher_dtype = "bfloat16" | |
| kl_chunk_size = 256 | |
| new_layer_lr_mul = 1.0 | |
| [eval] | |
| every_steps = 10 | |
| samples = 16 | |
| seed = 4242 | |
| cache_path = "./cache/sn97_eval_long_16.pt" | |
| [log] | |
| wandb = false | |
| wandb_project = "distil-subnet97" | |
| wandb_run = "sn97_text32_long_phase5" | |
| log_every = 1 | |
| output_dir = "./out/sn97_text32_long_phase5" | |
| experiment_log = "./out/experiments.jsonl" | |
| [init] | |
| zero_layers = [] | |
| target_num_layers = 32 | |