[model] teacher = "../models/Qwen3.5-35B-A3B" student = "../models/sn97-text" tokenizer = "../models/Qwen3.5-35B-A3B" student_device = "cuda:7" teacher_devices = [0, 1, 2, 3, 4, 5] teacher_max_memory_gb = 70 [data] dataset = "karpathy/climbmix-400b-shuffle" text_field = "text" min_chars = 2560 max_seq_len = 2048 kl_start_pos = 128 seed = 6767 shuffle_buffer = 10000 [train] seed = 6767 lr = 1.0e-7 schedule = "constant" warmup_steps = 0 weight_decay = 0.0 grad_clip = 1.0 betas = [0.9, 0.999] eps = 1.0e-3 samples_per_step = 4 micro_batch_size = 1 max_steps = 1 grad_checkpointing = true attn_implementation = "flash_attention_2" student_dtype = "bfloat16" teacher_dtype = "bfloat16" kl_chunk_size = 256 new_layer_lr_mul = 1.0 [eval] every_steps = 1 samples = 16 seed = 4242 cache_path = "./cache/sn97_eval_long_16.pt" [log] wandb = false wandb_project = "distil-subnet97" wandb_run = "sn97_eval_long" log_every = 1 output_dir = "./out/sn97_eval_long" experiment_log = "./out/experiments.jsonl" [init] zero_layers = [] target_num_layers = 32