distill-m-6a3lnzvb-code / sn97_smoke.toml
Delta-Vector's picture
Upload folder using huggingface_hub
2d38ae8 verified
[model]
teacher = "../models/Qwen3.5-35B-A3B"
student = "../models/sn97-xxxn"
tokenizer = "../models/Qwen3.5-35B-A3B"
student_device = "cuda:7"
teacher_devices = [0, 1, 2, 3, 4, 5]
teacher_max_memory_gb = 70
[data]
dataset = "karpathy/climbmix-400b-shuffle"
text_field = "text"
min_chars = 512
max_seq_len = 512
kl_start_pos = 64
seed = 1234
shuffle_buffer = 10000
[train]
seed = 1234
lr = 1.0e-7
schedule = "constant"
warmup_steps = 0
weight_decay = 0.0
grad_clip = 1.0
betas = [0.9, 0.95]
eps = 1.0e-8
samples_per_step = 1
micro_batch_size = 1
max_steps = 2
grad_checkpointing = true
attn_implementation = "flash_attention_2"
student_dtype = "bfloat16"
teacher_dtype = "bfloat16"
kl_chunk_size = 128
new_layer_lr_mul = 1.0
[eval]
every_steps = 1
samples = 4
seed = 4242
cache_path = "./cache/sn97_smoke_eval.pt"
[log]
wandb = false
wandb_project = "distil-subnet97"
wandb_run = "sn97_smoke"
log_every = 1
output_dir = "./out/sn97_smoke"
experiment_log = "./out/experiments.jsonl"
[init]
zero_layers = []
target_num_layers = 32