vocab_size: 16000
block_size: 256
n_layer: 6
n_head: 6
n_embed: 384
batch_size: 32
micro_batches: 4
lr: 3.0e-4
min_lr: 3.0e-5
warmup_steps: 200
max_steps: 1000
weight_decay: 0.01
grad_clip: 1.0
dtype: "float32"
device: "auto"
save_dir: "out/pretrain"
tokenizer_path: "out/tokenizer.json"
train_txt: "data/corpus_raw.txt"
sft_jsonl: "data/sft_train.jsonl"