vocab_size: 16000 block_size: 256 n_layer: 6 n_head: 6 n_embed: 384 batch_size: 32 micro_batches: 4 lr: 3.0e-4 min_lr: 3.0e-5 warmup_steps: 200 max_steps: 1000 weight_decay: 0.01 grad_clip: 1.0 dtype: "float32" device: "auto" save_dir: "out/pretrain" tokenizer_path: "out/tokenizer.json" train_txt: "data/corpus_raw.txt" sft_jsonl: "data/sft_train.jsonl"