# MTP Mini Configuration model: vocab_size: 8000 d_model: 256 n_layers: 4 n_heads: 4 d_ff: 1024 max_seq_len: 128 dropout: 0.1 training: batch_size: 4 epochs: 20 learning_rate: 0.0003 weight_decay: 0.01 max_grad_norm: 1.0 num_threads: 4 save_every: 5 data: corpus_path: corpus/mtp_mini_corpus.jsonl