{ "depth": 8, "vocab_size": 186, "max_seq_len": 4096, "mlp_type": "relu2", "logit_cap": 15.0, "use_residual_scalars": true, "learning_rate": 0.001, "warmup_frac": 0.02, "weight_decay": 0.1, "grad_clip": 1.0, "num_epochs": 20 }