| { | |
| "vocab_size": 256, | |
| "max_seq_len": 64, | |
| "n_layers": 2, | |
| "n_heads": 4, | |
| "n_kv_heads": 2, | |
| "n_embd": 64, | |
| "n_inner": 128, | |
| "dropout": 0.0, | |
| "bias": false, | |
| "rope_theta": 10000.0, | |
| "rms_norm_eps": 1e-06, | |
| "tie_embeddings": true, | |
| "batch_size": 2, | |
| "learning_rate": 0.001, | |
| "weight_decay": 0.1, | |
| "max_steps": 3, | |
| "warmup_steps": 0, | |
| "eval_interval": 250, | |
| "eval_steps": 20, | |
| "log_interval": 50, | |
| "save_interval": 1000, | |
| "gradient_accumulation_steps": 4, | |
| "grad_clip": 1.0, | |
| "dataset_name": "roneneldan/TinyStories", | |
| "output_dir": "checkpoints" | |
| } |