| { |
| "model_type": "ministu", |
| "_name_or_path": "STU_500M", |
| "architectures": ["MiniSTU"], |
| "n_embd": 896, |
| "n_heads": 8, |
| "n_layers": 12, |
| "seq_len": 8192, |
| "weight_tying": true, |
| "window_size": 1024, |
| "vocab_size": 200064, |
| "mlp_scale": 12, |
| "bias": false, |
| "dropout": 0.0, |
| "num_eigh": 24, |
| "use_hankel_L": false, |
| "num_epochs": 1, |
| "global_bsz": 524288, |
| "bsz": 2, |
| "warmup_steps": 1907, |
| "eval_period": 50, |
| "save_period": 500, |
| "max_lr": 3.0e-3, |
| "min_lr": 3.0e-5, |
| "max_norm": 1.0, |
| "dilation": 2, |
| "fsdp": true, |
| "ddp": false, |
| "mixed_precision": true, |
| "torch_dtype": "bfloat16", |
| "use_cpu_offload": false, |
| "sharding_strategy": "full_shard", |
| "state_dict_type": "full", |
| "auto_wrap_policy": "partial", |
| "backward_prefetch": "backward_pre", |
| "forward_prefetch": false, |
| "sync_module_states": true, |
| "use_orig_params": true, |
| "device_id": null, |
| "precision": { |
| "param": "bfloat16", |
| "reduce": "bfloat16", |
| "buffer": "bfloat16" |
| }, |
| "fsdp_modules": [ |
| "STU", |
| "Attention", |
| "MLP" |
| ], |
| "use_activation_checkpointing": true, |
| "use_flash_fft": true, |
| "use_approx": true, |
| "use_attn": true, |
| "softcap": 50.0, |
| "theta": 10000.0, |
| "use_alibi": false, |
| "torch_compile": false |
| } |