| attn_name: torch_attn | |
| cast_output_to_float32: false | |
| ffn_type: swiglu | |
| freeze: false | |
| hidden_dim: 2048 | |
| is_causal: true | |
| max_seq_len: 2048 | |
| n_heads: 16 | |
| n_layers: 24 | |
| norm_eps: 1.0e-05 | |
| norm_type: lp_layer_norm | |
| positional_embedding_type: rotary | |
| post_embed_norm: false | |
| qk_norm: true | |
| resume_from_checkpoint: null | |
| resume_weights_only: false | |
| type: transformer | |
| vocab_size: 49280 | |
| weight_tying: false | |