attn_name: torch_attn cast_output_to_float32: false ffn_type: swiglu freeze: false hidden_dim: 2048 is_causal: true max_seq_len: 2048 n_heads: 16 n_layers: 24 norm_eps: 1.0e-05 norm_type: lp_layer_norm positional_embedding_type: rotary post_embed_norm: false qk_norm: true resume_from_checkpoint: null resume_weights_only: false type: transformer vocab_size: 49280 weight_tying: false