{ "name": "eqlm-small-140m", "hf_config": { "name": "eqlm-small-140m" }, "block_size": 2048, "n_embd": 1024, "intermediate_size": 4096, "num_attention_heads": 8, "num_key_value_heads": 8, "vocab_size": 32768, "padding_multiple": 64, "padded_vocab_size": 32768, "rope_settings": { "use_rope": true, "rope_condense_ratio": 1, "rope_base": 50000 }, "use_abacus": false, "randomize_positions_from": null, "block_class_name": "TransformerPreNormBlock", "norm_class_name": "RMSNorm", "attn_impl": "flash", "norm_eps": 1e-05, "mlp_class_name": "BaseMLP", "nonlin_name": "ReLU2", "bias": false, "qk_bias": false, "init_strategy": "scaled-zero", "init_orthogonal": true, "skip_initialization": false, "mup_model_scaling_factor": 1, "use_fused_head": "pytorch", "debias_attention": false, "center_attention": false, "clip_qkv": null, "qk_norm": true, "logit_softcap": null, "causal": true, "activation_checkpoint_impl": "per-block", "simple_ops": false, "strategy": "ddp", "n_backbone_layers": 7, "n_fp_blocks": 1, "tie_embeddings": true, "solver": "anderson", "max_iter": 64, "min_iter": 6, "tol": 0.0003, "anderson_m": 5, "anderson_beta": 1.0, "backward_type": "onestep", "backward_max_iter": 64, "backward_min_iter": 6, "backward_tol": 0.0003, "adjoint_grad_clip": null, "layer_scale_init": 0.75, "gamma_max": 0.75, "fp_lr_scale": 0.5, "fp_wd": 0.1, "recurrent_embedding_dimension": 1024, "model_class_name": "EQLM", "_class_name": "EQLMConfig" }