{ "AP_embeddings": true, "architectures": [ "NeoBERT" ], "attention_activation": "softmax", "attention_ativation": "softmax", "attention_probs_dropout_prob": 0.1, "base_scale": 0.03227486121839514, "classifier_init_range": 0.02, "decoder_init_range": 0.02, "dim_head": 128, "dropout_prob": 0, "embedding_init_range": 0.02, "entropy_regularization_lambda": 0.01, "flash_attention": false, "hidden_act": "swiglu", "hidden_size": 768, "intermediate_size": 3072, "kwargs": { "attention_ativation": "softmax", "classifier_init_range": 0.02, "entropy_regularization_lambda": 0.01 }, "max_length": 512, "mix_attentions": "sum", "mixed_feed_forward": true, "model_type": "neobert", "ngpt": false, "norm_eps": 1e-05, "num_attention_heads": 6, "num_hidden_layers": 6, "pad_token_id": 0, "pos_dropout_prob": 0.1, "pos_intermediate_size": 1536, "pos_size": 384, "positional_embed_init": "random", "posneobert": false, "random_offset": false, "relative_pos_bias": false, "rms_norm": true, "rope": false, "scale_QK_dim": true, "share_pos_embeds_in_heads": false, "shared_pos_keys": false, "torch_dtype": "float32", "transformers_version": "4.46.3", "untie_cls": false, "use_only_sem_for_decoding": false, "vocab_size": 30522 }