from transformers import PretrainedConfig class BertEnergyConfig(PretrainedConfig): model_type = "bert_energy" def __init__( self, path: str | None = None, alpha: float = 1.0, beta: float | None = None, vocab_size: int = 30000, hidden_size: int = 768, embedding_dim: int | None = None, num_hidden_layers: int = 12, num_attention_heads: int = 12, intermediate_size: int | None = None, activation: str = "relu", positional: bool = True, share_layers: bool = False, layer_norm_eps: float = 1e-12, initializer_range: float = 0.02, initializer_hopfield_range: float = 0.002, hidden_dropout_prob: float = 0.1, attention_probs_dropout_prob: float = 0.1, max_position_embeddings: int = 512, tie_word_embeddings: bool = True, bias: bool = True, compile: bool = False, pad_token_id: int | None = None, problem_type: str | None = None, **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.path = path # Energy-specific parameters self.alpha = alpha self.beta = beta # Vocabulary / dimensions self.vocab_size = vocab_size self.hidden_size = hidden_size self.embedding_dim = embedding_dim if embedding_dim is not None else hidden_size # Transformer architecture self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = ( intermediate_size if intermediate_size is not None else hidden_size * 4 ) self.activation = activation self.positional = positional self.share_layers = share_layers self.tie_word_embeddings = tie_word_embeddings self.bias = bias # Regularization / initialization self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range self.initializer_hopfield_range = initializer_hopfield_range self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob # Sequence length self.max_position_embeddings = max_position_embeddings # Misc self.compile = compile self.problem_type = problem_type # ---- Validation ---- if self.embedding_dim % self.num_attention_heads != 0: raise ValueError("embedding_dim must be divisible by num_attention_heads") if self.hidden_size <= 0: raise ValueError("hidden_size must be > 0") if self.embedding_dim <= 0: raise ValueError("embedding_dim must be > 0") if self.num_hidden_layers <= 0: raise ValueError("num_hidden_layers must be > 0") if self.num_attention_heads <= 0: raise ValueError("num_attention_heads must be > 0") if self.max_position_embeddings <= 0: raise ValueError("max_position_embeddings must be > 0") if self.activation not in ["relu", "gelu", "softmax"]: raise ValueError("activation must be one of: relu, gelu, softmax")