Upload model

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +35 -0
hf_configuration.py +93 -0
mlm.py +486 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "FOR_JOSEPH/NRJ-BASE-125K",
+  "activation": "relu",
+  "alpha": 1.0,
+  "architectures": [
+    "BertEnergyModelForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "hf_configuration.BertEnergyConfig",
+    "AutoModelForMaskedLM": "mlm.BertEnergyModelForMaskedLM"
+  },
+  "beta": null,
+  "bias": true,
+  "compile": true,
+  "embedding_dim": 768,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 3072,
+  "initializer_hopfield_range": 0.002,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert_energy",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 3,
+  "path": null,
+  "positional": true,
+  "share_layers": true,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "vocab_size": 30000
+}

hf_configuration.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from transformers import PretrainedConfig
+class BertEnergyConfig(PretrainedConfig):
+    model_type = "bert_energy"
+    def __init__(
+        self,
+        path: str | None = None,
+        alpha: float = 1.0,
+        beta: float | None = None,
+        vocab_size: int = 30000,
+        hidden_size: int = 768,
+        embedding_dim: int | None = None,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int | None = None,
+        activation: str = "relu",
+        positional: bool = True,
+        share_layers: bool = False,
+        layer_norm_eps: float = 1e-12,
+        initializer_range: float = 0.02,
+        initializer_hopfield_range: float = 0.002,
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        tie_word_embeddings: bool = True,
+        bias: bool = True,
+        compile: bool = False,
+        pad_token_id: int | None = None,
+        problem_type: str | None = None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.path = path
+        # Energy-specific parameters
+        self.alpha = alpha
+        self.beta = beta
+        # Vocabulary / dimensions
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_dim = embedding_dim if embedding_dim is not None else hidden_size
+        # Transformer architecture
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = (
+            intermediate_size if intermediate_size is not None else hidden_size * 4
+        )
+        self.activation = activation
+        self.positional = positional
+        self.share_layers = share_layers
+        self.tie_word_embeddings = tie_word_embeddings
+        self.bias = bias
+        # Regularization / initialization
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.initializer_hopfield_range = initializer_hopfield_range
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        # Sequence length
+        self.max_position_embeddings = max_position_embeddings
+        # Misc
+        self.compile = compile
+        self.problem_type = problem_type
+        # ---- Validation ----
+        if self.embedding_dim % self.num_attention_heads != 0:
+            raise ValueError("embedding_dim must be divisible by num_attention_heads")
+        if self.hidden_size <= 0:
+            raise ValueError("hidden_size must be > 0")
+        if self.embedding_dim <= 0:
+            raise ValueError("embedding_dim must be > 0")
+        if self.num_hidden_layers <= 0:
+            raise ValueError("num_hidden_layers must be > 0")
+        if self.num_attention_heads <= 0:
+            raise ValueError("num_attention_heads must be > 0")
+        if self.max_position_embeddings <= 0:
+            raise ValueError("max_position_embeddings must be > 0")
+        if self.activation not in ["relu", "gelu", "softmax"]:
+            raise ValueError("activation must be one of: relu, gelu, softmax")

mlm.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import torch
+import torch.nn as nn
+from torch.nn.functional import gelu
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+)
+from hopfield import HopfieldLayer
+from hf_configuration import BertEnergyConfig
+from positional import PositionalEncoding
+class EnergyLMHead(nn.Module):
+    """
+    MLM head for the energy backbone.
+    Architecture:
+        hidden -> dense -> gelu -> layer_norm -> decoder(vocab)
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.embedding_dim, config.embedding_dim)
+        self.layer_norm = nn.LayerNorm(
+            config.embedding_dim,
+            eps=config.layer_norm_eps,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.decoder = nn.Linear(config.embedding_dim, config.vocab_size, bias=True)
+    @property
+    def bias(self):
+        return self.decoder.bias
+    def forward(self, hidden_states):
+        x = self.dense(hidden_states)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        x = self.dropout(x)
+        x = self.decoder(x)
+        return x
+    def _tie_weights(self):
+        pass
+class AlbertMLMHead(nn.Module):
+    """
+    ALBERT-style MLM head:
+      hidden (H) -> embedding (E) -> LN -> vocab (V)
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.embedding_dim)
+        self.layer_norm = nn.LayerNorm(config.embedding_dim, eps=config.layer_norm_eps)
+        self.decoder = nn.Linear(config.embedding_dim, config.vocab_size, bias=True)
+    def forward(self, hidden_states):
+        x = self.dense(hidden_states)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        return self.decoder(x)
+class MLMHead(nn.Module):
+    """
+    Standard BERT/RoBERTa-style MLM head.
+    """
+    def __init__(self, input_dim, hidden_dim, config):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, hidden_dim)
+        self.layer_norm = nn.LayerNorm(hidden_dim, eps=config.layer_norm_eps)
+        self.decoder = nn.Linear(hidden_dim, config.vocab_size, bias=True)
+    @property
+    def bias(self):
+        return self.decoder.bias
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        x = self.decoder(x)
+        return x
+    def _tie_weights(self):
+        pass
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    Common pretrained model base.
+    """
+    config_class = BertEnergyConfig
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class BertModel(BertPreTrainedModel):
+    """
+    Standard transformer backbone.
+    Outputs: last hidden state, optional hidden state history.
+    """
+    config_class = BertEnergyConfig
+    def __init__(self, config, add_pooling_layer=True, pad_idx=None, **kwargs):
+        super().__init__(config)
+        self.Emb_in = nn.Embedding(config.vocab_size, config.embedding_dim, padding_idx=pad_idx)
+        self.posn = (
+            PositionalEncoding(
+                config.embedding_dim,
+                max_len=config.max_position_embeddings,
+            )
+            if config.positional
+            else None
+        )
+        self.embed_norm = nn.LayerNorm(config.embedding_dim, eps=config.layer_norm_eps)
+        self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.num_layers = config.num_hidden_layers
+        self.share_layers = config.share_layers
+        if self.share_layers:
+            self.embedding_hidden_in = nn.Linear(config.embedding_dim, config.hidden_size)
+            layer = nn.TransformerEncoderLayer(
+                d_model=config.hidden_size,
+                nhead=config.num_attention_heads,
+                activation=config.activation,
+                dim_feedforward=config.hidden_size,
+                dropout=config.hidden_dropout_prob,
+                layer_norm_eps=config.layer_norm_eps,
+                batch_first=True,
+                norm_first=True,
+            )
+            self.layers = nn.ModuleList([layer])
+            self.output_dim = config.hidden_size
+        else:
+            self.embedding_hidden_in = None
+            self.layers = nn.ModuleList(
+                [
+                    nn.TransformerEncoderLayer(
+                        d_model=config.embedding_dim,
+                        nhead=config.num_attention_heads,
+                        dim_feedforward=config.intermediate_size,
+                        dropout=config.hidden_dropout_prob,
+                        layer_norm_eps=config.layer_norm_eps,
+                        batch_first=True,
+                        norm_first=True,
+                    )
+                    for _ in range(config.num_hidden_layers)
+                ]
+            )
+            self.output_dim = config.embedding_dim
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.Emb_in
+    def set_input_embeddings(self, new_embeddings):
+        self.Emb_in = new_embeddings
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        x = self.Emb_in(input_ids)
+        if self.posn is not None:
+            x = x + self.posn(x)
+        x = self.embed_norm(x)
+        x = self.embed_dropout(x)
+        if self.share_layers:
+            x = self.embedding_hidden_in(x)
+        history = None if self.training else [x]
+        pad_mask = None
+        if attention_mask is not None:
+            pad_mask = ~attention_mask.to(torch.bool)
+        for i in range(self.num_layers):
+            layer = self.layers[0] if self.share_layers else self.layers[i]
+            x = layer(x, src_key_padding_mask=pad_mask)
+            if not self.training:
+                history.append(x)
+        return BaseModelOutput(
+            last_hidden_state=x,
+            hidden_states=history,
+            attentions=None,
+        )
+class BertModelForMaskedLM(BertPreTrainedModel):
+    """
+    Standard transformer model for MLM.
+    """
+    config_class = BertEnergyConfig
+    ignore_index = -100
+    _tied_weights_keys = ["lm_head.decoder.weight"]
+    def __init__(self, config, add_pooling_layer=True, pad_idx=None):
+        super().__init__(config)
+        self.config = config
+        self.model = BertModel(config, pad_idx=pad_idx)
+        if config.share_layers:
+            self.lm_head = AlbertMLMHead(config)
+        else:
+            self.lm_head = MLMHead(config.embedding_dim, config.embedding_dim, config)
+        self.post_init()
+        if self.config.tie_word_embeddings:
+            self.tie_weights()
+    def get_input_embeddings(self):
+        return self.model.Emb_in
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
+        outputs = self.model(input_ids, attention_mask=attention_mask, **kwargs)
+        logits = self.lm_head(outputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            if attention_mask is not None:
+                labels = labels.masked_fill(attention_mask == 0, self.ignore_index)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class BertModelForSequenceClassification(BertPreTrainedModel):
+    """
+    Standard transformer model for sequence classification.
+    """
+    config_class = BertEnergyConfig
+    def __init__(
+        self,
+        config,
+        add_pooling_layer=True,
+        pad_idx=None,
+        num_labels=2,
+        classifier_dropout=None,
+        return_dict=True,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = num_labels
+        self.return_dict = return_dict
+        self.model = BertModel(config, pad_idx=pad_idx)
+        output_dim = self.model.output_dim
+        dropout = classifier_dropout if classifier_dropout is not None else config.hidden_dropout_prob
+        self.dropout = nn.Dropout(dropout)
+        self.norm = nn.LayerNorm(output_dim, eps=config.layer_norm_eps)
+        self.classifier = nn.Linear(output_dim, num_labels)
+        self.post_init()
+    def forward(self, input_ids, labels=None, return_dict=None, **kwargs):
+        if return_dict is None:
+            return_dict = self.return_dict
+        outputs = self.model(input_ids, **kwargs)
+        last_hidden_state = self.norm(outputs.last_hidden_state)
+        x = last_hidden_state[:, 0, :]
+        x = self.dropout(x)
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and labels.dtype in (torch.long, torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.squeeze(), labels.squeeze()) if self.num_labels == 1 else loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            else:
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, outputs.hidden_states, outputs.attentions)
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class BertEnergyModel(BertPreTrainedModel):
+    """
+    Energy-based backbone.
+    Update rule:
+        g = LayerNorm(X)
+        X <- X - alpha * layer(g)
+    """
+    config_class = BertEnergyConfig
+    def __init__(self, config, add_pooling_layer=True, pad_idx=None, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = config.num_hidden_layers
+        self.alpha = config.alpha
+        self.Emb_in = nn.Embedding(
+            config.vocab_size,
+            config.embedding_dim,
+            padding_idx=pad_idx,
+        )
+        self.posn = (
+            PositionalEncoding(
+                config.embedding_dim,
+                max_len=config.max_position_embeddings,
+            )
+            if config.positional
+            else None
+        )
+        self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
+        # External normalization, as in the original ET implementation
+        self.norm = nn.LayerNorm(config.embedding_dim, eps=config.layer_norm_eps)
+        self.layer = HopfieldLayer(
+            embedding_dim=config.embedding_dim,
+            nheads=config.num_attention_heads,
+            forward_memories=config.hidden_size,
+            forward_activation=config.activation,
+            bias=config.bias,
+            beta=config.beta,
+            device=None,
+            dropout=0.0,
+            initializer_range=config.initializer_hopfield_range,
+        )
+        self.post_init()
+    def set_input_embeddings(self, new_embeddings):
+        self.Emb_in = new_embeddings
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        x = self.Emb_in(input_ids)
+        if self.posn is not None:
+            x = x + self.posn(x)
+        x = self.embed_dropout(x)
+        keep_mask = attention_mask.to(torch.bool) if attention_mask is not None else None
+        history = None if self.training else [x]
+        for _ in range(self.num_layers):
+            g = self.norm(x)
+            update = self.layer(
+                g,
+                attention_mask=keep_mask,
+            )
+            x = x - self.alpha * update
+            if not self.training:
+                history.append(x)
+        return BaseModelOutput(
+            last_hidden_state=x,
+            hidden_states=history,
+            attentions=None,
+        )
+class BertEnergyModelForMaskedLM(BertPreTrainedModel):
+    """
+    Energy-based model for MLM.
+    """
+    config_class = BertEnergyConfig
+    ignore_index = -100
+    _tied_weights_keys = ["lm_head.decoder.weight"]
+    def __init__(self, config, add_pooling_layer=True, pad_idx=None):
+        super().__init__(config)
+        self.config = config
+        self.model = BertEnergyModel(config, pad_idx=pad_idx)
+        self.lm_head = EnergyLMHead(config)
+        self.post_init()
+        if self.config.tie_word_embeddings:
+            self.tie_weights()
+    def get_input_embeddings(self):
+        return self.model.Emb_in
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
+        outputs = self.model(input_ids, attention_mask=attention_mask, **kwargs)
+        logits = self.lm_head(outputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            if attention_mask is not None:
+                labels = labels.masked_fill(attention_mask == 0, self.ignore_index)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03d5ca8d5fb8ad089ef941dce630aceb768e8c556cce1279165640d0ce2b3278
+size 200983936