End of training

Browse files

Files changed (7) hide show

DisamBertSingleSense.py +150 -0
README.md +79 -0
config.json +85 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0
training_args.bin +3 -0

DisamBertSingleSense.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from collections.abc import Generator, Iterable
+from dataclasses import dataclass
+from enum import StrEnum
+import pprint
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    BatchEncoding,
+    ModernBertModel,
+    PreTrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+from transformers.modeling_outputs import TokenClassifierOutput
+BATCH_SIZE = 16
+class ModelURI(StrEnum):
+    BASE = "answerdotai/ModernBERT-base"
+    LARGE = "answerdotai/ModernBERT-large"
+@dataclass(slots=True, frozen=True)
+class LexicalExample:
+    concept: str
+    definition: str
+@dataclass(slots=True, frozen=True)
+class PaddedBatch:
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+class DisamBertSingleSense(PreTrainedModel):
+    def __init__(self, config: PreTrainedConfig):
+        super().__init__(config)
+        if config.init_basemodel:
+            self.BaseModel = AutoModel.from_pretrained(config.name_or_path,
+                                                       attn_implementation="flash_attention_2",
+                                                       dtype=torch.bfloat16,
+                                                       device_map="auto")
+            self.config.vocab_size += 3
+            self.BaseModel.resize_token_embeddings(self.config.vocab_size)
+        else:
+            self.BaseModel = ModernBertModel(config)
+        config.init_basemodel = False
+        self.loss = nn.CrossEntropyLoss()
+        self.post_init()
+    @classmethod
+    def from_base(cls, base_id: ModelURI):
+        config = AutoConfig.from_pretrained(base_id)
+        config.init_basemodel = True
+        return cls(config)
+    def add_special_tokens(self, start: int, end: int, gloss: int):
+        self.config.start_token = start
+        self.config.end_token = end
+        self.config.gloss_token = gloss
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: Iterable[int] | None = None,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+    ) -> TokenClassifierOutput:
+        base_model_output = self.BaseModel(
+            input_ids,
+            attention_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        token_vectors = base_model_output.last_hidden_state
+        selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
+        starts = (input_ids == self.config.start_token).nonzero()
+        ends = (input_ids == self.config.end_token).nonzero()
+        for startpos, endpos in zip(starts, ends, strict=True):
+            selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
+        entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
+        gloss_vectors = self.gloss_vectors(
+            token_vectors,
+            input_ids,
+        )
+        logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
+        return TokenClassifierOutput(
+            logits=logits,
+            loss=self.loss(logits, labels) if labels is not None else None,
+            hidden_states=base_model_output.hidden_states if output_hidden_states else None,
+            attentions=base_model_output.attentions if output_attentions else None,
+        )
+    def gloss_vectors(self, token_vectors: torch.Tensor, input_ids:torch.Tensor)->torch.Tensor:
+        with self.device:
+            selection = (input_ids==self.config.gloss_token)
+            candidates_per_row = selection.sum(axis=1)
+            max_candidates = candidates_per_row.max()
+            indices = torch.flatten(selection)
+            vectors = torch.reshape(token_vectors,
+                                    (token_vectors.shape[0]*token_vectors.shape[1],
+                                     token_vectors.shape[2]))
+            gloss_vectors = vectors[indices]
+            return torch.stack([torch.cat([chunk,torch.zeros((max_candidates-chunk.shape[0],
+                                                              chunk.shape[1]),
+                                                             dtype=torch.bfloat16)])
+                                for chunk in torch.split(gloss_vectors,
+                                                         tuple(candidates_per_row.tolist()))])
+class CandidateLabeller:
+    def __init__(self, tokenizer: PreTrainedTokenizer,
+                 ontology: Generator[LexicalExample],
+                 device:torch.device,
+                 retain_candidates: bool = False):
+        self.tokenizer = tokenizer
+        self.device = device
+        self.glosses = {
+            example.concept: example.definition
+            for example in ontology
+        }
+        self.retain_candidates = retain_candidates
+    def __call__(self, batch: list[dict]) -> dict:
+        with self.device:
+            glosses = ["\n".join(self.glosses[candidate]
+                                  for candidate in example)
+                       for example in batch['candidates']]
+            tokens = self.tokenizer(batch["text"],glosses,padding=True,return_tensors="pt")
+            result = {"input_ids":tokens.input_ids,
+                      "attention_mask":tokens.attention_mask}
+            if "label" in batch:
+                result["labels"] = torch.tensor(
+                    [candidates.index(label)
+                     for (candidates,label) in zip(batch['candidates'],
+                                                   batch['label'],
+                                                   strict=True)]
+                )
+            if self.retain_candidates:
+                result['candidates'] = batch['candidates']
+            return result

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+library_name: transformers
+language:
+- en
+license: apache-2.0
+base_model: answerdotai/ModernBERT-base
+tags:
+- generated_from_trainer
+metrics:
+- precision
+- recall
+- f1
+model-index:
+- name: DisamBertCrossEncoder-base
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# DisamBertCrossEncoder-base
+This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
+It achieves the following results on the evaluation set:
+- Loss: 13.9274
+- Precision: 0.6274
+- Recall: 0.6398
+- F1: 0.6335
+- Matthews: 0.6392
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 64
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: inverse_sqrt
+- lr_scheduler_warmup_steps: 1000
+- num_epochs: 10
+### Training results
+| Training Loss | Epoch | Step  | Validation Loss | Precision | Recall | F1     | Matthews |
+|:-------------:|:-----:|:-----:|:---------------:|:---------:|:------:|:------:|:--------:|
+| No log        | 0     | 0     | 427.8458        | 0.5014    | 0.4790 | 0.4899 | 0.4781   |
+| 9.0689        | 1.0   | 3504  | 15.5744         | 0.6010    | 0.6196 | 0.6102 | 0.6190   |
+| 8.5420        | 2.0   | 7008  | 15.4129         | 0.6088    | 0.6253 | 0.6170 | 0.6247   |
+| 7.8106        | 3.0   | 10512 | 14.3562         | 0.6138    | 0.6328 | 0.6232 | 0.6322   |
+| 7.6303        | 4.0   | 14016 | 13.9741         | 0.6157    | 0.6372 | 0.6262 | 0.6366   |
+| 7.6930        | 5.0   | 17520 | 13.8324         | 0.6262    | 0.6402 | 0.6331 | 0.6397   |
+| 7.4897        | 6.0   | 21024 | 13.9649         | 0.6144    | 0.6323 | 0.6232 | 0.6318   |
+| 7.3819        | 7.0   | 24528 | 13.4877         | 0.6273    | 0.6407 | 0.6339 | 0.6401   |
+| 7.4083        | 8.0   | 28032 | 13.7249         | 0.6321    | 0.6402 | 0.6362 | 0.6397   |
+| 7.0140        | 9.0   | 31536 | 13.5219         | 0.6168    | 0.6389 | 0.6277 | 0.6383   |
+| 7.7287        | 10.0  | 35040 | 13.9274         | 0.6274    | 0.6398 | 0.6335 | 0.6392   |
+### Framework versions
+- Transformers 5.2.0
+- Pytorch 2.10.0+cu128
+- Datasets 4.5.0
+- Tokenizers 0.22.2

config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "architectures": [
+    "DisamBertSingleSense"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModel": "DisamBertSingleSense.DisamBertSingleSense"
+  },
+  "bos_token_id": null,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "end_token": 50369,
+  "eos_token_id": null,
+  "global_attn_every_n_layers": 3,
+  "gloss_token": 50370,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "init_basemodel": false,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000.0,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "start_token": 50368,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.2.0",
+  "use_cache": false,
+  "vocab_size": 50371
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16fd64c635d8aa42aea9369b6196dc02bae7771edbb9dc4db1231ab774844f91
+size 298047648

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "extra_special_tokens": [
+    "[START]",
+    "[END]",
+    "[GLOSS]"
+  ],
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d495bea733a9a1d58c23c5a0a1a14cb54e0ab61bd6b81fc5d20360188143cd1
+size 5265