PeteBleackley commited on
Commit
0218456
·
verified ·
1 Parent(s): 2d64cfc

End of training

Browse files
DisamBertSingleSense.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from enum import StrEnum
4
+
5
+ import pprint
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import (
10
+ AutoConfig,
11
+ AutoModel,
12
+ BatchEncoding,
13
+ ModernBertModel,
14
+ PreTrainedConfig,
15
+ PreTrainedModel,
16
+ PreTrainedTokenizer,
17
+ )
18
+ from transformers.modeling_outputs import TokenClassifierOutput
19
+
20
+ BATCH_SIZE = 16
21
+
22
+
23
+ class ModelURI(StrEnum):
24
+ BASE = "answerdotai/ModernBERT-base"
25
+ LARGE = "answerdotai/ModernBERT-large"
26
+
27
+
28
+ @dataclass(slots=True, frozen=True)
29
+ class LexicalExample:
30
+ concept: str
31
+ definition: str
32
+
33
+
34
+ @dataclass(slots=True, frozen=True)
35
+ class PaddedBatch:
36
+ input_ids: torch.Tensor
37
+ attention_mask: torch.Tensor
38
+
39
+
40
+ class DisamBertSingleSense(PreTrainedModel):
41
+ def __init__(self, config: PreTrainedConfig):
42
+ super().__init__(config)
43
+ if config.init_basemodel:
44
+ self.BaseModel = AutoModel.from_pretrained(config.name_or_path,
45
+ attn_implementation="flash_attention_2",
46
+ dtype=torch.bfloat16,
47
+ device_map="auto")
48
+ self.config.vocab_size += 3
49
+ self.BaseModel.resize_token_embeddings(self.config.vocab_size)
50
+ else:
51
+ self.BaseModel = ModernBertModel(config)
52
+ config.init_basemodel = False
53
+
54
+ self.loss = nn.CrossEntropyLoss()
55
+ self.post_init()
56
+
57
+ @classmethod
58
+ def from_base(cls, base_id: ModelURI):
59
+ config = AutoConfig.from_pretrained(base_id)
60
+ config.init_basemodel = True
61
+ return cls(config)
62
+
63
+ def add_special_tokens(self, start: int, end: int, gloss: int):
64
+ self.config.start_token = start
65
+ self.config.end_token = end
66
+ self.config.gloss_token = gloss
67
+
68
+ def forward(
69
+ self,
70
+ input_ids: torch.Tensor,
71
+ attention_mask: torch.Tensor,
72
+ labels: Iterable[int] | None = None,
73
+ output_hidden_states: bool = False,
74
+ output_attentions: bool = False,
75
+ ) -> TokenClassifierOutput:
76
+ base_model_output = self.BaseModel(
77
+ input_ids,
78
+ attention_mask,
79
+ output_hidden_states=output_hidden_states,
80
+ output_attentions=output_attentions,
81
+ )
82
+ token_vectors = base_model_output.last_hidden_state
83
+ selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
84
+ starts = (input_ids == self.config.start_token).nonzero()
85
+ ends = (input_ids == self.config.end_token).nonzero()
86
+ for startpos, endpos in zip(starts, ends, strict=True):
87
+ selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
88
+ entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
89
+ gloss_vectors = self.gloss_vectors(
90
+ token_vectors,
91
+ input_ids,
92
+ )
93
+ logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)
94
+
95
+ return TokenClassifierOutput(
96
+ logits=logits,
97
+ loss=self.loss(logits, labels) if labels is not None else None,
98
+ hidden_states=base_model_output.hidden_states if output_hidden_states else None,
99
+ attentions=base_model_output.attentions if output_attentions else None,
100
+ )
101
+
102
+ def gloss_vectors(self, token_vectors: torch.Tensor, input_ids:torch.Tensor)->torch.Tensor:
103
+ with self.device:
104
+ selection = (input_ids==self.config.gloss_token)
105
+ candidates_per_row = selection.sum(axis=1)
106
+ max_candidates = candidates_per_row.max()
107
+ indices = torch.flatten(selection)
108
+ vectors = torch.reshape(token_vectors,
109
+ (token_vectors.shape[0]*token_vectors.shape[1],
110
+ token_vectors.shape[2]))
111
+ gloss_vectors = vectors[indices]
112
+ return torch.stack([torch.cat([chunk,torch.zeros((max_candidates-chunk.shape[0],
113
+ chunk.shape[1]),
114
+ dtype=torch.bfloat16)])
115
+ for chunk in torch.split(gloss_vectors,
116
+ tuple(candidates_per_row.tolist()))])
117
+
118
+
119
+
120
+ class CandidateLabeller:
121
+ def __init__(self, tokenizer: PreTrainedTokenizer,
122
+ ontology: Generator[LexicalExample],
123
+ device:torch.device,
124
+ retain_candidates: bool = False):
125
+ self.tokenizer = tokenizer
126
+ self.device = device
127
+ self.glosses = {
128
+ example.concept: example.definition
129
+ for example in ontology
130
+ }
131
+ self.retain_candidates = retain_candidates
132
+
133
+ def __call__(self, batch: list[dict]) -> dict:
134
+ with self.device:
135
+ glosses = ["\n".join(self.glosses[candidate]
136
+ for candidate in example)
137
+ for example in batch['candidates']]
138
+ tokens = self.tokenizer(batch["text"],glosses,padding=True,return_tensors="pt")
139
+ result = {"input_ids":tokens.input_ids,
140
+ "attention_mask":tokens.attention_mask}
141
+ if "label" in batch:
142
+ result["labels"] = torch.tensor(
143
+ [candidates.index(label)
144
+ for (candidates,label) in zip(batch['candidates'],
145
+ batch['label'],
146
+ strict=True)]
147
+ )
148
+ if self.retain_candidates:
149
+ result['candidates'] = batch['candidates']
150
+ return result
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - en
5
+ license: apache-2.0
6
+ base_model: answerdotai/ModernBERT-base
7
+ tags:
8
+ - generated_from_trainer
9
+ metrics:
10
+ - precision
11
+ - recall
12
+ - f1
13
+ model-index:
14
+ - name: DisamBertCrossEncoder-base
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ # DisamBertCrossEncoder-base
22
+
23
+ This model is a fine-tuned version of [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) on the semcor dataset.
24
+ It achieves the following results on the evaluation set:
25
+ - Loss: 13.9274
26
+ - Precision: 0.6274
27
+ - Recall: 0.6398
28
+ - F1: 0.6335
29
+ - Matthews: 0.6392
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 0.0001
49
+ - train_batch_size: 64
50
+ - eval_batch_size: 64
51
+ - seed: 42
52
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
53
+ - lr_scheduler_type: inverse_sqrt
54
+ - lr_scheduler_warmup_steps: 1000
55
+ - num_epochs: 10
56
+
57
+ ### Training results
58
+
59
+ | Training Loss | Epoch | Step | Validation Loss | Precision | Recall | F1 | Matthews |
60
+ |:-------------:|:-----:|:-----:|:---------------:|:---------:|:------:|:------:|:--------:|
61
+ | No log | 0 | 0 | 427.8458 | 0.5014 | 0.4790 | 0.4899 | 0.4781 |
62
+ | 9.0689 | 1.0 | 3504 | 15.5744 | 0.6010 | 0.6196 | 0.6102 | 0.6190 |
63
+ | 8.5420 | 2.0 | 7008 | 15.4129 | 0.6088 | 0.6253 | 0.6170 | 0.6247 |
64
+ | 7.8106 | 3.0 | 10512 | 14.3562 | 0.6138 | 0.6328 | 0.6232 | 0.6322 |
65
+ | 7.6303 | 4.0 | 14016 | 13.9741 | 0.6157 | 0.6372 | 0.6262 | 0.6366 |
66
+ | 7.6930 | 5.0 | 17520 | 13.8324 | 0.6262 | 0.6402 | 0.6331 | 0.6397 |
67
+ | 7.4897 | 6.0 | 21024 | 13.9649 | 0.6144 | 0.6323 | 0.6232 | 0.6318 |
68
+ | 7.3819 | 7.0 | 24528 | 13.4877 | 0.6273 | 0.6407 | 0.6339 | 0.6401 |
69
+ | 7.4083 | 8.0 | 28032 | 13.7249 | 0.6321 | 0.6402 | 0.6362 | 0.6397 |
70
+ | 7.0140 | 9.0 | 31536 | 13.5219 | 0.6168 | 0.6389 | 0.6277 | 0.6383 |
71
+ | 7.7287 | 10.0 | 35040 | 13.9274 | 0.6274 | 0.6398 | 0.6335 | 0.6392 |
72
+
73
+
74
+ ### Framework versions
75
+
76
+ - Transformers 5.2.0
77
+ - Pytorch 2.10.0+cu128
78
+ - Datasets 4.5.0
79
+ - Tokenizers 0.22.2
config.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DisamBertSingleSense"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModel": "DisamBertSingleSense.DisamBertSingleSense"
9
+ },
10
+ "bos_token_id": null,
11
+ "classifier_activation": "gelu",
12
+ "classifier_bias": false,
13
+ "classifier_dropout": 0.0,
14
+ "classifier_pooling": "mean",
15
+ "cls_token_id": 50281,
16
+ "decoder_bias": true,
17
+ "deterministic_flash_attn": false,
18
+ "dtype": "bfloat16",
19
+ "embedding_dropout": 0.0,
20
+ "end_token": 50369,
21
+ "eos_token_id": null,
22
+ "global_attn_every_n_layers": 3,
23
+ "gloss_token": 50370,
24
+ "gradient_checkpointing": false,
25
+ "hidden_activation": "gelu",
26
+ "hidden_size": 768,
27
+ "init_basemodel": false,
28
+ "initializer_cutoff_factor": 2.0,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 1152,
31
+ "layer_norm_eps": 1e-05,
32
+ "layer_types": [
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "full_attention"
55
+ ],
56
+ "local_attention": 128,
57
+ "max_position_embeddings": 8192,
58
+ "mlp_bias": false,
59
+ "mlp_dropout": 0.0,
60
+ "model_type": "modernbert",
61
+ "norm_bias": false,
62
+ "norm_eps": 1e-05,
63
+ "num_attention_heads": 12,
64
+ "num_hidden_layers": 22,
65
+ "pad_token_id": 50283,
66
+ "position_embedding_type": "absolute",
67
+ "rope_parameters": {
68
+ "full_attention": {
69
+ "rope_theta": 160000.0,
70
+ "rope_type": "default"
71
+ },
72
+ "sliding_attention": {
73
+ "rope_theta": 10000.0,
74
+ "rope_type": "default"
75
+ }
76
+ },
77
+ "sep_token_id": 50282,
78
+ "sparse_pred_ignore_index": -100,
79
+ "sparse_prediction": false,
80
+ "start_token": 50368,
81
+ "tie_word_embeddings": true,
82
+ "transformers_version": "5.2.0",
83
+ "use_cache": false,
84
+ "vocab_size": 50371
85
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16fd64c635d8aa42aea9369b6196dc02bae7771edbb9dc4db1231ab774844f91
3
+ size 298047648
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "extra_special_tokens": [
6
+ "[START]",
7
+ "[END]",
8
+ "[GLOSS]"
9
+ ],
10
+ "is_local": false,
11
+ "mask_token": "[MASK]",
12
+ "model_input_names": [
13
+ "input_ids",
14
+ "attention_mask"
15
+ ],
16
+ "model_max_length": 8192,
17
+ "pad_token": "[PAD]",
18
+ "sep_token": "[SEP]",
19
+ "tokenizer_class": "TokenizersBackend",
20
+ "unk_token": "[UNK]"
21
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d495bea733a9a1d58c23c5a0a1a14cb54e0ab61bd6b81fc5d20360188143cd1
3
+ size 5265