annnli commited on
Commit
d44be9c
·
verified ·
1 Parent(s): 0e0f5e9

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. config.json +41 -0
  2. merges.txt +0 -0
  3. model.safetensors +3 -0
  4. modeling_roberta_cl.py +395 -0
  5. tokenizer.json +0 -0
  6. vocab.json +0 -0
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
3
+ "architectures": [
4
+ "RobertaForCL"
5
+ ],
6
+ "auto_map": {
7
+ "AutoModel": "modeling_roberta_cl.RobertaForCL"
8
+ },
9
+ "attention_probs_dropout_prob": 0.1,
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": null,
12
+ "eos_token_id": 2,
13
+ "gradient_checkpointing": false,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "id2label": {
18
+ "0": "LABEL_0",
19
+ "1": "LABEL_1",
20
+ "2": "LABEL_2"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 3072,
24
+ "label2id": {
25
+ "LABEL_0": 0,
26
+ "LABEL_1": 1,
27
+ "LABEL_2": 2
28
+ },
29
+ "layer_norm_eps": 1e-05,
30
+ "max_position_embeddings": 514,
31
+ "model_type": "roberta",
32
+ "num_attention_heads": 12,
33
+ "num_hidden_layers": 12,
34
+ "pad_token_id": 1,
35
+ "position_embedding_type": "absolute",
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.48.1",
38
+ "type_vocab_size": 1,
39
+ "use_cache": true,
40
+ "vocab_size": 50265
41
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06626dbe31b7e4b4ebb273081631608f988c5c8d7345b90aff0190d04f2c4de5
3
+ size 503080724
modeling_roberta_cl.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import torch.distributed as dist
5
+ from torch import Tensor
6
+
7
+ import transformers
8
+ from transformers import RobertaTokenizer
9
+ from transformers.models.roberta.modeling_roberta import RobertaForSequenceClassification, RobertaClassificationHead, RobertaLMHead
10
+ from transformers.activations import gelu
11
+ from transformers.file_utils import (
12
+ add_code_sample_docstrings,
13
+ add_start_docstrings,
14
+ add_start_docstrings_to_model_forward,
15
+ replace_return_docstrings,
16
+ )
17
+ from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions
18
+
19
+ class MLPLayer(nn.Module):
20
+ """
21
+ Head for getting sentence representations over RoBERTa/BERT's CLS representation.
22
+ """
23
+
24
+ def __init__(self, config):
25
+ super().__init__()
26
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
27
+ self.activation = nn.Tanh()
28
+
29
+ def forward(self, features, **kwargs):
30
+ x = self.dense(features)
31
+ x = self.activation(x)
32
+
33
+ return x
34
+
35
+ class ResidualBlock(nn.Module):
36
+ def __init__(self, dim):
37
+ super(ResidualBlock, self).__init__()
38
+ self.fc = nn.Linear(dim, dim)
39
+ self.relu = nn.ReLU()
40
+
41
+ def forward(self, x):
42
+ out = self.fc(x)
43
+ out = self.relu(out)
44
+ out = out + x
45
+ return out
46
+
47
+ class SemanticModel(nn.Module):
48
+ def __init__(self, num_layers=2, input_dim=768, hidden_dim=512, output_dim=384):
49
+ super(SemanticModel, self).__init__()
50
+
51
+ self.layers = nn.ModuleList()
52
+
53
+ self.layers.append(nn.Linear(input_dim, hidden_dim))
54
+
55
+ for _ in range(num_layers):
56
+ self.layers.append(ResidualBlock(hidden_dim))
57
+
58
+ self.layers.append(nn.Linear(hidden_dim, output_dim))
59
+
60
+ def forward(self, x):
61
+ for i in range(len(self.layers)):
62
+ x = self.layers[i](x)
63
+
64
+ return x
65
+
66
+ class Similarity(nn.Module):
67
+ """
68
+ Dot product or cosine similarity
69
+ """
70
+
71
+ def __init__(self, temp):
72
+ super().__init__()
73
+ self.temp = temp
74
+ self.cos = nn.CosineSimilarity(dim=-1)
75
+
76
+ def forward(self, x, y):
77
+ return self.cos(x, y) / self.temp
78
+
79
+
80
+ class RobertaClassificationHeadForEmbedding(RobertaClassificationHead):
81
+ """Head for sentence-level classification tasks."""
82
+
83
+ def __init__(self, config):
84
+ super().__init__(config)
85
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
86
+ classifier_dropout = (
87
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
88
+ )
89
+ self.dropout = nn.Dropout(classifier_dropout)
90
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
91
+
92
+ def forward(self, features, **kwargs):
93
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
94
+ x = self.dropout(x)
95
+ x = self.dense(x)
96
+ # x = torch.tanh(x)
97
+ # x = self.dropout(x)
98
+ # x = self.out_proj(x)
99
+ return x
100
+
101
+ def cl_init(cls, config):
102
+ """
103
+ Contrastive learning class init function.
104
+ """
105
+ cls.sim = Similarity(temp=cls.model_args.temp)
106
+ cls.init_weights()
107
+
108
+ def remove_diagonal_elements(input_tensor):
109
+ """
110
+ Removes the diagonal elements from a square matrix (bs, bs)
111
+ and returns a new matrix of size (bs, bs-1).
112
+ """
113
+ if input_tensor.size(0) != input_tensor.size(1):
114
+ raise ValueError("Input tensor must be square (bs, bs).")
115
+
116
+ bs = input_tensor.size(0)
117
+ mask = ~torch.eye(bs, dtype=torch.bool, device=input_tensor.device) # Mask for non-diagonal elements
118
+ output_tensor = input_tensor[mask].view(bs, bs - 1) # Reshape into (bs, bs-1)
119
+ return output_tensor
120
+
121
+ def cl_forward(cls,
122
+ input_ids=None,
123
+ attention_mask=None,
124
+ token_type_ids=None,
125
+ position_ids=None,
126
+ head_mask=None,
127
+ inputs_embeds=None,
128
+ labels=None,
129
+ output_attentions=None,
130
+ output_hidden_states=None,
131
+ return_dict=None,
132
+ mlm_input_ids=None,
133
+ mlm_labels=None,
134
+ latter_sentiment_spoof_mask=None,
135
+ ):
136
+ return_dict = return_dict if return_dict is not None else cls.config.use_return_dict
137
+ batch_size = input_ids.size(0)
138
+ # Number of sentences in one instance
139
+ # original + cls.model_args.num_paraphrased + cls.model_args.num_negative
140
+ num_sent = input_ids.size(1)
141
+
142
+ mlm_outputs = None
143
+ # Flatten input for encoding
144
+ input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
145
+ attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
146
+ if token_type_ids is not None:
147
+ token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)
148
+
149
+ # Get raw embeddings
150
+ outputs = cls.roberta(
151
+ input_ids,
152
+ attention_mask=attention_mask,
153
+ token_type_ids=token_type_ids,
154
+ position_ids=position_ids,
155
+ head_mask=head_mask,
156
+ inputs_embeds=inputs_embeds,
157
+ output_attentions=output_attentions,
158
+ output_hidden_states=False,
159
+ return_dict=True,
160
+ )
161
+
162
+ # MLM auxiliary objective
163
+ if mlm_input_ids is not None:
164
+ mlm_input_ids = mlm_input_ids.view((-1, mlm_input_ids.size(-1)))
165
+ mlm_outputs = cls.roberta(
166
+ mlm_input_ids,
167
+ attention_mask=attention_mask,
168
+ token_type_ids=token_type_ids,
169
+ position_ids=position_ids,
170
+ head_mask=head_mask,
171
+ inputs_embeds=inputs_embeds,
172
+ output_attentions=output_attentions,
173
+ output_hidden_states=False,
174
+ return_dict=True,
175
+ )
176
+
177
+ # Pooling
178
+ sequence_output = outputs[0] # (bs*num_sent, seq_len, hidden)
179
+ pooler_output = cls.classifier(sequence_output) # (bs*num_sent, hidden)
180
+ pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden)
181
+
182
+ # Mapping
183
+ pooler_output = cls.map(pooler_output) # (bs, num_sent, hidden_states)
184
+
185
+ # Separate representation
186
+ original = pooler_output[:, 0]
187
+ paraphrase_list = [pooler_output[:, i] for i in range(1, cls.model_args.num_paraphrased + 1)]
188
+ if cls.model_args.num_negative == 0:
189
+ negative_list = []
190
+ else:
191
+ negative_list = [pooler_output[:, i] for i in range(cls.model_args.num_paraphrased + 1, cls.model_args.num_paraphrased + cls.model_args.num_negative + 1)]
192
+
193
+ # Gather all embeddings if using distributed training
194
+ if dist.is_initialized() and cls.training:
195
+ raise NotImplementedError
196
+
197
+ # get sign value before calculating similarity
198
+ original = torch.tanh(original * 1000)
199
+ paraphrase_list = [torch.tanh(p * 1000) for p in paraphrase_list]
200
+ negative_list = [torch.tanh(n * 1000) for n in negative_list]
201
+ spoofing_cnames = cls.model_args.spoofing_cnames
202
+ negative_dict = {}
203
+ for cname, n in zip(spoofing_cnames, negative_list):
204
+ negative_dict[cname] = n
205
+
206
+ # Calculate triplet loss
207
+ loss_triplet = 0
208
+ for i in range(batch_size):
209
+ for j in range(cls.model_args.num_paraphrased):
210
+ for cname in spoofing_cnames:
211
+ if cname == 'latter_sentiment_spoof_0' and latter_sentiment_spoof_mask[i] == 0:
212
+ continue
213
+ ori = original[i]
214
+ pos = paraphrase_list[j][i]
215
+ neg = negative_dict[cname][i]
216
+ loss_triplet += F.relu(cls.sim(ori, neg) * cls.model_args.temp - cls.sim(ori, pos) * cls.model_args.temp + cls.model_args.margin)
217
+ loss_triplet /= (batch_size * cls.model_args.num_paraphrased * len(spoofing_cnames))
218
+
219
+ # Calculate loss for MLM
220
+ if mlm_outputs is not None and mlm_labels is not None:
221
+ raise NotImplementedError
222
+ # mlm_labels = mlm_labels.view(-1, mlm_labels.size(-1))
223
+ # prediction_scores = cls.lm_head(mlm_outputs.last_hidden_state)
224
+ # masked_lm_loss = loss_fct(prediction_scores.view(-1, cls.config.vocab_size), mlm_labels.view(-1))
225
+ # loss_cl = loss_cl + cls.model_args.mlm_weight * masked_lm_loss
226
+
227
+ # Calculate loss for uniform perturbation and unbiased token preference
228
+ def sign_loss(x):
229
+ row = torch.abs(torch.mean(torch.mean(x, dim=0)))
230
+ col = torch.abs(torch.mean(torch.mean(x, dim=1)))
231
+ return (row + col)/2
232
+
233
+ loss_gr = sign_loss(original)
234
+
235
+ # calculate loss_3: similarity between original and paraphrased text
236
+ loss_3_list = [cls.sim(original, p).unsqueeze(1) for p in paraphrase_list] # [(bs, 1)] * num_paraphrased
237
+ loss_3_tensor = torch.cat(loss_3_list, dim=1) # (bs, num_paraphrased)
238
+ loss_3 = loss_3_tensor.mean() * cls.model_args.temp
239
+
240
+ # calculate loss_sent: similarity between original and sentiment spoofed text
241
+ negative_sample_loss = {}
242
+ for cname in spoofing_cnames:
243
+ negatives = negative_dict[cname]
244
+ originals = original.clone()
245
+ if cname == 'latter_sentiment_spoof_0':
246
+ negatives = negatives[latter_sentiment_spoof_mask == 1]
247
+ originals = originals[latter_sentiment_spoof_mask == 1]
248
+ one_negative_loss = cls.sim(originals, negatives).mean() * cls.model_args.temp
249
+ negative_sample_loss[cname] = one_negative_loss
250
+
251
+ # calculate loss_5: similarity between original and other original text
252
+ ori_ori_cos = cls.sim(original.unsqueeze(1), original.unsqueeze(0)) # (bs, bs)
253
+ ori_ori_cos_removed = remove_diagonal_elements(ori_ori_cos) # (bs, bs-1)
254
+ loss_5 = ori_ori_cos_removed.mean() * cls.model_args.temp
255
+
256
+ loss = loss_gr + loss_triplet
257
+
258
+ result = {
259
+ 'loss': loss,
260
+ 'loss_gr': loss_gr,
261
+ 'sim_paraphrase': loss_3,
262
+ 'sim_other': loss_5,
263
+ 'hidden_states': outputs.hidden_states,
264
+ 'attentions': outputs.attentions,
265
+ }
266
+
267
+ for cname, l in negative_sample_loss.items():
268
+ key = f"sim_{cname.replace('_spoof_0', '')}"
269
+ result[key] = l
270
+
271
+ result['loss_tl'] = loss_triplet
272
+
273
+ if not return_dict:
274
+ raise NotImplementedError
275
+ # output = (cos_sim,) + outputs[2:]
276
+ # return ((loss,) + output) if loss is not None else output
277
+ return result
278
+
279
+
280
+ def sentemb_forward(
281
+ cls,
282
+ input_ids=None,
283
+ attention_mask=None,
284
+ token_type_ids=None,
285
+ position_ids=None,
286
+ head_mask=None,
287
+ inputs_embeds=None,
288
+ labels=None,
289
+ output_attentions=None,
290
+ output_hidden_states=None,
291
+ return_dict=None,
292
+ ):
293
+
294
+ return_dict = return_dict if return_dict is not None else cls.config.use_return_dict
295
+
296
+ outputs = cls.roberta(
297
+ input_ids,
298
+ attention_mask=attention_mask,
299
+ token_type_ids=token_type_ids,
300
+ position_ids=position_ids,
301
+ head_mask=head_mask,
302
+ inputs_embeds=inputs_embeds,
303
+ output_attentions=output_attentions,
304
+ output_hidden_states=False,
305
+ return_dict=True,
306
+ )
307
+ sequence_output = outputs[0]
308
+ pooler_output = cls.classifier(sequence_output)
309
+
310
+ # Mapping
311
+ mapping_output = cls.map(pooler_output)
312
+ pooler_output = mapping_output
313
+
314
+
315
+ if not return_dict:
316
+ return (outputs[0], pooler_output) + outputs[2:]
317
+
318
+ return BaseModelOutputWithPoolingAndCrossAttentions(
319
+ pooler_output=pooler_output,
320
+ last_hidden_state=outputs.last_hidden_state,
321
+ hidden_states=outputs.hidden_states,
322
+ )
323
+
324
+
325
+ class RobertaForCL(RobertaForSequenceClassification):
326
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
327
+
328
+ def __init__(self, config, *model_args, **model_kargs):
329
+ super().__init__(config)
330
+ self.model_args = model_kargs.get("model_args", None)
331
+
332
+ self.classifier = RobertaClassificationHeadForEmbedding(config)
333
+
334
+ if self.model_args and getattr(self.model_args, "do_mlm", False):
335
+ self.lm_head = RobertaLMHead(config)
336
+ cl_init(self, config)
337
+
338
+ self.map = SemanticModel(input_dim=768)
339
+
340
+ # Initialize weights and apply final processing
341
+ self.post_init()
342
+
343
+ def initialize_mlp_weights(self, pretrained_model_state_dict):
344
+ """
345
+ Initialize MLP weights using the pretrained classifier's weights.
346
+ """
347
+ self.mlp.dense.weight.data = pretrained_model_state_dict.classifier.dense.weight.data.clone()
348
+ self.mlp.dense.bias.data = pretrained_model_state_dict.classifier.dense.bias.data.clone()
349
+
350
+ def forward(self,
351
+ input_ids=None,
352
+ attention_mask=None,
353
+ token_type_ids=None,
354
+ position_ids=None,
355
+ head_mask=None,
356
+ inputs_embeds=None,
357
+ labels=None,
358
+ output_attentions=None,
359
+ output_hidden_states=None,
360
+ return_dict=None,
361
+ sent_emb=False,
362
+ mlm_input_ids=None,
363
+ mlm_labels=None,
364
+ latter_sentiment_spoof_mask=None,
365
+ ):
366
+ if sent_emb:
367
+ return sentemb_forward(self,
368
+ input_ids=input_ids,
369
+ attention_mask=attention_mask,
370
+ token_type_ids=token_type_ids,
371
+ position_ids=position_ids,
372
+ head_mask=head_mask,
373
+ inputs_embeds=inputs_embeds,
374
+ labels=labels,
375
+ output_attentions=output_attentions,
376
+ output_hidden_states=output_hidden_states,
377
+ return_dict=return_dict,
378
+ )
379
+ else:
380
+ return cl_forward(self,
381
+ input_ids=input_ids,
382
+ attention_mask=attention_mask,
383
+ token_type_ids=token_type_ids,
384
+ position_ids=position_ids,
385
+ head_mask=head_mask,
386
+ inputs_embeds=inputs_embeds,
387
+ labels=labels,
388
+ output_attentions=output_attentions,
389
+ output_hidden_states=output_hidden_states,
390
+ return_dict=return_dict,
391
+ mlm_input_ids=mlm_input_ids,
392
+ mlm_labels=mlm_labels,
393
+ latter_sentiment_spoof_mask=latter_sentiment_spoof_mask,
394
+ )
395
+
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff