Martijn van Beers commited on
Commit
f59e918
1 Parent(s): 64ac833

Remove files that shouldn't have been committed

Browse files
Files changed (2) hide show
  1. lib/BERTalt.py +0 -551
  2. lib/roberta2.py.rej +0 -63
lib/BERTalt.py DELETED
@@ -1,551 +0,0 @@
1
- from __future__ import absolute_import
2
-
3
- import torch
4
- from torch import nn
5
- import torch.nn.functional as F
6
- import math
7
- from BERT_explainability.modules.layers_ours import *
8
-
9
- import transformers
10
-
11
- from transformers import BertConfig
12
- from transformers.modeling_outputs import BaseModelOutputWithPooling, BaseModelOutput
13
- from transformers import (
14
- BertPreTrainedModel,
15
- PreTrainedModel,
16
- )
17
-
18
-
19
- ACT2FN = {
20
- "relu": ReLU,
21
- "tanh": Tanh,
22
- "gelu": GELU,
23
- }
24
-
25
-
26
- def get_activation(activation_string):
27
- if activation_string in ACT2FN:
28
- return ACT2FN[activation_string]
29
- else:
30
- raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
31
-
32
- def compute_rollout_attention(all_layer_matrices, start_layer=0):
33
- # adding residual consideration
34
- num_tokens = all_layer_matrices[0].shape[1]
35
- batch_size = all_layer_matrices[0].shape[0]
36
- eye = torch.eye(num_tokens).expand(batch_size, num_tokens, num_tokens).to(all_layer_matrices[0].device)
37
- all_layer_matrices = [all_layer_matrices[i] + eye for i in range(len(all_layer_matrices))]
38
- all_layer_matrices = [all_layer_matrices[i] / all_layer_matrices[i].sum(dim=-1, keepdim=True)
39
- for i in range(len(all_layer_matrices))]
40
- joint_attention = all_layer_matrices[start_layer]
41
- for i in range(start_layer+1, len(all_layer_matrices)):
42
- joint_attention = all_layer_matrices[i].bmm(joint_attention)
43
- return joint_attention
44
-
45
- class RPBertEmbeddings(BertEmbeddings):
46
- def __init__(self, config):
47
- super().__init__()
48
-
49
- self.add1 = Add()
50
- self.add2 = Add()
51
-
52
- def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
53
- if input_ids is not None:
54
- input_shape = input_ids.size()
55
- else:
56
- input_shape = inputs_embeds.size()[:-1]
57
-
58
- seq_length = input_shape[1]
59
-
60
- if position_ids is None:
61
- position_ids = self.position_ids[:, :seq_length]
62
-
63
- if token_type_ids is None:
64
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
65
-
66
- if inputs_embeds is None:
67
- inputs_embeds = self.word_embeddings(input_ids)
68
- position_embeddings = self.position_embeddings(position_ids)
69
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
70
-
71
- # embeddings = inputs_embeds + position_embeddings + token_type_embeddings
72
- embeddings = self.add1([token_type_embeddings, position_embeddings])
73
- embeddings = self.add2([embeddings, inputs_embeds])
74
- embeddings = self.LayerNorm(embeddings)
75
- embeddings = self.dropout(embeddings)
76
- return embeddings
77
-
78
- def relprop(self, cam, **kwargs):
79
- cam = self.dropout.relprop(cam, **kwargs)
80
- cam = self.LayerNorm.relprop(cam, **kwargs)
81
-
82
- # [inputs_embeds, position_embeddings, token_type_embeddings]
83
- (cam) = self.add2.relprop(cam, **kwargs)
84
-
85
- return cam
86
-
87
- class RPBertEncoder(transformers.modeling_bert.BertEncoder):
88
- def __init__(self, config):
89
- super().__init__()
90
- self.config = config
91
- self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
92
-
93
- def relprop(self, cam, **kwargs):
94
- # assuming output_hidden_states is False
95
- for layer_module in reversed(self.layer):
96
- cam = layer_module.relprop(cam, **kwargs)
97
- return cam
98
-
99
-
100
- # not adding relprop since this is only pooling at the end of the network, does not impact tokens importance
101
- class RPBertPooler(transformers.modeling_bert.BertPooler):
102
- def __init__(self, config):
103
- super().__init__()
104
- self.pool = IndexSelect()
105
-
106
- def forward(self, hidden_states):
107
- # We "pool" the model by simply taking the hidden state corresponding
108
- # to the first token.
109
- self._seq_size = hidden_states.shape[1]
110
-
111
- # first_token_tensor = hidden_states[:, 0]
112
- first_token_tensor = self.pool(hidden_states, 1, torch.tensor(0, device=hidden_states.device))
113
- first_token_tensor = first_token_tensor.squeeze(1)
114
- pooled_output = self.dense(first_token_tensor)
115
- pooled_output = self.activation(pooled_output)
116
- return pooled_output
117
-
118
- def relprop(self, cam, **kwargs):
119
- cam = self.activation.relprop(cam, **kwargs)
120
- #print(cam.sum())
121
- cam = self.dense.relprop(cam, **kwargs)
122
- #print(cam.sum())
123
- cam = cam.unsqueeze(1)
124
- cam = self.pool.relprop(cam, **kwargs)
125
- #print(cam.sum())
126
-
127
- return cam
128
-
129
- class BertAttention(transformers.modeling_bert.BertAttention):
130
- def __init__(self, config):
131
- super().__init__()
132
- self.clone = Clone()
133
-
134
- def forward(
135
- self,
136
- hidden_states,
137
- attention_mask=None,
138
- head_mask=None,
139
- encoder_hidden_states=None,
140
- encoder_attention_mask=None,
141
- output_attentions=False,
142
- ):
143
- h1, h2 = self.clone(hidden_states, 2)
144
- self_outputs = self.self(
145
- h1,
146
- attention_mask,
147
- head_mask,
148
- encoder_hidden_states,
149
- encoder_attention_mask,
150
- output_attentions,
151
- )
152
- attention_output = self.output(self_outputs[0], h2)
153
- outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
154
- return outputs
155
-
156
- def relprop(self, cam, **kwargs):
157
- # assuming that we don't ouput the attentions (outputs = (attention_output,)), self_outputs=(context_layer,)
158
- (cam1, cam2) = self.output.relprop(cam, **kwargs)
159
- #print(cam1.sum(), cam2.sum(), (cam1 + cam2).sum())
160
- cam1 = self.self.relprop(cam1, **kwargs)
161
- #print(cam1.sum(), cam2.sum(), (cam1 + cam2).sum())
162
-
163
- return self.clone.relprop((cam1, cam2), **kwargs)
164
-
165
- class BertSelfAttention(transformers.modeling_bert.BertSelfAttention):
166
- def __init__(self, config):
167
- super().__init__()
168
-
169
- self.matmul1 = MatMul()
170
- self.matmul2 = MatMul()
171
- self.softmax = Softmax(dim=-1)
172
- self.add = Add()
173
- self.mul = Mul()
174
- self.head_mask = None
175
- self.attention_mask = None
176
- self.clone = Clone()
177
-
178
- self.attn_cam = None
179
- self.attn = None
180
- self.attn_gradients = None
181
-
182
- def get_attn(self):
183
- return self.attn
184
-
185
- def save_attn(self, attn):
186
- self.attn = attn
187
-
188
- def save_attn_cam(self, cam):
189
- self.attn_cam = cam
190
-
191
- def get_attn_cam(self):
192
- return self.attn_cam
193
-
194
- def save_attn_gradients(self, attn_gradients):
195
- self.attn_gradients = attn_gradients
196
-
197
- def get_attn_gradients(self):
198
- return self.attn_gradients
199
-
200
- def transpose_for_scores_relprop(self, x):
201
- return x.permute(0, 2, 1, 3).flatten(2)
202
-
203
- def forward(
204
- self,
205
- hidden_states,
206
- attention_mask=None,
207
- head_mask=None,
208
- encoder_hidden_states=None,
209
- encoder_attention_mask=None,
210
- output_attentions=False,
211
- ):
212
- self.head_mask = head_mask
213
- self.attention_mask = attention_mask
214
-
215
- h1, h2, h3 = self.clone(hidden_states, 3)
216
- mixed_query_layer = self.query(h1)
217
-
218
- # If this is instantiated as a cross-attention module, the keys
219
- # and values come from an encoder; the attention mask needs to be
220
- # such that the encoder's padding tokens are not attended to.
221
- if encoder_hidden_states is not None:
222
- mixed_key_layer = self.key(encoder_hidden_states)
223
- mixed_value_layer = self.value(encoder_hidden_states)
224
- attention_mask = encoder_attention_mask
225
- else:
226
- mixed_key_layer = self.key(h2)
227
- mixed_value_layer = self.value(h3)
228
-
229
- query_layer = self.transpose_for_scores(mixed_query_layer)
230
- key_layer = self.transpose_for_scores(mixed_key_layer)
231
- value_layer = self.transpose_for_scores(mixed_value_layer)
232
-
233
- # Take the dot product between "query" and "key" to get the raw attention scores.
234
- attention_scores = self.matmul1([query_layer, key_layer.transpose(-1, -2)])
235
- attention_scores = attention_scores / math.sqrt(self.attention_head_size)
236
- if attention_mask is not None:
237
- # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
238
- attention_scores = self.add([attention_scores, attention_mask])
239
-
240
- # Normalize the attention scores to probabilities.
241
- attention_probs = self.softmax(attention_scores)
242
-
243
- self.save_attn(attention_probs)
244
- attention_probs.register_hook(self.save_attn_gradients)
245
-
246
- # This is actually dropping out entire tokens to attend to, which might
247
- # seem a bit unusual, but is taken from the original Transformer paper.
248
- attention_probs = self.dropout(attention_probs)
249
-
250
- # Mask heads if we want to
251
- if head_mask is not None:
252
- attention_probs = attention_probs * head_mask
253
-
254
- context_layer = self.matmul2([attention_probs, value_layer])
255
-
256
- context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
257
- new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
258
- context_layer = context_layer.view(*new_context_layer_shape)
259
-
260
- outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
261
- return outputs
262
-
263
- def relprop(self, cam, **kwargs):
264
- # Assume output_attentions == False
265
- cam = self.transpose_for_scores(cam)
266
-
267
- # [attention_probs, value_layer]
268
- (cam1, cam2) = self.matmul2.relprop(cam, **kwargs)
269
- cam1 /= 2
270
- cam2 /= 2
271
- if self.head_mask is not None:
272
- # [attention_probs, head_mask]
273
- (cam1, _)= self.mul.relprop(cam1, **kwargs)
274
-
275
-
276
- self.save_attn_cam(cam1)
277
-
278
- cam1 = self.dropout.relprop(cam1, **kwargs)
279
-
280
- cam1 = self.softmax.relprop(cam1, **kwargs)
281
-
282
- if self.attention_mask is not None:
283
- # [attention_scores, attention_mask]
284
- (cam1, _) = self.add.relprop(cam1, **kwargs)
285
-
286
- # [query_layer, key_layer.transpose(-1, -2)]
287
- (cam1_1, cam1_2) = self.matmul1.relprop(cam1, **kwargs)
288
- cam1_1 /= 2
289
- cam1_2 /= 2
290
-
291
- # query
292
- cam1_1 = self.transpose_for_scores_relprop(cam1_1)
293
- cam1_1 = self.query.relprop(cam1_1, **kwargs)
294
-
295
- # key
296
- cam1_2 = self.transpose_for_scores_relprop(cam1_2.transpose(-1, -2))
297
- cam1_2 = self.key.relprop(cam1_2, **kwargs)
298
-
299
- # value
300
- cam2 = self.transpose_for_scores_relprop(cam2)
301
- cam2 = self.value.relprop(cam2, **kwargs)
302
-
303
- cam = self.clone.relprop((cam1_1, cam1_2, cam2), **kwargs)
304
-
305
- return cam
306
-
307
-
308
- class BertSelfOutput(transformers.modeling_bert.BertSelfOutput):
309
- def __init__(self, config):
310
- super().__init__()
311
- self.add = Add()
312
-
313
- def forward(self, hidden_states, input_tensor):
314
- hidden_states = self.dense(hidden_states)
315
- hidden_states = self.dropout(hidden_states)
316
- add = self.add([hidden_states, input_tensor])
317
- hidden_states = self.LayerNorm(add)
318
- return hidden_states
319
-
320
- def relprop(self, cam, **kwargs):
321
- cam = self.LayerNorm.relprop(cam, **kwargs)
322
- # [hidden_states, input_tensor]
323
- (cam1, cam2) = self.add.relprop(cam, **kwargs)
324
- cam1 = self.dropout.relprop(cam1, **kwargs)
325
- cam1 = self.dense.relprop(cam1, **kwargs)
326
-
327
- return (cam1, cam2)
328
-
329
-
330
- class BertIntermediate(transformers.modeling_bert.BertIntermediate):
331
- def relprop(self, cam, **kwargs):
332
- cam = self.intermediate_act_fn.relprop(cam, **kwargs) # FIXME only ReLU
333
- #print(cam.sum())
334
- cam = self.dense.relprop(cam, **kwargs)
335
- #print(cam.sum())
336
- return cam
337
-
338
-
339
- class BertOutput(transformers.modeling_bert.BertOutput):
340
- def __init__(self, config):
341
- super().__init__()
342
- self.add = Add()
343
-
344
- def forward(self, hidden_states, input_tensor):
345
- hidden_states = self.dense(hidden_states)
346
- hidden_states = self.dropout(hidden_states)
347
- add = self.add([hidden_states, input_tensor])
348
- hidden_states = self.LayerNorm(add)
349
- return hidden_states
350
-
351
- def relprop(self, cam, **kwargs):
352
- # print("in", cam.sum())
353
- cam = self.LayerNorm.relprop(cam, **kwargs)
354
- #print(cam.sum())
355
- # [hidden_states, input_tensor]
356
- (cam1, cam2)= self.add.relprop(cam, **kwargs)
357
- # print("add", cam1.sum(), cam2.sum(), cam1.sum() + cam2.sum())
358
- cam1 = self.dropout.relprop(cam1, **kwargs)
359
- #print(cam1.sum())
360
- cam1 = self.dense.relprop(cam1, **kwargs)
361
- # print("dense", cam1.sum())
362
-
363
- # print("out", cam1.sum() + cam2.sum(), cam1.sum(), cam2.sum())
364
- return (cam1, cam2)
365
-
366
-
367
- class RPBertLayer(nn.Module):
368
- def __init__(self, config):
369
- super().__init__()
370
- self.attention = BertAttention(config)
371
- self.intermediate = BertIntermediate(config)
372
- self.output = BertOutput(config)
373
- self.clone = Clone()
374
-
375
- def forward(
376
- self,
377
- hidden_states,
378
- attention_mask=None,
379
- head_mask=None,
380
- output_attentions=False,
381
- ):
382
- self_attention_outputs = self.attention(
383
- hidden_states,
384
- attention_mask,
385
- head_mask,
386
- output_attentions=output_attentions,
387
- )
388
- attention_output = self_attention_outputs[0]
389
- outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
390
-
391
- ao1, ao2 = self.clone(attention_output, 2)
392
- intermediate_output = self.intermediate(ao1)
393
- layer_output = self.output(intermediate_output, ao2)
394
-
395
- outputs = (layer_output,) + outputs
396
- return outputs
397
-
398
- def relprop(self, cam, **kwargs):
399
- (cam1, cam2) = self.output.relprop(cam, **kwargs)
400
- # print("output", cam1.sum(), cam2.sum(), cam1.sum() + cam2.sum())
401
- cam1 = self.intermediate.relprop(cam1, **kwargs)
402
- # print("intermediate", cam1.sum())
403
- cam = self.clone.relprop((cam1, cam2), **kwargs)
404
- # print("clone", cam.sum())
405
- cam = self.attention.relprop(cam, **kwargs)
406
- # print("attention", cam.sum())
407
- return cam
408
-
409
-
410
- class BertModel(BertPreTrainedModel):
411
- def __init__(self, config):
412
- super().__init__(config)
413
- self.config = config
414
-
415
- self.embeddings = BertEmbeddings(config)
416
- self.encoder = BertEncoder(config)
417
- self.pooler = BertPooler(config)
418
-
419
- self.init_weights()
420
-
421
- def get_input_embeddings(self):
422
- return self.embeddings.word_embeddings
423
-
424
- def set_input_embeddings(self, value):
425
- self.embeddings.word_embeddings = value
426
-
427
- def forward(
428
- self,
429
- input_ids=None,
430
- attention_mask=None,
431
- token_type_ids=None,
432
- position_ids=None,
433
- head_mask=None,
434
- inputs_embeds=None,
435
- encoder_hidden_states=None,
436
- encoder_attention_mask=None,
437
- output_attentions=None,
438
- output_hidden_states=None,
439
- return_dict=None,
440
- ):
441
- r"""
442
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
443
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
444
- if the model is configured as a decoder.
445
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
446
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask
447
- is used in the cross-attention if the model is configured as a decoder.
448
- Mask values selected in ``[0, 1]``:
449
- ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
450
- """
451
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
452
- output_hidden_states = (
453
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
454
- )
455
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
456
-
457
- if input_ids is not None and inputs_embeds is not None:
458
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
459
- elif input_ids is not None:
460
- input_shape = input_ids.size()
461
- elif inputs_embeds is not None:
462
- input_shape = inputs_embeds.size()[:-1]
463
- else:
464
- raise ValueError("You have to specify either input_ids or inputs_embeds")
465
-
466
- device = input_ids.device if input_ids is not None else inputs_embeds.device
467
-
468
- if attention_mask is None:
469
- attention_mask = torch.ones(input_shape, device=device)
470
- if token_type_ids is None:
471
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
472
-
473
- # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
474
- # ourselves in which case we just need to make it broadcastable to all heads.
475
- extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
476
-
477
- # If a 2D or 3D attention mask is provided for the cross-attention
478
- # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
479
- if self.config.is_decoder and encoder_hidden_states is not None:
480
- encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
481
- encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
482
- if encoder_attention_mask is None:
483
- encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
484
- encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
485
- else:
486
- encoder_extended_attention_mask = None
487
-
488
- # Prepare head mask if needed
489
- # 1.0 in head_mask indicate we keep the head
490
- # attention_probs has shape bsz x n_heads x N x N
491
- # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
492
- # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
493
- head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
494
-
495
- embedding_output = self.embeddings(
496
- input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
497
- )
498
-
499
- encoder_outputs = self.encoder(
500
- embedding_output,
501
- attention_mask=extended_attention_mask,
502
- head_mask=head_mask,
503
- encoder_hidden_states=encoder_hidden_states,
504
- encoder_attention_mask=encoder_extended_attention_mask,
505
- output_attentions=output_attentions,
506
- output_hidden_states=output_hidden_states,
507
- return_dict=return_dict,
508
- )
509
- sequence_output = encoder_outputs[0]
510
- pooled_output = self.pooler(sequence_output)
511
-
512
- if not return_dict:
513
- return (sequence_output, pooled_output) + encoder_outputs[1:]
514
-
515
- return BaseModelOutputWithPooling(
516
- last_hidden_state=sequence_output,
517
- pooler_output=pooled_output,
518
- hidden_states=encoder_outputs.hidden_states,
519
- attentions=encoder_outputs.attentions,
520
- )
521
-
522
- def relprop(self, cam, **kwargs):
523
- cam = self.pooler.relprop(cam, **kwargs)
524
- # print("111111111111",cam.sum())
525
- cam = self.encoder.relprop(cam, **kwargs)
526
- # print("222222222222222", cam.sum())
527
- # print("conservation: ", cam.sum())
528
- return cam
529
-
530
-
531
- transformers.modeling_bert.BertEmbeddings = RPBertEmbeddings
532
- transformers.modeling_bert.BertEncoder = RPBertEncoder
533
-
534
- if __name__ == '__main__':
535
- class Config:
536
- def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
537
- self.hidden_size = hidden_size
538
- self.num_attention_heads = num_attention_heads
539
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
540
-
541
- model = BertSelfAttention(Config(1024, 4, 0.1))
542
- x = torch.rand(2, 20, 1024)
543
- x.requires_grad_()
544
-
545
- model.eval()
546
-
547
- y = model.forward(x)
548
-
549
- relprop = model.relprop(torch.rand(2, 20, 1024), (torch.rand(2, 20, 1024),))
550
-
551
- print(relprop[1][0].shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/roberta2.py.rej DELETED
@@ -1,63 +0,0 @@
1
- --- modeling_roberta.py 2022-06-28 11:59:19.974278244 +0200
2
- +++ roberta2.py 2022-06-28 14:13:05.765050058 +0200
3
- @@ -23,14 +23,14 @@
4
- from torch import nn
5
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
6
-
7
- -from ...activations import ACT2FN, gelu
8
- -from ...file_utils import (
9
- +from transformers.activations import ACT2FN, gelu
10
- +from transformers.file_utils import (
11
- add_code_sample_docstrings,
12
- add_start_docstrings,
13
- add_start_docstrings_to_model_forward,
14
- replace_return_docstrings,
15
- )
16
- -from ...modeling_outputs import (
17
- +from transformers.modeling_outputs import (
18
- BaseModelOutputWithPastAndCrossAttentions,
19
- BaseModelOutputWithPoolingAndCrossAttentions,
20
- CausalLMOutputWithCrossAttentions,
21
- @@ -40,14 +40,14 @@
22
- SequenceClassifierOutput,
23
- TokenClassifierOutput,
24
- )
25
- -from ...modeling_utils import (
26
- +from transformers.modeling_utils import (
27
- PreTrainedModel,
28
- apply_chunking_to_forward,
29
- find_pruneable_heads_and_indices,
30
- prune_linear_layer,
31
- )
32
- -from ...utils import logging
33
- -from .configuration_roberta import RobertaConfig
34
- +from transformers.utils import logging
35
- +from transformers.models.roberta.configuration_roberta import RobertaConfig
36
-
37
-
38
- logger = logging.get_logger(__name__)
39
- @@ -183,6 +183,24 @@
40
-
41
- self.is_decoder = config.is_decoder
42
-
43
- + def get_attn(self):
44
- + return self.attn
45
- +
46
- + def save_attn(self, attn):
47
- + self.attn = attn
48
- +
49
- + def save_attn_cam(self, cam):
50
- + self.attn_cam = cam
51
- +
52
- + def get_attn_cam(self):
53
- + return self.attn_cam
54
- +
55
- + def save_attn_gradients(self, attn_gradients):
56
- + self.attn_gradients = attn_gradients
57
- +
58
- + def get_attn_gradients(self):
59
- + return self.attn_gradients
60
- +
61
- def transpose_for_scores(self, x):
62
- new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
63
- x = x.view(*new_x_shape)