howard-hou commited on
Commit
438b415
1 Parent(s): 9e05377

Upload RankingPrompterForPreTraining

Browse files
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_name_or_path": "D://huggingface_model/RankingPrompterForPreTraining-small",
3
  "architectures": [
4
- "UMT5Model"
5
  ],
6
  "auto_map": {
7
  "AutoConfig": "configuration_rankingprompter.RankingPrompterConfig",
 
1
  {
2
  "_name_or_path": "D://huggingface_model/RankingPrompterForPreTraining-small",
3
  "architectures": [
4
+ "RankingPrompterForPreTraining"
5
  ],
6
  "auto_map": {
7
  "AutoConfig": "configuration_rankingprompter.RankingPrompterConfig",
configuration_rankingprompter.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class RankingPrompterConfig(PretrainedConfig):
4
+ model_type = "umt5"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=250112,
9
+ d_model=512,
10
+ d_kv=64,
11
+ d_ff=1024,
12
+ num_layers=8,
13
+ num_decoder_layers=None,
14
+ num_heads=6,
15
+ relative_attention_num_buckets=32,
16
+ relative_attention_max_distance=128,
17
+ dropout_rate=0.1,
18
+ layer_norm_epsilon=1e-6,
19
+ initializer_factor=1.0,
20
+ feed_forward_proj="gated-gelu",
21
+ is_encoder_decoder=True,
22
+ use_cache=True,
23
+ tokenizer_class="T5Tokenizer",
24
+ tie_word_embeddings=True,
25
+ pad_token_id=0,
26
+ eos_token_id=1,
27
+ decoder_start_token_id=0,
28
+ classifier_dropout=0.0,
29
+ **kwargs,
30
+ ):
31
+ super().__init__(
32
+ is_encoder_decoder=is_encoder_decoder,
33
+ tokenizer_class=tokenizer_class,
34
+ tie_word_embeddings=tie_word_embeddings,
35
+ pad_token_id=pad_token_id,
36
+ eos_token_id=eos_token_id,
37
+ decoder_start_token_id=decoder_start_token_id,
38
+ **kwargs,
39
+ )
40
+ self.vocab_size = vocab_size
41
+ self.d_model = d_model
42
+ self.d_kv = d_kv
43
+ self.d_ff = d_ff
44
+ self.num_layers = num_layers
45
+ self.num_decoder_layers = (
46
+ num_decoder_layers if num_decoder_layers is not None else self.num_layers
47
+ ) # default = symmetry
48
+ self.num_heads = num_heads
49
+ self.relative_attention_num_buckets = relative_attention_num_buckets
50
+ self.relative_attention_max_distance = relative_attention_max_distance
51
+ self.dropout_rate = dropout_rate
52
+ self.classifier_dropout = classifier_dropout
53
+ self.layer_norm_epsilon = layer_norm_epsilon
54
+ self.initializer_factor = initializer_factor
55
+ self.feed_forward_proj = feed_forward_proj
56
+ self.use_cache = use_cache
57
+
58
+ act_info = self.feed_forward_proj.split("-")
59
+ self.dense_act_fn = act_info[-1]
60
+ self.is_gated_act = act_info[0] == "gated"
61
+
62
+ if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
63
+ raise ValueError(
64
+ f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
65
+ "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
66
+ "'gated-gelu' or 'relu'"
67
+ )
68
+
69
+ if feed_forward_proj == "gated-gelu":
70
+ self.dense_act_fn = "gelu_new"
71
+
72
+ @property
73
+ def hidden_size(self):
74
+ return self.d_model
75
+
76
+ @property
77
+ def num_attention_heads(self):
78
+ return self.num_heads
79
+
80
+ @property
81
+ def num_hidden_layers(self):
82
+ return self.num_layers
modeling_rankingprompter.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import nullcontext
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Tuple, Union
4
+
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import CrossEntropyLoss
8
+ from transformers import UMT5Model
9
+ from .configuration_rankingprompter import RankingPrompterConfig
10
+
11
+
12
+ @dataclass
13
+ class RankingPrompterForPreTrainingOutput:
14
+ loss: torch.FloatTensor = None
15
+ logits: torch.FloatTensor = None
16
+
17
+
18
+ class RankingPrompterForPreTraining(UMT5Model):
19
+ config_class = RankingPrompterConfig
20
+
21
+ _tied_weights_keys = [
22
+ "encoder.embed_tokens.weight",
23
+ "decoder.embed_tokens.weight",
24
+ ]
25
+
26
+ def __init__(self, config):
27
+ # encoder, decoder and shared are from UMT5Model
28
+ super().__init__(config)
29
+
30
+ # add ranking head
31
+ self.ranking_head = nn.Linear(config.d_model, 1)
32
+
33
+ # Initialize weights and apply final processing
34
+ self.post_init()
35
+
36
+ # ctx for mixed precision training
37
+ self.ctx = nullcontext()
38
+
39
+ def enable_amp_ctx(self, device_type="cuda", dtype=torch.bfloat16):
40
+ self.ctx = torch.amp.autocast(device_type=device_type, dtype=dtype)
41
+
42
+ def disable_amp_ctx(self):
43
+ self.ctx = nullcontext()
44
+
45
+ def forward(
46
+ self,
47
+ document_input_ids: Optional[torch.LongTensor] = None,
48
+ document_attention_mask: Optional[torch.FloatTensor] = None,
49
+ question_input_ids: Optional[torch.LongTensor] = None,
50
+ question_attention_mask: Optional[torch.BoolTensor] = None,
51
+ encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
52
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
53
+ labels: Optional[torch.LongTensor] = None,
54
+ use_cache: Optional[bool] = None,
55
+ return_dict: Optional[bool] = None,
56
+ ) -> Union[Tuple[torch.FloatTensor], RankingPrompterForPreTrainingOutput]:
57
+ r"""
58
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
59
+ Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
60
+ config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
61
+ labels in `[0, ..., config.vocab_size]`
62
+
63
+ Returns:
64
+
65
+ ```"""
66
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
67
+ return_dict = (
68
+ return_dict if return_dict is not None else self.config.use_return_dict
69
+ )
70
+ # document_input_ids: [batch_size, num_doc, doc_seq_len]
71
+ batch_size, num_doc, doc_seq_len = document_input_ids.shape
72
+ #
73
+ document_input_ids = document_input_ids.view(-1, doc_seq_len)
74
+ # to [batch_size * num_doc, doc_seq_len]
75
+ document_attention_mask = document_attention_mask.view(-1, doc_seq_len)
76
+
77
+ # Convert encoder inputs in embeddings if needed
78
+ with self.ctx:
79
+ encoder_outputs = self.encoder(
80
+ input_ids=document_input_ids,
81
+ attention_mask=document_attention_mask,
82
+ return_dict=return_dict,
83
+ )
84
+
85
+ document_embeds = encoder_outputs[0]
86
+
87
+ # repeat question inputs for each document
88
+ # question_input_ids: [batch_size, question_seq_len]
89
+ question_seq_len = question_input_ids.shape[1]
90
+ question_input_ids = (
91
+ question_input_ids.unsqueeze(1)
92
+ .expand(-1, num_doc, -1)
93
+ .reshape(-1, question_seq_len)
94
+ ) # [batch_size * num_doc, question_seq_len]
95
+ question_attention_mask = (
96
+ question_attention_mask.unsqueeze(1)
97
+ .expand(-1, num_doc, -1)
98
+ .reshape(-1, question_seq_len)
99
+ ) # [batch_size * num_doc, question_seq_len]
100
+
101
+ # Decode
102
+ with self.ctx:
103
+ decoder_outputs = self.decoder(
104
+ input_ids=question_input_ids,
105
+ attention_mask=question_attention_mask,
106
+ past_key_values=past_key_values,
107
+ encoder_hidden_states=document_embeds,
108
+ encoder_attention_mask=document_attention_mask,
109
+ use_cache=use_cache,
110
+ return_dict=return_dict,
111
+ )
112
+ # [batch_size * num_doc, soft_prompt_len + question_seq_len, hidden_size]
113
+ sequence_output = decoder_outputs[0]
114
+ # [batch_size * num_doc, soft_prompt_len, hidden_size]
115
+ question_seq_len = sequence_output.size(1)
116
+ # [batch_size, num_doc, soft_prompt_len, hidden_size]
117
+ soft_prompt_output = sequence_output.view(
118
+ batch_size, num_doc, question_seq_len, -1
119
+ )
120
+
121
+ # [batch_size, num_doc, self.num_soft_prompt_tokens, hidden_size] -> [batch_size, num_doc, hidden_size]
122
+ ranking_logits = self.ranking_head(soft_prompt_output.mean(dim=2))
123
+
124
+ # rank loss
125
+ loss = None
126
+ if labels is not None:
127
+ loss_fct = CrossEntropyLoss(ignore_index=-100)
128
+ ranking_logits = ranking_logits.view(batch_size, num_doc)
129
+ loss = loss_fct(ranking_logits, labels)
130
+
131
+ if not return_dict:
132
+ output = (ranking_logits,) + decoder_outputs[1:] + encoder_outputs
133
+ return ((loss,) + output) if loss is not None else output
134
+
135
+ return RankingPrompterForPreTrainingOutput(
136
+ loss=loss,
137
+ logits=ranking_logits
138
+ )
139
+
140
+
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:982317cd72cc9fc369542b1145e9141fc19168206fee0dbdcf91fc3d9ff0c2e0
3
- size 701400953
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90ef8ceeeffc7b033e65dfc28f3adf8d82cbdad204df0677ae0c0f45f4f0c24
3
+ size 701403585