Upload folder using huggingface_hub

#1
.gitattributes CHANGED
@@ -7,3 +7,5 @@
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  *.msgpack filter=lfs diff=lfs merge=lfs -text
10
+ mesh_tensorflow_checkpoints/model.ckpt-1461673.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
11
+ mesh_tensorflow_checkpoints/model.ckpt-1461673.meta filter=lfs diff=lfs merge=lfs -text
mesh_tensorflow_checkpoints/model.ckpt-1461673.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/model.ckpt-1461673.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fb0eb2104329a6ec986b9b607163ec27697aa810f118c12fecfdd04e6c299d
3
+ size 1480297984
mesh_tensorflow_checkpoints/model.ckpt-1461673.index ADDED
Binary file (20.9 kB). View file
 
mesh_tensorflow_checkpoints/model.ckpt-1461673.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1681f7905ba2b006f6ccce00e1c7309f93a345970c40036dbb9e7cfa28f925a2
3
+ size 41809252
mesh_tensorflow_checkpoints/operative_config.gin ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 4096
13
+ d_kv = 64
14
+ d_model = 1024
15
+ dropout_rate = 0.1
16
+ num_heads = 16
17
+ num_layers = 24
18
+
19
+ # Parameters for AdafactorOptimizer:
20
+ # ==============================================================================
21
+ AdafactorOptimizer.beta1 = 0.0
22
+ AdafactorOptimizer.clipping_threshold = 1.0
23
+ AdafactorOptimizer.decay_rate = None
24
+ AdafactorOptimizer.epsilon1 = 1e-30
25
+ AdafactorOptimizer.epsilon2 = 0.001
26
+ AdafactorOptimizer.factored = True
27
+ AdafactorOptimizer.min_dim_size_to_factor = 128
28
+ AdafactorOptimizer.multiply_by_parameter_scale = True
29
+
30
+ # Parameters for Bitransformer:
31
+ # ==============================================================================
32
+ Bitransformer.shared_embedding = True
33
+
34
+ # Parameters for denoise:
35
+ # ==============================================================================
36
+ # None.
37
+
38
+ # Parameters for decoder/DenseReluDense:
39
+ # ==============================================================================
40
+ decoder/DenseReluDense.activation = 'relu'
41
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
42
+ decoder/DenseReluDense.hidden_size = %d_ff
43
+
44
+ # Parameters for encoder/DenseReluDense:
45
+ # ==============================================================================
46
+ encoder/DenseReluDense.activation = 'relu'
47
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
48
+ encoder/DenseReluDense.hidden_size = %d_ff
49
+
50
+ # Parameters for decoder/EncDecAttention:
51
+ # ==============================================================================
52
+ # None.
53
+
54
+ # Parameters for get_variable_dtype:
55
+ # ==============================================================================
56
+ get_variable_dtype.activation_dtype = 'bfloat16'
57
+
58
+ # Parameters for get_vocab_embedding_cls:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_vocabulary:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for iid_noise_mask:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+ decoder/LayerStack.recompute_grads = False
75
+
76
+ # Parameters for encoder/LayerStack:
77
+ # ==============================================================================
78
+ encoder/LayerStack.dropout_rate = %dropout_rate
79
+ encoder/LayerStack.norm_epsilon = 1e-06
80
+ encoder/LayerStack.recompute_grads = False
81
+
82
+ # Parameters for make_bitransformer:
83
+ # ==============================================================================
84
+ make_bitransformer.decoder_name = 'decoder'
85
+ make_bitransformer.encoder_name = 'encoder'
86
+
87
+ # Parameters for decoder/make_layer_stack:
88
+ # ==============================================================================
89
+ decoder/make_layer_stack.block_scope = True
90
+ decoder/make_layer_stack.layers = \
91
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
92
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
93
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
94
+ decoder/make_layer_stack.num_layers = %num_layers
95
+
96
+ # Parameters for encoder/make_layer_stack:
97
+ # ==============================================================================
98
+ encoder/make_layer_stack.block_scope = True
99
+ encoder/make_layer_stack.layers = \
100
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
101
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
102
+ encoder/make_layer_stack.num_layers = %num_layers
103
+
104
+ # Parameters for maybe_print_dataset:
105
+ # ==============================================================================
106
+ maybe_print_dataset.should_print = False
107
+
108
+ # Parameters for mesh_train_dataset_fn:
109
+ # ==============================================================================
110
+ mesh_train_dataset_fn.use_cached = False
111
+
112
+ # Parameters for MtfModel:
113
+ # ==============================================================================
114
+ MtfModel.autostack = True
115
+ MtfModel.ensemble_inputs = None
116
+ MtfModel.gcp_project = None
117
+ MtfModel.layout_rules = \
118
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
119
+ MtfModel.mesh_devices = None
120
+ MtfModel.mesh_shape = None
121
+ MtfModel.model_type = 'bitransformer'
122
+ MtfModel.optimizer = None
123
+ MtfModel.predict_fn = None
124
+ MtfModel.tpu_job_name = None
125
+ MtfModel.tpu_zone = None
126
+ MtfModel.variable_filter = None
127
+
128
+ # Parameters for noise_token_to_sentinel:
129
+ # ==============================================================================
130
+ # None.
131
+
132
+ # Parameters for num_parallel_calls:
133
+ # ==============================================================================
134
+ num_parallel_calls.deterministic = False
135
+
136
+ # Parameters for pack_dataset:
137
+ # ==============================================================================
138
+ pack_dataset.use_custom_ops = False
139
+
140
+ # Parameters for pack_or_pad:
141
+ # ==============================================================================
142
+ # None.
143
+
144
+ # Parameters for decoder/SelfAttention:
145
+ # ==============================================================================
146
+ decoder/SelfAttention.attention_func = None
147
+ decoder/SelfAttention.attention_kwargs = None
148
+ decoder/SelfAttention.combine_dims = True
149
+ decoder/SelfAttention.dropout_rate = %dropout_rate
150
+ decoder/SelfAttention.keep_query_heads_dims = False
151
+ decoder/SelfAttention.key_value_size = %d_kv
152
+ decoder/SelfAttention.num_heads = %num_heads
153
+ decoder/SelfAttention.num_memory_heads = 0
154
+ decoder/SelfAttention.relative_attention_num_buckets = 32
155
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
156
+ decoder/SelfAttention.shared_kv = False
157
+
158
+ # Parameters for encoder/SelfAttention:
159
+ # ==============================================================================
160
+ encoder/SelfAttention.attention_func = None
161
+ encoder/SelfAttention.attention_kwargs = None
162
+ encoder/SelfAttention.combine_dims = True
163
+ encoder/SelfAttention.dropout_rate = %dropout_rate
164
+ encoder/SelfAttention.keep_query_heads_dims = False
165
+ encoder/SelfAttention.key_value_size = %d_kv
166
+ encoder/SelfAttention.num_heads = %num_heads
167
+ encoder/SelfAttention.num_memory_heads = 0
168
+ encoder/SelfAttention.relative_attention_num_buckets = 32
169
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
170
+ encoder/SelfAttention.shared_kv = False
171
+
172
+ # Parameters for SentencePieceVocabulary:
173
+ # ==============================================================================
174
+ # None.
175
+
176
+ # Parameters for sentinel_id:
177
+ # ==============================================================================
178
+ sentinel_id.return_value = None
179
+
180
+ # Parameters for serialize_num_microbatches:
181
+ # ==============================================================================
182
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
183
+
184
+ # Parameters for shift_targets:
185
+ # ==============================================================================
186
+ shift_targets.bos_id = 0
187
+ shift_targets.eos_id = 1
188
+
189
+ # Parameters for tpu_estimator_model_fn:
190
+ # ==============================================================================
191
+ tpu_estimator_model_fn.model_info_file = None
192
+ tpu_estimator_model_fn.outer_batch_size = 1
193
+ tpu_estimator_model_fn.tpu_summaries = False
194
+
195
+ # Parameters for tpu_mesh_shape:
196
+ # ==============================================================================
197
+ tpu_mesh_shape.ensemble_parallelism = None
198
+
199
+ # Parameters for decoder/Unitransformer:
200
+ # ==============================================================================
201
+ decoder/Unitransformer.d_model = %d_model
202
+ decoder/Unitransformer.ensemble = None
203
+ decoder/Unitransformer.input_full_attention = False
204
+ decoder/Unitransformer.label_smoothing = 0.0
205
+ decoder/Unitransformer.loss_denominator = 233472
206
+ decoder/Unitransformer.loss_fn = None
207
+ decoder/Unitransformer.loss_on_targets_only = False
208
+ decoder/Unitransformer.max_length = 512
209
+ decoder/Unitransformer.positional_embedding = False
210
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
211
+ decoder/Unitransformer.sinusoid_positional_embedding = False
212
+ decoder/Unitransformer.token_dropout_rate = 0.0
213
+ decoder/Unitransformer.vocab_divisor = 128
214
+ decoder/Unitransformer.z_loss = 0.0001
215
+
216
+ # Parameters for encoder/Unitransformer:
217
+ # ==============================================================================
218
+ encoder/Unitransformer.d_model = %d_model
219
+ encoder/Unitransformer.ensemble = None
220
+ encoder/Unitransformer.input_full_attention = False
221
+ encoder/Unitransformer.label_smoothing = 0.0
222
+ encoder/Unitransformer.loss_denominator = None
223
+ encoder/Unitransformer.loss_fn = None
224
+ encoder/Unitransformer.loss_on_targets_only = False
225
+ encoder/Unitransformer.max_length = 512
226
+ encoder/Unitransformer.positional_embedding = False
227
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
228
+ encoder/Unitransformer.sinusoid_positional_embedding = False
229
+ encoder/Unitransformer.token_dropout_rate = 0.0
230
+ encoder/Unitransformer.vocab_divisor = 128
231
+ encoder/Unitransformer.z_loss = 0.0001
232
+
233
+ # Parameters for VarianceScalingInitializer:
234
+ # ==============================================================================
235
+ VarianceScalingInitializer.distribution = 'normal'
236
+ VarianceScalingInitializer.mode = 'fan_in'
237
+ VarianceScalingInitializer.scale = 1.0
238
+
239
+ # Parameters for VocabEmbedding:
240
+ # ==============================================================================
241
+ # None.
242
+
243
+ # Parameters for Vocabulary:
244
+ # ==============================================================================
245
+ # None.