marcospiau commited on
Commit
f65d013
1 Parent(s): cf47201

Upload folder using huggingface_hub

Browse files
Files changed (33) hide show
  1. .gitattributes +12 -0
  2. mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.data-00000-of-00002 +0 -0
  3. mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.data-00001-of-00002 +3 -0
  4. mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.index +0 -0
  5. mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.meta +3 -0
  6. mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/operative_config.gin +248 -0
  7. mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.data-00000-of-00002 +0 -0
  8. mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.data-00001-of-00002 +3 -0
  9. mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.index +0 -0
  10. mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.meta +3 -0
  11. mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/operative_config.gin +245 -0
  12. mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.data-00000-of-00002 +0 -0
  13. mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.data-00001-of-00002 +3 -0
  14. mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.index +0 -0
  15. mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.meta +3 -0
  16. mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/operative_config.gin +245 -0
  17. mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.data-00000-of-00002 +0 -0
  18. mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.data-00001-of-00002 +3 -0
  19. mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.index +0 -0
  20. mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.meta +3 -0
  21. mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/operative_config.gin +245 -0
  22. mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.data-00000-of-00002 +0 -0
  23. mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.data-00001-of-00002 +3 -0
  24. mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.index +0 -0
  25. mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.meta +3 -0
  26. mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/operative_config.gin +248 -0
  27. mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.data-00000-of-00002 +0 -0
  28. mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.data-00001-of-00002 +3 -0
  29. mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.index +0 -0
  30. mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.meta +3 -0
  31. mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/operative_config.gin +245 -0
  32. vocabs/spm_32000_unigram/spm_32000_pt.model +0 -0
  33. vocabs/spm_32000_unigram/spm_32000_pt.vocab +0 -0
.gitattributes CHANGED
@@ -7,3 +7,15 @@
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  *.ot filter=lfs diff=lfs merge=lfs -text
8
  *.onnx filter=lfs diff=lfs merge=lfs -text
9
  model.safetensors filter=lfs diff=lfs merge=lfs -text
10
+ mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
11
+ mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.meta filter=lfs diff=lfs merge=lfs -text
12
+ mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
13
+ mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.meta filter=lfs diff=lfs merge=lfs -text
14
+ mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
15
+ mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.meta filter=lfs diff=lfs merge=lfs -text
16
+ mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
17
+ mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.meta filter=lfs diff=lfs merge=lfs -text
18
+ mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
19
+ mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.meta filter=lfs diff=lfs merge=lfs -text
20
+ mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.data-00001-of-00002 filter=lfs diff=lfs merge=lfs -text
21
+ mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.meta filter=lfs diff=lfs merge=lfs -text
mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a04543eb97388e5aac909a6c8c61da095865406e559e1b6a9549d149f765e7
3
+ size 447754240
mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.index ADDED
Binary file (10.9 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/model.ckpt-1229942.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b5a5aa8921b5eb418d2f1fcd0eccd618b75a99d86da56da7b605a8b97ecc304
3
+ size 20836297
mesh_tensorflow_checkpoints/ptt5-base-portuguese-vocab/operative_config.gin ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 3072
13
+ d_kv = 64
14
+ d_model = 768
15
+ dropout_rate = 0.1
16
+ init_checkpoint = 'gs://t5-data/pretrained_models/base/model.ckpt-999900'
17
+ MIXTURE_NAME = 'all_mix'
18
+ noise_density = 0.15
19
+ num_heads = 12
20
+ num_layers = 12
21
+
22
+ # Parameters for AdafactorOptimizer:
23
+ # ==============================================================================
24
+ AdafactorOptimizer.beta1 = 0.0
25
+ AdafactorOptimizer.clipping_threshold = 1.0
26
+ AdafactorOptimizer.decay_rate = None
27
+ AdafactorOptimizer.epsilon1 = 1e-30
28
+ AdafactorOptimizer.epsilon2 = 0.001
29
+ AdafactorOptimizer.factored = True
30
+ AdafactorOptimizer.min_dim_size_to_factor = 128
31
+ AdafactorOptimizer.multiply_by_parameter_scale = True
32
+
33
+ # Parameters for Bitransformer:
34
+ # ==============================================================================
35
+ Bitransformer.shared_embedding = True
36
+
37
+ # Parameters for denoise:
38
+ # ==============================================================================
39
+ # None.
40
+
41
+ # Parameters for decoder/DenseReluDense:
42
+ # ==============================================================================
43
+ decoder/DenseReluDense.activation = 'relu'
44
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
45
+ decoder/DenseReluDense.hidden_size = %d_ff
46
+
47
+ # Parameters for encoder/DenseReluDense:
48
+ # ==============================================================================
49
+ encoder/DenseReluDense.activation = 'relu'
50
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
51
+ encoder/DenseReluDense.hidden_size = %d_ff
52
+
53
+ # Parameters for decoder/EncDecAttention:
54
+ # ==============================================================================
55
+ # None.
56
+
57
+ # Parameters for get_variable_dtype:
58
+ # ==============================================================================
59
+ get_variable_dtype.activation_dtype = 'bfloat16'
60
+
61
+ # Parameters for get_vocab_embedding_cls:
62
+ # ==============================================================================
63
+ # None.
64
+
65
+ # Parameters for get_vocabulary:
66
+ # ==============================================================================
67
+ # None.
68
+
69
+ # Parameters for iid_noise_mask:
70
+ # ==============================================================================
71
+ # None.
72
+
73
+ # Parameters for decoder/LayerStack:
74
+ # ==============================================================================
75
+ decoder/LayerStack.dropout_rate = %dropout_rate
76
+ decoder/LayerStack.norm_epsilon = 1e-06
77
+ decoder/LayerStack.recompute_grads = False
78
+
79
+ # Parameters for encoder/LayerStack:
80
+ # ==============================================================================
81
+ encoder/LayerStack.dropout_rate = %dropout_rate
82
+ encoder/LayerStack.norm_epsilon = 1e-06
83
+ encoder/LayerStack.recompute_grads = False
84
+
85
+ # Parameters for make_bitransformer:
86
+ # ==============================================================================
87
+ make_bitransformer.decoder_name = 'decoder'
88
+ make_bitransformer.encoder_name = 'encoder'
89
+
90
+ # Parameters for decoder/make_layer_stack:
91
+ # ==============================================================================
92
+ decoder/make_layer_stack.block_scope = True
93
+ decoder/make_layer_stack.layers = \
94
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
95
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
96
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
97
+ decoder/make_layer_stack.num_layers = %num_layers
98
+
99
+ # Parameters for encoder/make_layer_stack:
100
+ # ==============================================================================
101
+ encoder/make_layer_stack.block_scope = True
102
+ encoder/make_layer_stack.layers = \
103
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
104
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
105
+ encoder/make_layer_stack.num_layers = %num_layers
106
+
107
+ # Parameters for maybe_print_dataset:
108
+ # ==============================================================================
109
+ maybe_print_dataset.should_print = False
110
+
111
+ # Parameters for mesh_train_dataset_fn:
112
+ # ==============================================================================
113
+ mesh_train_dataset_fn.use_cached = False
114
+
115
+ # Parameters for MtfModel:
116
+ # ==============================================================================
117
+ MtfModel.autostack = True
118
+ MtfModel.ensemble_inputs = None
119
+ MtfModel.gcp_project = None
120
+ MtfModel.layout_rules = \
121
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
122
+ MtfModel.mesh_devices = None
123
+ MtfModel.mesh_shape = None
124
+ MtfModel.model_type = 'bitransformer'
125
+ MtfModel.optimizer = None
126
+ MtfModel.predict_fn = None
127
+ MtfModel.tpu_job_name = None
128
+ MtfModel.tpu_zone = None
129
+ MtfModel.variable_filter = None
130
+
131
+ # Parameters for noise_token_to_sentinel:
132
+ # ==============================================================================
133
+ # None.
134
+
135
+ # Parameters for num_parallel_calls:
136
+ # ==============================================================================
137
+ num_parallel_calls.deterministic = False
138
+
139
+ # Parameters for pack_dataset:
140
+ # ==============================================================================
141
+ pack_dataset.use_custom_ops = False
142
+
143
+ # Parameters for pack_or_pad:
144
+ # ==============================================================================
145
+ # None.
146
+
147
+ # Parameters for decoder/SelfAttention:
148
+ # ==============================================================================
149
+ decoder/SelfAttention.attention_func = None
150
+ decoder/SelfAttention.attention_kwargs = None
151
+ decoder/SelfAttention.combine_dims = True
152
+ decoder/SelfAttention.dropout_rate = %dropout_rate
153
+ decoder/SelfAttention.keep_query_heads_dims = False
154
+ decoder/SelfAttention.key_value_size = %d_kv
155
+ decoder/SelfAttention.num_heads = %num_heads
156
+ decoder/SelfAttention.num_memory_heads = 0
157
+ decoder/SelfAttention.relative_attention_num_buckets = 32
158
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
159
+ decoder/SelfAttention.shared_kv = False
160
+
161
+ # Parameters for encoder/SelfAttention:
162
+ # ==============================================================================
163
+ encoder/SelfAttention.attention_func = None
164
+ encoder/SelfAttention.attention_kwargs = None
165
+ encoder/SelfAttention.combine_dims = True
166
+ encoder/SelfAttention.dropout_rate = %dropout_rate
167
+ encoder/SelfAttention.keep_query_heads_dims = False
168
+ encoder/SelfAttention.key_value_size = %d_kv
169
+ encoder/SelfAttention.num_heads = %num_heads
170
+ encoder/SelfAttention.num_memory_heads = 0
171
+ encoder/SelfAttention.relative_attention_num_buckets = 32
172
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
173
+ encoder/SelfAttention.shared_kv = False
174
+
175
+ # Parameters for SentencePieceVocabulary:
176
+ # ==============================================================================
177
+ # None.
178
+
179
+ # Parameters for sentinel_id:
180
+ # ==============================================================================
181
+ sentinel_id.return_value = None
182
+
183
+ # Parameters for serialize_num_microbatches:
184
+ # ==============================================================================
185
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
186
+
187
+ # Parameters for shift_targets:
188
+ # ==============================================================================
189
+ shift_targets.bos_id = 0
190
+ shift_targets.eos_id = 1
191
+
192
+ # Parameters for tpu_estimator_model_fn:
193
+ # ==============================================================================
194
+ tpu_estimator_model_fn.model_info_file = None
195
+ tpu_estimator_model_fn.outer_batch_size = 1
196
+ tpu_estimator_model_fn.tpu_summaries = False
197
+
198
+ # Parameters for tpu_mesh_shape:
199
+ # ==============================================================================
200
+ tpu_mesh_shape.ensemble_parallelism = None
201
+
202
+ # Parameters for decoder/Unitransformer:
203
+ # ==============================================================================
204
+ decoder/Unitransformer.d_model = %d_model
205
+ decoder/Unitransformer.ensemble = None
206
+ decoder/Unitransformer.input_full_attention = False
207
+ decoder/Unitransformer.label_smoothing = 0.0
208
+ decoder/Unitransformer.loss_denominator = 233472
209
+ decoder/Unitransformer.loss_fn = None
210
+ decoder/Unitransformer.loss_on_targets_only = False
211
+ decoder/Unitransformer.max_length = 512
212
+ decoder/Unitransformer.positional_embedding = False
213
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
214
+ decoder/Unitransformer.sinusoid_positional_embedding = False
215
+ decoder/Unitransformer.token_dropout_rate = 0.0
216
+ decoder/Unitransformer.vocab_divisor = 128
217
+ decoder/Unitransformer.z_loss = 0.0001
218
+
219
+ # Parameters for encoder/Unitransformer:
220
+ # ==============================================================================
221
+ encoder/Unitransformer.d_model = %d_model
222
+ encoder/Unitransformer.ensemble = None
223
+ encoder/Unitransformer.input_full_attention = False
224
+ encoder/Unitransformer.label_smoothing = 0.0
225
+ encoder/Unitransformer.loss_denominator = None
226
+ encoder/Unitransformer.loss_fn = None
227
+ encoder/Unitransformer.loss_on_targets_only = False
228
+ encoder/Unitransformer.max_length = 512
229
+ encoder/Unitransformer.positional_embedding = False
230
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
231
+ encoder/Unitransformer.sinusoid_positional_embedding = False
232
+ encoder/Unitransformer.token_dropout_rate = 0.0
233
+ encoder/Unitransformer.vocab_divisor = 128
234
+ encoder/Unitransformer.z_loss = 0.0001
235
+
236
+ # Parameters for VarianceScalingInitializer:
237
+ # ==============================================================================
238
+ VarianceScalingInitializer.distribution = 'normal'
239
+ VarianceScalingInitializer.mode = 'fan_in'
240
+ VarianceScalingInitializer.scale = 1.0
241
+
242
+ # Parameters for VocabEmbedding:
243
+ # ==============================================================================
244
+ # None.
245
+
246
+ # Parameters for Vocabulary:
247
+ # ==============================================================================
248
+ # None.
mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:729e0fdca015f74fe06afee7c73f6d4376d10ee63a09035e389f1ab46512341a
3
+ size 447754240
mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.index ADDED
Binary file (10.9 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/model.ckpt-1229941.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfffa5a090374ff8c00b448a3bb252c333c03eabc87bc162b9ca00c9ca72be20
3
+ size 20914843
mesh_tensorflow_checkpoints/ptt5-base-t5-vocab/operative_config.gin ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 3072
13
+ d_kv = 64
14
+ d_model = 768
15
+ dropout_rate = 0.1
16
+ num_heads = 12
17
+ num_layers = 12
18
+
19
+ # Parameters for AdafactorOptimizer:
20
+ # ==============================================================================
21
+ AdafactorOptimizer.beta1 = 0.0
22
+ AdafactorOptimizer.clipping_threshold = 1.0
23
+ AdafactorOptimizer.decay_rate = None
24
+ AdafactorOptimizer.epsilon1 = 1e-30
25
+ AdafactorOptimizer.epsilon2 = 0.001
26
+ AdafactorOptimizer.factored = True
27
+ AdafactorOptimizer.min_dim_size_to_factor = 128
28
+ AdafactorOptimizer.multiply_by_parameter_scale = True
29
+
30
+ # Parameters for Bitransformer:
31
+ # ==============================================================================
32
+ Bitransformer.shared_embedding = True
33
+
34
+ # Parameters for denoise:
35
+ # ==============================================================================
36
+ # None.
37
+
38
+ # Parameters for decoder/DenseReluDense:
39
+ # ==============================================================================
40
+ decoder/DenseReluDense.activation = 'relu'
41
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
42
+ decoder/DenseReluDense.hidden_size = %d_ff
43
+
44
+ # Parameters for encoder/DenseReluDense:
45
+ # ==============================================================================
46
+ encoder/DenseReluDense.activation = 'relu'
47
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
48
+ encoder/DenseReluDense.hidden_size = %d_ff
49
+
50
+ # Parameters for decoder/EncDecAttention:
51
+ # ==============================================================================
52
+ # None.
53
+
54
+ # Parameters for get_variable_dtype:
55
+ # ==============================================================================
56
+ get_variable_dtype.activation_dtype = 'bfloat16'
57
+
58
+ # Parameters for get_vocab_embedding_cls:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_vocabulary:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for iid_noise_mask:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+ decoder/LayerStack.recompute_grads = False
75
+
76
+ # Parameters for encoder/LayerStack:
77
+ # ==============================================================================
78
+ encoder/LayerStack.dropout_rate = %dropout_rate
79
+ encoder/LayerStack.norm_epsilon = 1e-06
80
+ encoder/LayerStack.recompute_grads = False
81
+
82
+ # Parameters for make_bitransformer:
83
+ # ==============================================================================
84
+ make_bitransformer.decoder_name = 'decoder'
85
+ make_bitransformer.encoder_name = 'encoder'
86
+
87
+ # Parameters for decoder/make_layer_stack:
88
+ # ==============================================================================
89
+ decoder/make_layer_stack.block_scope = True
90
+ decoder/make_layer_stack.layers = \
91
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
92
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
93
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
94
+ decoder/make_layer_stack.num_layers = %num_layers
95
+
96
+ # Parameters for encoder/make_layer_stack:
97
+ # ==============================================================================
98
+ encoder/make_layer_stack.block_scope = True
99
+ encoder/make_layer_stack.layers = \
100
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
101
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
102
+ encoder/make_layer_stack.num_layers = %num_layers
103
+
104
+ # Parameters for maybe_print_dataset:
105
+ # ==============================================================================
106
+ maybe_print_dataset.should_print = False
107
+
108
+ # Parameters for mesh_train_dataset_fn:
109
+ # ==============================================================================
110
+ mesh_train_dataset_fn.use_cached = False
111
+
112
+ # Parameters for MtfModel:
113
+ # ==============================================================================
114
+ MtfModel.autostack = True
115
+ MtfModel.ensemble_inputs = None
116
+ MtfModel.gcp_project = None
117
+ MtfModel.layout_rules = \
118
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
119
+ MtfModel.mesh_devices = None
120
+ MtfModel.mesh_shape = None
121
+ MtfModel.model_type = 'bitransformer'
122
+ MtfModel.optimizer = None
123
+ MtfModel.predict_fn = None
124
+ MtfModel.tpu_job_name = None
125
+ MtfModel.tpu_zone = None
126
+ MtfModel.variable_filter = None
127
+
128
+ # Parameters for noise_token_to_sentinel:
129
+ # ==============================================================================
130
+ # None.
131
+
132
+ # Parameters for num_parallel_calls:
133
+ # ==============================================================================
134
+ num_parallel_calls.deterministic = False
135
+
136
+ # Parameters for pack_dataset:
137
+ # ==============================================================================
138
+ pack_dataset.use_custom_ops = False
139
+
140
+ # Parameters for pack_or_pad:
141
+ # ==============================================================================
142
+ # None.
143
+
144
+ # Parameters for decoder/SelfAttention:
145
+ # ==============================================================================
146
+ decoder/SelfAttention.attention_func = None
147
+ decoder/SelfAttention.attention_kwargs = None
148
+ decoder/SelfAttention.combine_dims = True
149
+ decoder/SelfAttention.dropout_rate = %dropout_rate
150
+ decoder/SelfAttention.keep_query_heads_dims = False
151
+ decoder/SelfAttention.key_value_size = %d_kv
152
+ decoder/SelfAttention.num_heads = %num_heads
153
+ decoder/SelfAttention.num_memory_heads = 0
154
+ decoder/SelfAttention.relative_attention_num_buckets = 32
155
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
156
+ decoder/SelfAttention.shared_kv = False
157
+
158
+ # Parameters for encoder/SelfAttention:
159
+ # ==============================================================================
160
+ encoder/SelfAttention.attention_func = None
161
+ encoder/SelfAttention.attention_kwargs = None
162
+ encoder/SelfAttention.combine_dims = True
163
+ encoder/SelfAttention.dropout_rate = %dropout_rate
164
+ encoder/SelfAttention.keep_query_heads_dims = False
165
+ encoder/SelfAttention.key_value_size = %d_kv
166
+ encoder/SelfAttention.num_heads = %num_heads
167
+ encoder/SelfAttention.num_memory_heads = 0
168
+ encoder/SelfAttention.relative_attention_num_buckets = 32
169
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
170
+ encoder/SelfAttention.shared_kv = False
171
+
172
+ # Parameters for SentencePieceVocabulary:
173
+ # ==============================================================================
174
+ # None.
175
+
176
+ # Parameters for sentinel_id:
177
+ # ==============================================================================
178
+ sentinel_id.return_value = None
179
+
180
+ # Parameters for serialize_num_microbatches:
181
+ # ==============================================================================
182
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
183
+
184
+ # Parameters for shift_targets:
185
+ # ==============================================================================
186
+ shift_targets.bos_id = 0
187
+ shift_targets.eos_id = 1
188
+
189
+ # Parameters for tpu_estimator_model_fn:
190
+ # ==============================================================================
191
+ tpu_estimator_model_fn.model_info_file = None
192
+ tpu_estimator_model_fn.outer_batch_size = 1
193
+ tpu_estimator_model_fn.tpu_summaries = False
194
+
195
+ # Parameters for tpu_mesh_shape:
196
+ # ==============================================================================
197
+ tpu_mesh_shape.ensemble_parallelism = None
198
+
199
+ # Parameters for decoder/Unitransformer:
200
+ # ==============================================================================
201
+ decoder/Unitransformer.d_model = %d_model
202
+ decoder/Unitransformer.ensemble = None
203
+ decoder/Unitransformer.input_full_attention = False
204
+ decoder/Unitransformer.label_smoothing = 0.0
205
+ decoder/Unitransformer.loss_denominator = 233472
206
+ decoder/Unitransformer.loss_fn = None
207
+ decoder/Unitransformer.loss_on_targets_only = False
208
+ decoder/Unitransformer.max_length = 512
209
+ decoder/Unitransformer.positional_embedding = False
210
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
211
+ decoder/Unitransformer.sinusoid_positional_embedding = False
212
+ decoder/Unitransformer.token_dropout_rate = 0.0
213
+ decoder/Unitransformer.vocab_divisor = 128
214
+ decoder/Unitransformer.z_loss = 0.0001
215
+
216
+ # Parameters for encoder/Unitransformer:
217
+ # ==============================================================================
218
+ encoder/Unitransformer.d_model = %d_model
219
+ encoder/Unitransformer.ensemble = None
220
+ encoder/Unitransformer.input_full_attention = False
221
+ encoder/Unitransformer.label_smoothing = 0.0
222
+ encoder/Unitransformer.loss_denominator = None
223
+ encoder/Unitransformer.loss_fn = None
224
+ encoder/Unitransformer.loss_on_targets_only = False
225
+ encoder/Unitransformer.max_length = 512
226
+ encoder/Unitransformer.positional_embedding = False
227
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
228
+ encoder/Unitransformer.sinusoid_positional_embedding = False
229
+ encoder/Unitransformer.token_dropout_rate = 0.0
230
+ encoder/Unitransformer.vocab_divisor = 128
231
+ encoder/Unitransformer.z_loss = 0.0001
232
+
233
+ # Parameters for VarianceScalingInitializer:
234
+ # ==============================================================================
235
+ VarianceScalingInitializer.distribution = 'normal'
236
+ VarianceScalingInitializer.mode = 'fan_in'
237
+ VarianceScalingInitializer.scale = 1.0
238
+
239
+ # Parameters for VocabEmbedding:
240
+ # ==============================================================================
241
+ # None.
242
+
243
+ # Parameters for Vocabulary:
244
+ # ==============================================================================
245
+ # None.
mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e45f3c6d3d0ef5a2678b59ec0da6bccc45dd203a431aa9278eb601104da26cd
3
+ size 1480297984
mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.index ADDED
Binary file (21 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/model.ckpt-1460784.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63fbae2b0454bef2790070e657d0431193083bd93865736a4ffd8e1c4df29b36
3
+ size 41753926
mesh_tensorflow_checkpoints/ptt5-large-portuguese-vocab/operative_config.gin ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 4096
13
+ d_kv = 64
14
+ d_model = 1024
15
+ dropout_rate = 0.1
16
+ num_heads = 16
17
+ num_layers = 24
18
+
19
+ # Parameters for AdafactorOptimizer:
20
+ # ==============================================================================
21
+ AdafactorOptimizer.beta1 = 0.0
22
+ AdafactorOptimizer.clipping_threshold = 1.0
23
+ AdafactorOptimizer.decay_rate = None
24
+ AdafactorOptimizer.epsilon1 = 1e-30
25
+ AdafactorOptimizer.epsilon2 = 0.001
26
+ AdafactorOptimizer.factored = True
27
+ AdafactorOptimizer.min_dim_size_to_factor = 128
28
+ AdafactorOptimizer.multiply_by_parameter_scale = True
29
+
30
+ # Parameters for Bitransformer:
31
+ # ==============================================================================
32
+ Bitransformer.shared_embedding = True
33
+
34
+ # Parameters for denoise:
35
+ # ==============================================================================
36
+ # None.
37
+
38
+ # Parameters for decoder/DenseReluDense:
39
+ # ==============================================================================
40
+ decoder/DenseReluDense.activation = 'relu'
41
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
42
+ decoder/DenseReluDense.hidden_size = %d_ff
43
+
44
+ # Parameters for encoder/DenseReluDense:
45
+ # ==============================================================================
46
+ encoder/DenseReluDense.activation = 'relu'
47
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
48
+ encoder/DenseReluDense.hidden_size = %d_ff
49
+
50
+ # Parameters for decoder/EncDecAttention:
51
+ # ==============================================================================
52
+ # None.
53
+
54
+ # Parameters for get_variable_dtype:
55
+ # ==============================================================================
56
+ get_variable_dtype.activation_dtype = 'bfloat16'
57
+
58
+ # Parameters for get_vocab_embedding_cls:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_vocabulary:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for iid_noise_mask:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+ decoder/LayerStack.recompute_grads = False
75
+
76
+ # Parameters for encoder/LayerStack:
77
+ # ==============================================================================
78
+ encoder/LayerStack.dropout_rate = %dropout_rate
79
+ encoder/LayerStack.norm_epsilon = 1e-06
80
+ encoder/LayerStack.recompute_grads = False
81
+
82
+ # Parameters for make_bitransformer:
83
+ # ==============================================================================
84
+ make_bitransformer.decoder_name = 'decoder'
85
+ make_bitransformer.encoder_name = 'encoder'
86
+
87
+ # Parameters for decoder/make_layer_stack:
88
+ # ==============================================================================
89
+ decoder/make_layer_stack.block_scope = True
90
+ decoder/make_layer_stack.layers = \
91
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
92
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
93
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
94
+ decoder/make_layer_stack.num_layers = %num_layers
95
+
96
+ # Parameters for encoder/make_layer_stack:
97
+ # ==============================================================================
98
+ encoder/make_layer_stack.block_scope = True
99
+ encoder/make_layer_stack.layers = \
100
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
101
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
102
+ encoder/make_layer_stack.num_layers = %num_layers
103
+
104
+ # Parameters for maybe_print_dataset:
105
+ # ==============================================================================
106
+ maybe_print_dataset.should_print = False
107
+
108
+ # Parameters for mesh_train_dataset_fn:
109
+ # ==============================================================================
110
+ mesh_train_dataset_fn.use_cached = False
111
+
112
+ # Parameters for MtfModel:
113
+ # ==============================================================================
114
+ MtfModel.autostack = True
115
+ MtfModel.ensemble_inputs = None
116
+ MtfModel.gcp_project = None
117
+ MtfModel.layout_rules = \
118
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
119
+ MtfModel.mesh_devices = None
120
+ MtfModel.mesh_shape = None
121
+ MtfModel.model_type = 'bitransformer'
122
+ MtfModel.optimizer = None
123
+ MtfModel.predict_fn = None
124
+ MtfModel.tpu_job_name = None
125
+ MtfModel.tpu_zone = None
126
+ MtfModel.variable_filter = None
127
+
128
+ # Parameters for noise_token_to_sentinel:
129
+ # ==============================================================================
130
+ # None.
131
+
132
+ # Parameters for num_parallel_calls:
133
+ # ==============================================================================
134
+ num_parallel_calls.deterministic = False
135
+
136
+ # Parameters for pack_dataset:
137
+ # ==============================================================================
138
+ pack_dataset.use_custom_ops = False
139
+
140
+ # Parameters for pack_or_pad:
141
+ # ==============================================================================
142
+ # None.
143
+
144
+ # Parameters for decoder/SelfAttention:
145
+ # ==============================================================================
146
+ decoder/SelfAttention.attention_func = None
147
+ decoder/SelfAttention.attention_kwargs = None
148
+ decoder/SelfAttention.combine_dims = True
149
+ decoder/SelfAttention.dropout_rate = %dropout_rate
150
+ decoder/SelfAttention.keep_query_heads_dims = False
151
+ decoder/SelfAttention.key_value_size = %d_kv
152
+ decoder/SelfAttention.num_heads = %num_heads
153
+ decoder/SelfAttention.num_memory_heads = 0
154
+ decoder/SelfAttention.relative_attention_num_buckets = 32
155
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
156
+ decoder/SelfAttention.shared_kv = False
157
+
158
+ # Parameters for encoder/SelfAttention:
159
+ # ==============================================================================
160
+ encoder/SelfAttention.attention_func = None
161
+ encoder/SelfAttention.attention_kwargs = None
162
+ encoder/SelfAttention.combine_dims = True
163
+ encoder/SelfAttention.dropout_rate = %dropout_rate
164
+ encoder/SelfAttention.keep_query_heads_dims = False
165
+ encoder/SelfAttention.key_value_size = %d_kv
166
+ encoder/SelfAttention.num_heads = %num_heads
167
+ encoder/SelfAttention.num_memory_heads = 0
168
+ encoder/SelfAttention.relative_attention_num_buckets = 32
169
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
170
+ encoder/SelfAttention.shared_kv = False
171
+
172
+ # Parameters for SentencePieceVocabulary:
173
+ # ==============================================================================
174
+ # None.
175
+
176
+ # Parameters for sentinel_id:
177
+ # ==============================================================================
178
+ sentinel_id.return_value = None
179
+
180
+ # Parameters for serialize_num_microbatches:
181
+ # ==============================================================================
182
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
183
+
184
+ # Parameters for shift_targets:
185
+ # ==============================================================================
186
+ shift_targets.bos_id = 0
187
+ shift_targets.eos_id = 1
188
+
189
+ # Parameters for tpu_estimator_model_fn:
190
+ # ==============================================================================
191
+ tpu_estimator_model_fn.model_info_file = None
192
+ tpu_estimator_model_fn.outer_batch_size = 1
193
+ tpu_estimator_model_fn.tpu_summaries = False
194
+
195
+ # Parameters for tpu_mesh_shape:
196
+ # ==============================================================================
197
+ tpu_mesh_shape.ensemble_parallelism = None
198
+
199
+ # Parameters for decoder/Unitransformer:
200
+ # ==============================================================================
201
+ decoder/Unitransformer.d_model = %d_model
202
+ decoder/Unitransformer.ensemble = None
203
+ decoder/Unitransformer.input_full_attention = False
204
+ decoder/Unitransformer.label_smoothing = 0.0
205
+ decoder/Unitransformer.loss_denominator = 233472
206
+ decoder/Unitransformer.loss_fn = None
207
+ decoder/Unitransformer.loss_on_targets_only = False
208
+ decoder/Unitransformer.max_length = 512
209
+ decoder/Unitransformer.positional_embedding = False
210
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
211
+ decoder/Unitransformer.sinusoid_positional_embedding = False
212
+ decoder/Unitransformer.token_dropout_rate = 0.0
213
+ decoder/Unitransformer.vocab_divisor = 128
214
+ decoder/Unitransformer.z_loss = 0.0001
215
+
216
+ # Parameters for encoder/Unitransformer:
217
+ # ==============================================================================
218
+ encoder/Unitransformer.d_model = %d_model
219
+ encoder/Unitransformer.ensemble = None
220
+ encoder/Unitransformer.input_full_attention = False
221
+ encoder/Unitransformer.label_smoothing = 0.0
222
+ encoder/Unitransformer.loss_denominator = None
223
+ encoder/Unitransformer.loss_fn = None
224
+ encoder/Unitransformer.loss_on_targets_only = False
225
+ encoder/Unitransformer.max_length = 512
226
+ encoder/Unitransformer.positional_embedding = False
227
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
228
+ encoder/Unitransformer.sinusoid_positional_embedding = False
229
+ encoder/Unitransformer.token_dropout_rate = 0.0
230
+ encoder/Unitransformer.vocab_divisor = 128
231
+ encoder/Unitransformer.z_loss = 0.0001
232
+
233
+ # Parameters for VarianceScalingInitializer:
234
+ # ==============================================================================
235
+ VarianceScalingInitializer.distribution = 'normal'
236
+ VarianceScalingInitializer.mode = 'fan_in'
237
+ VarianceScalingInitializer.scale = 1.0
238
+
239
+ # Parameters for VocabEmbedding:
240
+ # ==============================================================================
241
+ # None.
242
+
243
+ # Parameters for Vocabulary:
244
+ # ==============================================================================
245
+ # None.
mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fb0eb2104329a6ec986b9b607163ec27697aa810f118c12fecfdd04e6c299d
3
+ size 1480297984
mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.index ADDED
Binary file (20.9 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/model.ckpt-1461673.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1681f7905ba2b006f6ccce00e1c7309f93a345970c40036dbb9e7cfa28f925a2
3
+ size 41809252
mesh_tensorflow_checkpoints/ptt5-large-t5-vocab/operative_config.gin ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 4096
13
+ d_kv = 64
14
+ d_model = 1024
15
+ dropout_rate = 0.1
16
+ num_heads = 16
17
+ num_layers = 24
18
+
19
+ # Parameters for AdafactorOptimizer:
20
+ # ==============================================================================
21
+ AdafactorOptimizer.beta1 = 0.0
22
+ AdafactorOptimizer.clipping_threshold = 1.0
23
+ AdafactorOptimizer.decay_rate = None
24
+ AdafactorOptimizer.epsilon1 = 1e-30
25
+ AdafactorOptimizer.epsilon2 = 0.001
26
+ AdafactorOptimizer.factored = True
27
+ AdafactorOptimizer.min_dim_size_to_factor = 128
28
+ AdafactorOptimizer.multiply_by_parameter_scale = True
29
+
30
+ # Parameters for Bitransformer:
31
+ # ==============================================================================
32
+ Bitransformer.shared_embedding = True
33
+
34
+ # Parameters for denoise:
35
+ # ==============================================================================
36
+ # None.
37
+
38
+ # Parameters for decoder/DenseReluDense:
39
+ # ==============================================================================
40
+ decoder/DenseReluDense.activation = 'relu'
41
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
42
+ decoder/DenseReluDense.hidden_size = %d_ff
43
+
44
+ # Parameters for encoder/DenseReluDense:
45
+ # ==============================================================================
46
+ encoder/DenseReluDense.activation = 'relu'
47
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
48
+ encoder/DenseReluDense.hidden_size = %d_ff
49
+
50
+ # Parameters for decoder/EncDecAttention:
51
+ # ==============================================================================
52
+ # None.
53
+
54
+ # Parameters for get_variable_dtype:
55
+ # ==============================================================================
56
+ get_variable_dtype.activation_dtype = 'bfloat16'
57
+
58
+ # Parameters for get_vocab_embedding_cls:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_vocabulary:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for iid_noise_mask:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+ decoder/LayerStack.recompute_grads = False
75
+
76
+ # Parameters for encoder/LayerStack:
77
+ # ==============================================================================
78
+ encoder/LayerStack.dropout_rate = %dropout_rate
79
+ encoder/LayerStack.norm_epsilon = 1e-06
80
+ encoder/LayerStack.recompute_grads = False
81
+
82
+ # Parameters for make_bitransformer:
83
+ # ==============================================================================
84
+ make_bitransformer.decoder_name = 'decoder'
85
+ make_bitransformer.encoder_name = 'encoder'
86
+
87
+ # Parameters for decoder/make_layer_stack:
88
+ # ==============================================================================
89
+ decoder/make_layer_stack.block_scope = True
90
+ decoder/make_layer_stack.layers = \
91
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
92
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
93
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
94
+ decoder/make_layer_stack.num_layers = %num_layers
95
+
96
+ # Parameters for encoder/make_layer_stack:
97
+ # ==============================================================================
98
+ encoder/make_layer_stack.block_scope = True
99
+ encoder/make_layer_stack.layers = \
100
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
101
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
102
+ encoder/make_layer_stack.num_layers = %num_layers
103
+
104
+ # Parameters for maybe_print_dataset:
105
+ # ==============================================================================
106
+ maybe_print_dataset.should_print = False
107
+
108
+ # Parameters for mesh_train_dataset_fn:
109
+ # ==============================================================================
110
+ mesh_train_dataset_fn.use_cached = False
111
+
112
+ # Parameters for MtfModel:
113
+ # ==============================================================================
114
+ MtfModel.autostack = True
115
+ MtfModel.ensemble_inputs = None
116
+ MtfModel.gcp_project = None
117
+ MtfModel.layout_rules = \
118
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
119
+ MtfModel.mesh_devices = None
120
+ MtfModel.mesh_shape = None
121
+ MtfModel.model_type = 'bitransformer'
122
+ MtfModel.optimizer = None
123
+ MtfModel.predict_fn = None
124
+ MtfModel.tpu_job_name = None
125
+ MtfModel.tpu_zone = None
126
+ MtfModel.variable_filter = None
127
+
128
+ # Parameters for noise_token_to_sentinel:
129
+ # ==============================================================================
130
+ # None.
131
+
132
+ # Parameters for num_parallel_calls:
133
+ # ==============================================================================
134
+ num_parallel_calls.deterministic = False
135
+
136
+ # Parameters for pack_dataset:
137
+ # ==============================================================================
138
+ pack_dataset.use_custom_ops = False
139
+
140
+ # Parameters for pack_or_pad:
141
+ # ==============================================================================
142
+ # None.
143
+
144
+ # Parameters for decoder/SelfAttention:
145
+ # ==============================================================================
146
+ decoder/SelfAttention.attention_func = None
147
+ decoder/SelfAttention.attention_kwargs = None
148
+ decoder/SelfAttention.combine_dims = True
149
+ decoder/SelfAttention.dropout_rate = %dropout_rate
150
+ decoder/SelfAttention.keep_query_heads_dims = False
151
+ decoder/SelfAttention.key_value_size = %d_kv
152
+ decoder/SelfAttention.num_heads = %num_heads
153
+ decoder/SelfAttention.num_memory_heads = 0
154
+ decoder/SelfAttention.relative_attention_num_buckets = 32
155
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
156
+ decoder/SelfAttention.shared_kv = False
157
+
158
+ # Parameters for encoder/SelfAttention:
159
+ # ==============================================================================
160
+ encoder/SelfAttention.attention_func = None
161
+ encoder/SelfAttention.attention_kwargs = None
162
+ encoder/SelfAttention.combine_dims = True
163
+ encoder/SelfAttention.dropout_rate = %dropout_rate
164
+ encoder/SelfAttention.keep_query_heads_dims = False
165
+ encoder/SelfAttention.key_value_size = %d_kv
166
+ encoder/SelfAttention.num_heads = %num_heads
167
+ encoder/SelfAttention.num_memory_heads = 0
168
+ encoder/SelfAttention.relative_attention_num_buckets = 32
169
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
170
+ encoder/SelfAttention.shared_kv = False
171
+
172
+ # Parameters for SentencePieceVocabulary:
173
+ # ==============================================================================
174
+ # None.
175
+
176
+ # Parameters for sentinel_id:
177
+ # ==============================================================================
178
+ sentinel_id.return_value = None
179
+
180
+ # Parameters for serialize_num_microbatches:
181
+ # ==============================================================================
182
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
183
+
184
+ # Parameters for shift_targets:
185
+ # ==============================================================================
186
+ shift_targets.bos_id = 0
187
+ shift_targets.eos_id = 1
188
+
189
+ # Parameters for tpu_estimator_model_fn:
190
+ # ==============================================================================
191
+ tpu_estimator_model_fn.model_info_file = None
192
+ tpu_estimator_model_fn.outer_batch_size = 1
193
+ tpu_estimator_model_fn.tpu_summaries = False
194
+
195
+ # Parameters for tpu_mesh_shape:
196
+ # ==============================================================================
197
+ tpu_mesh_shape.ensemble_parallelism = None
198
+
199
+ # Parameters for decoder/Unitransformer:
200
+ # ==============================================================================
201
+ decoder/Unitransformer.d_model = %d_model
202
+ decoder/Unitransformer.ensemble = None
203
+ decoder/Unitransformer.input_full_attention = False
204
+ decoder/Unitransformer.label_smoothing = 0.0
205
+ decoder/Unitransformer.loss_denominator = 233472
206
+ decoder/Unitransformer.loss_fn = None
207
+ decoder/Unitransformer.loss_on_targets_only = False
208
+ decoder/Unitransformer.max_length = 512
209
+ decoder/Unitransformer.positional_embedding = False
210
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
211
+ decoder/Unitransformer.sinusoid_positional_embedding = False
212
+ decoder/Unitransformer.token_dropout_rate = 0.0
213
+ decoder/Unitransformer.vocab_divisor = 128
214
+ decoder/Unitransformer.z_loss = 0.0001
215
+
216
+ # Parameters for encoder/Unitransformer:
217
+ # ==============================================================================
218
+ encoder/Unitransformer.d_model = %d_model
219
+ encoder/Unitransformer.ensemble = None
220
+ encoder/Unitransformer.input_full_attention = False
221
+ encoder/Unitransformer.label_smoothing = 0.0
222
+ encoder/Unitransformer.loss_denominator = None
223
+ encoder/Unitransformer.loss_fn = None
224
+ encoder/Unitransformer.loss_on_targets_only = False
225
+ encoder/Unitransformer.max_length = 512
226
+ encoder/Unitransformer.positional_embedding = False
227
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
228
+ encoder/Unitransformer.sinusoid_positional_embedding = False
229
+ encoder/Unitransformer.token_dropout_rate = 0.0
230
+ encoder/Unitransformer.vocab_divisor = 128
231
+ encoder/Unitransformer.z_loss = 0.0001
232
+
233
+ # Parameters for VarianceScalingInitializer:
234
+ # ==============================================================================
235
+ VarianceScalingInitializer.distribution = 'normal'
236
+ VarianceScalingInitializer.mode = 'fan_in'
237
+ VarianceScalingInitializer.scale = 1.0
238
+
239
+ # Parameters for VocabEmbedding:
240
+ # ==============================================================================
241
+ # None.
242
+
243
+ # Parameters for Vocabulary:
244
+ # ==============================================================================
245
+ # None.
mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d3ed67257bdba129b2d9b6f94f1b801f7244c5dfa7a11433f7701ff0775f803
3
+ size 121752064
mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.index ADDED
Binary file (5.65 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/model.ckpt-1115021.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7abb5d6797e9f6e246c6c4d014814b1d05c037c0d91ea628936e61a667b97602
3
+ size 10897208
mesh_tensorflow_checkpoints/ptt5-small-portuguese-vocab/operative_config.gin ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 2048
13
+ d_kv = 64
14
+ d_model = 512
15
+ dropout_rate = 0.1
16
+ init_checkpoint = 'gs://t5-data/pretrained_models/small/model.ckpt-1000000'
17
+ MIXTURE_NAME = 'all_mix'
18
+ noise_density = 0.15
19
+ num_heads = 8
20
+ num_layers = 6
21
+
22
+ # Parameters for AdafactorOptimizer:
23
+ # ==============================================================================
24
+ AdafactorOptimizer.beta1 = 0.0
25
+ AdafactorOptimizer.clipping_threshold = 1.0
26
+ AdafactorOptimizer.decay_rate = None
27
+ AdafactorOptimizer.epsilon1 = 1e-30
28
+ AdafactorOptimizer.epsilon2 = 0.001
29
+ AdafactorOptimizer.factored = True
30
+ AdafactorOptimizer.min_dim_size_to_factor = 128
31
+ AdafactorOptimizer.multiply_by_parameter_scale = True
32
+
33
+ # Parameters for Bitransformer:
34
+ # ==============================================================================
35
+ Bitransformer.shared_embedding = True
36
+
37
+ # Parameters for denoise:
38
+ # ==============================================================================
39
+ # None.
40
+
41
+ # Parameters for decoder/DenseReluDense:
42
+ # ==============================================================================
43
+ decoder/DenseReluDense.activation = 'relu'
44
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
45
+ decoder/DenseReluDense.hidden_size = %d_ff
46
+
47
+ # Parameters for encoder/DenseReluDense:
48
+ # ==============================================================================
49
+ encoder/DenseReluDense.activation = 'relu'
50
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
51
+ encoder/DenseReluDense.hidden_size = %d_ff
52
+
53
+ # Parameters for decoder/EncDecAttention:
54
+ # ==============================================================================
55
+ # None.
56
+
57
+ # Parameters for get_variable_dtype:
58
+ # ==============================================================================
59
+ get_variable_dtype.activation_dtype = 'bfloat16'
60
+
61
+ # Parameters for get_vocab_embedding_cls:
62
+ # ==============================================================================
63
+ # None.
64
+
65
+ # Parameters for get_vocabulary:
66
+ # ==============================================================================
67
+ # None.
68
+
69
+ # Parameters for iid_noise_mask:
70
+ # ==============================================================================
71
+ # None.
72
+
73
+ # Parameters for decoder/LayerStack:
74
+ # ==============================================================================
75
+ decoder/LayerStack.dropout_rate = %dropout_rate
76
+ decoder/LayerStack.norm_epsilon = 1e-06
77
+ decoder/LayerStack.recompute_grads = False
78
+
79
+ # Parameters for encoder/LayerStack:
80
+ # ==============================================================================
81
+ encoder/LayerStack.dropout_rate = %dropout_rate
82
+ encoder/LayerStack.norm_epsilon = 1e-06
83
+ encoder/LayerStack.recompute_grads = False
84
+
85
+ # Parameters for make_bitransformer:
86
+ # ==============================================================================
87
+ make_bitransformer.decoder_name = 'decoder'
88
+ make_bitransformer.encoder_name = 'encoder'
89
+
90
+ # Parameters for decoder/make_layer_stack:
91
+ # ==============================================================================
92
+ decoder/make_layer_stack.block_scope = True
93
+ decoder/make_layer_stack.layers = \
94
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
95
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
96
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
97
+ decoder/make_layer_stack.num_layers = %num_layers
98
+
99
+ # Parameters for encoder/make_layer_stack:
100
+ # ==============================================================================
101
+ encoder/make_layer_stack.block_scope = True
102
+ encoder/make_layer_stack.layers = \
103
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
104
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
105
+ encoder/make_layer_stack.num_layers = %num_layers
106
+
107
+ # Parameters for maybe_print_dataset:
108
+ # ==============================================================================
109
+ maybe_print_dataset.should_print = False
110
+
111
+ # Parameters for mesh_train_dataset_fn:
112
+ # ==============================================================================
113
+ mesh_train_dataset_fn.use_cached = False
114
+
115
+ # Parameters for MtfModel:
116
+ # ==============================================================================
117
+ MtfModel.autostack = True
118
+ MtfModel.ensemble_inputs = None
119
+ MtfModel.gcp_project = None
120
+ MtfModel.layout_rules = \
121
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
122
+ MtfModel.mesh_devices = None
123
+ MtfModel.mesh_shape = None
124
+ MtfModel.model_type = 'bitransformer'
125
+ MtfModel.optimizer = None
126
+ MtfModel.predict_fn = None
127
+ MtfModel.tpu_job_name = None
128
+ MtfModel.tpu_zone = None
129
+ MtfModel.variable_filter = None
130
+
131
+ # Parameters for noise_token_to_sentinel:
132
+ # ==============================================================================
133
+ # None.
134
+
135
+ # Parameters for num_parallel_calls:
136
+ # ==============================================================================
137
+ num_parallel_calls.deterministic = False
138
+
139
+ # Parameters for pack_dataset:
140
+ # ==============================================================================
141
+ pack_dataset.use_custom_ops = False
142
+
143
+ # Parameters for pack_or_pad:
144
+ # ==============================================================================
145
+ # None.
146
+
147
+ # Parameters for decoder/SelfAttention:
148
+ # ==============================================================================
149
+ decoder/SelfAttention.attention_func = None
150
+ decoder/SelfAttention.attention_kwargs = None
151
+ decoder/SelfAttention.combine_dims = True
152
+ decoder/SelfAttention.dropout_rate = %dropout_rate
153
+ decoder/SelfAttention.keep_query_heads_dims = False
154
+ decoder/SelfAttention.key_value_size = %d_kv
155
+ decoder/SelfAttention.num_heads = %num_heads
156
+ decoder/SelfAttention.num_memory_heads = 0
157
+ decoder/SelfAttention.relative_attention_num_buckets = 32
158
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
159
+ decoder/SelfAttention.shared_kv = False
160
+
161
+ # Parameters for encoder/SelfAttention:
162
+ # ==============================================================================
163
+ encoder/SelfAttention.attention_func = None
164
+ encoder/SelfAttention.attention_kwargs = None
165
+ encoder/SelfAttention.combine_dims = True
166
+ encoder/SelfAttention.dropout_rate = %dropout_rate
167
+ encoder/SelfAttention.keep_query_heads_dims = False
168
+ encoder/SelfAttention.key_value_size = %d_kv
169
+ encoder/SelfAttention.num_heads = %num_heads
170
+ encoder/SelfAttention.num_memory_heads = 0
171
+ encoder/SelfAttention.relative_attention_num_buckets = 32
172
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
173
+ encoder/SelfAttention.shared_kv = False
174
+
175
+ # Parameters for SentencePieceVocabulary:
176
+ # ==============================================================================
177
+ # None.
178
+
179
+ # Parameters for sentinel_id:
180
+ # ==============================================================================
181
+ sentinel_id.return_value = None
182
+
183
+ # Parameters for serialize_num_microbatches:
184
+ # ==============================================================================
185
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
186
+
187
+ # Parameters for shift_targets:
188
+ # ==============================================================================
189
+ shift_targets.bos_id = 0
190
+ shift_targets.eos_id = 1
191
+
192
+ # Parameters for tpu_estimator_model_fn:
193
+ # ==============================================================================
194
+ tpu_estimator_model_fn.model_info_file = None
195
+ tpu_estimator_model_fn.outer_batch_size = 1
196
+ tpu_estimator_model_fn.tpu_summaries = False
197
+
198
+ # Parameters for tpu_mesh_shape:
199
+ # ==============================================================================
200
+ tpu_mesh_shape.ensemble_parallelism = None
201
+
202
+ # Parameters for decoder/Unitransformer:
203
+ # ==============================================================================
204
+ decoder/Unitransformer.d_model = %d_model
205
+ decoder/Unitransformer.ensemble = None
206
+ decoder/Unitransformer.input_full_attention = False
207
+ decoder/Unitransformer.label_smoothing = 0.0
208
+ decoder/Unitransformer.loss_denominator = 233472
209
+ decoder/Unitransformer.loss_fn = None
210
+ decoder/Unitransformer.loss_on_targets_only = False
211
+ decoder/Unitransformer.max_length = 512
212
+ decoder/Unitransformer.positional_embedding = False
213
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
214
+ decoder/Unitransformer.sinusoid_positional_embedding = False
215
+ decoder/Unitransformer.token_dropout_rate = 0.0
216
+ decoder/Unitransformer.vocab_divisor = 128
217
+ decoder/Unitransformer.z_loss = 0.0001
218
+
219
+ # Parameters for encoder/Unitransformer:
220
+ # ==============================================================================
221
+ encoder/Unitransformer.d_model = %d_model
222
+ encoder/Unitransformer.ensemble = None
223
+ encoder/Unitransformer.input_full_attention = False
224
+ encoder/Unitransformer.label_smoothing = 0.0
225
+ encoder/Unitransformer.loss_denominator = None
226
+ encoder/Unitransformer.loss_fn = None
227
+ encoder/Unitransformer.loss_on_targets_only = False
228
+ encoder/Unitransformer.max_length = 512
229
+ encoder/Unitransformer.positional_embedding = False
230
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
231
+ encoder/Unitransformer.sinusoid_positional_embedding = False
232
+ encoder/Unitransformer.token_dropout_rate = 0.0
233
+ encoder/Unitransformer.vocab_divisor = 128
234
+ encoder/Unitransformer.z_loss = 0.0001
235
+
236
+ # Parameters for VarianceScalingInitializer:
237
+ # ==============================================================================
238
+ VarianceScalingInitializer.distribution = 'normal'
239
+ VarianceScalingInitializer.mode = 'fan_in'
240
+ VarianceScalingInitializer.scale = 1.0
241
+
242
+ # Parameters for VocabEmbedding:
243
+ # ==============================================================================
244
+ # None.
245
+
246
+ # Parameters for Vocabulary:
247
+ # ==============================================================================
248
+ # None.
mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.data-00000-of-00002 ADDED
Binary file (8 Bytes). View file
 
mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.data-00001-of-00002 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a7fe8c3261574846c913f53c8be2f85077933b407f559b88aa98dd9b1c34d2
3
+ size 121752064
mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.index ADDED
Binary file (5.67 kB). View file
 
mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/model.ckpt-1115020.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44fab654a4b89b88e8a3dd33b0530b48f95e676e8d99722a31a4d148302f039c
3
+ size 10972622
mesh_tensorflow_checkpoints/ptt5-small-t5-vocab/operative_config.gin ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mesh_tensorflow.optimize
2
+ import mesh_tensorflow.transformer.dataset
3
+ import mesh_tensorflow.transformer.learning_rate_schedules
4
+ import mesh_tensorflow.transformer.t2t_vocabulary
5
+ import mesh_tensorflow.transformer.transformer_layers
6
+ import mesh_tensorflow.transformer.utils
7
+ import t5.data.sentencepiece_vocabulary
8
+ import t5.models.mesh_transformer
9
+
10
+ # Macros:
11
+ # ==============================================================================
12
+ d_ff = 2048
13
+ d_kv = 64
14
+ d_model = 512
15
+ dropout_rate = 0.1
16
+ num_heads = 8
17
+ num_layers = 6
18
+
19
+ # Parameters for AdafactorOptimizer:
20
+ # ==============================================================================
21
+ AdafactorOptimizer.beta1 = 0.0
22
+ AdafactorOptimizer.clipping_threshold = 1.0
23
+ AdafactorOptimizer.decay_rate = None
24
+ AdafactorOptimizer.epsilon1 = 1e-30
25
+ AdafactorOptimizer.epsilon2 = 0.001
26
+ AdafactorOptimizer.factored = True
27
+ AdafactorOptimizer.min_dim_size_to_factor = 128
28
+ AdafactorOptimizer.multiply_by_parameter_scale = True
29
+
30
+ # Parameters for Bitransformer:
31
+ # ==============================================================================
32
+ Bitransformer.shared_embedding = True
33
+
34
+ # Parameters for denoise:
35
+ # ==============================================================================
36
+ # None.
37
+
38
+ # Parameters for decoder/DenseReluDense:
39
+ # ==============================================================================
40
+ decoder/DenseReluDense.activation = 'relu'
41
+ decoder/DenseReluDense.dropout_rate = %dropout_rate
42
+ decoder/DenseReluDense.hidden_size = %d_ff
43
+
44
+ # Parameters for encoder/DenseReluDense:
45
+ # ==============================================================================
46
+ encoder/DenseReluDense.activation = 'relu'
47
+ encoder/DenseReluDense.dropout_rate = %dropout_rate
48
+ encoder/DenseReluDense.hidden_size = %d_ff
49
+
50
+ # Parameters for decoder/EncDecAttention:
51
+ # ==============================================================================
52
+ # None.
53
+
54
+ # Parameters for get_variable_dtype:
55
+ # ==============================================================================
56
+ get_variable_dtype.activation_dtype = 'bfloat16'
57
+
58
+ # Parameters for get_vocab_embedding_cls:
59
+ # ==============================================================================
60
+ # None.
61
+
62
+ # Parameters for get_vocabulary:
63
+ # ==============================================================================
64
+ # None.
65
+
66
+ # Parameters for iid_noise_mask:
67
+ # ==============================================================================
68
+ # None.
69
+
70
+ # Parameters for decoder/LayerStack:
71
+ # ==============================================================================
72
+ decoder/LayerStack.dropout_rate = %dropout_rate
73
+ decoder/LayerStack.norm_epsilon = 1e-06
74
+ decoder/LayerStack.recompute_grads = False
75
+
76
+ # Parameters for encoder/LayerStack:
77
+ # ==============================================================================
78
+ encoder/LayerStack.dropout_rate = %dropout_rate
79
+ encoder/LayerStack.norm_epsilon = 1e-06
80
+ encoder/LayerStack.recompute_grads = False
81
+
82
+ # Parameters for make_bitransformer:
83
+ # ==============================================================================
84
+ make_bitransformer.decoder_name = 'decoder'
85
+ make_bitransformer.encoder_name = 'encoder'
86
+
87
+ # Parameters for decoder/make_layer_stack:
88
+ # ==============================================================================
89
+ decoder/make_layer_stack.block_scope = True
90
+ decoder/make_layer_stack.layers = \
91
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
92
+ @mesh_tensorflow.transformer.transformer_layers.EncDecAttention,
93
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
94
+ decoder/make_layer_stack.num_layers = %num_layers
95
+
96
+ # Parameters for encoder/make_layer_stack:
97
+ # ==============================================================================
98
+ encoder/make_layer_stack.block_scope = True
99
+ encoder/make_layer_stack.layers = \
100
+ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention,
101
+ @mesh_tensorflow.transformer.transformer_layers.DenseReluDense]
102
+ encoder/make_layer_stack.num_layers = %num_layers
103
+
104
+ # Parameters for maybe_print_dataset:
105
+ # ==============================================================================
106
+ maybe_print_dataset.should_print = False
107
+
108
+ # Parameters for mesh_train_dataset_fn:
109
+ # ==============================================================================
110
+ mesh_train_dataset_fn.use_cached = False
111
+
112
+ # Parameters for MtfModel:
113
+ # ==============================================================================
114
+ MtfModel.autostack = True
115
+ MtfModel.ensemble_inputs = None
116
+ MtfModel.gcp_project = None
117
+ MtfModel.layout_rules = \
118
+ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch'
119
+ MtfModel.mesh_devices = None
120
+ MtfModel.mesh_shape = None
121
+ MtfModel.model_type = 'bitransformer'
122
+ MtfModel.optimizer = None
123
+ MtfModel.predict_fn = None
124
+ MtfModel.tpu_job_name = None
125
+ MtfModel.tpu_zone = None
126
+ MtfModel.variable_filter = None
127
+
128
+ # Parameters for noise_token_to_sentinel:
129
+ # ==============================================================================
130
+ # None.
131
+
132
+ # Parameters for num_parallel_calls:
133
+ # ==============================================================================
134
+ num_parallel_calls.deterministic = False
135
+
136
+ # Parameters for pack_dataset:
137
+ # ==============================================================================
138
+ pack_dataset.use_custom_ops = False
139
+
140
+ # Parameters for pack_or_pad:
141
+ # ==============================================================================
142
+ # None.
143
+
144
+ # Parameters for decoder/SelfAttention:
145
+ # ==============================================================================
146
+ decoder/SelfAttention.attention_func = None
147
+ decoder/SelfAttention.attention_kwargs = None
148
+ decoder/SelfAttention.combine_dims = True
149
+ decoder/SelfAttention.dropout_rate = %dropout_rate
150
+ decoder/SelfAttention.keep_query_heads_dims = False
151
+ decoder/SelfAttention.key_value_size = %d_kv
152
+ decoder/SelfAttention.num_heads = %num_heads
153
+ decoder/SelfAttention.num_memory_heads = 0
154
+ decoder/SelfAttention.relative_attention_num_buckets = 32
155
+ decoder/SelfAttention.relative_attention_type = 'bias_shared'
156
+ decoder/SelfAttention.shared_kv = False
157
+
158
+ # Parameters for encoder/SelfAttention:
159
+ # ==============================================================================
160
+ encoder/SelfAttention.attention_func = None
161
+ encoder/SelfAttention.attention_kwargs = None
162
+ encoder/SelfAttention.combine_dims = True
163
+ encoder/SelfAttention.dropout_rate = %dropout_rate
164
+ encoder/SelfAttention.keep_query_heads_dims = False
165
+ encoder/SelfAttention.key_value_size = %d_kv
166
+ encoder/SelfAttention.num_heads = %num_heads
167
+ encoder/SelfAttention.num_memory_heads = 0
168
+ encoder/SelfAttention.relative_attention_num_buckets = 32
169
+ encoder/SelfAttention.relative_attention_type = 'bias_shared'
170
+ encoder/SelfAttention.shared_kv = False
171
+
172
+ # Parameters for SentencePieceVocabulary:
173
+ # ==============================================================================
174
+ # None.
175
+
176
+ # Parameters for sentinel_id:
177
+ # ==============================================================================
178
+ sentinel_id.return_value = None
179
+
180
+ # Parameters for serialize_num_microbatches:
181
+ # ==============================================================================
182
+ serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192
183
+
184
+ # Parameters for shift_targets:
185
+ # ==============================================================================
186
+ shift_targets.bos_id = 0
187
+ shift_targets.eos_id = 1
188
+
189
+ # Parameters for tpu_estimator_model_fn:
190
+ # ==============================================================================
191
+ tpu_estimator_model_fn.model_info_file = None
192
+ tpu_estimator_model_fn.outer_batch_size = 1
193
+ tpu_estimator_model_fn.tpu_summaries = False
194
+
195
+ # Parameters for tpu_mesh_shape:
196
+ # ==============================================================================
197
+ tpu_mesh_shape.ensemble_parallelism = None
198
+
199
+ # Parameters for decoder/Unitransformer:
200
+ # ==============================================================================
201
+ decoder/Unitransformer.d_model = %d_model
202
+ decoder/Unitransformer.ensemble = None
203
+ decoder/Unitransformer.input_full_attention = False
204
+ decoder/Unitransformer.label_smoothing = 0.0
205
+ decoder/Unitransformer.loss_denominator = 233472
206
+ decoder/Unitransformer.loss_fn = None
207
+ decoder/Unitransformer.loss_on_targets_only = False
208
+ decoder/Unitransformer.max_length = 512
209
+ decoder/Unitransformer.positional_embedding = False
210
+ decoder/Unitransformer.shared_embedding_and_softmax_weights = True
211
+ decoder/Unitransformer.sinusoid_positional_embedding = False
212
+ decoder/Unitransformer.token_dropout_rate = 0.0
213
+ decoder/Unitransformer.vocab_divisor = 128
214
+ decoder/Unitransformer.z_loss = 0.0001
215
+
216
+ # Parameters for encoder/Unitransformer:
217
+ # ==============================================================================
218
+ encoder/Unitransformer.d_model = %d_model
219
+ encoder/Unitransformer.ensemble = None
220
+ encoder/Unitransformer.input_full_attention = False
221
+ encoder/Unitransformer.label_smoothing = 0.0
222
+ encoder/Unitransformer.loss_denominator = None
223
+ encoder/Unitransformer.loss_fn = None
224
+ encoder/Unitransformer.loss_on_targets_only = False
225
+ encoder/Unitransformer.max_length = 512
226
+ encoder/Unitransformer.positional_embedding = False
227
+ encoder/Unitransformer.shared_embedding_and_softmax_weights = True
228
+ encoder/Unitransformer.sinusoid_positional_embedding = False
229
+ encoder/Unitransformer.token_dropout_rate = 0.0
230
+ encoder/Unitransformer.vocab_divisor = 128
231
+ encoder/Unitransformer.z_loss = 0.0001
232
+
233
+ # Parameters for VarianceScalingInitializer:
234
+ # ==============================================================================
235
+ VarianceScalingInitializer.distribution = 'normal'
236
+ VarianceScalingInitializer.mode = 'fan_in'
237
+ VarianceScalingInitializer.scale = 1.0
238
+
239
+ # Parameters for VocabEmbedding:
240
+ # ==============================================================================
241
+ # None.
242
+
243
+ # Parameters for Vocabulary:
244
+ # ==============================================================================
245
+ # None.
vocabs/spm_32000_unigram/spm_32000_pt.model ADDED
Binary file (756 kB). View file
 
vocabs/spm_32000_unigram/spm_32000_pt.vocab ADDED
The diff for this file is too large to render. See raw diff