guillermoruiz commited on
Commit
09a0bd6
1 Parent(s): 536e8bd

Upload TFBilma

Browse files
Files changed (4) hide show
  1. config.json +20 -0
  2. configuration_bilma.py +41 -0
  3. modeling_bilma.py +380 -0
  4. tf_model.h5 +3 -0
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bilma_MX",
3
+ "architectures": [
4
+ "Bilma"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_bilma.BilmaConfig",
8
+ "TFAutoModel": "modeling_bilma.TFBilma"
9
+ },
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 512,
12
+ "include_top": true,
13
+ "model_type": "bilma",
14
+ "num_attention_heads": 4,
15
+ "num_hidden_layers": 2,
16
+ "seq_max_length": 280,
17
+ "transformers_version": "4.30.2",
18
+ "vocab_size": 29025,
19
+ "weights": "MX"
20
+ }
configuration_bilma.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class BilmaConfig(PretrainedConfig):
4
+ model_type = "bilma"
5
+
6
+ def __init__(
7
+ self,
8
+ weights="MX",
9
+ include_top=True,
10
+ num_attention_heads: int = 4,
11
+ num_hidden_layers: int = 2,
12
+ seq_max_length: int = 280,
13
+ hidden_size: int = 512,
14
+ vocab_size: int = 29025,
15
+ hidden_dropout_prob: float = 0.1,
16
+ **kwargs,
17
+ ):
18
+ countries = ["MX"]
19
+ if weights not in countries:
20
+ raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
21
+ if weights is not None:
22
+ self.weights = weights
23
+ self.include_top = include_top
24
+ self.num_attention_heads = 4
25
+ self.num_hidden_layers = 2
26
+ self.seq_max_length = 280
27
+ self.hidden_size = 512
28
+ self.vocab_size = 29025
29
+ self.hidden_dropout_prob = 0.1
30
+ super().__init__(**kwargs)
31
+ return
32
+
33
+ self.weights = weights
34
+ self.include_top = include_top
35
+ self.num_attention_heads = num_attention_heads
36
+ self.num_hidden_layers = num_hidden_layers
37
+ self.seq_max_length = seq_max_length
38
+ self.hidden_size = hidden_size
39
+ self.vocab_size = vocab_size
40
+ self.hidden_dropout_prob = hidden_dropout_prob
41
+ super().__init__(**kwargs)
modeling_bilma.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TFPreTrainedModel, PreTrainedTokenizer
2
+ from tensorflow.keras.models import Model, load_model, Sequential
3
+ from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
4
+ import tensorflow as tf
5
+ import numpy as np
6
+
7
+ from typing import Dict
8
+
9
+ import re
10
+ import unicodedata
11
+
12
+ from configuration_bilma import BilmaConfig
13
+
14
+ # copied from preprocessing.py
15
+ BLANK = ' '
16
+
17
+ RE_OPS = re.I | re.M | re.S
18
+ RE_USR = re.compile(r"""@\S+""", RE_OPS)
19
+ RE_TAG = re.compile(r"""#\S+""", RE_OPS)
20
+ RE_URL = re.compile(r"""(http|ftp|https)://\S+""", RE_OPS)
21
+ RE_NUM = re.compile(r"""[-+]?\d+\.?\d*""", RE_OPS)
22
+
23
+ SYMBOLS_ = "()[]¿?¡!{}~<>|"
24
+ SYMBOLS = set(";:,.@\\-\"/" + SYMBOLS_)
25
+
26
+
27
+
28
+ # ------------------
29
+ # Class declaration
30
+ # ------------------
31
+
32
+
33
+ class TFBilma(TFPreTrainedModel):
34
+ config_class = BilmaConfig
35
+ main_input_name = "input_ids"
36
+ #base_model_prefix = "bilma"
37
+
38
+ def __init__(self, config):
39
+ self.seq_max_length = config.seq_max_length
40
+ self.include_top = config.include_top
41
+ super().__init__(config)
42
+
43
+ self.model = bilma(num_enc=config.num_hidden_layers,
44
+ embed_dim=config.hidden_size,
45
+ max_length=config.seq_max_length,
46
+ num_heads=config.num_attention_heads,
47
+ ff_dim=config.hidden_size,
48
+ vocab_size=config.vocab_size,
49
+ rate=config.hidden_dropout_prob,
50
+ include_top = config.include_top)
51
+
52
+ @property
53
+ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
54
+
55
+ dummies = {}
56
+ for key, spec in self.input_signature.items():
57
+ dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
58
+ if spec.shape[0] is None:
59
+ dummy_shape[0] = 1
60
+ dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
61
+
62
+
63
+ return dummies
64
+
65
+ @property
66
+ def input_signature(self) -> Dict[str, tf.TensorSpec]:
67
+ sig = {}
68
+ sig["input_ids"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="input_ids")
69
+ return sig
70
+
71
+
72
+ def call(self, inputs):
73
+ ins = tf.cast(inputs["input_ids"], tf.float32)
74
+ if self.include_top:
75
+ output = {"logits":self.model(ins)}
76
+ else:
77
+ output = {"last_hidden_state":self.model(ins)}
78
+ return output
79
+
80
+ # copied from bilma_model.py
81
+ # --------------------------
82
+
83
+ def loss_function(ignore_id=0):
84
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
85
+ def loss(real, pred):
86
+ mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
87
+ loss_ = loss_object(real, pred)
88
+ mask = tf.cast(mask, dtype=loss_.dtype)
89
+ loss_ *= mask
90
+ sum_ = tf.reduce_sum(mask,axis=1)
91
+
92
+ loss_ = tf.math.divide_no_nan(tf.reduce_sum(loss_, axis=1), sum_)
93
+ return loss_
94
+ return loss
95
+
96
+ def accuracy_function(ignore_id=0):
97
+ def acc_mlm(real, pred):
98
+ accuracies = tf.equal(tf.cast(real, tf.int64), tf.argmax(pred, axis=2))
99
+
100
+ mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
101
+ accuracies = tf.math.logical_and(mask, accuracies)
102
+
103
+ accuracies = tf.cast(accuracies, dtype=tf.float32)
104
+ mask = tf.cast(mask, dtype=tf.float32)
105
+ return tf.math.divide_no_nan(tf.reduce_sum(accuracies), tf.reduce_sum(mask))
106
+ return acc_mlm
107
+
108
+ def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1, include_top=True):
109
+ capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
110
+ capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
111
+ capt_inputs = capt_embedding(capt_inputs_ids)
112
+
113
+ enc = Encoder(num_enc, embed_dim, max_length, num_heads, ff_dim, rate=rate, name="bilma/encoder")
114
+ enc_output = enc(capt_inputs)
115
+ if include_top:
116
+ fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
117
+ else:
118
+ fin_output = enc_output
119
+
120
+ caption_model = Model(inputs=capt_inputs_ids, outputs=[fin_output], name="bilma_model")
121
+ return caption_model
122
+
123
+ def load(model_file):
124
+ custom_objects={"EncoderBlock": EncoderBlock,
125
+ "Encoder": Encoder,
126
+ "loss": loss_function(),
127
+ "acc_mlm":accuracy_function(),
128
+ }
129
+ return load_model(model_file, custom_objects=custom_objects)
130
+
131
+
132
+ #
133
+ # Copied from transformer_text.py
134
+ # -------------------------------
135
+
136
+ class EncoderBlock(Layer):
137
+ def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
138
+ super(EncoderBlock, self).__init__(**kwargs)
139
+ self.ln = layer_num
140
+ self.p_d = patch_dim
141
+ self.n_h = num_heads
142
+ self.f_d = ff_dim
143
+ self.rate = rate
144
+
145
+ self.att = MultiHeadAttention(num_heads=num_heads, key_dim=patch_dim, name=f"bilma/MHA_{layer_num}")
146
+ self.ffn = Sequential(
147
+ #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu),
148
+ # Conv1D(patch_dim, kernel_size=1),]
149
+ [Dense(ff_dim, activation=tf.nn.gelu, name=f"bilma/dense1_{layer_num}"),
150
+ Dense(patch_dim, name=f"bilma/dense2_{layer_num}")]
151
+ )
152
+ #self.layernorm0 = LayerNormalization(epsilon=1e-6)
153
+ self.layernorm1 = LayerNormalization(epsilon=1e-6, name=f"ln1_{layer_num}")
154
+ self.layernorm2 = LayerNormalization(epsilon=1e-6, name=f"ln2_{layer_num}")
155
+ self.dropout1 = Dropout(rate)
156
+ self.dropout2 = Dropout(rate)
157
+
158
+ def get_config(self):
159
+ config = super(EncoderBlock, self).get_config()
160
+ config.update({"layer_num":self.ln, "patch_dim":self.p_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
161
+ return config
162
+
163
+ def call(self, inputs, training=False):
164
+ #inputs = self.layernorm0(inputs)
165
+ attn_output = self.att(inputs, inputs)
166
+ attn_output = self.dropout1(attn_output, training=training)
167
+ out1 = self.layernorm1(add([inputs, attn_output]))
168
+ ffn_output = self.ffn(out1)
169
+ ffn_output = self.dropout2(ffn_output, training=training)
170
+ return self.layernorm2(add([out1, ffn_output]))
171
+
172
+
173
+ class DecoderBlock(Layer):
174
+ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
175
+ super(DecoderBlock, self).__init__(**kwargs)
176
+ self.e_d = embed_dim
177
+ self.n_h = num_heads
178
+ self.f_d = ff_dim
179
+ self.rate = rate
180
+
181
+ self.att1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
182
+ self.att2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
183
+ self.ffn = Sequential(
184
+ #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu),
185
+ # Conv1D(embed_dim, kernel_size=1),]
186
+ [Dense(ff_dim, activation=tf.nn.gelu),
187
+ Dense(embed_dim),]
188
+ )
189
+ self.layernorm1 = LayerNormalization(epsilon=1e-6)
190
+ self.layernorm2 = LayerNormalization(epsilon=1e-6)
191
+ self.dropout1 = Dropout(rate)
192
+ self.dropout2 = Dropout(rate)
193
+ self.dropout3 = Dropout(rate)
194
+
195
+ def get_config(self):
196
+ config = super(DecoderBlock, self).get_config()
197
+ config.update({"embed_dim":self.e_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
198
+ return config
199
+
200
+ def call(self, inputs, encoder_output, look_ahead_mask, padding_mask, training=None):
201
+ y, attn_output1 = self.att1(inputs, inputs, attention_mask=look_ahead_mask, return_attention_scores=True)
202
+ y = self.dropout1(y, training=training)
203
+ y = add([inputs, y])
204
+ out1 = self.layernorm1(y)
205
+
206
+ y, attn_encoder = self.att2(out1, encoder_output, attention_mask=padding_mask, return_attention_scores=True)
207
+ y = self.dropout2(y, training=training)
208
+ y = add([out1, y])
209
+ out2 = self.layernorm1(y)
210
+
211
+ ffn_output = self.ffn(out2)
212
+ ffn_output = self.dropout3(ffn_output, training=training)
213
+ final_output = self.layernorm2(out2 + ffn_output)
214
+
215
+ return final_output, attn_output1, attn_encoder
216
+
217
+
218
+ class Encoder(Layer):
219
+ def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
220
+ super(Encoder, self).__init__(**kwargs)
221
+ self.n = n
222
+ self.embed_dim = embed_dim
223
+ self.max_length = max_length
224
+ self.n_h = num_heads
225
+ self.f_d = ff_dim
226
+ self.rate = rate
227
+ self._layers = [EncoderBlock(i, embed_dim, num_heads, ff_dim, rate=0.1, name=f"enc_block_{i}") for i in range(n)]
228
+ self.pe = positional_encoding(self.max_length, self.embed_dim)
229
+
230
+ def get_config(self):
231
+ config = super(Encoder, self).get_config()
232
+ config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
233
+ return config
234
+
235
+ def call(self, x, training=False):
236
+ x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
237
+ x = x + self.pe[:, :tf.shape(x)[1], :]
238
+ for layer in self._layers:
239
+ x = layer(x, training)
240
+ return x
241
+
242
+
243
+ class Decoder(Layer):
244
+ def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
245
+ super(Decoder, self).__init__(**kwargs)
246
+ self.n = n
247
+ self.embed_dim = embed_dim
248
+ self.max_length = max_length
249
+ self.n_h = num_heads
250
+ self.f_d = ff_dim
251
+ self.rate = rate
252
+ self._layers = [DecoderBlock(embed_dim, num_heads, ff_dim, rate=0.1) for _ in range(n)]
253
+ self.pe = positional_encoding(self.max_length, self.embed_dim)
254
+
255
+ def get_config(self):
256
+ config = super(Decoder, self).get_config()
257
+ config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
258
+ return config
259
+
260
+ def call(self, x, encoder_output, look_ahead_mask, padding_mask, training):
261
+ x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
262
+ x = x + self.pe[:, :tf.shape(x)[1], :]
263
+
264
+ for layer in self._layers:
265
+ x, self_att, enc_att = layer(x, encoder_output, look_ahead_mask, padding_mask, training)
266
+
267
+ return x
268
+
269
+
270
+
271
+
272
+ # =========================================
273
+ # M A S K S
274
+ # =========================================
275
+ def create_padding_mask(seq):
276
+ """
277
+ For self-attention
278
+ seq shape(bs, max_length, emb_dim)
279
+ output shape (bs, max_length, max_length)
280
+ """
281
+ mask = tf.cast(tf.not_equal(seq, 0), tf.bool)
282
+ mask = tf.reduce_any(mask, 2)
283
+ mask = tf.repeat(mask, seq.shape[1], 0)
284
+ mask = tf.reshape(mask, (-1,seq.shape[1], seq.shape[1]))
285
+ return tf.cast(mask, tf.float32)
286
+
287
+
288
+ def create_cross_padding_mask(seq, target_seq):
289
+ """
290
+ For cross-attention
291
+ seq shape(bs, k, image_features)
292
+ target_seq(bs, max_length, emb_dim)
293
+ output shape (bs, max_length, k)
294
+ """
295
+ mask = tf.cast(tf.not_equal(target_seq, 0), tf.bool)
296
+ mask = tf.reduce_any(mask, 2)
297
+ mask = tf.repeat(mask, seq.shape[1], 0)
298
+ mask = tf.reshape(mask, (-1, tf.shape(seq)[1], tf.shape(target_seq)[1]))
299
+ mask = tf.transpose(mask, [0, 2, 1])
300
+ return mask
301
+
302
+
303
+ def create_look_ahead_mask(seq):
304
+ """
305
+ seq shape(bs, max_length, emb_dim)
306
+ output 2D matrix of shape (bs, max_length, max_length) with ones on the diagonal and below.
307
+ """
308
+ size = seq.shape[1]
309
+ mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
310
+ mask = tf.expand_dims(mask, 0)
311
+ mask = tf.repeat(mask, tf.shape(seq)[0], 0)
312
+ return mask
313
+
314
+
315
+ def create_masks(seq, target_seq):
316
+ decoder_mask = create_padding_mask(target_seq)
317
+ decoder_mask *= create_look_ahead_mask(target_seq)
318
+ cross_att_mask = create_cross_padding_mask(seq, target_seq)
319
+ return decoder_mask, cross_att_mask
320
+
321
+
322
+ def create_masks_looking_ahead(seq, target_seq):
323
+ decoder_mask = create_padding_mask(target_seq)
324
+ cross_att_mask = create_cross_padding_mask(seq, target_seq)
325
+ return decoder_mask, cross_att_mask
326
+
327
+ # =========================================
328
+ # P O S I T I O N A L E N C O D I N G
329
+ # =========================================
330
+ def get_angles(pos, i, d_model):
331
+ angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
332
+ return pos * angle_rates
333
+
334
+ @tf.autograph.experimental.do_not_convert
335
+ def positional_encoding(position, d_model):
336
+ angle_rads = get_angles(np.arange(position)[:, np.newaxis],
337
+ np.arange(d_model)[np.newaxis, :],
338
+ d_model)
339
+
340
+ # apply sin to even indices in the array; 2i
341
+ angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
342
+
343
+ # apply cos to odd indices in the array; 2i+1
344
+ angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
345
+
346
+ pos_encoding = angle_rads[np.newaxis, ...]
347
+
348
+ return tf.cast(pos_encoding, dtype=tf.float32)
349
+
350
+ class PatchEncoder(Layer):
351
+ def __init__(self, num_patches, projection_dim, **kwargs):
352
+ super(PatchEncoder, self).__init__(**kwargs)
353
+ self.num_patches = num_patches
354
+ self.projection_dim = projection_dim
355
+ self.projection = Dense(units=projection_dim)
356
+ self.position_embedding = Embedding(
357
+ input_dim=num_patches, output_dim=projection_dim
358
+ )
359
+
360
+ def get_config(self):
361
+ config = super(PatchEncoder, self).get_config()
362
+ config.update({"num_patches": self.num_patches, "projection_dim":self.projection_dim})
363
+ return config
364
+
365
+ def call(self, patch):
366
+ positions = tf.range(start=0, limit=self.num_patches, delta=1)
367
+ encoded = self.projection(patch) + self.position_embedding(positions)
368
+ return encoded
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbfa589e471d9015d5ca64d2d212afa28da612a2ff8f2d93560fca1b03167afa
3
+ size 156875820