guillermoruiz
commited on
Commit
•
9e5d21e
1
Parent(s):
f70213f
Upload TFBilma
Browse files- config.json +2 -1
- configuration_bilma.py +14 -1
- modeling_bilma.py +70 -9
- tf_model.h5 +1 -1
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
"architectures": [
|
4 |
"Bilma"
|
5 |
],
|
@@ -13,6 +13,7 @@
|
|
13 |
"model_type": "bilma",
|
14 |
"num_attention_heads": 4,
|
15 |
"num_hidden_layers": 2,
|
|
|
16 |
"seq_max_length": 280,
|
17 |
"transformers_version": "4.30.2",
|
18 |
"vocab_size": 29025,
|
|
|
1 |
{
|
2 |
+
"add_head": null,
|
3 |
"architectures": [
|
4 |
"Bilma"
|
5 |
],
|
|
|
13 |
"model_type": "bilma",
|
14 |
"num_attention_heads": 4,
|
15 |
"num_hidden_layers": 2,
|
16 |
+
"pooling": null,
|
17 |
"seq_max_length": 280,
|
18 |
"transformers_version": "4.30.2",
|
19 |
"vocab_size": 29025,
|
configuration_bilma.py
CHANGED
@@ -6,7 +6,9 @@ class BilmaConfig(PretrainedConfig):
|
|
6 |
def __init__(
|
7 |
self,
|
8 |
weights="US",
|
9 |
-
include_top=True,
|
|
|
|
|
10 |
num_attention_heads: int = 4,
|
11 |
num_hidden_layers: int = 2,
|
12 |
seq_max_length: int = 280,
|
@@ -16,11 +18,20 @@ class BilmaConfig(PretrainedConfig):
|
|
16 |
**kwargs,
|
17 |
):
|
18 |
countries = ["US"]
|
|
|
19 |
if weights not in countries:
|
20 |
raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
if weights is not None:
|
22 |
self.weights = weights
|
23 |
self.include_top = include_top
|
|
|
|
|
24 |
self.num_attention_heads = 4
|
25 |
self.num_hidden_layers = 2
|
26 |
self.seq_max_length = 280
|
@@ -32,6 +43,8 @@ class BilmaConfig(PretrainedConfig):
|
|
32 |
|
33 |
self.weights = weights
|
34 |
self.include_top = include_top
|
|
|
|
|
35 |
self.num_attention_heads = num_attention_heads
|
36 |
self.num_hidden_layers = num_hidden_layers
|
37 |
self.seq_max_length = seq_max_length
|
|
|
6 |
def __init__(
|
7 |
self,
|
8 |
weights="US",
|
9 |
+
include_top = True,
|
10 |
+
add_head = None,
|
11 |
+
pooling = None,
|
12 |
num_attention_heads: int = 4,
|
13 |
num_hidden_layers: int = 2,
|
14 |
seq_max_length: int = 280,
|
|
|
18 |
**kwargs,
|
19 |
):
|
20 |
countries = ["US"]
|
21 |
+
poolings = ["mean", "cls", "max"]
|
22 |
if weights not in countries:
|
23 |
raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
|
24 |
+
if add_head is not None and include_top == True:
|
25 |
+
raise ValueError(f"To add a head, 'include_top' must be False")
|
26 |
+
if pooling is not None and include_top == True:
|
27 |
+
raise ValueError(f"To specify a pooling, 'include_top' must be False")
|
28 |
+
if pooling is not None and pooling not in poolings:
|
29 |
+
raise ValueError(f"`pooling` must be one of {poolings}, got {pooling}.")
|
30 |
if weights is not None:
|
31 |
self.weights = weights
|
32 |
self.include_top = include_top
|
33 |
+
self.add_head = add_head
|
34 |
+
self.pooling = pooling
|
35 |
self.num_attention_heads = 4
|
36 |
self.num_hidden_layers = 2
|
37 |
self.seq_max_length = 280
|
|
|
43 |
|
44 |
self.weights = weights
|
45 |
self.include_top = include_top
|
46 |
+
self.add_head = add_head
|
47 |
+
self.pooling = pooling
|
48 |
self.num_attention_heads = num_attention_heads
|
49 |
self.num_hidden_layers = num_hidden_layers
|
50 |
self.seq_max_length = seq_max_length
|
modeling_bilma.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
from transformers import TFPreTrainedModel, PreTrainedTokenizer
|
|
|
2 |
from tensorflow.keras.models import Model, load_model, Sequential
|
3 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
4 |
import tensorflow as tf
|
@@ -38,6 +39,7 @@ class TFBilma(TFPreTrainedModel):
|
|
38 |
def __init__(self, config):
|
39 |
self.seq_max_length = config.seq_max_length
|
40 |
self.include_top = config.include_top
|
|
|
41 |
super().__init__(config)
|
42 |
|
43 |
self.model = bilma(num_enc=config.num_hidden_layers,
|
@@ -47,7 +49,9 @@ class TFBilma(TFPreTrainedModel):
|
|
47 |
ff_dim=config.hidden_size,
|
48 |
vocab_size=config.vocab_size,
|
49 |
rate=config.hidden_dropout_prob,
|
50 |
-
include_top = config.include_top
|
|
|
|
|
51 |
|
52 |
@property
|
53 |
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
@@ -70,13 +74,26 @@ class TFBilma(TFPreTrainedModel):
|
|
70 |
|
71 |
|
72 |
def call(self, inputs):
|
73 |
-
|
|
|
|
|
|
|
74 |
if self.include_top:
|
75 |
output = {"logits":self.model(ins)}
|
76 |
else:
|
77 |
-
|
|
|
|
|
|
|
78 |
return output
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# copied from bilma_model.py
|
81 |
# --------------------------
|
82 |
|
@@ -105,7 +122,40 @@ def accuracy_function(ignore_id=0):
|
|
105 |
return tf.math.divide_no_nan(tf.reduce_sum(accuracies), tf.reduce_sum(mask))
|
106 |
return acc_mlm
|
107 |
|
108 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
|
110 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
111 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
@@ -115,9 +165,22 @@ def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, voca
|
|
115 |
if include_top:
|
116 |
fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
|
117 |
else:
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
caption_model = Model(inputs=capt_inputs_ids, outputs=
|
121 |
return caption_model
|
122 |
|
123 |
def load(model_file):
|
@@ -132,7 +195,6 @@ def load(model_file):
|
|
132 |
#
|
133 |
# Copied from transformer_text.py
|
134 |
# -------------------------------
|
135 |
-
|
136 |
class EncoderBlock(Layer):
|
137 |
def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
|
138 |
super(EncoderBlock, self).__init__(**kwargs)
|
@@ -214,7 +276,6 @@ class DecoderBlock(Layer):
|
|
214 |
|
215 |
return final_output, attn_output1, attn_encoder
|
216 |
|
217 |
-
|
218 |
class Encoder(Layer):
|
219 |
def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
|
220 |
super(Encoder, self).__init__(**kwargs)
|
|
|
1 |
+
from transformers import TFPreTrainedModel, PreTrainedTokenizer, BatchEncoding
|
2 |
+
|
3 |
from tensorflow.keras.models import Model, load_model, Sequential
|
4 |
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
|
5 |
import tensorflow as tf
|
|
|
39 |
def __init__(self, config):
|
40 |
self.seq_max_length = config.seq_max_length
|
41 |
self.include_top = config.include_top
|
42 |
+
self.add_head = config.add_head
|
43 |
super().__init__(config)
|
44 |
|
45 |
self.model = bilma(num_enc=config.num_hidden_layers,
|
|
|
49 |
ff_dim=config.hidden_size,
|
50 |
vocab_size=config.vocab_size,
|
51 |
rate=config.hidden_dropout_prob,
|
52 |
+
include_top = config.include_top,
|
53 |
+
add_head = config.add_head,
|
54 |
+
pooling = config.pooling)
|
55 |
|
56 |
@property
|
57 |
def dummy_inputs(self) -> Dict[str, tf.Tensor]:
|
|
|
74 |
|
75 |
|
76 |
def call(self, inputs):
|
77 |
+
if isinstance(inputs, Dict) or isinstance(inputs, BatchEncoding):
|
78 |
+
ins = tf.cast(inputs["input_ids"], tf.float32)
|
79 |
+
else:
|
80 |
+
ins = inputs
|
81 |
if self.include_top:
|
82 |
output = {"logits":self.model(ins)}
|
83 |
else:
|
84 |
+
if self.add_head is None:
|
85 |
+
output = {"last_hidden_state":self.model(ins)}
|
86 |
+
else:
|
87 |
+
output = {"label":self.model(ins)}
|
88 |
return output
|
89 |
|
90 |
+
def get_loss_function():
|
91 |
+
return loss_funtion()
|
92 |
+
|
93 |
+
def get_acc_function():
|
94 |
+
return accuracy_function()
|
95 |
+
|
96 |
+
|
97 |
# copied from bilma_model.py
|
98 |
# --------------------------
|
99 |
|
|
|
122 |
return tf.math.divide_no_nan(tf.reduce_sum(accuracies), tf.reduce_sum(mask))
|
123 |
return acc_mlm
|
124 |
|
125 |
+
def mean_vectors(inputs, enc_vectors, max_length):
|
126 |
+
p = tf.where(inputs == 3)
|
127 |
+
pos = tf.transpose(p)[1]
|
128 |
+
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
129 |
+
C = tf.reshape(C, (-1, max_length, 1))
|
130 |
+
S = tf.reduce_sum(enc_vectors * C, 1)
|
131 |
+
x = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
132 |
+
return x
|
133 |
+
|
134 |
+
def mean_diff_vectors(inputs, enc_vectors, max_length):
|
135 |
+
p = tf.where(inputs == 3)
|
136 |
+
pos = tf.transpose(p)[1]
|
137 |
+
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
138 |
+
C = tf.reshape(C, (-1, max_length, 1))
|
139 |
+
vecs = enc_vectors * C
|
140 |
+
S = tf.reduce_sum(vecs, 1)
|
141 |
+
mu = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
142 |
+
x = tf.reduce_sum(mu - vecs, 1) / tf.expand_dims(tf.cast(pos, tf.float32), (1))
|
143 |
+
return x
|
144 |
+
|
145 |
+
def max_vectors(inputs, enc_vectors, max_length):
|
146 |
+
p = tf.where(inputs == 3)
|
147 |
+
pos = tf.transpose(p)[1]
|
148 |
+
C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
|
149 |
+
C = tf.reshape(C, (-1, max_length, 1))
|
150 |
+
x = tf.reduce_max(enc_vectors * C, 1)
|
151 |
+
return x
|
152 |
+
|
153 |
+
def cls_vectors(inputs, enc_vectors, max_length):
|
154 |
+
x = tf.squeeze(enc_vectors[:, 0:1, :], axis=1)
|
155 |
+
return x
|
156 |
+
|
157 |
+
|
158 |
+
def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1, include_top=True, add_head=None, pooling=None):
|
159 |
capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
|
160 |
capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
|
161 |
capt_inputs = capt_embedding(capt_inputs_ids)
|
|
|
165 |
if include_top:
|
166 |
fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
|
167 |
else:
|
168 |
+
x = enc_output
|
169 |
+
if pooling == "mean":
|
170 |
+
x = mean_vectors(capt_inputs_ids, x, max_length)
|
171 |
+
elif pooling == "cls":
|
172 |
+
x = cls_vectors(capt_inputs_ids, x, max_length)
|
173 |
+
elif pooling == "max":
|
174 |
+
x = max_vectors(capt_inputs_ids, x, max_length)
|
175 |
+
|
176 |
+
if add_head is None:
|
177 |
+
fin_output = x
|
178 |
+
else:
|
179 |
+
for i, m in enumerate(add_head[:-1]):
|
180 |
+
x = Dense(m, use_bias=True, activation="relu", name=f"bilma/dense_ex_{i}")(x)
|
181 |
+
fin_output = Dense(add_head[-1], use_bias=True, activation="softmax", name=f"bilma/dense_ex_final")(x)
|
182 |
|
183 |
+
caption_model = Model(inputs=capt_inputs_ids, outputs=fin_output, name="bilma_model")
|
184 |
return caption_model
|
185 |
|
186 |
def load(model_file):
|
|
|
195 |
#
|
196 |
# Copied from transformer_text.py
|
197 |
# -------------------------------
|
|
|
198 |
class EncoderBlock(Layer):
|
199 |
def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
|
200 |
super(EncoderBlock, self).__init__(**kwargs)
|
|
|
276 |
|
277 |
return final_output, attn_output1, attn_encoder
|
278 |
|
|
|
279 |
class Encoder(Layer):
|
280 |
def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
|
281 |
super(Encoder, self).__init__(**kwargs)
|
tf_model.h5
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 156875820
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6fe3ecec3d1d988d47959917f0eae89ea06daa01a44c1a04d327d7af2a23034
|
3 |
size 156875820
|