File size: 16,282 Bytes

from transformers import TFPreTrainedModel, PreTrainedTokenizer, BatchEncoding

from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
import tensorflow as tf
import numpy as np

from typing import Dict

import re
import unicodedata

from .configuration_bilma import BilmaConfig

# copied from preprocessing.py
BLANK = ' '

RE_OPS = re.I | re.M | re.S
RE_USR = re.compile(r"""@\S+""", RE_OPS)
RE_TAG = re.compile(r"""#\S+""", RE_OPS)
RE_URL = re.compile(r"""(http|ftp|https)://\S+""", RE_OPS)
RE_NUM = re.compile(r"""[-+]?\d+\.?\d*""", RE_OPS)

SYMBOLS_ = "()[]¿?¡!{}~<>|"
SYMBOLS = set(";:,.@\\-\"/" + SYMBOLS_)



# ------------------
# Class declaration
# ------------------


class TFBilma(TFPreTrainedModel):
    config_class = BilmaConfig
    main_input_name = "input_ids"
    #base_model_prefix = "bilma"

    def __init__(self, config):
        self.seq_max_length = config.seq_max_length
        self.include_top = config.include_top
        self.add_head = config.add_head
        super().__init__(config)

        self.model = bilma(num_enc=config.num_hidden_layers,
                           embed_dim=config.hidden_size, 
                           max_length=config.seq_max_length,
                           num_heads=config.num_attention_heads,
                           ff_dim=config.hidden_size,
                           vocab_size=config.vocab_size,
                           rate=config.hidden_dropout_prob,
                           include_top = config.include_top,
                           add_head = config.add_head,
                           pooling = config.pooling)
            
    @property
    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
    
        dummies = {}
        for key, spec in self.input_signature.items():
            dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
            if spec.shape[0] is None:
                dummy_shape[0] = 1
            dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
        
        
        return dummies
    
    @property
    def input_signature(self) -> Dict[str, tf.TensorSpec]:
        sig = {}
        sig["input_ids"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="input_ids")
        return sig
    
    
    def call(self, inputs):        
        if isinstance(inputs, Dict) or isinstance(inputs, BatchEncoding):
            ins = tf.cast(inputs["input_ids"], tf.float32)
        else:
            ins = inputs
        if self.include_top:
            output = {"logits":self.model(ins)}
        else:
            if self.add_head is None:
                output = {"last_hidden_state":self.model(ins)}
            else:
                output = {"label":self.model(ins)}
        return output
    
    def get_loss_function():
        return loss_funtion()
    
    def get_acc_function():
        return accuracy_function()
    
    
# copied from bilma_model.py
# --------------------------

def loss_function(ignore_id=0):
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    def loss(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
        loss_ = loss_object(real, pred)
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        sum_ = tf.reduce_sum(mask,axis=1)
        
        loss_ = tf.math.divide_no_nan(tf.reduce_sum(loss_, axis=1), sum_)
        return loss_
    return loss

def accuracy_function(ignore_id=0):
    def acc_mlm(real, pred):
        accuracies = tf.equal(tf.cast(real, tf.int64), tf.argmax(pred, axis=2))

        mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
        accuracies = tf.math.logical_and(mask, accuracies)

        accuracies = tf.cast(accuracies, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.math.divide_no_nan(tf.reduce_sum(accuracies), tf.reduce_sum(mask))
    return acc_mlm

def mean_vectors(inputs, enc_vectors, max_length):
    p = tf.where(inputs == 3)
    pos = tf.transpose(p)[1]
    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
    C = tf.reshape(C, (-1, max_length, 1))
    S = tf.reduce_sum(enc_vectors * C, 1)
    x = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
    return x

def mean_diff_vectors(inputs, enc_vectors, max_length):
    p = tf.where(inputs == 3)
    pos = tf.transpose(p)[1]
    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
    C = tf.reshape(C, (-1, max_length, 1))
    vecs = enc_vectors * C
    S = tf.reduce_sum(vecs, 1)
    mu = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
    x = tf.reduce_sum(mu - vecs, 1) / tf.expand_dims(tf.cast(pos, tf.float32), (1))
    return x

def max_vectors(inputs, enc_vectors, max_length):
    p = tf.where(inputs == 3)
    pos = tf.transpose(p)[1]
    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
    C = tf.reshape(C, (-1, max_length, 1))
    x = tf.reduce_max(enc_vectors * C, 1)
    return x

def cls_vectors(inputs, enc_vectors, max_length):
    x = tf.squeeze(enc_vectors[:, 0:1, :], axis=1)
    return x


def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1, include_top=True, add_head=None, pooling=None):
    capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
    capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
    capt_inputs = capt_embedding(capt_inputs_ids)
    
    enc = Encoder(num_enc, embed_dim, max_length, num_heads, ff_dim, rate=rate, name="bilma/encoder")
    enc_output = enc(capt_inputs)
    if include_top:
        fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
    else:
        x = enc_output
        if pooling == "mean":
            x = mean_vectors(capt_inputs_ids, x, max_length)
        elif pooling == "cls":                
            x = cls_vectors(capt_inputs_ids, x, max_length)
        elif pooling == "max":
            x = max_vectors(capt_inputs_ids, x, max_length)
        
        if add_head is None:
            fin_output = x
        else:            
            for i, m in enumerate(add_head[:-1]):
                x = Dense(m, use_bias=True, activation="relu", name=f"bilma/dense_ex_{i}")(x)
            fin_output = Dense(add_head[-1], use_bias=True, activation="softmax", name=f"bilma/dense_ex_final")(x)
    
    caption_model = Model(inputs=capt_inputs_ids, outputs=fin_output, name="bilma_model")
    return caption_model

def load(model_file):
    custom_objects={"EncoderBlock": EncoderBlock, 
                    "Encoder": Encoder,
                    "loss": loss_function(),
                    "acc_mlm":accuracy_function(),
                   }
    return load_model(model_file, custom_objects=custom_objects)


# 
# Copied from transformer_text.py   
# -------------------------------
class EncoderBlock(Layer):
    def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.ln = layer_num
        self.p_d = patch_dim
        self.n_h = num_heads
        self.f_d = ff_dim
        self.rate = rate
        
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=patch_dim, name=f"bilma/MHA_{layer_num}")
        self.ffn = Sequential(
            #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu), 
            # Conv1D(patch_dim, kernel_size=1),]
            [Dense(ff_dim, activation=tf.nn.gelu, name=f"bilma/dense1_{layer_num}"), 
             Dense(patch_dim, name=f"bilma/dense2_{layer_num}")] 
        )
        #self.layernorm0 = LayerNormalization(epsilon=1e-6)
        self.layernorm1 = LayerNormalization(epsilon=1e-6, name=f"ln1_{layer_num}")
        self.layernorm2 = LayerNormalization(epsilon=1e-6, name=f"ln2_{layer_num}")
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        
    def get_config(self):
        config = super(EncoderBlock, self).get_config()
        config.update({"layer_num":self.ln, "patch_dim":self.p_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
        return config

    def call(self, inputs, training=False):
        #inputs = self.layernorm0(inputs)
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(add([inputs, attn_output]))
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(add([out1, ffn_output]))
    

class DecoderBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.e_d = embed_dim
        self.n_h = num_heads
        self.f_d = ff_dim
        self.rate = rate
        
        self.att1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu), 
            # Conv1D(embed_dim, kernel_size=1),]
            [Dense(ff_dim, activation=tf.nn.gelu), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)
        
    def get_config(self):
        config = super(DecoderBlock, self).get_config()
        config.update({"embed_dim":self.e_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
        return config

    def call(self, inputs, encoder_output, look_ahead_mask, padding_mask, training=None):
        y, attn_output1 = self.att1(inputs, inputs, attention_mask=look_ahead_mask, return_attention_scores=True)
        y = self.dropout1(y, training=training)
        y = add([inputs, y])                
        out1 = self.layernorm1(y)
        
        y, attn_encoder = self.att2(out1, encoder_output, attention_mask=padding_mask, return_attention_scores=True)
        y = self.dropout2(y, training=training)
        y = add([out1, y])                
        out2 = self.layernorm1(y)
        
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        final_output =  self.layernorm2(out2 + ffn_output)
        
        return final_output, attn_output1, attn_encoder

class Encoder(Layer):
    def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.n = n        
        self.embed_dim = embed_dim
        self.max_length = max_length
        self.n_h = num_heads
        self.f_d = ff_dim
        self.rate = rate
        self._layers = [EncoderBlock(i, embed_dim, num_heads, ff_dim, rate=0.1, name=f"enc_block_{i}") for i in range(n)]
        self.pe = positional_encoding(self.max_length, self.embed_dim)
        
    def get_config(self):
        config = super(Encoder, self).get_config()
        config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
        return config
    
    def call(self, x, training=False):
        x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
        x = x + self.pe[:, :tf.shape(x)[1], :]
        for layer in self._layers:
            x = layer(x, training)
        return x

    
class Decoder(Layer):
    def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.n = n
        self.embed_dim = embed_dim
        self.max_length = max_length
        self.n_h = num_heads
        self.f_d = ff_dim
        self.rate = rate
        self._layers = [DecoderBlock(embed_dim, num_heads, ff_dim, rate=0.1) for _ in range(n)]
        self.pe = positional_encoding(self.max_length, self.embed_dim)
    
    def get_config(self):
        config = super(Decoder, self).get_config()
        config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
        return config
    
    def call(self, x, encoder_output, look_ahead_mask, padding_mask, training):      
        x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
        x = x + self.pe[:, :tf.shape(x)[1], :]
        
        for layer in self._layers:
            x, self_att, enc_att = layer(x, encoder_output, look_ahead_mask, padding_mask, training)

        return x




# =========================================
#   M A S K S 
# =========================================
def create_padding_mask(seq):
    """
    For self-attention
    seq shape(bs, max_length, emb_dim)
    output shape (bs, max_length, max_length)
    """
    mask = tf.cast(tf.not_equal(seq, 0), tf.bool)
    mask = tf.reduce_any(mask, 2)
    mask = tf.repeat(mask, seq.shape[1], 0)
    mask = tf.reshape(mask, (-1,seq.shape[1], seq.shape[1]))
    return tf.cast(mask, tf.float32)


def create_cross_padding_mask(seq, target_seq):
    """
    For cross-attention
    seq shape(bs, k, image_features)
    target_seq(bs, max_length, emb_dim)
    output shape (bs, max_length, k)
    """
    mask = tf.cast(tf.not_equal(target_seq, 0), tf.bool)
    mask = tf.reduce_any(mask, 2)
    mask = tf.repeat(mask, seq.shape[1], 0)
    mask = tf.reshape(mask, (-1, tf.shape(seq)[1], tf.shape(target_seq)[1]))
    mask = tf.transpose(mask, [0, 2, 1])
    return mask


def create_look_ahead_mask(seq):
    """
    seq shape(bs, max_length, emb_dim)
    output 2D matrix of shape (bs, max_length, max_length) with ones on the diagonal and below.
    """
    size = seq.shape[1]
    mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    mask = tf.expand_dims(mask, 0)
    mask = tf.repeat(mask, tf.shape(seq)[0], 0)
    return mask


def create_masks(seq, target_seq):
    decoder_mask = create_padding_mask(target_seq)
    decoder_mask *= create_look_ahead_mask(target_seq)
    cross_att_mask = create_cross_padding_mask(seq, target_seq)
    return decoder_mask, cross_att_mask
        
    
def create_masks_looking_ahead(seq, target_seq):
    decoder_mask = create_padding_mask(target_seq)
    cross_att_mask = create_cross_padding_mask(seq, target_seq)
    return decoder_mask, cross_att_mask
    
# =========================================
#   P O S I T I O N A L   E N C O D I N G
# =========================================
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

@tf.autograph.experimental.do_not_convert
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

class PatchEncoder(Layer):
    def __init__(self, num_patches, projection_dim, **kwargs):
        super(PatchEncoder, self).__init__(**kwargs)
        self.num_patches = num_patches
        self.projection_dim = projection_dim
        self.projection = Dense(units=projection_dim)
        self.position_embedding = Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )
    
    def get_config(self):
        config = super(PatchEncoder, self).get_config()
        config.update({"num_patches": self.num_patches, "projection_dim":self.projection_dim})
        return config

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded