|
|
|
|
|
import pandas as pd |
|
import glob |
|
from nltk import tokenize |
|
from transformers import BertTokenizer, TFBertModel, BertConfig |
|
from transformers.utils.dummy_tf_objects import TFBertMainLayer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow import convert_to_tensor |
|
from tensorflow.keras.layers import Input, Dense |
|
from tensorflow.keras.initializers import TruncatedNormal |
|
from tensorflow.keras.models import load_model, Model |
|
from tensorflow.keras.optimizers import Adam |
|
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall |
|
|
|
|
|
|
|
|
|
DATA="..." |
|
|
|
MODELS=".../" |
|
|
|
SAVE_PREDICTIONS_TO="..." |
|
|
|
|
|
|
|
|
|
def tokenize_abstracts(abstracts): |
|
"""For given texts, adds '[CLS]' and '[SEP]' tokens |
|
at the beginning and the end of each sentence, respectively. |
|
""" |
|
t_abstracts=[] |
|
for abstract in abstracts: |
|
t_abstract="[CLS] " |
|
for sentence in tokenize.sent_tokenize(abstract): |
|
t_abstract=t_abstract + sentence + " [SEP] " |
|
t_abstracts.append(t_abstract) |
|
return t_abstracts |
|
|
|
|
|
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased') |
|
|
|
|
|
def b_tokenize_abstracts(t_abstracts, max_len=512): |
|
"""Tokenizes sentences with the help |
|
of a 'bert-base-multilingual-uncased' tokenizer. |
|
""" |
|
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts] |
|
return b_t_abstracts |
|
|
|
|
|
def convert_to_ids(b_t_abstracts): |
|
"""Converts tokens to its specific |
|
IDs in a bert vocabulary. |
|
""" |
|
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts] |
|
return input_ids |
|
|
|
|
|
def abstracts_to_ids(abstracts): |
|
"""Tokenizes abstracts and converts |
|
tokens to their specific IDs |
|
in a bert vocabulary. |
|
""" |
|
tokenized_abstracts=tokenize_abstracts(abstracts) |
|
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts) |
|
ids=convert_to_ids(b_tokenized_abstracts) |
|
return ids |
|
|
|
|
|
def pad_ids(input_ids, max_len=512): |
|
"""Padds sequences of a given IDs. |
|
""" |
|
p_input_ids=pad_sequences(input_ids, |
|
maxlen=max_len, |
|
dtype="long", |
|
truncating="post", |
|
padding="post") |
|
return p_input_ids |
|
|
|
|
|
def create_attention_masks(inputs): |
|
"""Creates attention masks |
|
for a given seuquences. |
|
""" |
|
masks=[] |
|
for sequence in inputs: |
|
sequence_mask=[float(_>0) for _ in sequence] |
|
masks.append(sequence_mask) |
|
return masks |
|
|
|
|
|
|
|
|
|
def float_to_percent(float, decimal=3): |
|
"""Takes a float from range 0. to 0.9... as an input |
|
and converts it to a percentage with specified decimal places. |
|
""" |
|
return str(float*100)[:(decimal+3)]+"%" |
|
|
|
|
|
def models_predict(directory, inputs, attention_masks, float_to_percent=False): |
|
"""Loads separate .h5 models from a given directory. |
|
For predictions, inputs are expected to be: |
|
tensors of token's ids (bert vocab) and tensors of attention masks. |
|
Output is of format: |
|
{'model/target N': [the probability of a text N dealing with the target N , ...], ...} |
|
""" |
|
models=glob.glob(f"{directory}*.h5") |
|
predictions_dict={} |
|
for _ in models: |
|
model=load_model(_) |
|
print(f"Model {_} is loaded.") |
|
predictions=model.predict_step([inputs, attention_masks]) |
|
print(f"Predictions from the model {_} are finished.") |
|
predictions=[float(_) for _ in predictions] |
|
if float_to_percent==True: |
|
predictions=[float_to_percent(_) for _ in predictions] |
|
predictions_dict[model.name]=predictions |
|
print(f"Predictions from the model {_} are saved.") |
|
del predictions, model |
|
return predictions_dict |
|
|
|
|
|
def predictions_dict_to_df(predictions_dictionary): |
|
"""Converts model's predictions of format: |
|
{'model/target N': [the probability of a text N dealing with the target N , ...], ...} |
|
to a dataframe of format: |
|
| text N | the probability of the text N dealing with the target N | ... | |
|
""" |
|
predictions_df=pd.DataFrame(predictions_dictionary) |
|
predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns] |
|
predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))]) |
|
return predictions_df |
|
|
|
|
|
def predictions_above_treshold(predictions_dataframe, treshold=0.95): |
|
"""Filters predictions above specified treshold. |
|
Input is expected to be a dataframe of format: |
|
| text N | the probability of the text N dealing with the target N | ... | |
|
Output is of format: |
|
{text N: [target N dealing with probability > trheshold with text N, ...], ...} |
|
""" |
|
above_treshold_dict={} |
|
above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1) |
|
for _ in range(len(above_treshold)): |
|
above_treshold_dict[_]=list(above_treshold[_]) |
|
return above_treshold_dict |
|
|
|
|
|
|
|
|
|
marks=[_ for _ in range(int(len(DATA)/100))] |
|
|
|
output=pd.DataFrame() |
|
|
|
for _ in marks: |
|
if _ == 0: |
|
abstracts=DATA[_: (_+1)*100] |
|
else: |
|
abstracts=DATA[_*100: (_+1)*100] |
|
ids=abstracts_to_ids(abstracts) |
|
padded_ids=pad_ids(ids) |
|
masks=create_attention_masks(padded_ids) |
|
masks=convert_to_tensor(masks) |
|
inputs=convert_to_tensor(padded_ids) |
|
predictions=models_predict(MODELS, inputs, masks) |
|
predictions_df=predictions_dict_to_df(predictions) |
|
output=output.append(predictions_df) |
|
del abstracts, predictions, predictions_df |
|
|
|
if len(DATA)!=((marks[-1]+1)*100): |
|
rest_idx=((marks[-1]+1)*100) |
|
abstracts=DATA[rest_idx:] |
|
ids=abstracts_to_ids(abstracts) |
|
padded_ids=pad_ids(ids) |
|
masks=create_attention_masks(padded_ids) |
|
masks=convert_to_tensor(masks) |
|
inputs=convert_to_tensor(padded_ids) |
|
predictions=models_predict(MODELS, inputs, masks) |
|
predictions_df=predictions_dict_to_df(predictions) |
|
output=output.append(predictions_df) |
|
del abstracts, predictions, predictions_df |
|
|
|
|
|
output.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False) |
|
|