MauriceV2021
commited on
Commit
•
6f70897
1
Parent(s):
00f161b
Upload 5 files
Browse files- predict.py +168 -0
- predict_in_batches.py +186 -0
- train_multiclass_model.py +193 -0
- train_multilabel_model.py +193 -0
- train_multilabel_models.py +221 -0
predict.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORTS
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import glob
|
5 |
+
from nltk import tokenize
|
6 |
+
from transformers import BertTokenizer, TFBertModel, BertConfig
|
7 |
+
from transformers.utils.dummy_tf_objects import TFBertMainLayer
|
8 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
9 |
+
from tensorflow import convert_to_tensor
|
10 |
+
from tensorflow.keras.layers import Input, Dense
|
11 |
+
from tensorflow.keras.initializers import TruncatedNormal
|
12 |
+
from tensorflow.keras.models import load_model, Model
|
13 |
+
from tensorflow.keras.optimizers import Adam
|
14 |
+
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
|
15 |
+
|
16 |
+
|
17 |
+
# SET PARAMETERS
|
18 |
+
|
19 |
+
DATA="..." # DATA need to be a list of texts
|
20 |
+
|
21 |
+
MODELS=".../"
|
22 |
+
|
23 |
+
SAVE_PREDICTIONS_TO="..."
|
24 |
+
|
25 |
+
|
26 |
+
# PREPROCESS TEXTS
|
27 |
+
|
28 |
+
def tokenize_abstracts(abstracts):
|
29 |
+
"""For given texts, adds '[CLS]' and '[SEP]' tokens
|
30 |
+
at the beginning and the end of each sentence, respectively.
|
31 |
+
"""
|
32 |
+
t_abstracts=[]
|
33 |
+
for abstract in abstracts:
|
34 |
+
t_abstract="[CLS] "
|
35 |
+
for sentence in tokenize.sent_tokenize(abstract):
|
36 |
+
t_abstract=t_abstract + sentence + " [SEP] "
|
37 |
+
t_abstracts.append(t_abstract)
|
38 |
+
return t_abstracts
|
39 |
+
|
40 |
+
|
41 |
+
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
|
42 |
+
|
43 |
+
|
44 |
+
def b_tokenize_abstracts(t_abstracts, max_len=512):
|
45 |
+
"""Tokenizes sentences with the help
|
46 |
+
of a 'bert-base-multilingual-uncased' tokenizer.
|
47 |
+
"""
|
48 |
+
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
|
49 |
+
return b_t_abstracts
|
50 |
+
|
51 |
+
|
52 |
+
def convert_to_ids(b_t_abstracts):
|
53 |
+
"""Converts tokens to its specific
|
54 |
+
IDs in a bert vocabulary.
|
55 |
+
"""
|
56 |
+
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
|
57 |
+
return input_ids
|
58 |
+
|
59 |
+
|
60 |
+
def abstracts_to_ids(abstracts):
|
61 |
+
"""Tokenizes abstracts and converts
|
62 |
+
tokens to their specific IDs
|
63 |
+
in a bert vocabulary.
|
64 |
+
"""
|
65 |
+
tokenized_abstracts=tokenize_abstracts(abstracts)
|
66 |
+
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
|
67 |
+
ids=convert_to_ids(b_tokenized_abstracts)
|
68 |
+
return ids
|
69 |
+
|
70 |
+
|
71 |
+
def pad_ids(input_ids, max_len=512):
|
72 |
+
"""Padds sequences of a given IDs.
|
73 |
+
"""
|
74 |
+
p_input_ids=pad_sequences(input_ids,
|
75 |
+
maxlen=max_len,
|
76 |
+
dtype="long",
|
77 |
+
truncating="post",
|
78 |
+
padding="post")
|
79 |
+
return p_input_ids
|
80 |
+
|
81 |
+
|
82 |
+
def create_attention_masks(inputs):
|
83 |
+
"""Creates attention masks
|
84 |
+
for a given seuquences.
|
85 |
+
"""
|
86 |
+
masks=[]
|
87 |
+
for sequence in inputs:
|
88 |
+
sequence_mask=[float(_>0) for _ in sequence]
|
89 |
+
masks.append(sequence_mask)
|
90 |
+
return masks
|
91 |
+
|
92 |
+
|
93 |
+
# PREDICT
|
94 |
+
|
95 |
+
def float_to_percent(float, decimal=3):
|
96 |
+
"""Takes a float from range 0. to 0.9... as an input
|
97 |
+
and converts it to a percentage with specified decimal places.
|
98 |
+
"""
|
99 |
+
return str(float*100)[:(decimal+3)]+"%"
|
100 |
+
|
101 |
+
|
102 |
+
def models_predict(directory, inputs, attention_masks, float_to_percent=False):
|
103 |
+
"""Loads separate .h5 models from a given directory.
|
104 |
+
For predictions, inputs are expected to be:
|
105 |
+
tensors of token's ids (bert vocab) and tensors of attention masks.
|
106 |
+
Output is of format:
|
107 |
+
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
|
108 |
+
"""
|
109 |
+
models=glob.glob(f"{directory}*.h5")
|
110 |
+
predictions_dict={}
|
111 |
+
for _ in models:
|
112 |
+
model=load_model(_)
|
113 |
+
predictions=model.predict_step([inputs, attention_masks])
|
114 |
+
predictions=[float(_) for _ in predictions]
|
115 |
+
if float_to_percent==True:
|
116 |
+
predictions=[float_to_percent(_) for _ in predictions]
|
117 |
+
predictions_dict[model.name]=predictions
|
118 |
+
del predictions, model
|
119 |
+
return predictions_dict
|
120 |
+
|
121 |
+
|
122 |
+
def predictions_dict_to_df(predictions_dictionary):
|
123 |
+
"""Converts model's predictions of format:
|
124 |
+
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
|
125 |
+
to a dataframe of format:
|
126 |
+
| text N | the probability of the text N dealing with the target N | ... |
|
127 |
+
"""
|
128 |
+
predictions_df=pd.DataFrame(predictions_dictionary)
|
129 |
+
predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns]
|
130 |
+
predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))])
|
131 |
+
return predictions_df
|
132 |
+
|
133 |
+
|
134 |
+
def predictions_above_treshold(predictions_dataframe, treshold=0.95):
|
135 |
+
"""Filters predictions above specified treshold.
|
136 |
+
Input is expected to be a dataframe of format:
|
137 |
+
| text N | the probability of the text N dealing with the target N | ... |
|
138 |
+
Output is of format:
|
139 |
+
{text N: [target N dealing with probability > trheshold with text N, ...], ...}
|
140 |
+
"""
|
141 |
+
above_treshold_dict={}
|
142 |
+
above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1)
|
143 |
+
for _ in range(len(above_treshold)):
|
144 |
+
above_treshold_dict[_]=list(above_treshold[_])
|
145 |
+
return above_treshold_dict
|
146 |
+
|
147 |
+
|
148 |
+
# RUN
|
149 |
+
|
150 |
+
abstracts=DATA
|
151 |
+
|
152 |
+
ids=abstracts_to_ids(abstracts)
|
153 |
+
|
154 |
+
padded_ids=pad_ids(ids)
|
155 |
+
|
156 |
+
masks=create_attention_masks(padded_ids)
|
157 |
+
|
158 |
+
masks=convert_to_tensor(masks)
|
159 |
+
|
160 |
+
inputs=convert_to_tensor(padded_ids)
|
161 |
+
|
162 |
+
predictions=models_predict(MODELS, inputs, masks)
|
163 |
+
|
164 |
+
predictions_df=predictions_dict_to_df(predictions)
|
165 |
+
|
166 |
+
predictions_df.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False)
|
167 |
+
|
168 |
+
|
predict_in_batches.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORTS
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import glob
|
5 |
+
from nltk import tokenize
|
6 |
+
from transformers import BertTokenizer, TFBertModel, BertConfig
|
7 |
+
from transformers.utils.dummy_tf_objects import TFBertMainLayer
|
8 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
9 |
+
from tensorflow import convert_to_tensor
|
10 |
+
from tensorflow.keras.layers import Input, Dense
|
11 |
+
from tensorflow.keras.initializers import TruncatedNormal
|
12 |
+
from tensorflow.keras.models import load_model, Model
|
13 |
+
from tensorflow.keras.optimizers import Adam
|
14 |
+
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
|
15 |
+
|
16 |
+
|
17 |
+
# SET PARAMETERS
|
18 |
+
|
19 |
+
DATA="..." # DATA need to be a list of texts
|
20 |
+
|
21 |
+
MODELS=".../"
|
22 |
+
|
23 |
+
SAVE_PREDICTIONS_TO="..."
|
24 |
+
|
25 |
+
|
26 |
+
# PREPROCESS TEXTS
|
27 |
+
|
28 |
+
def tokenize_abstracts(abstracts):
|
29 |
+
"""For given texts, adds '[CLS]' and '[SEP]' tokens
|
30 |
+
at the beginning and the end of each sentence, respectively.
|
31 |
+
"""
|
32 |
+
t_abstracts=[]
|
33 |
+
for abstract in abstracts:
|
34 |
+
t_abstract="[CLS] "
|
35 |
+
for sentence in tokenize.sent_tokenize(abstract):
|
36 |
+
t_abstract=t_abstract + sentence + " [SEP] "
|
37 |
+
t_abstracts.append(t_abstract)
|
38 |
+
return t_abstracts
|
39 |
+
|
40 |
+
|
41 |
+
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
|
42 |
+
|
43 |
+
|
44 |
+
def b_tokenize_abstracts(t_abstracts, max_len=512):
|
45 |
+
"""Tokenizes sentences with the help
|
46 |
+
of a 'bert-base-multilingual-uncased' tokenizer.
|
47 |
+
"""
|
48 |
+
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
|
49 |
+
return b_t_abstracts
|
50 |
+
|
51 |
+
|
52 |
+
def convert_to_ids(b_t_abstracts):
|
53 |
+
"""Converts tokens to its specific
|
54 |
+
IDs in a bert vocabulary.
|
55 |
+
"""
|
56 |
+
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
|
57 |
+
return input_ids
|
58 |
+
|
59 |
+
|
60 |
+
def abstracts_to_ids(abstracts):
|
61 |
+
"""Tokenizes abstracts and converts
|
62 |
+
tokens to their specific IDs
|
63 |
+
in a bert vocabulary.
|
64 |
+
"""
|
65 |
+
tokenized_abstracts=tokenize_abstracts(abstracts)
|
66 |
+
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
|
67 |
+
ids=convert_to_ids(b_tokenized_abstracts)
|
68 |
+
return ids
|
69 |
+
|
70 |
+
|
71 |
+
def pad_ids(input_ids, max_len=512):
|
72 |
+
"""Padds sequences of a given IDs.
|
73 |
+
"""
|
74 |
+
p_input_ids=pad_sequences(input_ids,
|
75 |
+
maxlen=max_len,
|
76 |
+
dtype="long",
|
77 |
+
truncating="post",
|
78 |
+
padding="post")
|
79 |
+
return p_input_ids
|
80 |
+
|
81 |
+
|
82 |
+
def create_attention_masks(inputs):
|
83 |
+
"""Creates attention masks
|
84 |
+
for a given seuquences.
|
85 |
+
"""
|
86 |
+
masks=[]
|
87 |
+
for sequence in inputs:
|
88 |
+
sequence_mask=[float(_>0) for _ in sequence]
|
89 |
+
masks.append(sequence_mask)
|
90 |
+
return masks
|
91 |
+
|
92 |
+
|
93 |
+
# PREDICT
|
94 |
+
|
95 |
+
def float_to_percent(float, decimal=3):
|
96 |
+
"""Takes a float from range 0. to 0.9... as an input
|
97 |
+
and converts it to a percentage with specified decimal places.
|
98 |
+
"""
|
99 |
+
return str(float*100)[:(decimal+3)]+"%"
|
100 |
+
|
101 |
+
|
102 |
+
def models_predict(directory, inputs, attention_masks, float_to_percent=False):
|
103 |
+
"""Loads separate .h5 models from a given directory.
|
104 |
+
For predictions, inputs are expected to be:
|
105 |
+
tensors of token's ids (bert vocab) and tensors of attention masks.
|
106 |
+
Output is of format:
|
107 |
+
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
|
108 |
+
"""
|
109 |
+
models=glob.glob(f"{directory}*.h5")
|
110 |
+
predictions_dict={}
|
111 |
+
for _ in models:
|
112 |
+
model=load_model(_)
|
113 |
+
print(f"Model {_} is loaded.")
|
114 |
+
predictions=model.predict_step([inputs, attention_masks])
|
115 |
+
print(f"Predictions from the model {_} are finished.")
|
116 |
+
predictions=[float(_) for _ in predictions]
|
117 |
+
if float_to_percent==True:
|
118 |
+
predictions=[float_to_percent(_) for _ in predictions]
|
119 |
+
predictions_dict[model.name]=predictions
|
120 |
+
print(f"Predictions from the model {_} are saved.")
|
121 |
+
del predictions, model
|
122 |
+
return predictions_dict
|
123 |
+
|
124 |
+
|
125 |
+
def predictions_dict_to_df(predictions_dictionary):
|
126 |
+
"""Converts model's predictions of format:
|
127 |
+
{'model/target N': [the probability of a text N dealing with the target N , ...], ...}
|
128 |
+
to a dataframe of format:
|
129 |
+
| text N | the probability of the text N dealing with the target N | ... |
|
130 |
+
"""
|
131 |
+
predictions_df=pd.DataFrame(predictions_dictionary)
|
132 |
+
predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns]
|
133 |
+
predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))])
|
134 |
+
return predictions_df
|
135 |
+
|
136 |
+
|
137 |
+
def predictions_above_treshold(predictions_dataframe, treshold=0.95):
|
138 |
+
"""Filters predictions above specified treshold.
|
139 |
+
Input is expected to be a dataframe of format:
|
140 |
+
| text N | the probability of the text N dealing with the target N | ... |
|
141 |
+
Output is of format:
|
142 |
+
{text N: [target N dealing with probability > trheshold with text N, ...], ...}
|
143 |
+
"""
|
144 |
+
above_treshold_dict={}
|
145 |
+
above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1)
|
146 |
+
for _ in range(len(above_treshold)):
|
147 |
+
above_treshold_dict[_]=list(above_treshold[_])
|
148 |
+
return above_treshold_dict
|
149 |
+
|
150 |
+
|
151 |
+
# RUN
|
152 |
+
|
153 |
+
marks=[_ for _ in range(int(len(DATA)/100))]
|
154 |
+
|
155 |
+
output=pd.DataFrame()
|
156 |
+
|
157 |
+
for _ in marks:
|
158 |
+
if _ == 0:
|
159 |
+
abstracts=DATA[_: (_+1)*100]
|
160 |
+
else:
|
161 |
+
abstracts=DATA[_*100: (_+1)*100]
|
162 |
+
ids=abstracts_to_ids(abstracts)
|
163 |
+
padded_ids=pad_ids(ids)
|
164 |
+
masks=create_attention_masks(padded_ids)
|
165 |
+
masks=convert_to_tensor(masks)
|
166 |
+
inputs=convert_to_tensor(padded_ids)
|
167 |
+
predictions=models_predict(MODELS, inputs, masks)
|
168 |
+
predictions_df=predictions_dict_to_df(predictions)
|
169 |
+
output=output.append(predictions_df)
|
170 |
+
del abstracts, predictions, predictions_df
|
171 |
+
|
172 |
+
if len(DATA)!=((marks[-1]+1)*100):
|
173 |
+
rest_idx=((marks[-1]+1)*100)
|
174 |
+
abstracts=DATA[rest_idx:]
|
175 |
+
ids=abstracts_to_ids(abstracts)
|
176 |
+
padded_ids=pad_ids(ids)
|
177 |
+
masks=create_attention_masks(padded_ids)
|
178 |
+
masks=convert_to_tensor(masks)
|
179 |
+
inputs=convert_to_tensor(padded_ids)
|
180 |
+
predictions=models_predict(MODELS, inputs, masks)
|
181 |
+
predictions_df=predictions_dict_to_df(predictions)
|
182 |
+
output=output.append(predictions_df)
|
183 |
+
del abstracts, predictions, predictions_df
|
184 |
+
|
185 |
+
|
186 |
+
output.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False)
|
train_multiclass_model.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORTS
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import nltk
|
5 |
+
nltk.download("punkt")
|
6 |
+
from nltk import tokenize
|
7 |
+
import time
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from transformers import BertConfig, BertTokenizer, TFBertModel
|
10 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
+
from tensorflow import convert_to_tensor
|
12 |
+
from tensorflow.keras.layers import Input, Dense
|
13 |
+
from tensorflow.keras.initializers import TruncatedNormal
|
14 |
+
from tensorflow.keras.models import Model
|
15 |
+
from tensorflow.keras.optimizers import Adam
|
16 |
+
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
|
17 |
+
|
18 |
+
|
19 |
+
# SET PARAMETERS
|
20 |
+
|
21 |
+
DATA_PATH="..."
|
22 |
+
|
23 |
+
SAVE_MODEL_TO=".../"
|
24 |
+
|
25 |
+
|
26 |
+
# READ DATA
|
27 |
+
|
28 |
+
tab=pd.read_hdf(DATA_PATH)
|
29 |
+
|
30 |
+
|
31 |
+
# PREPARE DATA FOR BERT
|
32 |
+
|
33 |
+
def data_to_values(dataframe):
|
34 |
+
"""Converts data to values.
|
35 |
+
"""
|
36 |
+
abstracts=dataframe.Abstract.values
|
37 |
+
labels=dataframe.Label.values
|
38 |
+
return abstracts, labels
|
39 |
+
|
40 |
+
|
41 |
+
def tokenize_abstracts(abstracts):
|
42 |
+
"""For given texts, adds '[CLS]' and '[SEP]' tokens
|
43 |
+
at the beginning and the end of each sentence, respectively.
|
44 |
+
"""
|
45 |
+
t_abstracts=[]
|
46 |
+
for abstract in abstracts:
|
47 |
+
t_abstract="[CLS] "
|
48 |
+
for sentence in tokenize.sent_tokenize(abstract):
|
49 |
+
t_abstract=t_abstract + sentence + " [SEP] "
|
50 |
+
t_abstracts.append(t_abstract)
|
51 |
+
return t_abstracts
|
52 |
+
|
53 |
+
|
54 |
+
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
|
55 |
+
|
56 |
+
|
57 |
+
def b_tokenize_abstracts(t_abstracts, max_len=512):
|
58 |
+
"""Tokenizes sentences with the help
|
59 |
+
of a 'bert-base-multilingual-uncased' tokenizer.
|
60 |
+
"""
|
61 |
+
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
|
62 |
+
return b_t_abstracts
|
63 |
+
|
64 |
+
|
65 |
+
def convert_to_ids(b_t_abstracts):
|
66 |
+
"""Converts tokens to its specific
|
67 |
+
IDs in a bert vocabulary.
|
68 |
+
"""
|
69 |
+
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
|
70 |
+
return input_ids
|
71 |
+
|
72 |
+
|
73 |
+
def abstracts_to_ids(abstracts):
|
74 |
+
"""Tokenizes abstracts and converts
|
75 |
+
tokens to their specific IDs
|
76 |
+
in a bert vocabulary.
|
77 |
+
"""
|
78 |
+
tokenized_abstracts=tokenize_abstracts(abstracts)
|
79 |
+
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
|
80 |
+
ids=convert_to_ids(b_tokenized_abstracts)
|
81 |
+
return ids
|
82 |
+
|
83 |
+
|
84 |
+
def pad_ids(input_ids, max_len=512):
|
85 |
+
"""Padds sequences of a given IDs.
|
86 |
+
"""
|
87 |
+
p_input_ids=pad_sequences(input_ids,
|
88 |
+
maxlen=max_len,
|
89 |
+
dtype="long",
|
90 |
+
truncating="post",
|
91 |
+
padding="post")
|
92 |
+
return p_input_ids
|
93 |
+
|
94 |
+
|
95 |
+
def create_attention_masks(inputs):
|
96 |
+
"""Creates attention masks
|
97 |
+
for a given seuquences.
|
98 |
+
"""
|
99 |
+
masks=[]
|
100 |
+
for sequence in inputs:
|
101 |
+
sequence_mask=[float(_>0) for _ in sequence]
|
102 |
+
masks.append(sequence_mask)
|
103 |
+
return masks
|
104 |
+
|
105 |
+
|
106 |
+
# CREATE MODEL
|
107 |
+
|
108 |
+
def create_model():
|
109 |
+
config=BertConfig.from_pretrained(
|
110 |
+
"bert-base-multilingual-uncased",
|
111 |
+
num_labels=16,
|
112 |
+
hidden_dropout_prob=0.2,
|
113 |
+
attention_probs_dropout_prob=0.2)
|
114 |
+
bert=TFBertModel.from_pretrained(
|
115 |
+
"bert-base-multilingual-uncased",
|
116 |
+
config=config)
|
117 |
+
bert_layer=bert.layers[0]
|
118 |
+
input_ids_layer=Input(
|
119 |
+
shape=(512),
|
120 |
+
name="input_ids",
|
121 |
+
dtype="int32")
|
122 |
+
input_attention_masks_layer=Input(
|
123 |
+
shape=(512),
|
124 |
+
name="attention_masks",
|
125 |
+
dtype="int32")
|
126 |
+
bert_model=bert_layer(
|
127 |
+
input_ids_layer,
|
128 |
+
input_attention_masks_layer)
|
129 |
+
target_layer=Dense(
|
130 |
+
units=16,
|
131 |
+
kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
|
132 |
+
name="target_layer",
|
133 |
+
activation="softmax")(bert_model[1])
|
134 |
+
model=Model(
|
135 |
+
inputs=[input_ids_layer, input_attention_masks_layer],
|
136 |
+
outputs=target_layer,
|
137 |
+
name="mbert_multiclass_16")
|
138 |
+
optimizer=Adam(
|
139 |
+
learning_rate=5e-05,
|
140 |
+
epsilon=1e-08,
|
141 |
+
decay=0.01,
|
142 |
+
clipnorm=1.0)
|
143 |
+
model.compile(
|
144 |
+
optimizer=optimizer,
|
145 |
+
loss="categorical_crossentropy",
|
146 |
+
metrics=[CategoricalAccuracy(), Precision(), Recall()])
|
147 |
+
return model
|
148 |
+
|
149 |
+
|
150 |
+
abstracts, labels=data_to_values(tab)
|
151 |
+
ids=abstracts_to_ids(abstracts)
|
152 |
+
print("Abstracts tokenized, tokens converted to ids.")
|
153 |
+
|
154 |
+
padded_ids=pad_ids(ids)
|
155 |
+
print("Sequences padded.")
|
156 |
+
|
157 |
+
train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
|
158 |
+
validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
|
159 |
+
print("Data splited into train, validation, test sets.")
|
160 |
+
|
161 |
+
train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
162 |
+
print("Attention masks created.")
|
163 |
+
train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
164 |
+
print("Inputs converted to tensors.")
|
165 |
+
train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]]
|
166 |
+
print("Labels converted to tensors.")
|
167 |
+
train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
|
168 |
+
print("Masks converted to tensors.")
|
169 |
+
|
170 |
+
|
171 |
+
model=create_model()
|
172 |
+
print("Model initialized.")
|
173 |
+
|
174 |
+
|
175 |
+
history=model.fit([train_inputs, train_masks], train_labels,
|
176 |
+
batch_size=8,
|
177 |
+
epochs=2,
|
178 |
+
validation_data=([validation_inputs, validation_masks], validation_labels))
|
179 |
+
|
180 |
+
|
181 |
+
model.save(SAVE_MODEL_TO+"mbert_multiclass_16.h5")
|
182 |
+
print("Model saved.")
|
183 |
+
|
184 |
+
test_score=model.evaluate([test_inputs, test_masks], test_labels,
|
185 |
+
batch_size=8)
|
186 |
+
|
187 |
+
print("Model tested.")
|
188 |
+
|
189 |
+
|
190 |
+
stats=pd.DataFrame(test_score, columns=["loss", "accuracy", "precision", "recall"])
|
191 |
+
stats.to_excel(SAVE_MODEL_TO+"mbert_multiclass_16_stats.xlsx", index=False)
|
192 |
+
|
193 |
+
print("Stats saved.")
|
train_multilabel_model.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORTS
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import nltk
|
5 |
+
nltk.download("punkt")
|
6 |
+
from nltk import tokenize
|
7 |
+
import time
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
from transformers import BertConfig, BertTokenizer, TFBertModel
|
10 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
+
from tensorflow import convert_to_tensor
|
12 |
+
from tensorflow.keras.layers import Input, Dense
|
13 |
+
from tensorflow.keras.initializers import TruncatedNormal
|
14 |
+
from tensorflow.keras.models import Model
|
15 |
+
from tensorflow.keras.optimizers import Adam
|
16 |
+
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
|
17 |
+
|
18 |
+
|
19 |
+
# SET PARAMETERS
|
20 |
+
|
21 |
+
DATA_PATH="..."
|
22 |
+
|
23 |
+
SAVE_MODELS_TO=".../"
|
24 |
+
|
25 |
+
|
26 |
+
# READ DATA
|
27 |
+
|
28 |
+
tab=pd.read_hdf(DATA_PATH)
|
29 |
+
|
30 |
+
|
31 |
+
# PREPARE DATA FOR BERT
|
32 |
+
|
33 |
+
def data_to_values(dataframe):
|
34 |
+
"""Converts data to values.
|
35 |
+
"""
|
36 |
+
abstracts=dataframe.Abstract.values
|
37 |
+
labels=dataframe.Label.values
|
38 |
+
return abstracts, labels
|
39 |
+
|
40 |
+
|
41 |
+
def tokenize_abstracts(abstracts):
|
42 |
+
"""For given texts, adds '[CLS]' and '[SEP]' tokens
|
43 |
+
at the beginning and the end of each sentence, respectively.
|
44 |
+
"""
|
45 |
+
t_abstracts=[]
|
46 |
+
for abstract in abstracts:
|
47 |
+
t_abstract="[CLS] "
|
48 |
+
for sentence in tokenize.sent_tokenize(abstract):
|
49 |
+
t_abstract=t_abstract + sentence + " [SEP] "
|
50 |
+
t_abstracts.append(t_abstract)
|
51 |
+
return t_abstracts
|
52 |
+
|
53 |
+
|
54 |
+
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
|
55 |
+
|
56 |
+
|
57 |
+
def b_tokenize_abstracts(t_abstracts, max_len=512):
|
58 |
+
"""Tokenizes sentences with the help
|
59 |
+
of a 'bert-base-multilingual-uncased' tokenizer.
|
60 |
+
"""
|
61 |
+
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
|
62 |
+
return b_t_abstracts
|
63 |
+
|
64 |
+
|
65 |
+
def convert_to_ids(b_t_abstracts):
|
66 |
+
"""Converts tokens to its specific
|
67 |
+
IDs in a bert vocabulary.
|
68 |
+
"""
|
69 |
+
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
|
70 |
+
return input_ids
|
71 |
+
|
72 |
+
|
73 |
+
def abstracts_to_ids(abstracts):
|
74 |
+
"""Tokenizes abstracts and converts
|
75 |
+
tokens to their specific IDs
|
76 |
+
in a bert vocabulary.
|
77 |
+
"""
|
78 |
+
tokenized_abstracts=tokenize_abstracts(abstracts)
|
79 |
+
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
|
80 |
+
ids=convert_to_ids(b_tokenized_abstracts)
|
81 |
+
return ids
|
82 |
+
|
83 |
+
|
84 |
+
def pad_ids(input_ids, max_len=512):
|
85 |
+
"""Padds sequences of a given IDs.
|
86 |
+
"""
|
87 |
+
p_input_ids=pad_sequences(input_ids,
|
88 |
+
maxlen=max_len,
|
89 |
+
dtype="long",
|
90 |
+
truncating="post",
|
91 |
+
padding="post")
|
92 |
+
return p_input_ids
|
93 |
+
|
94 |
+
|
95 |
+
def create_attention_masks(inputs):
|
96 |
+
"""Creates attention masks
|
97 |
+
for a given seuquences.
|
98 |
+
"""
|
99 |
+
masks=[]
|
100 |
+
for sequence in inputs:
|
101 |
+
sequence_mask=[float(_>0) for _ in sequence]
|
102 |
+
masks.append(sequence_mask)
|
103 |
+
return masks
|
104 |
+
|
105 |
+
|
106 |
+
# CREATE MODEL
|
107 |
+
|
108 |
+
def create_model():
|
109 |
+
config=BertConfig.from_pretrained(
|
110 |
+
"bert-base-multilingual-uncased",
|
111 |
+
num_labels=17,
|
112 |
+
hidden_dropout_prob=0.2,
|
113 |
+
attention_probs_dropout_prob=0.2)
|
114 |
+
bert=TFBertModel.from_pretrained(
|
115 |
+
"bert-base-multilingual-uncased",
|
116 |
+
config=config)
|
117 |
+
bert_layer=bert.layers[0]
|
118 |
+
input_ids_layer=Input(
|
119 |
+
shape=(512),
|
120 |
+
name="input_ids",
|
121 |
+
dtype="int32")
|
122 |
+
input_attention_masks_layer=Input(
|
123 |
+
shape=(512),
|
124 |
+
name="attention_masks",
|
125 |
+
dtype="int32")
|
126 |
+
bert_model=bert_layer(
|
127 |
+
input_ids_layer,
|
128 |
+
input_attention_masks_layer)
|
129 |
+
target_layer=Dense(
|
130 |
+
units=17,
|
131 |
+
kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
|
132 |
+
name="target_layer",
|
133 |
+
activation="sigmoid")(bert_model[1])
|
134 |
+
model=Model(
|
135 |
+
inputs=[input_ids_layer, input_attention_masks_layer],
|
136 |
+
outputs=target_layer,
|
137 |
+
name="aurora_sdg_mbert_multilabel")
|
138 |
+
optimizer=Adam(
|
139 |
+
learning_rate=5e-05,
|
140 |
+
epsilon=1e-08,
|
141 |
+
decay=0.01,
|
142 |
+
clipnorm=1.0)
|
143 |
+
model.compile(
|
144 |
+
optimizer=optimizer,
|
145 |
+
loss="binary_crossentropy",
|
146 |
+
metrics=[Precision(), Recall()])
|
147 |
+
return model
|
148 |
+
|
149 |
+
|
150 |
+
abstracts, labels=data_to_values(tab)
|
151 |
+
ids=abstracts_to_ids(abstracts)
|
152 |
+
print("Abstracts tokenized, tokens converted to ids.")
|
153 |
+
|
154 |
+
padded_ids=pad_ids(ids)
|
155 |
+
print("Sequences padded.")
|
156 |
+
|
157 |
+
train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
|
158 |
+
validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
|
159 |
+
print("Data splited into train, validation, test sets.")
|
160 |
+
|
161 |
+
train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
162 |
+
print("Attention masks created.")
|
163 |
+
train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
164 |
+
print("Inputs converted to tensors.")
|
165 |
+
train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]]
|
166 |
+
print("Labels converted to tensors.")
|
167 |
+
train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
|
168 |
+
print("Masks converted to tensors.")
|
169 |
+
|
170 |
+
|
171 |
+
model=create_model()
|
172 |
+
print("Model initialized.")
|
173 |
+
|
174 |
+
|
175 |
+
history=model.fit([train_inputs, train_masks], train_labels,
|
176 |
+
batch_size=16,
|
177 |
+
epochs=4,
|
178 |
+
validation_data=([validation_inputs, validation_masks], validation_labels))
|
179 |
+
|
180 |
+
|
181 |
+
model.save(SAVE_MODEL_TO+"mbert_multilabel.h5")
|
182 |
+
print("Model saved.")
|
183 |
+
|
184 |
+
test_score=model.evaluate([test_inputs, test_masks], test_labels,
|
185 |
+
batch_size=8)
|
186 |
+
|
187 |
+
print("Model tested.")
|
188 |
+
|
189 |
+
|
190 |
+
stats=pd.DataFrame(test_score)
|
191 |
+
stats.to_excel(SAVE_MODEL_TO+"mbert_multilabel_stats.xlsx", index=False)
|
192 |
+
|
193 |
+
print("Stats saved.")
|
train_multilabel_models.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IMPORTS
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from nltk import tokenize
|
5 |
+
import time
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from transformers import BertConfig, BertTokenizer, TFBertModel
|
8 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
9 |
+
from tensorflow import convert_to_tensor
|
10 |
+
from tensorflow.keras.layers import Input, Dense
|
11 |
+
from tensorflow.keras.initializers import TruncatedNormal
|
12 |
+
from tensorflow.keras.models import Model
|
13 |
+
from tensorflow.keras.optimizers import Adam
|
14 |
+
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
|
15 |
+
|
16 |
+
|
17 |
+
# SET PARAMETERS
|
18 |
+
|
19 |
+
DATA_PATH="..."
|
20 |
+
|
21 |
+
SAVE_MODELS_TO=".../"
|
22 |
+
|
23 |
+
|
24 |
+
# READ DATA
|
25 |
+
|
26 |
+
tab=pd.read_hdf(DATA_PATH)
|
27 |
+
|
28 |
+
|
29 |
+
# SLICE DATA
|
30 |
+
|
31 |
+
def slice_data(dataframe, label):
|
32 |
+
"""Slices dataframe of a structure:
|
33 |
+
| text/abstract | label |
|
34 |
+
Prepares data for a binary classification
|
35 |
+
training. For a given label, creates new
|
36 |
+
dataset where number of items belonging
|
37 |
+
to the given label equals number of randomly
|
38 |
+
generated items from all the other labels items.
|
39 |
+
"""
|
40 |
+
label_data=dataframe[dataframe[label]==1]
|
41 |
+
label_data_len=len(label_data)
|
42 |
+
temp_data=dataframe.copy()[dataframe[label]!=1].sample(n=label_data_len)
|
43 |
+
label_data=label_data[["Abstract", label]]
|
44 |
+
label_data=label_data.append(temp_data[["Abstract", label]])
|
45 |
+
label_data.columns=["Abstract", "Label"]
|
46 |
+
return label_data
|
47 |
+
|
48 |
+
|
49 |
+
# PREPARE DATA FOR BERT
|
50 |
+
|
51 |
+
def data_to_values(dataframe):
|
52 |
+
"""Converts data to values.
|
53 |
+
"""
|
54 |
+
abstracts=dataframe.Abstract.values
|
55 |
+
labels=dataframe.Label.values
|
56 |
+
return abstracts, labels
|
57 |
+
|
58 |
+
|
59 |
+
def tokenize_abstracts(abstracts):
|
60 |
+
"""For given texts, adds '[CLS]' and '[SEP]' tokens
|
61 |
+
at the beginning and the end of each sentence, respectively.
|
62 |
+
"""
|
63 |
+
t_abstracts=[]
|
64 |
+
for abstract in abstracts:
|
65 |
+
t_abstract="[CLS] "
|
66 |
+
for sentence in tokenize.sent_tokenize(abstract):
|
67 |
+
t_abstract=t_abstract + sentence + " [SEP] "
|
68 |
+
t_abstracts.append(t_abstract)
|
69 |
+
return t_abstracts
|
70 |
+
|
71 |
+
|
72 |
+
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
|
73 |
+
|
74 |
+
|
75 |
+
def b_tokenize_abstracts(t_abstracts, max_len=512):
|
76 |
+
"""Tokenizes sentences with the help
|
77 |
+
of a 'bert-base-multilingual-uncased' tokenizer.
|
78 |
+
"""
|
79 |
+
b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
|
80 |
+
return b_t_abstracts
|
81 |
+
|
82 |
+
|
83 |
+
def convert_to_ids(b_t_abstracts):
|
84 |
+
"""Converts tokens to its specific
|
85 |
+
IDs in a bert vocabulary.
|
86 |
+
"""
|
87 |
+
input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
|
88 |
+
return input_ids
|
89 |
+
|
90 |
+
|
91 |
+
def abstracts_to_ids(abstracts):
|
92 |
+
"""Tokenizes abstracts and converts
|
93 |
+
tokens to their specific IDs
|
94 |
+
in a bert vocabulary.
|
95 |
+
"""
|
96 |
+
tokenized_abstracts=tokenize_abstracts(abstracts)
|
97 |
+
b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
|
98 |
+
ids=convert_to_ids(b_tokenized_abstracts)
|
99 |
+
return ids
|
100 |
+
|
101 |
+
|
102 |
+
def pad_ids(input_ids, max_len=512):
|
103 |
+
"""Padds sequences of a given IDs.
|
104 |
+
"""
|
105 |
+
p_input_ids=pad_sequences(input_ids,
|
106 |
+
maxlen=max_len,
|
107 |
+
dtype="long",
|
108 |
+
truncating="post",
|
109 |
+
padding="post")
|
110 |
+
return p_input_ids
|
111 |
+
|
112 |
+
|
113 |
+
def create_attention_masks(inputs):
|
114 |
+
"""Creates attention masks
|
115 |
+
for a given seuquences.
|
116 |
+
"""
|
117 |
+
masks=[]
|
118 |
+
for sequence in inputs:
|
119 |
+
sequence_mask=[float(_>0) for _ in sequence]
|
120 |
+
masks.append(sequence_mask)
|
121 |
+
return masks
|
122 |
+
|
123 |
+
|
124 |
+
# CREATE MODEL
|
125 |
+
|
126 |
+
def create_model(label):
|
127 |
+
config=BertConfig.from_pretrained(
|
128 |
+
"bert-base-multilingual-uncased",
|
129 |
+
num_labels=2,
|
130 |
+
hidden_dropout_prob=0.2,
|
131 |
+
attention_probs_dropout_prob=0.2)
|
132 |
+
bert=TFBertModel.from_pretrained(
|
133 |
+
"bert-base-multilingual-uncased",
|
134 |
+
config=config)
|
135 |
+
bert_layer=bert.layers[0]
|
136 |
+
input_ids_layer=Input(
|
137 |
+
shape=(512),
|
138 |
+
name="input_ids",
|
139 |
+
dtype="int32")
|
140 |
+
input_attention_masks_layer=Input(
|
141 |
+
shape=(512),
|
142 |
+
name="attention_masks",
|
143 |
+
dtype="int32")
|
144 |
+
bert_model=bert_layer(
|
145 |
+
input_ids_layer,
|
146 |
+
input_attention_masks_layer)
|
147 |
+
target_layer=Dense(
|
148 |
+
units=1,
|
149 |
+
kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
|
150 |
+
name="target_layer",
|
151 |
+
activation="sigmoid")(bert_model[1])
|
152 |
+
model=Model(
|
153 |
+
inputs=[input_ids_layer, input_attention_masks_layer],
|
154 |
+
outputs=target_layer,
|
155 |
+
name="model_"+label.replace(".", "_"))
|
156 |
+
optimizer=Adam(
|
157 |
+
learning_rate=5e-05,
|
158 |
+
epsilon=1e-08,
|
159 |
+
decay=0.01,
|
160 |
+
clipnorm=1.0)
|
161 |
+
model.compile(
|
162 |
+
optimizer=optimizer,
|
163 |
+
loss="binary_crossentropy",
|
164 |
+
metrics=[BinaryAccuracy(), Precision(), Recall()])
|
165 |
+
return model
|
166 |
+
|
167 |
+
|
168 |
+
# THE LOOP
|
169 |
+
|
170 |
+
test_scores=[]
|
171 |
+
elapsed_times=[]
|
172 |
+
|
173 |
+
for _ in tab.columns[4:]: # here you have to specify the index where label’s columns start
|
174 |
+
print(f"PROCESSING TARGET {_}...")
|
175 |
+
start_time=time.process_time()
|
176 |
+
data=slice_data(tab, _)
|
177 |
+
print("Data sliced.")
|
178 |
+
abstracts, labels=data_to_values(data)
|
179 |
+
ids=abstracts_to_ids(abstracts)
|
180 |
+
print("Abstracts tokenized, tokens converted to ids.")
|
181 |
+
padded_ids=pad_ids(ids)
|
182 |
+
print("Sequences padded.")
|
183 |
+
train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
|
184 |
+
validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
|
185 |
+
print("Data splited into train, validation, test sets.")
|
186 |
+
train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
187 |
+
print("Attention masks created.")
|
188 |
+
train_inputs, validation_inputs, test inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
|
189 |
+
print("Inputs converted to tensors.")
|
190 |
+
train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_lables, validation_labels, test_labels]]
|
191 |
+
print("Labels converted to tensors.")
|
192 |
+
train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
|
193 |
+
print("Masks converted to tensors.")
|
194 |
+
model=create_model(_)
|
195 |
+
print("Model initialized.")
|
196 |
+
history=model.fit([train_inputs, train_masks], train_labels,
|
197 |
+
batch_size=3,
|
198 |
+
epochs=3,
|
199 |
+
validation_data=([validation_inputs, validation_masks], validation_labels))
|
200 |
+
histories.append(history)
|
201 |
+
print(f"Model for {_} target trained.")
|
202 |
+
model.save(SAVE_MODELS_TO+_.replace(".", "_")+".h5")
|
203 |
+
print(f"Model for target {_} saved.")
|
204 |
+
test_score=model.evaluate([test_inputs, test_masks], test_labels,
|
205 |
+
batch_size=3)
|
206 |
+
elapsed_times.append(time.process_time()-start_time)
|
207 |
+
test_scores.append(test_score)
|
208 |
+
print(f"""Model for target {_} tested.
|
209 |
+
.
|
210 |
+
.
|
211 |
+
.""")
|
212 |
+
|
213 |
+
|
214 |
+
# SAVE STATISTICS
|
215 |
+
|
216 |
+
stats=pd.DataFrame(test_scores, columns=["loss", "accuracy", "precision", "recall"])
|
217 |
+
stats.insert(loc=0, "target", tab.columns[4:])
|
218 |
+
stats.insert(loc=5, "elapsed_time", elapsed_times)
|
219 |
+
stats.to_excel(SAVE_MODELS_TO+"_stats.xlsx", index=False)
|
220 |
+
|
221 |
+
|