File size: 11,403 Bytes
7f9da02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a42807
7f9da02
 
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
 
 
 
 
092cded
7f9da02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Delete all objects from memory

keys = list(globals().keys())

for o in keys:
    if not o.startswith('_'):
        print(o)
        del globals()[o]

# Imort from a file called Bert-medium.py

from Bert_medium import MediumBert
from Offensive_Bert import BertClassifier
from data_cleaning import cleaning_content
from Dialect_Bert import Dialect_Detection

import torch
device = torch.device("cpu")


from transformers import BertTokenizer, AutoTokenizer, BertTokenizerFast
import streamlit as st

# file path
import os

path_file = os.path.dirname(os.path.abspath(__file__))
parent_path = os.path.dirname(path_file)

##########################FUNCTIONS########################

def predict_off(review_text,model,device,tokenizer):

        encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=256,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='longest',
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)
        #print(f'Review text: {review_text}')
        index = output.cpu().data.numpy().argmax()
        #print(f'Sentiment  : {index}')
        # decode the output of the model to get the predicted label
        pred = index
        
        return pred
#########################################""
def predict_other(review_text,model,device,tokenizer):
        
        encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=217,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='longest',
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)
        #print(f'Review text: {review_text}')
        index = output.cpu().data.numpy().argmax()
        #print(f'Sentiment  : {index}')
        # decode the output of the model to get the predicted label

        return index
#########################"##################

def predict_dialect(review_text,model,device,tokenizer):
        
        encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=123,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='longest',
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)
        #print(f'Review text: {review_text}')
        index = output.cpu().data.numpy().argmax()
        #print(f'Sentiment  : {index}')
        pred = index
        return pred


# Main prediction function

def predict(text,device,offensive_model,offensive_tokenizer,racism_model,misogyny_model,verbalabuse_model,dialect_model,religionhate_model,tokenizer_dialect,other_tokenizer,off_dictionary,racism_dict,misogyny_dict,verbalabuse_dict,dialect_dict,religionhate_dict):
        # clean text
        text = cleaning_content(text)
        
        # predict using offensive model
        off_pred = off_dictionary[predict_off(text,offensive_model,device,offensive_tokenizer)]

        if off_pred == 'offensive':
            # predict using racism model
            rac_pred = racism_dict[predict_other(text,racism_model,device,other_tokenizer)]
            # predict using misogyny model
            misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)]
            # predict using verbal abuse model
            ver_pred = verbalabuse_dict[predict_other(text,verbalabuse_model,device,other_tokenizer)]
            # predict using dialect model
            dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)]
            # predict using religion hate model
            Religion_Hate_pred = religionhate_dict[predict_other(text,religionhate_model,device,other_tokenizer)]
            # return the prediction
            return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": rac_pred, "Verbal Abuse": ver_pred, "Religion Hate": Religion_Hate_pred}
        
        # predict using misogyny model
        misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)]
        # predict using dialect model
        dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)]
        
        # return the prediction  as a dataframe row
        return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": "Not_Racism", "Verbal Abuse": "Not Verbal Abuse", "Religion Hate": "Not Religion Hate"}
###############################################

from geopy.geocoders import Nominatim
import numpy as np
import pandas as pd

geolocator = Nominatim(user_agent="NLP")

def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

# Stream lit app

st.title("Arabic Hate Speech Detection")

st.write("This app detects hate speech in Arabic dialect text")

st.write("Please enter your text below")


# Session state
if 'Loaded' not in st.session_state:
    st.markdown('### Loading models ...')
    st.session_state['Loaded'] = False
else:
    print('Model already loaded')
    st.session_state['Loaded'] = True
    

if st.session_state['Loaded'] == False:

    # Offensiveness detection model 

    offensive_model = BertClassifier()
    offensive_model.load_state_dict(torch.load(os.path.join(parent_path,'models/modelv3.pt'), map_location=torch.device('cpu')))
    offensive_tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02', do_lower_case=True)

    #send model to device

    offensive_model = offensive_model.to(device)
    st.session_state['Offensive_model'] = offensive_model
    st.session_state['Offensive_tokenizer'] = offensive_tokenizer
    print('Offensive model loaded')
    off_dictionary = {1: 'offensive', 0: 'non_offensive'}
    st.session_state['Offensive_dictionary'] = off_dictionary

    ##############################################################################################################################

    # Other four models

    other_tokenizer =  AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")
    st.session_state['Other_tokenizer'] = other_tokenizer

    racism_model,religionhate_model,verbalabuse_model,misogyny_model = MediumBert(),MediumBert(),MediumBert(),MediumBert()
    ################################################################

    racism_model.load_state_dict(torch.load(os.path.join(parent_path,'models/racism/racism_arabert.pt'), map_location=torch.device('cpu')))
    racism_dict = {0: 'non_racist', 1: 'racist'}

    racism_model = racism_model.to(device)

    st.session_state['Racism_model'] = racism_model
    st.session_state['Racism_dictionary'] = racism_dict

    print('Racism model loaded')
    ################################################################

    religionhate_model.load_state_dict(torch.load(os.path.join(parent_path,'models/religion_hate/religion_hate_params.pt'), map_location=torch.device('cpu')))
    religionhate_dict = {0: 'Religion Hate', 1: 'Not Religion Hate'}

    religionhate_model = religionhate_model.to(device)

    st.session_state['Religion_hate_model'] = religionhate_model
    st.session_state['Religion_hate_dictionary'] = religionhate_dict

    print('Religion Hate model loaded')
    ################################################################

    verbalabuse_model.load_state_dict(torch.load(os.path.join(parent_path,'models/verbal_abuse/verbal_abuse_arabert.pt'), map_location=torch.device('cpu')))
    verbalabuse_dict = {0: 'Verbal Abuse', 1: 'Not Verbal Abuse'}

    verbalabuse_model=verbalabuse_model.to(device)

    st.session_state['Verbal_abuse_model'] = verbalabuse_model
    st.session_state['Verbal_abuse_dictionary'] = verbalabuse_dict

    print('Verbal Abuse model loaded')
    ################################################################

    misogyny_model.load_state_dict(torch.load(os.path.join(parent_path,'models/misogyny/misogyny.pt'), map_location=torch.device('cpu')))
    misogyny_dict = {0: 'misogyny', 1: 'non_misogyny'}

    misogyny_model=misogyny_model.to(device)

    st.session_state['Misogyny_model'] = misogyny_model
    st.session_state['Misogyny_dictionary'] = misogyny_dict


    print('Misogyny model loaded')
    ################################################################

    # Dialect detection model

    dialect_model = Dialect_Detection(10)
    dialect_model.load_state_dict(torch.load(os.path.join(parent_path,'models/dialect_classifier.pt'), map_location=torch.device('cpu')))

    dialect_model = dialect_model.to(device)

    st.session_state['Dialect_model'] = dialect_model

    print('Dialect model loaded')

    tokenizer_dialect = BertTokenizerFast.from_pretrained('alger-ia/dziribert')

    st.session_state['Dialect_tokenizer'] = tokenizer_dialect

    # load the model
    dialect_dict = {0: 'lebanon', 1: 'egypt', 2: 'morocco', 3: 'tunisia', 4: 'algeria', 5: 'qatar', 6: 'iraq', 7: 'saudi arabia', 8: 'libya', 9: 'jordan'}

    st.session_state['Dialect_dictionary'] = dialect_dict

    st.session_state['Loaded'] = True

text = st.text_area("Enter Text")

if st.button("Predict") and text != '':
    result = predict(text = text, device = device,
                    offensive_model= st.session_state['Offensive_model'],
                    offensive_tokenizer= st.session_state['Offensive_tokenizer'],
                    racism_model= st.session_state['Racism_model'],
                    misogyny_model=st.session_state['Misogyny_model'],
                    verbalabuse_model= st.session_state['Verbal_abuse_model'],
                    dialect_model=st.session_state['Dialect_model'],
                    religionhate_model=st.session_state['Religion_hate_model'],
                    tokenizer_dialect=st.session_state['Dialect_tokenizer'],
                    other_tokenizer=st.session_state['Other_tokenizer'],
                    off_dictionary=st.session_state['Offensive_dictionary'],
                    racism_dict=st.session_state['Racism_dictionary'],
                    misogyny_dict=st.session_state['Misogyny_dictionary'],
                    verbalabuse_dict=st.session_state['Verbal_abuse_dictionary'],
                    dialect_dict=st.session_state['Dialect_dictionary'],
                    religionhate_dict=st.session_state['Religion_hate_dictionary'])

    st.write(result)

    location  = geolocate(result['Dialect'])

    # map with contry highlited
    location = pd.DataFrame({'lat': [location[0]], 'lon': [location[1]]})
    st.map(data= location , zoom=5)
    
elif text == '':
    st.write('Please enter text to predict')