|
|
|
|
|
keys = list(globals().keys()) |
|
|
|
for o in keys: |
|
if not o.startswith('_'): |
|
print(o) |
|
del globals()[o] |
|
|
|
|
|
|
|
from Bert_medium import MediumBert |
|
from Offensive_Bert import BertClassifier |
|
from data_cleaning import cleaning_content |
|
from Dialect_Bert import Dialect_Detection |
|
|
|
import torch |
|
device = torch.device("cpu") |
|
|
|
|
|
from transformers import BertTokenizer, AutoTokenizer, BertTokenizerFast |
|
import streamlit as st |
|
|
|
|
|
import os |
|
|
|
path_file = os.path.dirname(os.path.abspath(__file__)) |
|
parent_path = os.path.dirname(path_file) |
|
|
|
|
|
|
|
def predict_off(review_text,model,device,tokenizer): |
|
|
|
encoded_review = tokenizer.encode_plus( |
|
review_text, |
|
max_length=256, |
|
add_special_tokens=True, |
|
return_token_type_ids=False, |
|
padding='longest', |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
|
|
input_ids = encoded_review['input_ids'].to(device) |
|
attention_mask = encoded_review['attention_mask'].to(device) |
|
output = model(input_ids, attention_mask) |
|
_, prediction = torch.max(output, dim=1) |
|
|
|
index = output.cpu().data.numpy().argmax() |
|
|
|
|
|
pred = index |
|
|
|
return pred |
|
|
|
def predict_other(review_text,model,device,tokenizer): |
|
|
|
encoded_review = tokenizer.encode_plus( |
|
review_text, |
|
max_length=217, |
|
add_special_tokens=True, |
|
return_token_type_ids=False, |
|
padding='longest', |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
|
|
input_ids = encoded_review['input_ids'].to(device) |
|
attention_mask = encoded_review['attention_mask'].to(device) |
|
output = model(input_ids, attention_mask) |
|
_, prediction = torch.max(output, dim=1) |
|
|
|
index = output.cpu().data.numpy().argmax() |
|
|
|
|
|
|
|
return index |
|
|
|
|
|
def predict_dialect(review_text,model,device,tokenizer): |
|
|
|
encoded_review = tokenizer.encode_plus( |
|
review_text, |
|
max_length=123, |
|
add_special_tokens=True, |
|
return_token_type_ids=False, |
|
padding='longest', |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
|
|
input_ids = encoded_review['input_ids'].to(device) |
|
attention_mask = encoded_review['attention_mask'].to(device) |
|
output = model(input_ids, attention_mask) |
|
_, prediction = torch.max(output, dim=1) |
|
|
|
index = output.cpu().data.numpy().argmax() |
|
|
|
pred = index |
|
return pred |
|
|
|
|
|
|
|
|
|
def predict(text,device,offensive_model,offensive_tokenizer,racism_model,misogyny_model,verbalabuse_model,dialect_model,religionhate_model,tokenizer_dialect,other_tokenizer,off_dictionary,racism_dict,misogyny_dict,verbalabuse_dict,dialect_dict,religionhate_dict): |
|
|
|
text = cleaning_content(text) |
|
|
|
|
|
off_pred = off_dictionary[predict_off(text,offensive_model,device,offensive_tokenizer)] |
|
|
|
if off_pred == 'offensive': |
|
|
|
rac_pred = racism_dict[predict_other(text,racism_model,device,other_tokenizer)] |
|
|
|
misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)] |
|
|
|
ver_pred = verbalabuse_dict[predict_other(text,verbalabuse_model,device,other_tokenizer)] |
|
|
|
dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)] |
|
|
|
Religion_Hate_pred = religionhate_dict[predict_other(text,religionhate_model,device,other_tokenizer)] |
|
|
|
return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": rac_pred, "Verbal Abuse": ver_pred, "Religion Hate": Religion_Hate_pred} |
|
|
|
|
|
misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)] |
|
|
|
dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)] |
|
|
|
|
|
return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": "Not_Racism", "Verbal Abuse": "Not Verbal Abuse", "Religion Hate": "Not Religion Hate"} |
|
|
|
|
|
from geopy.geocoders import Nominatim |
|
import numpy as np |
|
import pandas as pd |
|
|
|
geolocator = Nominatim(user_agent="NLP") |
|
|
|
def geolocate(country): |
|
try: |
|
|
|
loc = geolocator.geocode(country) |
|
|
|
return (loc.latitude, loc.longitude) |
|
except: |
|
|
|
return np.nan |
|
|
|
|
|
|
|
st.title("Arabic Hate Speech Detection") |
|
|
|
st.write("This app detects hate speech in Arabic dialect text") |
|
|
|
st.write("Please enter your text below") |
|
|
|
|
|
|
|
if 'Loaded' not in st.session_state: |
|
st.markdown('### Loading models ...') |
|
st.session_state['Loaded'] = False |
|
else: |
|
print('Model already loaded') |
|
st.session_state['Loaded'] = True |
|
|
|
|
|
if st.session_state['Loaded'] == False: |
|
|
|
|
|
|
|
offensive_model = BertClassifier() |
|
offensive_model.load_state_dict(torch.load(os.path.join(parent_path,'models/modelv3.pt'), map_location=torch.device('cpu'))) |
|
offensive_tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02', do_lower_case=True) |
|
|
|
|
|
|
|
offensive_model = offensive_model.to(device) |
|
st.session_state['Offensive_model'] = offensive_model |
|
st.session_state['Offensive_tokenizer'] = offensive_tokenizer |
|
print('Offensive model loaded') |
|
off_dictionary = {1: 'offensive', 0: 'non_offensive'} |
|
st.session_state['Offensive_dictionary'] = off_dictionary |
|
|
|
|
|
|
|
|
|
|
|
other_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic") |
|
st.session_state['Other_tokenizer'] = other_tokenizer |
|
|
|
racism_model,religionhate_model,verbalabuse_model,misogyny_model = MediumBert(),MediumBert(),MediumBert(),MediumBert() |
|
|
|
|
|
racism_model.load_state_dict(torch.load(os.path.join(parent_path,'models/racism/racism_arabert.pt'), map_location=torch.device('cpu'))) |
|
racism_dict = {0: 'non_racist', 1: 'racist'} |
|
|
|
racism_model = racism_model.to(device) |
|
|
|
st.session_state['Racism_model'] = racism_model |
|
st.session_state['Racism_dictionary'] = racism_dict |
|
|
|
print('Racism model loaded') |
|
|
|
|
|
religionhate_model.load_state_dict(torch.load(os.path.join(parent_path,'models/religion_hate/religion_hate_params.pt'), map_location=torch.device('cpu'))) |
|
religionhate_dict = {0: 'Religion Hate', 1: 'Not Religion Hate'} |
|
|
|
religionhate_model = religionhate_model.to(device) |
|
|
|
st.session_state['Religion_hate_model'] = religionhate_model |
|
st.session_state['Religion_hate_dictionary'] = religionhate_dict |
|
|
|
print('Religion Hate model loaded') |
|
|
|
|
|
verbalabuse_model.load_state_dict(torch.load(os.path.join(parent_path,'models/verbal_abuse/verbal_abuse_arabert.pt'), map_location=torch.device('cpu'))) |
|
verbalabuse_dict = {0: 'Verbal Abuse', 1: 'Not Verbal Abuse'} |
|
|
|
verbalabuse_model=verbalabuse_model.to(device) |
|
|
|
st.session_state['Verbal_abuse_model'] = verbalabuse_model |
|
st.session_state['Verbal_abuse_dictionary'] = verbalabuse_dict |
|
|
|
print('Verbal Abuse model loaded') |
|
|
|
|
|
misogyny_model.load_state_dict(torch.load(os.path.join(parent_path,'models/misogyny/misogyny.pt'), map_location=torch.device('cpu'))) |
|
misogyny_dict = {0: 'misogyny', 1: 'non_misogyny'} |
|
|
|
misogyny_model=misogyny_model.to(device) |
|
|
|
st.session_state['Misogyny_model'] = misogyny_model |
|
st.session_state['Misogyny_dictionary'] = misogyny_dict |
|
|
|
|
|
print('Misogyny model loaded') |
|
|
|
|
|
|
|
|
|
dialect_model = Dialect_Detection(10) |
|
dialect_model.load_state_dict(torch.load(os.path.join(parent_path,'models/dialect_classifier.pt'), map_location=torch.device('cpu'))) |
|
|
|
dialect_model = dialect_model.to(device) |
|
|
|
st.session_state['Dialect_model'] = dialect_model |
|
|
|
print('Dialect model loaded') |
|
|
|
tokenizer_dialect = BertTokenizerFast.from_pretrained('alger-ia/dziribert') |
|
|
|
st.session_state['Dialect_tokenizer'] = tokenizer_dialect |
|
|
|
|
|
dialect_dict = {0: 'lebanon', 1: 'egypt', 2: 'morocco', 3: 'tunisia', 4: 'algeria', 5: 'qatar', 6: 'iraq', 7: 'saudi arabia', 8: 'libya', 9: 'jordan'} |
|
|
|
st.session_state['Dialect_dictionary'] = dialect_dict |
|
|
|
st.session_state['Loaded'] = True |
|
|
|
text = st.text_area("Enter Text") |
|
|
|
if st.button("Predict") and text != '': |
|
result = predict(text = text, device = device, |
|
offensive_model= st.session_state['Offensive_model'], |
|
offensive_tokenizer= st.session_state['Offensive_tokenizer'], |
|
racism_model= st.session_state['Racism_model'], |
|
misogyny_model=st.session_state['Misogyny_model'], |
|
verbalabuse_model= st.session_state['Verbal_abuse_model'], |
|
dialect_model=st.session_state['Dialect_model'], |
|
religionhate_model=st.session_state['Religion_hate_model'], |
|
tokenizer_dialect=st.session_state['Dialect_tokenizer'], |
|
other_tokenizer=st.session_state['Other_tokenizer'], |
|
off_dictionary=st.session_state['Offensive_dictionary'], |
|
racism_dict=st.session_state['Racism_dictionary'], |
|
misogyny_dict=st.session_state['Misogyny_dictionary'], |
|
verbalabuse_dict=st.session_state['Verbal_abuse_dictionary'], |
|
dialect_dict=st.session_state['Dialect_dictionary'], |
|
religionhate_dict=st.session_state['Religion_hate_dictionary']) |
|
|
|
st.write(result) |
|
|
|
location = geolocate(result['Dialect']) |
|
|
|
|
|
location = pd.DataFrame({'lat': [location[0]], 'lon': [location[1]]}) |
|
st.map(data= location , zoom=5) |
|
|
|
elif text == '': |
|
st.write('Please enter text to predict') |
|
|