RealKintaro's picture
Update Deployment/app.py
6a42807
# Delete all objects from memory
keys = list(globals().keys())
for o in keys:
if not o.startswith('_'):
print(o)
del globals()[o]
# Imort from a file called Bert-medium.py
from Bert_medium import MediumBert
from Offensive_Bert import BertClassifier
from data_cleaning import cleaning_content
from Dialect_Bert import Dialect_Detection
import torch
device = torch.device("cpu")
from transformers import BertTokenizer, AutoTokenizer, BertTokenizerFast
import streamlit as st
# file path
import os
path_file = os.path.dirname(os.path.abspath(__file__))
parent_path = os.path.dirname(path_file)
##########################FUNCTIONS########################
def predict_off(review_text,model,device,tokenizer):
encoded_review = tokenizer.encode_plus(
review_text,
max_length=256,
add_special_tokens=True,
return_token_type_ids=False,
padding='longest',
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
#print(f'Review text: {review_text}')
index = output.cpu().data.numpy().argmax()
#print(f'Sentiment : {index}')
# decode the output of the model to get the predicted label
pred = index
return pred
#########################################""
def predict_other(review_text,model,device,tokenizer):
encoded_review = tokenizer.encode_plus(
review_text,
max_length=217,
add_special_tokens=True,
return_token_type_ids=False,
padding='longest',
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
#print(f'Review text: {review_text}')
index = output.cpu().data.numpy().argmax()
#print(f'Sentiment : {index}')
# decode the output of the model to get the predicted label
return index
#########################"##################
def predict_dialect(review_text,model,device,tokenizer):
encoded_review = tokenizer.encode_plus(
review_text,
max_length=123,
add_special_tokens=True,
return_token_type_ids=False,
padding='longest',
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
#print(f'Review text: {review_text}')
index = output.cpu().data.numpy().argmax()
#print(f'Sentiment : {index}')
pred = index
return pred
# Main prediction function
def predict(text,device,offensive_model,offensive_tokenizer,racism_model,misogyny_model,verbalabuse_model,dialect_model,religionhate_model,tokenizer_dialect,other_tokenizer,off_dictionary,racism_dict,misogyny_dict,verbalabuse_dict,dialect_dict,religionhate_dict):
# clean text
text = cleaning_content(text)
# predict using offensive model
off_pred = off_dictionary[predict_off(text,offensive_model,device,offensive_tokenizer)]
if off_pred == 'offensive':
# predict using racism model
rac_pred = racism_dict[predict_other(text,racism_model,device,other_tokenizer)]
# predict using misogyny model
misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)]
# predict using verbal abuse model
ver_pred = verbalabuse_dict[predict_other(text,verbalabuse_model,device,other_tokenizer)]
# predict using dialect model
dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)]
# predict using religion hate model
Religion_Hate_pred = religionhate_dict[predict_other(text,religionhate_model,device,other_tokenizer)]
# return the prediction
return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": rac_pred, "Verbal Abuse": ver_pred, "Religion Hate": Religion_Hate_pred}
# predict using misogyny model
misog_pred = misogyny_dict[predict_other(text,misogyny_model,device,other_tokenizer)]
# predict using dialect model
dialect_pred = dialect_dict[predict_dialect(text,dialect_model,device,tokenizer_dialect)]
# return the prediction as a dataframe row
return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": "Not_Racism", "Verbal Abuse": "Not Verbal Abuse", "Religion Hate": "Not Religion Hate"}
###############################################
from geopy.geocoders import Nominatim
import numpy as np
import pandas as pd
geolocator = Nominatim(user_agent="NLP")
def geolocate(country):
try:
# Geolocate the center of the country
loc = geolocator.geocode(country)
# And return latitude and longitude
return (loc.latitude, loc.longitude)
except:
# Return missing value
return np.nan
# Stream lit app
st.title("Arabic Hate Speech Detection")
st.write("This app detects hate speech in Arabic dialect text")
st.write("Please enter your text below")
# Session state
if 'Loaded' not in st.session_state:
st.markdown('### Loading models ...')
st.session_state['Loaded'] = False
else:
print('Model already loaded')
st.session_state['Loaded'] = True
if st.session_state['Loaded'] == False:
# Offensiveness detection model
offensive_model = BertClassifier()
offensive_model.load_state_dict(torch.load(os.path.join(parent_path,'models/modelv3.pt'), map_location=torch.device('cpu')))
offensive_tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02', do_lower_case=True)
#send model to device
offensive_model = offensive_model.to(device)
st.session_state['Offensive_model'] = offensive_model
st.session_state['Offensive_tokenizer'] = offensive_tokenizer
print('Offensive model loaded')
off_dictionary = {1: 'offensive', 0: 'non_offensive'}
st.session_state['Offensive_dictionary'] = off_dictionary
##############################################################################################################################
# Other four models
other_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")
st.session_state['Other_tokenizer'] = other_tokenizer
racism_model,religionhate_model,verbalabuse_model,misogyny_model = MediumBert(),MediumBert(),MediumBert(),MediumBert()
################################################################
racism_model.load_state_dict(torch.load(os.path.join(parent_path,'models/racism/racism_arabert.pt'), map_location=torch.device('cpu')))
racism_dict = {0: 'non_racist', 1: 'racist'}
racism_model = racism_model.to(device)
st.session_state['Racism_model'] = racism_model
st.session_state['Racism_dictionary'] = racism_dict
print('Racism model loaded')
################################################################
religionhate_model.load_state_dict(torch.load(os.path.join(parent_path,'models/religion_hate/religion_hate_params.pt'), map_location=torch.device('cpu')))
religionhate_dict = {0: 'Religion Hate', 1: 'Not Religion Hate'}
religionhate_model = religionhate_model.to(device)
st.session_state['Religion_hate_model'] = religionhate_model
st.session_state['Religion_hate_dictionary'] = religionhate_dict
print('Religion Hate model loaded')
################################################################
verbalabuse_model.load_state_dict(torch.load(os.path.join(parent_path,'models/verbal_abuse/verbal_abuse_arabert.pt'), map_location=torch.device('cpu')))
verbalabuse_dict = {0: 'Verbal Abuse', 1: 'Not Verbal Abuse'}
verbalabuse_model=verbalabuse_model.to(device)
st.session_state['Verbal_abuse_model'] = verbalabuse_model
st.session_state['Verbal_abuse_dictionary'] = verbalabuse_dict
print('Verbal Abuse model loaded')
################################################################
misogyny_model.load_state_dict(torch.load(os.path.join(parent_path,'models/misogyny/misogyny.pt'), map_location=torch.device('cpu')))
misogyny_dict = {0: 'misogyny', 1: 'non_misogyny'}
misogyny_model=misogyny_model.to(device)
st.session_state['Misogyny_model'] = misogyny_model
st.session_state['Misogyny_dictionary'] = misogyny_dict
print('Misogyny model loaded')
################################################################
# Dialect detection model
dialect_model = Dialect_Detection(10)
dialect_model.load_state_dict(torch.load(os.path.join(parent_path,'models/dialect_classifier.pt'), map_location=torch.device('cpu')))
dialect_model = dialect_model.to(device)
st.session_state['Dialect_model'] = dialect_model
print('Dialect model loaded')
tokenizer_dialect = BertTokenizerFast.from_pretrained('alger-ia/dziribert')
st.session_state['Dialect_tokenizer'] = tokenizer_dialect
# load the model
dialect_dict = {0: 'lebanon', 1: 'egypt', 2: 'morocco', 3: 'tunisia', 4: 'algeria', 5: 'qatar', 6: 'iraq', 7: 'saudi arabia', 8: 'libya', 9: 'jordan'}
st.session_state['Dialect_dictionary'] = dialect_dict
st.session_state['Loaded'] = True
text = st.text_area("Enter Text")
if st.button("Predict") and text != '':
result = predict(text = text, device = device,
offensive_model= st.session_state['Offensive_model'],
offensive_tokenizer= st.session_state['Offensive_tokenizer'],
racism_model= st.session_state['Racism_model'],
misogyny_model=st.session_state['Misogyny_model'],
verbalabuse_model= st.session_state['Verbal_abuse_model'],
dialect_model=st.session_state['Dialect_model'],
religionhate_model=st.session_state['Religion_hate_model'],
tokenizer_dialect=st.session_state['Dialect_tokenizer'],
other_tokenizer=st.session_state['Other_tokenizer'],
off_dictionary=st.session_state['Offensive_dictionary'],
racism_dict=st.session_state['Racism_dictionary'],
misogyny_dict=st.session_state['Misogyny_dictionary'],
verbalabuse_dict=st.session_state['Verbal_abuse_dictionary'],
dialect_dict=st.session_state['Dialect_dictionary'],
religionhate_dict=st.session_state['Religion_hate_dictionary'])
st.write(result)
location = geolocate(result['Dialect'])
# map with contry highlited
location = pd.DataFrame({'lat': [location[0]], 'lon': [location[1]]})
st.map(data= location , zoom=5)
elif text == '':
st.write('Please enter text to predict')