Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer,AutoModelForSequenceClassification
|
4 |
+
from scipy.special import softmax
|
5 |
+
from tqdm.notebook import tqdm
|
6 |
+
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
|
7 |
+
from rouge_score import rouge_scorer
|
8 |
+
from rouge import Rouge
|
9 |
+
import streamlit as st
|
10 |
+
from ydata_profiling import ProfileReport
|
11 |
+
from streamlit_pandas_profiling import st_profile_report
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
import time
|
16 |
+
import io
|
17 |
+
import os
|
18 |
+
import pprint
|
19 |
+
from IPython.display import HTML
|
20 |
+
import traceback
|
21 |
+
import logging
|
22 |
+
import random
|
23 |
+
import pandas as pd
|
24 |
+
import numpy as np
|
25 |
+
import seaborn as sns
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import tensorflow as tf
|
28 |
+
|
29 |
+
st.set_page_config(page_title="Review Summary App", page_icon=None, layout="centered", initial_sidebar_state="auto", menu_items=None)
|
30 |
+
# st.set_page_config(layout="wide")
|
31 |
+
st.title("Review Summarizer App")
|
32 |
+
st.write("This app summarises all the reviews of a product")
|
33 |
+
|
34 |
+
|
35 |
+
@st.cache_resource#(allow_output_mutation=True)
|
36 |
+
def load_roberta_model_and_tokenizer(model_name):
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
38 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
39 |
+
return tokenizer, model
|
40 |
+
|
41 |
+
@st.cache_resource#(allow_output_mutation=True)
|
42 |
+
def load_pegasus_model_and_tokenizer(model_name):
|
43 |
+
tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
44 |
+
model = PegasusForConditionalGeneration.from_pretrained(model_name)
|
45 |
+
return tokenizer, model
|
46 |
+
|
47 |
+
# =========================================================
|
48 |
+
# Define a function to assign labels based on star rating
|
49 |
+
# =========================================================
|
50 |
+
def assign_star_label(row):
|
51 |
+
return 'positive' if row['star_rating'] > 3 else 'negative'
|
52 |
+
|
53 |
+
def showEda(df):
|
54 |
+
pr = ProfileReport(df, explorative=True)
|
55 |
+
st.header('**Pandas Profiling Report**')
|
56 |
+
st_profile_report(pr)
|
57 |
+
|
58 |
+
|
59 |
+
def dataset_load():
|
60 |
+
|
61 |
+
with st.spinner("Importing modules................"):
|
62 |
+
time.sleep(2)
|
63 |
+
st.success("Imported Modules")
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# ===================================================================================================================
|
68 |
+
# ================================================= UTILITY FUNCTIONS ===============================================
|
69 |
+
# ===================================================================================================================
|
70 |
+
with st.spinner("Initialising methods ............"):
|
71 |
+
# ==================
|
72 |
+
# Load & Clean Data
|
73 |
+
# ==================
|
74 |
+
@st.cache_data
|
75 |
+
def data_load_clean_df():
|
76 |
+
df = pd.read_csv('./amazon_reviews_us_Mobile_Electronics_v1_00.csv', on_bad_lines='skip')
|
77 |
+
# df = df.loc[df['product_id'].isin(['B00J46XO9U'])]
|
78 |
+
df = df[['customer_id','product_title','star_rating','review_body','product_id']]
|
79 |
+
df[~df.duplicated(subset='review_body')] #Remove duplicates
|
80 |
+
df = df.apply(lambda row: row[df['star_rating'].isin(['1','2','3','4','5'])]) # Remove date fields inside star_rating
|
81 |
+
df['star_rating']=df['star_rating'].astype('int64') # Convert data type for star_rating
|
82 |
+
df['star_rating_label'] = df.apply(assign_star_label, axis=1) # Apply the function to create the 'label' column
|
83 |
+
df['review_body'] = df['review_body'].apply(lambda x : str(x)) # Convert text inputs to STRING
|
84 |
+
df['review_body'] = df['review_body'].apply(lambda x : x[:512]) # Limit length of string
|
85 |
+
return df.reset_index(drop=True)
|
86 |
+
|
87 |
+
|
88 |
+
# ======================
|
89 |
+
# Assign Polarity Score
|
90 |
+
# ======================
|
91 |
+
def polarity_scores_roberta(review):
|
92 |
+
encoded_text = roberta_tokenizer(review, return_tensors='pt').to(device)
|
93 |
+
with torch.no_grad():
|
94 |
+
output = roberta_model(**encoded_text)
|
95 |
+
# scores = output[0][0].detach().numpy() # FOR CPU
|
96 |
+
scores = softmax(output.logits.detach().cpu().numpy()) # CONVERT from GPU to CPU
|
97 |
+
scores = softmax(scores[0])
|
98 |
+
scores_dict = {
|
99 |
+
'roberta_negative' : scores[0],
|
100 |
+
'roberta_positive' : scores[1]
|
101 |
+
}
|
102 |
+
return scores_dict
|
103 |
+
|
104 |
+
# ==================
|
105 |
+
# Summarising Text
|
106 |
+
# ==================
|
107 |
+
def text_summarizer(review):
|
108 |
+
batch = pegasus_tokenizer(review, truncation=True, padding="longest", max_length=1024, return_tensors="pt").to(device)
|
109 |
+
with torch.no_grad():
|
110 |
+
translated = pegasus_model.generate(**batch)
|
111 |
+
#translated = pegasus_model.module.generate(**batch) #When using Data Parallel
|
112 |
+
tgt_text = pegasus_tokenizer.batch_decode(translated, skip_special_tokens=True)
|
113 |
+
summary_dict = {"summary":tgt_text[0]}
|
114 |
+
return summary_dict
|
115 |
+
|
116 |
+
# =================
|
117 |
+
# Rouge Score Check
|
118 |
+
# =================
|
119 |
+
def rouge_score_viewer(original_text,generated_summary):
|
120 |
+
# Create a Rouge object
|
121 |
+
rouge = Rouge()
|
122 |
+
|
123 |
+
# Calculate ROUGE scores
|
124 |
+
scores = rouge.get_scores(generated_summary, original_text)
|
125 |
+
|
126 |
+
# Print ROUGE scores
|
127 |
+
return {"Rouge-1":scores[0]['rouge-1'],"Rouge-2":scores[0]['rouge-2'],"Rouge-L":scores[0]['rouge-l']}
|
128 |
+
|
129 |
+
# =======================================================
|
130 |
+
# Define a function to assign labels based on star rating
|
131 |
+
# =======================================================
|
132 |
+
def assign_label(row):
|
133 |
+
if row['roberta_positive'] > row['roberta_negative']:
|
134 |
+
return 'positive'
|
135 |
+
else:
|
136 |
+
return 'negative'
|
137 |
+
|
138 |
+
# =======================================================
|
139 |
+
# Summarise bunch of summaries together
|
140 |
+
# =======================================================
|
141 |
+
@st.cache_data
|
142 |
+
def data_summarizer(df, marker, summary_count):
|
143 |
+
summaries = []
|
144 |
+
marker = 'positive' if marker==1 else 'negative'
|
145 |
+
df_new = df[(df['star_rating_label']==marker) & (df['roberta_rating_label']==marker)]
|
146 |
+
df_new = df_new[~df_new.duplicated(subset=["review_body","summary"])]
|
147 |
+
sentence = df_new.sort_values(['roberta_positive','Rouge_1','Rouge_2','Rouge_L'],ascending=[False, False,False,False])['summary'].reset_index(drop=True) if marker==1 else df_new.sort_values(['roberta_negative','Rouge_1','Rouge_2','Rouge_L'],ascending=[False, False,False,False])['summary'].reset_index(drop=True)
|
148 |
+
print(sentence)
|
149 |
+
print(f"Sentence len :{len(sentence)}")
|
150 |
+
count=0
|
151 |
+
for i in range(0,len(sentence),10):
|
152 |
+
if(count==summary_count):
|
153 |
+
break
|
154 |
+
else:
|
155 |
+
chunk = sentence[i:i + 10]
|
156 |
+
joined_sentence = ' '.join(chunk)
|
157 |
+
print(f"JOINED SENTENCE :{joined_sentence}\n\n\n")
|
158 |
+
summaries.append(text_summarizer(joined_sentence[:512])["summary"])
|
159 |
+
count+=1
|
160 |
+
print(f"SUMMARY IS:{summaries}\n")
|
161 |
+
return summaries
|
162 |
+
|
163 |
+
|
164 |
+
# ==========================================================
|
165 |
+
# Convert the array to a markdown string with bullet points
|
166 |
+
# ==========================================================
|
167 |
+
def bullet_markdown(array):
|
168 |
+
return "\n".join(f"- {item}" for item in array)
|
169 |
+
|
170 |
+
# ==========================================================
|
171 |
+
# Get rows with same rating labels
|
172 |
+
# ==========================================================
|
173 |
+
|
174 |
+
def getMatchCols(df,value):
|
175 |
+
marker = "positive" if value == 1 else "negative"
|
176 |
+
df_new = df[(df['star_rating_label']==marker) & (df['roberta_rating_label']==marker)]
|
177 |
+
if df_new.shape[0]>0:
|
178 |
+
return df_new.sort_values(['roberta_positive','Rouge_1','Rouge_2','Rouge_L'],ascending=[False,False,False,False])['review_body'].values
|
179 |
+
else:
|
180 |
+
return [f"No {marker} reviews available"]
|
181 |
+
|
182 |
+
# =========================================================================================================================
|
183 |
+
# ================================================= LOADING OF THE DATA ===================================================
|
184 |
+
# =========================================================================================================================
|
185 |
+
|
186 |
+
## Load & Clean Data
|
187 |
+
with st.spinner("Loading the data ............"):
|
188 |
+
# st.header("Loaded Dataframe")
|
189 |
+
df = data_load_clean_df()
|
190 |
+
|
191 |
+
loaded_df = df.copy()
|
192 |
+
# Controlling the sidebar for loaded DF and new DF with selected product
|
193 |
+
ProductDataframeCheck = False
|
194 |
+
|
195 |
+
# TODO : Limit for demonstration only. Less rows to be analysed later
|
196 |
+
# df = df.groupby('product_id').filter(lambda x: (len(x) <= 5)).reset_index(drop=True)
|
197 |
+
st.header("The Dataframe loaded is shown below :")
|
198 |
+
with st.spinner("Loading the data ............"):
|
199 |
+
st.dataframe(df)
|
200 |
+
# =========================================================================================================================
|
201 |
+
# ================================================= LIST OF ALL PRODUCTS ==================================================
|
202 |
+
# =========================================================================================================================
|
203 |
+
with st.spinner("Loading list of products ............"):
|
204 |
+
time.sleep(2)
|
205 |
+
prod_ids = df['product_id'].unique()
|
206 |
+
|
207 |
+
# =========================================================================================================================
|
208 |
+
# ================================================= CHOOSE A PRODUCT ======================================================
|
209 |
+
# =========================================================================================================================
|
210 |
+
|
211 |
+
# Create a dual slider to select the range of product ids to display
|
212 |
+
st.markdown("---")
|
213 |
+
st.subheader("Step 0 : Choose a product")
|
214 |
+
|
215 |
+
|
216 |
+
# Group the dataframe by product_id and count the number of rows for each product_id
|
217 |
+
grouped_df = df.groupby("product_id").size().reset_index(name="count")
|
218 |
+
|
219 |
+
# st.dataframe(grouped_df)
|
220 |
+
|
221 |
+
# Find the product_id with the maximum number of rows and store it in max_rows
|
222 |
+
max_rows = grouped_df["count"].max()
|
223 |
+
|
224 |
+
# Create a slider in streamlit with min value as 0, and max value as max_rows
|
225 |
+
# slider_value = st.slider("Select the number of rows", min_value=1, max_value=max_rows)
|
226 |
+
slider_value = st.select_slider("Select the number of rows", options=sorted(grouped_df['count'].unique()),value=max(grouped_df['count']))
|
227 |
+
|
228 |
+
# Filter the grouped dataframe by the slider value and get the product_id column as a list
|
229 |
+
filtered_df = grouped_df[grouped_df["count"] == slider_value]["product_id"].tolist()
|
230 |
+
|
231 |
+
# Create a select box in streamlit with the filtered list of product_id
|
232 |
+
st.write(f"There are {len(filtered_df)} products with {slider_value} rows")
|
233 |
+
selected_product_id = st.selectbox("Select the product_id", filtered_df)
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
preview_df = df.loc[df['product_id']==selected_product_id].reset_index(drop=True)
|
238 |
+
|
239 |
+
if(not preview_df.empty):
|
240 |
+
prod_name = preview_df['product_title'][0]
|
241 |
+
|
242 |
+
# Display the selected product id
|
243 |
+
st.markdown("---")
|
244 |
+
st.subheader("Step 1 : Product Details :")
|
245 |
+
st.write(f'Product Name : {prod_name}')
|
246 |
+
st.write(f'Product ID : {selected_product_id} ')
|
247 |
+
st.write(f'Total Rows : {preview_df.shape[0]}')
|
248 |
+
|
249 |
+
#================================================================
|
250 |
+
# Use the condition to control the display of the radio buttons
|
251 |
+
#================================================================
|
252 |
+
if(not preview_df.empty):
|
253 |
+
ProductDataframeCheck = True
|
254 |
+
|
255 |
+
if (not ProductDataframeCheck):
|
256 |
+
option = st.sidebar.radio("Select an option", ["None","Show EDA"])
|
257 |
+
else:
|
258 |
+
option = st.sidebar.radio("Select an option", ["None","Show EDA", "Product EDA"])
|
259 |
+
|
260 |
+
if(option=="Show EDA"):
|
261 |
+
showEda(loaded_df)
|
262 |
+
elif option=="Product EDA":
|
263 |
+
showEda(preview_df)
|
264 |
+
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
if st.button('Confirm Product'):
|
269 |
+
df = df.loc[df['product_id']==selected_product_id].reset_index(drop=True)
|
270 |
+
st.markdown("---")
|
271 |
+
st.subheader("Step 2 : Dataframe with chosen product :")
|
272 |
+
st.dataframe(df)
|
273 |
+
# st.success(f"Dataframe loaded with product_id:{selected_product_id}")
|
274 |
+
# st.write(f"Selected product is {selected_product_id}, named as \"{df['product_title']}\" with dataframe having {df.shape[0]} rows")
|
275 |
+
|
276 |
+
df_rows = df.shape[0]
|
277 |
+
|
278 |
+
# =========================================================================================================================
|
279 |
+
# ================================================ PRE-TRAINED MODEL ======================================================
|
280 |
+
# =========================================================================================================================
|
281 |
+
st.markdown("---")
|
282 |
+
st.subheader("Step 3 : Initialising the models & running operation")
|
283 |
+
with st.spinner("Initializing RoBERTa Model ............"):
|
284 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
285 |
+
st.write(f"Selected device for processing is (CPU/GPU) : {device.upper()}")
|
286 |
+
|
287 |
+
# ROBERTA Model
|
288 |
+
with st.spinner("Initializing RoBERTa Model ............"):
|
289 |
+
# roberta_model_name = f"siebert/sentiment-roberta-large-english"
|
290 |
+
# roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
|
291 |
+
# roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name).to(device)
|
292 |
+
|
293 |
+
roberta_model_name = "siebert/sentiment-roberta-large-english"
|
294 |
+
roberta_tokenizer, roberta_model = load_roberta_model_and_tokenizer(roberta_model_name)
|
295 |
+
roberta_model.to(device)
|
296 |
+
|
297 |
+
|
298 |
+
# PEGASUS Model
|
299 |
+
with st.spinner("Initializing Pegasus Model ............"):
|
300 |
+
# pegasus_model_name = "google/pegasus-large"
|
301 |
+
# pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
|
302 |
+
# pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name).to(device)
|
303 |
+
pegasus_model_name = "google/pegasus-large"
|
304 |
+
pegasus_tokenizer, pegasus_model = load_pegasus_model_and_tokenizer(pegasus_model_name)
|
305 |
+
pegasus_model.to(device)
|
306 |
+
|
307 |
+
st.success("Models successfully loaded")
|
308 |
+
# =========================================================================================================================
|
309 |
+
# ================================================ RUN MODEL ON DATA ======================================================
|
310 |
+
# =========================================================================================================================
|
311 |
+
|
312 |
+
# Sentimental Analysis & Text Summarisation
|
313 |
+
|
314 |
+
res = {}
|
315 |
+
summaries = {}
|
316 |
+
rouge_1 = {}
|
317 |
+
rouge_2 = {}
|
318 |
+
rouge_L = {}
|
319 |
+
broken_ids = []
|
320 |
+
|
321 |
+
with st.spinner("Operation in progress ............"):
|
322 |
+
|
323 |
+
progress_bar_analysis = st.progress((0/len(df))*100, text="Please wait......... 0%")
|
324 |
+
|
325 |
+
progress_percent = 0
|
326 |
+
progress_text = f"Please wait......... {float(progress_percent):.2f}%"
|
327 |
+
|
328 |
+
|
329 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
330 |
+
|
331 |
+
progress_percent = (i/len(df))*100
|
332 |
+
progress_text = f"Please wait......... {progress_percent:.2f}%"
|
333 |
+
progress_bar_analysis.progress(int(progress_percent+1), text=progress_text)
|
334 |
+
|
335 |
+
|
336 |
+
# Process Sentimental Analysis
|
337 |
+
text = row['review_body']
|
338 |
+
myid = row['customer_id']
|
339 |
+
|
340 |
+
|
341 |
+
roberta_result = polarity_scores_roberta(text)
|
342 |
+
both = {**roberta_result}
|
343 |
+
res[myid] = both
|
344 |
+
|
345 |
+
# Process Summaries
|
346 |
+
summary_result = text_summarizer(text)
|
347 |
+
summaries[myid] = {**summary_result}
|
348 |
+
|
349 |
+
#Rouge SCore
|
350 |
+
original_text = row['review_body']
|
351 |
+
generated_summary = summary_result['summary']
|
352 |
+
rouge_scores = rouge_score_viewer(original_text,generated_summary)
|
353 |
+
rouge_1[myid]={"rouge-1":rouge_scores['Rouge-1']['f']}
|
354 |
+
rouge_2[myid]={"rouge-2":rouge_scores['Rouge-2']['f']}
|
355 |
+
rouge_L[myid]={"rouge-L":rouge_scores['Rouge-L']['f']}
|
356 |
+
progress_bar_analysis.progress(int(100), text="Completed......... 100%")
|
357 |
+
st.success("Operation Completed")
|
358 |
+
|
359 |
+
with st.spinner("Merging in progress ............"):
|
360 |
+
|
361 |
+
# Merge dataframes
|
362 |
+
results_df = pd.DataFrame(res).T
|
363 |
+
results_df['summary'] = (pd.DataFrame(summaries).T)['summary'].values #Add summary column
|
364 |
+
results_df['Rouge_1'] = pd.DataFrame(rouge_1).T[:].values
|
365 |
+
results_df['Rouge_2'] = pd.DataFrame(rouge_2).T[:].values
|
366 |
+
results_df['Rouge_L'] = pd.DataFrame(rouge_L).T[:].values
|
367 |
+
results_df = results_df.reset_index().rename(columns={'index': 'customer_id'})
|
368 |
+
results_df = results_df.merge(df, how='left')
|
369 |
+
|
370 |
+
results_df['roberta_rating_label'] = results_df.apply(assign_label, axis=1) # Apply the function to create the 'label' column
|
371 |
+
st.markdown("---")
|
372 |
+
st.subheader("Step 4 : Dataframe after operation")
|
373 |
+
# st.dataframe(results_df)
|
374 |
+
# st.success("Merge Completed")
|
375 |
+
|
376 |
+
|
377 |
+
with st.spinner("Matching Columns in progress ............"):
|
378 |
+
# prod_a = results_df.loc[results_df['product_id']=='B00J46XO9U']
|
379 |
+
prod_a = results_df.copy()
|
380 |
+
prod_a = prod_a[prod_a['star_rating_label'] == prod_a['roberta_rating_label']]
|
381 |
+
prod_a.reset_index(drop=True)
|
382 |
+
|
383 |
+
# st.success("Matching columns Completed")
|
384 |
+
|
385 |
+
# st.header("Dataframe with matching labels")
|
386 |
+
st.dataframe(prod_a)
|
387 |
+
|
388 |
+
# =========================================================================================================================
|
389 |
+
# ============================================= HISTOGRAM CHECK ======================================================
|
390 |
+
# =========================================================================================================================
|
391 |
+
|
392 |
+
# # Create a histogram using matplotlib
|
393 |
+
# plt.figure(figsize=(8, 6))
|
394 |
+
# plt.hist(prod_a['Rouge_1'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
|
395 |
+
# plt.title('Histogram of Random Data')
|
396 |
+
# plt.xlabel('Values')
|
397 |
+
# plt.ylabel('Frequency')
|
398 |
+
# plt.grid(True)
|
399 |
+
# plt.show()
|
400 |
+
|
401 |
+
# # Create a histogram using matplotlib
|
402 |
+
# plt.figure(figsize=(8, 6))
|
403 |
+
# plt.hist(prod_a['Rouge_2'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
|
404 |
+
# plt.title('Histogram of Random Data')
|
405 |
+
# plt.xlabel('Values')
|
406 |
+
# plt.ylabel('Frequency')
|
407 |
+
# plt.grid(True)
|
408 |
+
# plt.show()
|
409 |
+
|
410 |
+
# # Create a histogram using matplotlib
|
411 |
+
# plt.figure(figsize=(8, 6))
|
412 |
+
# plt.hist(prod_a['Rouge_L'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
|
413 |
+
# plt.title('Histogram of Random Data')
|
414 |
+
# plt.xlabel('Values')
|
415 |
+
# plt.ylabel('Frequency')
|
416 |
+
# plt.grid(True)
|
417 |
+
# plt.show()
|
418 |
+
|
419 |
+
|
420 |
+
# =========================================================================================================================
|
421 |
+
# ============================================= CHECKING THE METRICS ======================================================
|
422 |
+
# =========================================================================================================================
|
423 |
+
|
424 |
+
# RUN only if NUMBER OF ROWS > 4
|
425 |
+
if(df_rows>4):
|
426 |
+
with st.spinner("Creating confusion matrix ............"):
|
427 |
+
st.markdown("---")
|
428 |
+
st.subheader("Step 5. - Confusion Matrix")
|
429 |
+
# Sample confusion matrix (replace this with your actual data)
|
430 |
+
conf_df = results_df.copy()
|
431 |
+
actual_labels = conf_df['star_rating_label']
|
432 |
+
predicted_labels = conf_df['roberta_rating_label']
|
433 |
+
|
434 |
+
# Create the confusion matrix
|
435 |
+
cm_a = confusion_matrix(actual_labels, predicted_labels)
|
436 |
+
|
437 |
+
# Display the confusion matrix using seaborn
|
438 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
439 |
+
sns.heatmap(cm_a, annot=True, fmt='d')
|
440 |
+
st.pyplot()
|
441 |
+
|
442 |
+
|
443 |
+
# Extract true positives, false positives, false negatives, true negatives
|
444 |
+
tn, fp, fn, tp = cm_a.ravel()
|
445 |
+
|
446 |
+
# Calculate accuracy
|
447 |
+
accuracy = accuracy_score(actual_labels, predicted_labels)
|
448 |
+
|
449 |
+
# Calculate precision, recall, and F1 score
|
450 |
+
precision = tp / (tp + fp)
|
451 |
+
recall = tp / (tp + fn)
|
452 |
+
f1 = 2 * (precision * recall) / (precision + recall)
|
453 |
+
|
454 |
+
st.write(f"Accuracy :{accuracy*100:.2f} | Precision :{precision:.2f} | Recall:{recall:.2f} | F1-Score:{f1:.2f}")
|
455 |
+
|
456 |
+
# =========================================================================================================================
|
457 |
+
# ============================================= SUMMARRY OF PRODUCT =======================================================
|
458 |
+
# =========================================================================================================================
|
459 |
+
st.markdown("---")
|
460 |
+
st.subheader("Step 6 : Summary of product")
|
461 |
+
choice = 10#st.number_input("Choose number of summaries", 0, 10)
|
462 |
+
|
463 |
+
# POSITIVE SUMMARIES
|
464 |
+
st.header("Positive Reviews Summary")
|
465 |
+
if(df_rows<=10):
|
466 |
+
st.markdown(bullet_markdown(getMatchCols(prod_a,1)))
|
467 |
+
else:
|
468 |
+
with st.spinner("Generating Positive Summaries ............"):
|
469 |
+
sum_list_pos = data_summarizer(prod_a,1,choice)
|
470 |
+
st.markdown(bullet_markdown(sum_list_pos))
|
471 |
+
|
472 |
+
# NEGATIVE SUMMARIES
|
473 |
+
st.header("Negative Reviews Summary")
|
474 |
+
if(df_rows<=10):
|
475 |
+
st.markdown(bullet_markdown(getMatchCols(prod_a,0)))
|
476 |
+
else:
|
477 |
+
with st.spinner("Generating Negative Summaries ............"):
|
478 |
+
sum_list_neg =data_summarizer(prod_a,0,choice)
|
479 |
+
st.markdown(bullet_markdown(sum_list_neg))
|
480 |
+
|
481 |
+
|
482 |
+
dataset_load()
|