rkf2778 commited on
Commit
0d5bda6
1 Parent(s): c2aaf35

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +482 -0
app.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer,AutoModelForSequenceClassification
4
+ from scipy.special import softmax
5
+ from tqdm.notebook import tqdm
6
+ from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
7
+ from rouge_score import rouge_scorer
8
+ from rouge import Rouge
9
+ import streamlit as st
10
+ from ydata_profiling import ProfileReport
11
+ from streamlit_pandas_profiling import st_profile_report
12
+
13
+
14
+
15
+ import time
16
+ import io
17
+ import os
18
+ import pprint
19
+ from IPython.display import HTML
20
+ import traceback
21
+ import logging
22
+ import random
23
+ import pandas as pd
24
+ import numpy as np
25
+ import seaborn as sns
26
+ import matplotlib.pyplot as plt
27
+ import tensorflow as tf
28
+
29
+ st.set_page_config(page_title="Review Summary App", page_icon=None, layout="centered", initial_sidebar_state="auto", menu_items=None)
30
+ # st.set_page_config(layout="wide")
31
+ st.title("Review Summarizer App")
32
+ st.write("This app summarises all the reviews of a product")
33
+
34
+
35
+ @st.cache_resource#(allow_output_mutation=True)
36
+ def load_roberta_model_and_tokenizer(model_name):
37
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
38
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
39
+ return tokenizer, model
40
+
41
+ @st.cache_resource#(allow_output_mutation=True)
42
+ def load_pegasus_model_and_tokenizer(model_name):
43
+ tokenizer = PegasusTokenizer.from_pretrained(model_name)
44
+ model = PegasusForConditionalGeneration.from_pretrained(model_name)
45
+ return tokenizer, model
46
+
47
+ # =========================================================
48
+ # Define a function to assign labels based on star rating
49
+ # =========================================================
50
+ def assign_star_label(row):
51
+ return 'positive' if row['star_rating'] > 3 else 'negative'
52
+
53
+ def showEda(df):
54
+ pr = ProfileReport(df, explorative=True)
55
+ st.header('**Pandas Profiling Report**')
56
+ st_profile_report(pr)
57
+
58
+
59
+ def dataset_load():
60
+
61
+ with st.spinner("Importing modules................"):
62
+ time.sleep(2)
63
+ st.success("Imported Modules")
64
+
65
+
66
+
67
+ # ===================================================================================================================
68
+ # ================================================= UTILITY FUNCTIONS ===============================================
69
+ # ===================================================================================================================
70
+ with st.spinner("Initialising methods ............"):
71
+ # ==================
72
+ # Load & Clean Data
73
+ # ==================
74
+ @st.cache_data
75
+ def data_load_clean_df():
76
+ df = pd.read_csv('./amazon_reviews_us_Mobile_Electronics_v1_00.csv', on_bad_lines='skip')
77
+ # df = df.loc[df['product_id'].isin(['B00J46XO9U'])]
78
+ df = df[['customer_id','product_title','star_rating','review_body','product_id']]
79
+ df[~df.duplicated(subset='review_body')] #Remove duplicates
80
+ df = df.apply(lambda row: row[df['star_rating'].isin(['1','2','3','4','5'])]) # Remove date fields inside star_rating
81
+ df['star_rating']=df['star_rating'].astype('int64') # Convert data type for star_rating
82
+ df['star_rating_label'] = df.apply(assign_star_label, axis=1) # Apply the function to create the 'label' column
83
+ df['review_body'] = df['review_body'].apply(lambda x : str(x)) # Convert text inputs to STRING
84
+ df['review_body'] = df['review_body'].apply(lambda x : x[:512]) # Limit length of string
85
+ return df.reset_index(drop=True)
86
+
87
+
88
+ # ======================
89
+ # Assign Polarity Score
90
+ # ======================
91
+ def polarity_scores_roberta(review):
92
+ encoded_text = roberta_tokenizer(review, return_tensors='pt').to(device)
93
+ with torch.no_grad():
94
+ output = roberta_model(**encoded_text)
95
+ # scores = output[0][0].detach().numpy() # FOR CPU
96
+ scores = softmax(output.logits.detach().cpu().numpy()) # CONVERT from GPU to CPU
97
+ scores = softmax(scores[0])
98
+ scores_dict = {
99
+ 'roberta_negative' : scores[0],
100
+ 'roberta_positive' : scores[1]
101
+ }
102
+ return scores_dict
103
+
104
+ # ==================
105
+ # Summarising Text
106
+ # ==================
107
+ def text_summarizer(review):
108
+ batch = pegasus_tokenizer(review, truncation=True, padding="longest", max_length=1024, return_tensors="pt").to(device)
109
+ with torch.no_grad():
110
+ translated = pegasus_model.generate(**batch)
111
+ #translated = pegasus_model.module.generate(**batch) #When using Data Parallel
112
+ tgt_text = pegasus_tokenizer.batch_decode(translated, skip_special_tokens=True)
113
+ summary_dict = {"summary":tgt_text[0]}
114
+ return summary_dict
115
+
116
+ # =================
117
+ # Rouge Score Check
118
+ # =================
119
+ def rouge_score_viewer(original_text,generated_summary):
120
+ # Create a Rouge object
121
+ rouge = Rouge()
122
+
123
+ # Calculate ROUGE scores
124
+ scores = rouge.get_scores(generated_summary, original_text)
125
+
126
+ # Print ROUGE scores
127
+ return {"Rouge-1":scores[0]['rouge-1'],"Rouge-2":scores[0]['rouge-2'],"Rouge-L":scores[0]['rouge-l']}
128
+
129
+ # =======================================================
130
+ # Define a function to assign labels based on star rating
131
+ # =======================================================
132
+ def assign_label(row):
133
+ if row['roberta_positive'] > row['roberta_negative']:
134
+ return 'positive'
135
+ else:
136
+ return 'negative'
137
+
138
+ # =======================================================
139
+ # Summarise bunch of summaries together
140
+ # =======================================================
141
+ @st.cache_data
142
+ def data_summarizer(df, marker, summary_count):
143
+ summaries = []
144
+ marker = 'positive' if marker==1 else 'negative'
145
+ df_new = df[(df['star_rating_label']==marker) & (df['roberta_rating_label']==marker)]
146
+ df_new = df_new[~df_new.duplicated(subset=["review_body","summary"])]
147
+ sentence = df_new.sort_values(['roberta_positive','Rouge_1','Rouge_2','Rouge_L'],ascending=[False, False,False,False])['summary'].reset_index(drop=True) if marker==1 else df_new.sort_values(['roberta_negative','Rouge_1','Rouge_2','Rouge_L'],ascending=[False, False,False,False])['summary'].reset_index(drop=True)
148
+ print(sentence)
149
+ print(f"Sentence len :{len(sentence)}")
150
+ count=0
151
+ for i in range(0,len(sentence),10):
152
+ if(count==summary_count):
153
+ break
154
+ else:
155
+ chunk = sentence[i:i + 10]
156
+ joined_sentence = ' '.join(chunk)
157
+ print(f"JOINED SENTENCE :{joined_sentence}\n\n\n")
158
+ summaries.append(text_summarizer(joined_sentence[:512])["summary"])
159
+ count+=1
160
+ print(f"SUMMARY IS:{summaries}\n")
161
+ return summaries
162
+
163
+
164
+ # ==========================================================
165
+ # Convert the array to a markdown string with bullet points
166
+ # ==========================================================
167
+ def bullet_markdown(array):
168
+ return "\n".join(f"- {item}" for item in array)
169
+
170
+ # ==========================================================
171
+ # Get rows with same rating labels
172
+ # ==========================================================
173
+
174
+ def getMatchCols(df,value):
175
+ marker = "positive" if value == 1 else "negative"
176
+ df_new = df[(df['star_rating_label']==marker) & (df['roberta_rating_label']==marker)]
177
+ if df_new.shape[0]>0:
178
+ return df_new.sort_values(['roberta_positive','Rouge_1','Rouge_2','Rouge_L'],ascending=[False,False,False,False])['review_body'].values
179
+ else:
180
+ return [f"No {marker} reviews available"]
181
+
182
+ # =========================================================================================================================
183
+ # ================================================= LOADING OF THE DATA ===================================================
184
+ # =========================================================================================================================
185
+
186
+ ## Load & Clean Data
187
+ with st.spinner("Loading the data ............"):
188
+ # st.header("Loaded Dataframe")
189
+ df = data_load_clean_df()
190
+
191
+ loaded_df = df.copy()
192
+ # Controlling the sidebar for loaded DF and new DF with selected product
193
+ ProductDataframeCheck = False
194
+
195
+ # TODO : Limit for demonstration only. Less rows to be analysed later
196
+ # df = df.groupby('product_id').filter(lambda x: (len(x) <= 5)).reset_index(drop=True)
197
+ st.header("The Dataframe loaded is shown below :")
198
+ with st.spinner("Loading the data ............"):
199
+ st.dataframe(df)
200
+ # =========================================================================================================================
201
+ # ================================================= LIST OF ALL PRODUCTS ==================================================
202
+ # =========================================================================================================================
203
+ with st.spinner("Loading list of products ............"):
204
+ time.sleep(2)
205
+ prod_ids = df['product_id'].unique()
206
+
207
+ # =========================================================================================================================
208
+ # ================================================= CHOOSE A PRODUCT ======================================================
209
+ # =========================================================================================================================
210
+
211
+ # Create a dual slider to select the range of product ids to display
212
+ st.markdown("---")
213
+ st.subheader("Step 0 : Choose a product")
214
+
215
+
216
+ # Group the dataframe by product_id and count the number of rows for each product_id
217
+ grouped_df = df.groupby("product_id").size().reset_index(name="count")
218
+
219
+ # st.dataframe(grouped_df)
220
+
221
+ # Find the product_id with the maximum number of rows and store it in max_rows
222
+ max_rows = grouped_df["count"].max()
223
+
224
+ # Create a slider in streamlit with min value as 0, and max value as max_rows
225
+ # slider_value = st.slider("Select the number of rows", min_value=1, max_value=max_rows)
226
+ slider_value = st.select_slider("Select the number of rows", options=sorted(grouped_df['count'].unique()),value=max(grouped_df['count']))
227
+
228
+ # Filter the grouped dataframe by the slider value and get the product_id column as a list
229
+ filtered_df = grouped_df[grouped_df["count"] == slider_value]["product_id"].tolist()
230
+
231
+ # Create a select box in streamlit with the filtered list of product_id
232
+ st.write(f"There are {len(filtered_df)} products with {slider_value} rows")
233
+ selected_product_id = st.selectbox("Select the product_id", filtered_df)
234
+
235
+
236
+
237
+ preview_df = df.loc[df['product_id']==selected_product_id].reset_index(drop=True)
238
+
239
+ if(not preview_df.empty):
240
+ prod_name = preview_df['product_title'][0]
241
+
242
+ # Display the selected product id
243
+ st.markdown("---")
244
+ st.subheader("Step 1 : Product Details :")
245
+ st.write(f'Product Name : {prod_name}')
246
+ st.write(f'Product ID : {selected_product_id} ')
247
+ st.write(f'Total Rows : {preview_df.shape[0]}')
248
+
249
+ #================================================================
250
+ # Use the condition to control the display of the radio buttons
251
+ #================================================================
252
+ if(not preview_df.empty):
253
+ ProductDataframeCheck = True
254
+
255
+ if (not ProductDataframeCheck):
256
+ option = st.sidebar.radio("Select an option", ["None","Show EDA"])
257
+ else:
258
+ option = st.sidebar.radio("Select an option", ["None","Show EDA", "Product EDA"])
259
+
260
+ if(option=="Show EDA"):
261
+ showEda(loaded_df)
262
+ elif option=="Product EDA":
263
+ showEda(preview_df)
264
+
265
+
266
+
267
+
268
+ if st.button('Confirm Product'):
269
+ df = df.loc[df['product_id']==selected_product_id].reset_index(drop=True)
270
+ st.markdown("---")
271
+ st.subheader("Step 2 : Dataframe with chosen product :")
272
+ st.dataframe(df)
273
+ # st.success(f"Dataframe loaded with product_id:{selected_product_id}")
274
+ # st.write(f"Selected product is {selected_product_id}, named as \"{df['product_title']}\" with dataframe having {df.shape[0]} rows")
275
+
276
+ df_rows = df.shape[0]
277
+
278
+ # =========================================================================================================================
279
+ # ================================================ PRE-TRAINED MODEL ======================================================
280
+ # =========================================================================================================================
281
+ st.markdown("---")
282
+ st.subheader("Step 3 : Initialising the models & running operation")
283
+ with st.spinner("Initializing RoBERTa Model ............"):
284
+ device = "cuda" if torch.cuda.is_available() else "cpu"
285
+ st.write(f"Selected device for processing is (CPU/GPU) : {device.upper()}")
286
+
287
+ # ROBERTA Model
288
+ with st.spinner("Initializing RoBERTa Model ............"):
289
+ # roberta_model_name = f"siebert/sentiment-roberta-large-english"
290
+ # roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
291
+ # roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name).to(device)
292
+
293
+ roberta_model_name = "siebert/sentiment-roberta-large-english"
294
+ roberta_tokenizer, roberta_model = load_roberta_model_and_tokenizer(roberta_model_name)
295
+ roberta_model.to(device)
296
+
297
+
298
+ # PEGASUS Model
299
+ with st.spinner("Initializing Pegasus Model ............"):
300
+ # pegasus_model_name = "google/pegasus-large"
301
+ # pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
302
+ # pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name).to(device)
303
+ pegasus_model_name = "google/pegasus-large"
304
+ pegasus_tokenizer, pegasus_model = load_pegasus_model_and_tokenizer(pegasus_model_name)
305
+ pegasus_model.to(device)
306
+
307
+ st.success("Models successfully loaded")
308
+ # =========================================================================================================================
309
+ # ================================================ RUN MODEL ON DATA ======================================================
310
+ # =========================================================================================================================
311
+
312
+ # Sentimental Analysis & Text Summarisation
313
+
314
+ res = {}
315
+ summaries = {}
316
+ rouge_1 = {}
317
+ rouge_2 = {}
318
+ rouge_L = {}
319
+ broken_ids = []
320
+
321
+ with st.spinner("Operation in progress ............"):
322
+
323
+ progress_bar_analysis = st.progress((0/len(df))*100, text="Please wait......... 0%")
324
+
325
+ progress_percent = 0
326
+ progress_text = f"Please wait......... {float(progress_percent):.2f}%"
327
+
328
+
329
+ for i, row in tqdm(df.iterrows(), total=len(df)):
330
+
331
+ progress_percent = (i/len(df))*100
332
+ progress_text = f"Please wait......... {progress_percent:.2f}%"
333
+ progress_bar_analysis.progress(int(progress_percent+1), text=progress_text)
334
+
335
+
336
+ # Process Sentimental Analysis
337
+ text = row['review_body']
338
+ myid = row['customer_id']
339
+
340
+
341
+ roberta_result = polarity_scores_roberta(text)
342
+ both = {**roberta_result}
343
+ res[myid] = both
344
+
345
+ # Process Summaries
346
+ summary_result = text_summarizer(text)
347
+ summaries[myid] = {**summary_result}
348
+
349
+ #Rouge SCore
350
+ original_text = row['review_body']
351
+ generated_summary = summary_result['summary']
352
+ rouge_scores = rouge_score_viewer(original_text,generated_summary)
353
+ rouge_1[myid]={"rouge-1":rouge_scores['Rouge-1']['f']}
354
+ rouge_2[myid]={"rouge-2":rouge_scores['Rouge-2']['f']}
355
+ rouge_L[myid]={"rouge-L":rouge_scores['Rouge-L']['f']}
356
+ progress_bar_analysis.progress(int(100), text="Completed......... 100%")
357
+ st.success("Operation Completed")
358
+
359
+ with st.spinner("Merging in progress ............"):
360
+
361
+ # Merge dataframes
362
+ results_df = pd.DataFrame(res).T
363
+ results_df['summary'] = (pd.DataFrame(summaries).T)['summary'].values #Add summary column
364
+ results_df['Rouge_1'] = pd.DataFrame(rouge_1).T[:].values
365
+ results_df['Rouge_2'] = pd.DataFrame(rouge_2).T[:].values
366
+ results_df['Rouge_L'] = pd.DataFrame(rouge_L).T[:].values
367
+ results_df = results_df.reset_index().rename(columns={'index': 'customer_id'})
368
+ results_df = results_df.merge(df, how='left')
369
+
370
+ results_df['roberta_rating_label'] = results_df.apply(assign_label, axis=1) # Apply the function to create the 'label' column
371
+ st.markdown("---")
372
+ st.subheader("Step 4 : Dataframe after operation")
373
+ # st.dataframe(results_df)
374
+ # st.success("Merge Completed")
375
+
376
+
377
+ with st.spinner("Matching Columns in progress ............"):
378
+ # prod_a = results_df.loc[results_df['product_id']=='B00J46XO9U']
379
+ prod_a = results_df.copy()
380
+ prod_a = prod_a[prod_a['star_rating_label'] == prod_a['roberta_rating_label']]
381
+ prod_a.reset_index(drop=True)
382
+
383
+ # st.success("Matching columns Completed")
384
+
385
+ # st.header("Dataframe with matching labels")
386
+ st.dataframe(prod_a)
387
+
388
+ # =========================================================================================================================
389
+ # ============================================= HISTOGRAM CHECK ======================================================
390
+ # =========================================================================================================================
391
+
392
+ # # Create a histogram using matplotlib
393
+ # plt.figure(figsize=(8, 6))
394
+ # plt.hist(prod_a['Rouge_1'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
395
+ # plt.title('Histogram of Random Data')
396
+ # plt.xlabel('Values')
397
+ # plt.ylabel('Frequency')
398
+ # plt.grid(True)
399
+ # plt.show()
400
+
401
+ # # Create a histogram using matplotlib
402
+ # plt.figure(figsize=(8, 6))
403
+ # plt.hist(prod_a['Rouge_2'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
404
+ # plt.title('Histogram of Random Data')
405
+ # plt.xlabel('Values')
406
+ # plt.ylabel('Frequency')
407
+ # plt.grid(True)
408
+ # plt.show()
409
+
410
+ # # Create a histogram using matplotlib
411
+ # plt.figure(figsize=(8, 6))
412
+ # plt.hist(prod_a['Rouge_L'], bins=30, alpha=0.7, color='blue') # Adjust bins and color as needed
413
+ # plt.title('Histogram of Random Data')
414
+ # plt.xlabel('Values')
415
+ # plt.ylabel('Frequency')
416
+ # plt.grid(True)
417
+ # plt.show()
418
+
419
+
420
+ # =========================================================================================================================
421
+ # ============================================= CHECKING THE METRICS ======================================================
422
+ # =========================================================================================================================
423
+
424
+ # RUN only if NUMBER OF ROWS > 4
425
+ if(df_rows>4):
426
+ with st.spinner("Creating confusion matrix ............"):
427
+ st.markdown("---")
428
+ st.subheader("Step 5. - Confusion Matrix")
429
+ # Sample confusion matrix (replace this with your actual data)
430
+ conf_df = results_df.copy()
431
+ actual_labels = conf_df['star_rating_label']
432
+ predicted_labels = conf_df['roberta_rating_label']
433
+
434
+ # Create the confusion matrix
435
+ cm_a = confusion_matrix(actual_labels, predicted_labels)
436
+
437
+ # Display the confusion matrix using seaborn
438
+ st.set_option('deprecation.showPyplotGlobalUse', False)
439
+ sns.heatmap(cm_a, annot=True, fmt='d')
440
+ st.pyplot()
441
+
442
+
443
+ # Extract true positives, false positives, false negatives, true negatives
444
+ tn, fp, fn, tp = cm_a.ravel()
445
+
446
+ # Calculate accuracy
447
+ accuracy = accuracy_score(actual_labels, predicted_labels)
448
+
449
+ # Calculate precision, recall, and F1 score
450
+ precision = tp / (tp + fp)
451
+ recall = tp / (tp + fn)
452
+ f1 = 2 * (precision * recall) / (precision + recall)
453
+
454
+ st.write(f"Accuracy :{accuracy*100:.2f} | Precision :{precision:.2f} | Recall:{recall:.2f} | F1-Score:{f1:.2f}")
455
+
456
+ # =========================================================================================================================
457
+ # ============================================= SUMMARRY OF PRODUCT =======================================================
458
+ # =========================================================================================================================
459
+ st.markdown("---")
460
+ st.subheader("Step 6 : Summary of product")
461
+ choice = 10#st.number_input("Choose number of summaries", 0, 10)
462
+
463
+ # POSITIVE SUMMARIES
464
+ st.header("Positive Reviews Summary")
465
+ if(df_rows<=10):
466
+ st.markdown(bullet_markdown(getMatchCols(prod_a,1)))
467
+ else:
468
+ with st.spinner("Generating Positive Summaries ............"):
469
+ sum_list_pos = data_summarizer(prod_a,1,choice)
470
+ st.markdown(bullet_markdown(sum_list_pos))
471
+
472
+ # NEGATIVE SUMMARIES
473
+ st.header("Negative Reviews Summary")
474
+ if(df_rows<=10):
475
+ st.markdown(bullet_markdown(getMatchCols(prod_a,0)))
476
+ else:
477
+ with st.spinner("Generating Negative Summaries ............"):
478
+ sum_list_neg =data_summarizer(prod_a,0,choice)
479
+ st.markdown(bullet_markdown(sum_list_neg))
480
+
481
+
482
+ dataset_load()