That1BrainCell commited on
Commit
c3c7d51
1 Parent(s): 05fdf5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -95
app.py CHANGED
@@ -6,27 +6,69 @@ import numpy as np
6
  from io import StringIO
7
  import sys
8
  import time
 
9
  from pymongo import MongoClient
 
 
 
 
 
 
 
 
10
 
11
  # File Imports
12
- from embedding import get_embeddings # Ensure this file/module is available
13
  from preprocess import filtering # Ensure this file/module is available
14
  from search import *
15
 
16
 
17
- # Mongo Connections
18
- srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
 
19
 
20
- client = MongoClient(srv_connection_uri)
21
- db = client['embeddings']
22
- collection = db['data']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Cosine Similarity Function
25
  def cosine_similarity(vec1, vec2):
26
  vec1 = np.array(vec1)
27
  vec2 = np.array(vec2)
28
 
29
- dot_product = np.dot(vec1, vec2)
30
  magnitude_vec1 = np.linalg.norm(vec1)
31
  magnitude_vec2 = np.linalg.norm(vec2)
32
 
@@ -36,6 +78,29 @@ def cosine_similarity(vec1, vec2):
36
  cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
37
  return cosine_sim
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Logger class to capture output
40
  class StreamCapture:
41
  def __init__(self):
@@ -52,12 +117,11 @@ class StreamCapture:
52
  # Main Function
53
  def score(main_product, main_url, product_count, link_count, search, logger, log_area):
54
 
55
- existing_products_urls = set(collection.distinct('url'))
56
 
57
  data = {}
58
  similar_products = extract_similar_products(main_product)[:product_count]
59
 
60
-
61
  # Normal Filtering + Embedding -----------------------------------------------
62
  if search == 'All':
63
 
@@ -107,94 +171,69 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
107
 
108
 
109
  # Filtered Link -----------------------------------------
110
- logger.write("\n\nFiltered Links ------------------>\n")
111
- logger.write(str(data) + "\n")
112
  log_area.text(logger.getvalue())
113
 
114
 
115
-
116
  # Main product Embeddings ---------------------------------
117
- logger.write("\n\nCreating Main product Embeddings ---------->\n")
118
-
119
- # Check main product in MongoDB
120
- if main_url in existing_products_urls:
121
- saved_data = collection.find_one({'url': main_url})
122
 
123
- if tag_option not in saved_data:
124
- main_result , main_embedding = get_embeddings(main_url,tag_option)
125
- else:
126
- main_embedding = saved_data[tag_option]
127
- else:
128
- main_result , main_embedding = get_embeddings(main_url,tag_option)
129
 
130
- log_area.text(logger.getvalue())
131
- print("main",main_embedding)
132
 
133
- update_doc = {
134
- '$set': {
135
- 'product_name': main_product,
136
- 'url': main_url,
137
- tag_option: main_embedding
138
- }
139
- }
140
 
141
- collection.update_one(
142
- {'url': main_url},
143
- update_doc,
144
- upsert=True
145
- )
146
 
 
 
 
147
 
148
- #Similar Products Check
149
- cosine_sim_scores = []
150
 
151
- logger.write("\n\nCreating Similar product Embeddings ---------->\n")
152
- log_area.text(logger.getvalue())
153
 
 
 
 
 
 
154
 
155
- for product in data:
 
 
156
 
157
- if len(data[product])==0:
158
- logger.write("\n\nNo Product links Found Increase No of Links or Change Search Source\n")
159
- log_area.text(logger.getvalue())
160
-
161
- cosine_sim_scores.append((product,'No Product links Found Increase Number of Links or Change Search Source',None,None))
162
-
163
- else:
164
- for link,present in data[product][:link_count]:
165
-
166
- saved_data = collection.find_one({'url': link})
167
 
168
- if present and (tag_option in saved_data):
169
- similar_embedding = saved_data[tag_option]
170
- else:
171
- similar_result, similar_embedding = get_embeddings(link,tag_option)
172
 
173
- log_area.text(logger.getvalue())
174
 
175
- print(similar_embedding)
176
- for i in range(len(main_embedding)):
177
- score = cosine_similarity(main_embedding[i], similar_embedding[i])
178
- cosine_sim_scores.append((product, link, i, score))
179
- log_area.text(logger.getvalue())
180
-
181
- update_doc = {
182
- '$set': {
183
- 'product_name': product,
184
- 'url': link,
185
- tag_option: similar_embedding
186
- }
187
- }
188
-
189
- collection.update_one(
190
- {'url': link},
191
- update_doc,
192
- upsert=True
193
  )
 
 
 
 
 
194
 
195
  logger.write("--------------- DONE -----------------\n")
196
  log_area.text(logger.getvalue())
197
- return cosine_sim_scores
 
 
 
 
 
198
 
199
  # Streamlit Interface
200
  st.title("Check Infringement")
@@ -205,35 +244,95 @@ main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
205
  main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
206
  search_method = st.selectbox('Choose Search Engine', ['All','duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
207
 
208
- col1, col2 = st.columns(2)
209
  with col1:
210
  product_count = st.number_input("Number of Simliar Products",min_value=1, step=1, format="%i")
211
  with col2:
212
  link_count = st.number_input("Number of Links per product",min_value=1, step=1, format="%i")
 
 
213
 
214
-
215
- tag_option = st.selectbox('Choose Similarity Method', ["Complete Document Similarity","Field Wise Document Similarity"])
216
 
217
 
218
  if st.button('Check for Infringement'):
219
- log_output = st.empty() # Placeholder for log output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- with st.spinner('Processing...'):
222
- with StreamCapture() as logger:
223
- cosine_sim_scores = score(main_product, main_url,product_count, link_count, search_method, logger, log_output)
224
 
225
- st.success('Processing complete!')
 
226
 
227
- st.subheader("Cosine Similarity Scores")
228
 
229
- # = score(main_product, main_url, search, logger, log_output)
230
- if tag_option == 'Complete Document Similarity':
231
- tags = ['Details']
232
- else:
233
- tags = ['Introduction', 'Specifications', 'Product Overview', 'Safety Information', 'Installation Instructions', 'Setup and Configuration', 'Operation Instructions', 'Maintenance and Care', 'Troubleshooting', 'Warranty Information', 'Legal Information']
234
 
235
- for product, link, index, value in cosine_sim_scores:
236
- if not index:
237
- st.write(f"Product: {product}, Link: {link}")
238
- if value!=None:
239
- st.write(f"{tags[index]:<20} - Similarity: {value:.2f}")
 
6
  from io import StringIO
7
  import sys
8
  import time
9
+ import pandas as pd
10
  from pymongo import MongoClient
11
+ import plotly.express as px
12
+ from pinecone import Pinecone, ServerlessSpec
13
+ import chromadb
14
+ import requests
15
+ from io import BytesIO
16
+ from PyPDF2 import PdfReader
17
+ import hashlib
18
+ import os
19
 
20
  # File Imports
21
+ from embedding import get_embeddings,get_image_embeddings,get_embed_chroma,imporve_text # Ensure this file/module is available
22
  from preprocess import filtering # Ensure this file/module is available
23
  from search import *
24
 
25
 
26
+ # Chroma Connections
27
+ client = chromadb.PersistentClient(path = "embeddings")
28
+ collection = client.get_or_create_collection(name="data",metadata={"hnsw:space": "l2"})
29
 
30
+
31
+ def generate_hash(content):
32
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()
33
+
34
+ def get_key(link):
35
+ text = ''
36
+ try:
37
+ # Fetch the PDF file from the URL
38
+ response = requests.get(link)
39
+ response.raise_for_status() # Raise an error for bad status codes
40
+
41
+ # Use BytesIO to handle the PDF content in memory
42
+ pdf_file = BytesIO(response.content)
43
+
44
+ # Load the PDF file
45
+ reader = PdfReader(pdf_file)
46
+ num_pages = len(reader.pages)
47
+
48
+ first_page_text = reader.pages[0].extract_text()
49
+ if first_page_text:
50
+ text += first_page_text
51
+
52
+
53
+ last_page_text = reader.pages[-1].extract_text()
54
+ if last_page_text:
55
+ text += last_page_text
56
+
57
+ except requests.exceptions.HTTPError as e:
58
+ print(f'HTTP error occurred: {e}')
59
+ except Exception as e:
60
+ print(f'An error occurred: {e}')
61
+
62
+ unique_key = generate_hash(text)
63
+
64
+ return unique_key
65
 
66
  # Cosine Similarity Function
67
  def cosine_similarity(vec1, vec2):
68
  vec1 = np.array(vec1)
69
  vec2 = np.array(vec2)
70
 
71
+ dot_product = np.dot(vec1, vec2.T)
72
  magnitude_vec1 = np.linalg.norm(vec1)
73
  magnitude_vec2 = np.linalg.norm(vec2)
74
 
 
78
  cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
79
  return cosine_sim
80
 
81
+ def update_chroma(product_name,url,key,text,vector,log_area):
82
+
83
+ id_list = [key+str(i) for i in range(len(text))]
84
+
85
+ metadata_list = [
86
+ { 'key':key,
87
+ 'product_name': product_name,
88
+ 'url': url,
89
+ 'text':item
90
+ }
91
+ for item in text
92
+ ]
93
+
94
+ collection.upsert(
95
+ ids = id_list,
96
+ embeddings = vector,
97
+ metadatas = metadata_list
98
+ )
99
+
100
+ logger.write(f"\n\u2713 Updated DB - {url}\n\n")
101
+ log_area.text(logger.getvalue())
102
+
103
+
104
  # Logger class to capture output
105
  class StreamCapture:
106
  def __init__(self):
 
117
  # Main Function
118
  def score(main_product, main_url, product_count, link_count, search, logger, log_area):
119
 
 
120
 
121
  data = {}
122
  similar_products = extract_similar_products(main_product)[:product_count]
123
 
124
+ print("--> Fetching Manual Links")
125
  # Normal Filtering + Embedding -----------------------------------------------
126
  if search == 'All':
127
 
 
171
 
172
 
173
  # Filtered Link -----------------------------------------
174
+ logger.write("\n\n\u2713 Filtered Links\n")
 
175
  log_area.text(logger.getvalue())
176
 
177
 
 
178
  # Main product Embeddings ---------------------------------
179
+ logger.write("\n\n--> Creating Main product Embeddings\n")
 
 
 
 
180
 
181
+ main_key = get_key(main_url)
182
+ main_text,main_vector = get_embed_chroma(main_url)
 
 
 
 
183
 
184
+ update_chroma(main_product,main_url,main_key,main_text,main_vector,log_area)
 
185
 
186
+ # log_area.text(logger.getvalue())
187
+ print("\n\n\u2713 Main Product embeddings Created")
 
 
 
 
 
188
 
 
 
 
 
 
189
 
190
+ logger.write("\n\n--> Creating Similar product Embeddings\n")
191
+ log_area.text(logger.getvalue())
192
+ test_embedding = [0]*768
193
 
194
+ for product in data:
195
+ for link in data[product]:
196
 
197
+ url, _ = link
198
+ similar_key = get_key(url)
199
 
200
+ res = collection.query(
201
+ query_embeddings = [test_embedding],
202
+ n_results=1,
203
+ where={"key": similar_key},
204
+ )
205
 
206
+ if not res['distances'][0]:
207
+ similar_text,similar_vector = get_embed_chroma(url)
208
+ update_chroma(product,url,similar_key,similar_text,similar_vector,log_area)
209
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ logger.write("\n\n\u2713 Similar Product embeddings Created\n")
212
+ log_area.text(logger.getvalue())
 
 
213
 
214
+ top_similar = []
215
 
216
+ for idx,chunk in enumerate(main_vector):
217
+ res = collection.query(
218
+ query_embeddings = [chunk],
219
+ n_results=1,
220
+ where={"key": {'$ne':main_key}},
221
+ include=['metadatas','embeddings','distances']
 
 
 
 
 
 
 
 
 
 
 
 
222
  )
223
+
224
+ top_similar.append((main_text[idx],chunk,res,res['distances'][0]))
225
+
226
+ most_similar_items = sorted(top_similar,key = lambda x:x[3])[:top_similar_count]
227
+
228
 
229
  logger.write("--------------- DONE -----------------\n")
230
  log_area.text(logger.getvalue())
231
+
232
+ return most_similar_items
233
+
234
+
235
+
236
+
237
 
238
  # Streamlit Interface
239
  st.title("Check Infringement")
 
244
  main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
245
  search_method = st.selectbox('Choose Search Engine', ['All','duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
246
 
247
+ col1, col2, col3= st.columns(3)
248
  with col1:
249
  product_count = st.number_input("Number of Simliar Products",min_value=1, step=1, format="%i")
250
  with col2:
251
  link_count = st.number_input("Number of Links per product",min_value=1, step=1, format="%i")
252
+ with col3:
253
+ need_image = st.selectbox("Process Images", ['True','False'])
254
 
255
+ top_similar_count = st.number_input("Top Similarities to be displayed",value=3,min_value=1, step=1, format="%i")
256
+ tag_option = "Complete Document Similarity"
257
 
258
 
259
  if st.button('Check for Infringement'):
260
+ global log_output # Placeholder for log output
261
+
262
+ tab1, tab2 = st.tabs(["Output", "Console"])
263
+
264
+ with tab2:
265
+ log_output = st.empty()
266
+
267
+ with tab1:
268
+ with st.spinner('Processing...'):
269
+ with StreamCapture() as logger:
270
+ top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
271
+
272
+ st.success('Processing complete!')
273
+
274
+ st.subheader("Cosine Similarity Scores")
275
+
276
+ for main_text, main_vector, response, _ in top_similar_values:
277
+ product_name = response['metadatas'][0][0]['product_name']
278
+ link = response['metadatas'][0][0]['url']
279
+ similar_text = response['metadatas'][0][0]['text']
280
+
281
+ cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
282
+
283
+ # Display the product information
284
+ with st.container():
285
+ st.markdown(f"### [Product: {product_name}]({link})")
286
+ st.markdown(f"#### Cosine Score: {cosine_score:.4f}")
287
+ col1, col2 = st.columns(2)
288
+ with col1:
289
+ st.markdown(f"**Main Text:** {imporve_text(main_text)}")
290
+ with col2:
291
+ st.markdown(f"**Similar Text:** {imporve_text(similar_text)}")
292
+
293
+ st.markdown("---")
294
+
295
+ if need_image == 'True':
296
+ with st.spinner('Processing Images...'):
297
+ emb_main = get_image_embeddings(main_product)
298
+ similar_prod = extract_similar_products(main_product)[0]
299
+ emb_similar = get_image_embeddings(similar_prod)
300
+
301
+ similarity_matrix = np.zeros((5, 5))
302
+ for i in range(5):
303
+ for j in range(5):
304
+ similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
305
+
306
+ st.subheader("Image Similarity")
307
+ # Create an interactive heatmap
308
+ fig = px.imshow(similarity_matrix,
309
+ labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
310
+ x=[f"Image {i+1}" for i in range(5)],
311
+ y=[f"Image {i+1}" for i in range(5)],
312
+ color_continuous_scale="Viridis")
313
+
314
+ # Add title to the heatmap
315
+ fig.update_layout(title="Image Similarity Heatmap")
316
+
317
+ # Display the interactive heatmap
318
+ st.plotly_chart(fig)
319
+
320
+
321
+
322
+
323
+ # main_product = 'Philips led 7w bulb'
324
+ # main_url = 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf'
325
+ # search_method = 'duckduckgo'
326
+
327
+ # product_count = 1
328
+ # link_count = 1
329
+ # need_image = False
330
+
331
 
332
+ # tag_option = "Field Wise Document Similarity"
 
 
333
 
334
+ # logger = StreamCapture()
335
+ # score(main_product, main_url,product_count, link_count, search_method, logger, st.empty())
336
 
 
337
 
 
 
 
 
 
338