Spaces:
Sleeping
Sleeping
Shivangsinha
commited on
Commit
•
31ef0bb
1
Parent(s):
041f935
initial commit
Browse files- EngaigeQuery.py +61 -0
- Engaigemodelling.py +162 -0
- Procfile +1 -0
- app.py +246 -0
- metadata1.json +0 -0
- requirements.txt +9 -0
- static/css/styles.css +209 -0
- static/js/scripts.js +0 -0
- templates/index.html +40 -0
- uploads/employee_handbook_print_1.pdf +0 -0
- vercel.json +9 -0
EngaigeQuery.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import faiss
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
# Initialize the sentence transformer model
|
7 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
8 |
+
|
9 |
+
index_path = 'vector_indexNLP.faiss'
|
10 |
+
metadata_path = 'metadataNLP.json'
|
11 |
+
|
12 |
+
# Load FAISS index and metadata
|
13 |
+
index = faiss.read_index(index_path)
|
14 |
+
with open(metadata_path, 'r') as f:
|
15 |
+
metadata = json.load(f)
|
16 |
+
|
17 |
+
def convert_distance_to_similarity(distance):
|
18 |
+
# Assuming the distances are non-negative, we can use a simple conversion:
|
19 |
+
return 1 / (1 + distance)*100
|
20 |
+
|
21 |
+
def query_index(query, model, index, metadata, top_k=5):
|
22 |
+
query_embedding = model.encode(query).reshape(1,-1).astype('float32')
|
23 |
+
D, I = index.search(query_embedding, top_k)
|
24 |
+
|
25 |
+
results = []
|
26 |
+
for i in range(top_k):
|
27 |
+
doc_metadata = metadata[I[0, i]]
|
28 |
+
similarity_score = convert_distance_to_similarity(D[0, i])
|
29 |
+
result = {
|
30 |
+
"filename": doc_metadata["filename"],
|
31 |
+
"page_num": doc_metadata["page_num"],
|
32 |
+
"standardized_text": doc_metadata["standardized_text"],
|
33 |
+
"question_text":doc_metadata["question_text"],
|
34 |
+
"answerable_text":doc_metadata["answerable_text"],
|
35 |
+
"score":similarity_score
|
36 |
+
}
|
37 |
+
results.append(result)
|
38 |
+
|
39 |
+
return results
|
40 |
+
|
41 |
+
query = "what is Rule-Based Machine Translation?"
|
42 |
+
results = query_index(query, model, index, metadata)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
def create_answer_to_show(query, results):
|
47 |
+
answer = f"Based on your query '{query}', the following relevant information was found:\n\n"
|
48 |
+
for result in results:
|
49 |
+
answer += "\n------------------------------------------------------------------------------------------------------------------\n"
|
50 |
+
answer += f"Filename: {result['filename']}\n"
|
51 |
+
answer += f"Page number: {result['page_num']}\n"
|
52 |
+
answer += f"Related keywords: {result['question_text'][:100]}...\n"
|
53 |
+
if result['answerable_text']!="":
|
54 |
+
answer += f"Answer: {result['answerable_text'][:500]}\n"
|
55 |
+
answer += f"Relevancy Score: {result['score']}\n"
|
56 |
+
answer += "\nFor more detailed information, please refer to the respective original texts.\n\n\n"
|
57 |
+
return answer
|
58 |
+
|
59 |
+
answer = create_answer_to_show(query, results)
|
60 |
+
|
61 |
+
print(answer)
|
Engaigemodelling.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
import tensorflow as tf
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import numpy as np
|
6 |
+
import faiss
|
7 |
+
import json
|
8 |
+
import re
|
9 |
+
|
10 |
+
# This folder should contain all the pdf files which we need to work on . Below given is just an example
|
11 |
+
pdf_folder = '/Users/shivangsinha/Downloads/personalProject'
|
12 |
+
pdf_text_data = {}
|
13 |
+
embeddings = []
|
14 |
+
metadata = []
|
15 |
+
|
16 |
+
# Initialize the sentence transformer model
|
17 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2') - Also tried with other model but seems the current one is working better.
|
19 |
+
|
20 |
+
# converting tensor to string so that to store it in json format.
|
21 |
+
def tensor_to_string(tensor):
|
22 |
+
return tensor.numpy().decode("utf-8") # Assuming utf-8 encoding
|
23 |
+
|
24 |
+
# extract text based on page number so that it is more relevant for search.
|
25 |
+
def extract_text_from_pdf_with_page_numbers(pdf_path):
|
26 |
+
doc = fitz.open(pdf_path)
|
27 |
+
text_pages = []
|
28 |
+
|
29 |
+
for page_num in range(len(doc)):
|
30 |
+
page = doc.load_page(page_num)
|
31 |
+
text = page.get_text()
|
32 |
+
text_pages.append((page_num + 1, text)) # Page numbers are 1-based in fitz
|
33 |
+
|
34 |
+
return text_pages
|
35 |
+
|
36 |
+
# Making sure inout data is not coming from table of content part and also preprocess all the text which are irrevant for the search.
|
37 |
+
def custom_standardization(input_data):
|
38 |
+
|
39 |
+
# If index pattern is seems to be part of table of content then simply ignore it.
|
40 |
+
index_pattern = re.compile(r'\.{3,}')
|
41 |
+
if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
|
42 |
+
return ""
|
43 |
+
|
44 |
+
# Remove URLs
|
45 |
+
stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+|www\.\S+", "")
|
46 |
+
|
47 |
+
# Remove email addresses
|
48 |
+
stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")
|
49 |
+
|
50 |
+
# Remove text in angular brackets (usually HTML tags)
|
51 |
+
stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")
|
52 |
+
|
53 |
+
# Remove any square brackets and leave the text within square brackets
|
54 |
+
stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[|\]", "")
|
55 |
+
|
56 |
+
# Remove alphanumeric characters with digits
|
57 |
+
stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w*\d\w*", "")
|
58 |
+
|
59 |
+
# Remove non-alphabet characters
|
60 |
+
stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")
|
61 |
+
|
62 |
+
# Replace multiple whitespaces with a single whitespace
|
63 |
+
standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")
|
64 |
+
|
65 |
+
return standardized_text.numpy().decode('utf-8')
|
66 |
+
|
67 |
+
|
68 |
+
# For the time being I am using the pattern of question and answer. I am splitting up text into paragraphs which ends with ? mark
|
69 |
+
def split_into_paragraphs(text):
|
70 |
+
pattern = r'(?<=\n)(?=\d+\.)'
|
71 |
+
|
72 |
+
# Split text using the pattern
|
73 |
+
paragraphs = re.split(pattern, text)
|
74 |
+
|
75 |
+
# Remove leading/trailing whitespace from each paragraph and filter out empty paragraphs
|
76 |
+
paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
|
77 |
+
|
78 |
+
return paragraphs
|
79 |
+
|
80 |
+
|
81 |
+
# This part is for storing the vector of a paragraph in a required format
|
82 |
+
def text_to_vectors(paragraphs):
|
83 |
+
vectors = model.encode(paragraphs)
|
84 |
+
return vectors
|
85 |
+
|
86 |
+
# This split is used to Answer the query or simply show the relevant text from the book.
|
87 |
+
def split_into_qa(text):
|
88 |
+
# Find the last occurrence of a question mark
|
89 |
+
index_pattern = re.compile(r'\.{3,}')
|
90 |
+
# Split the text at each question mark followed by a newline or space
|
91 |
+
match = re.search(r'(.*\?.*?)\n', text, re.DOTALL)
|
92 |
+
|
93 |
+
# If a match is found, split the text accordingly
|
94 |
+
if match:
|
95 |
+
question = match.group(1).strip() # The part before the last question mark
|
96 |
+
answer = text[match.end():].strip() # The part after the last question mark
|
97 |
+
|
98 |
+
# Filter out index-like entries in both question and answer
|
99 |
+
if index_pattern.search(question):
|
100 |
+
question = "" # Ignore this as it looks like an index entry
|
101 |
+
if index_pattern.search(answer):
|
102 |
+
answer = "" # Ignore this as it looks like an index entry
|
103 |
+
else:
|
104 |
+
question = text.strip() # No question mark found, consider the entire text as the question
|
105 |
+
answer = "" # No answer part
|
106 |
+
|
107 |
+
return question, answer
|
108 |
+
|
109 |
+
# storing vector to use it later while querying
|
110 |
+
def store_vectors(paragraphs, vectors, metadata, filename, page_num):
|
111 |
+
for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
|
112 |
+
original_text = paragraph
|
113 |
+
question,answer = split_into_qa(original_text)
|
114 |
+
original_text = paragraph[:500] # Store the first 500 characters of the original text
|
115 |
+
standardized_text = custom_standardization(tf.constant(paragraph))
|
116 |
+
vector = model.encode(standardized_text).tolist() # Recompute vector for standardized text
|
117 |
+
metadata.append({
|
118 |
+
"index": f'paragraph-{i}',
|
119 |
+
"filename": filename,
|
120 |
+
"page_num": page_num,
|
121 |
+
"standardized_text": standardized_text,
|
122 |
+
"question_text":question,
|
123 |
+
"answerable_text":answer
|
124 |
+
})
|
125 |
+
embeddings.append(vector)
|
126 |
+
|
127 |
+
for filename in os.listdir(pdf_folder):
|
128 |
+
if filename.endswith('.pdf'):
|
129 |
+
pdf_path = os.path.join(pdf_folder, filename)
|
130 |
+
text_pages = extract_text_from_pdf_with_page_numbers(pdf_path)
|
131 |
+
for page_num, text in text_pages:
|
132 |
+
paragraphs = split_into_paragraphs(text)
|
133 |
+
vectors = text_to_vectors(paragraphs)
|
134 |
+
store_vectors(paragraphs, vectors, metadata, filename, page_num)
|
135 |
+
pdf_text_data[filename] = text_pages
|
136 |
+
|
137 |
+
# Save FAISS index and metadata to JSON
|
138 |
+
index_path = 'vector_indexNLP.faiss'
|
139 |
+
metadata_path = 'metadataNLP.json'
|
140 |
+
|
141 |
+
# Convert embeddings to numpy array for FAISS
|
142 |
+
embeddings_array = np.array(embeddings, dtype='float32')
|
143 |
+
|
144 |
+
# Initialize FAISS index
|
145 |
+
dimension = embeddings_array.shape[1] # Dimension of the embeddings
|
146 |
+
index = faiss.IndexFlatL2(dimension)
|
147 |
+
|
148 |
+
# Add embeddings in batches to avoid memory issues. I faced some issue while adding index
|
149 |
+
batch_size = 1000 # Adjust batch size based on available memory
|
150 |
+
for i in range(0, len(embeddings), batch_size):
|
151 |
+
batch_embeddings = embeddings_array[i:i+batch_size]
|
152 |
+
index.add(batch_embeddings)
|
153 |
+
|
154 |
+
# Save the FAISS index
|
155 |
+
faiss.write_index(index, index_path)
|
156 |
+
|
157 |
+
# Save metadata
|
158 |
+
with open(metadata_path, 'w') as f:
|
159 |
+
json.dump(metadata, f)
|
160 |
+
|
161 |
+
print(f"FAISS index saved to: {index_path}")
|
162 |
+
print(f"Metadata saved to: {metadata_path}")
|
Procfile
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web: gunicorn app:app
|
app.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify,render_template
|
2 |
+
from flask_cors import CORS
|
3 |
+
import requests
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import faiss
|
6 |
+
import json
|
7 |
+
import numpy as np
|
8 |
+
import os
|
9 |
+
from flask import Flask, request, jsonify
|
10 |
+
from flask_cors import CORS
|
11 |
+
from werkzeug.utils import secure_filename
|
12 |
+
import fitz # PyMuPDF
|
13 |
+
import tensorflow as tf
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
import numpy as np
|
16 |
+
import faiss
|
17 |
+
import json
|
18 |
+
import re
|
19 |
+
import shutil
|
20 |
+
|
21 |
+
app = Flask(__name__)
|
22 |
+
CORS(app) # Enable CORS for all routes
|
23 |
+
|
24 |
+
@app.route('/')
|
25 |
+
def index():
|
26 |
+
return render_template('index.html')
|
27 |
+
|
28 |
+
UPLOAD_FOLDER = 'uploads'
|
29 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
30 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
31 |
+
|
32 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
33 |
+
|
34 |
+
index_path = 'vector_index1.faiss'
|
35 |
+
metadata_path = 'metadata1.json'
|
36 |
+
|
37 |
+
embeddings = []
|
38 |
+
metadata = []
|
39 |
+
|
40 |
+
def tensor_to_string(tensor):
|
41 |
+
return tensor.numpy().decode("utf-8")
|
42 |
+
|
43 |
+
def extract_text_from_pdf_with_page_numbers(pdf_path):
|
44 |
+
doc = fitz.open(pdf_path)
|
45 |
+
text_pages = []
|
46 |
+
for page_num in range(len(doc)):
|
47 |
+
page = doc.load_page(page_num)
|
48 |
+
text = page.get_text()
|
49 |
+
text_pages.append((page_num + 1, text))
|
50 |
+
return text_pages
|
51 |
+
|
52 |
+
def custom_standardization(input_data):
|
53 |
+
index_pattern = re.compile(r'\.{3,}')
|
54 |
+
if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
|
55 |
+
return ""
|
56 |
+
stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+|www\.\S+", "")
|
57 |
+
stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")
|
58 |
+
stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")
|
59 |
+
stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[|\]", "")
|
60 |
+
stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w*\d\w*", "")
|
61 |
+
stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")
|
62 |
+
standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")
|
63 |
+
return standardized_text.numpy().decode('utf-8')
|
64 |
+
|
65 |
+
def split_into_paragraphs(text):
|
66 |
+
# pattern = r'(?<=\n)(?=\d+)'
|
67 |
+
paragraphs = re.split(r'(?<=\n)(?=\d+|(?=\n\s*\n))', text)
|
68 |
+
paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
|
69 |
+
return paragraphs
|
70 |
+
|
71 |
+
def text_to_vectors(paragraphs):
|
72 |
+
vectors = model.encode(paragraphs)
|
73 |
+
return vectors
|
74 |
+
|
75 |
+
def split_into_qa(text):
|
76 |
+
# Define the regex pattern to capture the question and answer in one line
|
77 |
+
index_pattern = re.compile(r'\.{3,}')
|
78 |
+
# Split the text at each question mark followed by a newline or space
|
79 |
+
match = re.search(r'(.*\?.*?)\n', text, re.DOTALL)
|
80 |
+
|
81 |
+
# If a match is found, split the text accordingly
|
82 |
+
if match:
|
83 |
+
question = match.group(1).strip() # The part before the last question mark
|
84 |
+
answer = text[match.end():].strip() # The part after the last question mark
|
85 |
+
|
86 |
+
# Filter out index-like entries in both question and answer
|
87 |
+
if index_pattern.search(question):
|
88 |
+
question = "" # Ignore this as it looks like an index entry
|
89 |
+
if index_pattern.search(answer):
|
90 |
+
answer = "" # Ignore this as it looks like an index entry
|
91 |
+
else:
|
92 |
+
question = text.strip() # No question mark found, consider the entire text as the question
|
93 |
+
answer = "" # No answer part
|
94 |
+
|
95 |
+
return question, answer
|
96 |
+
|
97 |
+
def store_vectors(paragraphs, vectors, metadata, filename, page_num):
|
98 |
+
for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
|
99 |
+
original_text = paragraph
|
100 |
+
question, answer = split_into_qa(original_text)
|
101 |
+
original_text = paragraph[:500]
|
102 |
+
standardized_text = custom_standardization(tf.constant(paragraph))
|
103 |
+
vector = model.encode(standardized_text).tolist()
|
104 |
+
metadata.append({
|
105 |
+
"index": f'paragraph-{i}',
|
106 |
+
"filename": filename,
|
107 |
+
"page_num": page_num,
|
108 |
+
"standardized_text": standardized_text,
|
109 |
+
"question_text": question,
|
110 |
+
"answerable_text": answer
|
111 |
+
})
|
112 |
+
embeddings.append(vector)
|
113 |
+
|
114 |
+
@app.route('/upload', methods=['POST'])
|
115 |
+
def upload_pdf():
|
116 |
+
if 'file' not in request.files:
|
117 |
+
return jsonify({'error': 'No file part'}), 400
|
118 |
+
file = request.files['file']
|
119 |
+
if file.filename == '':
|
120 |
+
return jsonify({'error': 'No selected file'}), 400
|
121 |
+
if file:
|
122 |
+
# filename = secure_filename(file.filename)
|
123 |
+
# file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
124 |
+
# file.save(file_path)
|
125 |
+
|
126 |
+
filename = secure_filename(file.filename)
|
127 |
+
|
128 |
+
# Delete the uploads folder and its contents
|
129 |
+
if os.path.exists(app.config['UPLOAD_FOLDER']):
|
130 |
+
shutil.rmtree(app.config['UPLOAD_FOLDER'])
|
131 |
+
|
132 |
+
# Recreate the uploads folder
|
133 |
+
os.makedirs(app.config['UPLOAD_FOLDER'])
|
134 |
+
|
135 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
136 |
+
file.save(file_path)
|
137 |
+
try:
|
138 |
+
os.remove('metadata1.json')
|
139 |
+
os.remove('vector_index1.faiss')
|
140 |
+
except OSError as e:
|
141 |
+
print(f"Error: {e.strerror}")
|
142 |
+
process_pdf(file_path, filename)
|
143 |
+
print(file_path+filename)
|
144 |
+
return jsonify({'success': 'File uploaded and processed successfully'})
|
145 |
+
|
146 |
+
def process_pdf(file_path, filename):
|
147 |
+
text_pages = extract_text_from_pdf_with_page_numbers(file_path)
|
148 |
+
for page_num, text in text_pages:
|
149 |
+
paragraphs = split_into_paragraphs(text)
|
150 |
+
vectors = text_to_vectors(paragraphs)
|
151 |
+
store_vectors(paragraphs, vectors, metadata, filename, page_num)
|
152 |
+
save_index_and_metadata()
|
153 |
+
|
154 |
+
def save_index_and_metadata():
|
155 |
+
embeddings_array = np.array(embeddings, dtype='float32')
|
156 |
+
dimension = embeddings_array.shape[1]
|
157 |
+
index = faiss.IndexFlatL2(dimension)
|
158 |
+
batch_size = 1000
|
159 |
+
for i in range(0, len(embeddings), batch_size):
|
160 |
+
batch_embeddings = embeddings_array[i:i+batch_size]
|
161 |
+
index.add(batch_embeddings)
|
162 |
+
faiss.write_index(index, index_path)
|
163 |
+
with open(metadata_path, 'w') as f:
|
164 |
+
json.dump(metadata, f)
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
# Load FAISS index and metadata
|
169 |
+
|
170 |
+
|
171 |
+
def convert_distance_to_similarity(distance):
|
172 |
+
# Assuming the distances are non-negative, we can use a simple conversion:
|
173 |
+
return 1 / (1 + distance) * 100
|
174 |
+
|
175 |
+
def query_index(query, model, index, metadata, top_k=5):
|
176 |
+
query_embedding = model.encode(query).reshape(1, -1).astype('float32')
|
177 |
+
D, I = index.search(query_embedding, top_k)
|
178 |
+
|
179 |
+
results = []
|
180 |
+
for i in range(top_k):
|
181 |
+
doc_metadata = metadata[I[0, i]]
|
182 |
+
similarity_score = convert_distance_to_similarity(D[0, i])
|
183 |
+
result = {
|
184 |
+
"filename": doc_metadata["filename"],
|
185 |
+
"page_num": doc_metadata["page_num"],
|
186 |
+
"standardized_text": doc_metadata["standardized_text"],
|
187 |
+
"question_text": doc_metadata["question_text"],
|
188 |
+
"answerable_text": doc_metadata["answerable_text"],
|
189 |
+
"score": similarity_score
|
190 |
+
}
|
191 |
+
results.append(result)
|
192 |
+
|
193 |
+
return results
|
194 |
+
|
195 |
+
def fetch_answer_from_external_api(question,result):
|
196 |
+
|
197 |
+
data = {
|
198 |
+
"messages": [
|
199 |
+
{
|
200 |
+
"content": "Question=" +question + ",answer to look from Uploaded pdf file and dont include the field name from the json file in answer section = " +str(result) + "answer=Based on your PDF provided , ",
|
201 |
+
"role": "user"
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"model": "mixtral:8x7b-instruct-v0.1-q6_K"
|
205 |
+
}
|
206 |
+
print("data="+str(data))
|
207 |
+
response = requests.post('https://inf.cl.uni-trier.de/chat/', json=data, headers={'accept': 'application/json', 'Content-Type': 'application/json'})
|
208 |
+
response_data = response.json()
|
209 |
+
|
210 |
+
return response_data.get('response', '')
|
211 |
+
|
212 |
+
def create_answer_to_show(query, results):
|
213 |
+
answer = f"Based on your query '{query}', the following relevant information was found:\n\n"
|
214 |
+
for result in results:
|
215 |
+
answer += "\n------------------------------------------------------------------------------------------------------------------\n"
|
216 |
+
answer += f"Filename: {result['filename']}\n"
|
217 |
+
answer += f"Page number: {result['page_num']}\n"
|
218 |
+
answer += f"Related keywords: {result['question_text']}...\n"
|
219 |
+
if result['answerable_text'] != "":
|
220 |
+
answer += f"Answer: {result['answerable_text'][:500]}\n"
|
221 |
+
answer += f"Relevancy Score: {result['score']}\n"
|
222 |
+
answer += "\nFor more detailed information, please refer to the respective original texts.\n\n\n"
|
223 |
+
return answer
|
224 |
+
|
225 |
+
@app.route('/api/query', methods=['POST'])
|
226 |
+
def query_endpoint():
|
227 |
+
data = request.json
|
228 |
+
query = data.get('query', '')
|
229 |
+
|
230 |
+
top_k = data.get('top_k', 5)
|
231 |
+
index = faiss.read_index(index_path)
|
232 |
+
with open(metadata_path, 'r') as f:
|
233 |
+
metadata = json.load(f)
|
234 |
+
results = query_index(query, model, index, metadata, top_k)
|
235 |
+
formatted_answer = create_answer_to_show(query, results)
|
236 |
+
answer2 = fetch_answer_from_external_api(query,results[0])
|
237 |
+
print("=>"+answer2)
|
238 |
+
|
239 |
+
|
240 |
+
|
241 |
+
return jsonify({'answer': answer2+"\n\n"+formatted_answer })
|
242 |
+
|
243 |
+
|
244 |
+
|
245 |
+
if __name__ == '__main__':
|
246 |
+
app.run(debug=True)
|
metadata1.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faiss_cpu==1.8.0
|
2 |
+
sentence_transformers==3.0.1
|
3 |
+
tensorflow==2.16.1
|
4 |
+
Flask==3.0.3
|
5 |
+
Flask-Cors==4.0.1
|
6 |
+
numpy
|
7 |
+
tf-keras
|
8 |
+
PyMuPDF==1.24.5
|
9 |
+
gunicorn
|
static/css/styles.css
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@import url('https://fonts.googleapis.com/css?family=Exo:400,700');
|
2 |
+
|
3 |
+
* {
|
4 |
+
margin: 0px;
|
5 |
+
padding: 0px;
|
6 |
+
box-sizing: border-box;
|
7 |
+
}
|
8 |
+
|
9 |
+
body {
|
10 |
+
font-family: 'Exo', sans-serif;
|
11 |
+
}
|
12 |
+
|
13 |
+
|
14 |
+
.context {
|
15 |
+
width: 100%;
|
16 |
+
position: absolute;
|
17 |
+
top: 30vh;
|
18 |
+
|
19 |
+
}
|
20 |
+
|
21 |
+
.context h1 {
|
22 |
+
text-align: center;
|
23 |
+
color: #fefefe;
|
24 |
+
font-size: 150px;
|
25 |
+
}
|
26 |
+
|
27 |
+
.context h3 {
|
28 |
+
text-align: center;
|
29 |
+
color: #e4b714;
|
30 |
+
font-size: 30px;
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
.area {
|
35 |
+
background: #000428;
|
36 |
+
background: -webkit-linear-gradient(to right, #000428, #004e92);
|
37 |
+
background: linear-gradient(to right, #000428, #004e92);
|
38 |
+
width: 100%;
|
39 |
+
height: 100vh;
|
40 |
+
}
|
41 |
+
|
42 |
+
.circles {
|
43 |
+
position: absolute;
|
44 |
+
top: 0;
|
45 |
+
left: 0;
|
46 |
+
width: 100%;
|
47 |
+
height: 100%;
|
48 |
+
overflow: hidden;
|
49 |
+
}
|
50 |
+
|
51 |
+
.circles li {
|
52 |
+
position: absolute;
|
53 |
+
display: block;
|
54 |
+
list-style: none;
|
55 |
+
width: 20px;
|
56 |
+
height: 20px;
|
57 |
+
background: rgba(255, 255, 255, 0.2);
|
58 |
+
animation: animate 25s linear infinite;
|
59 |
+
bottom: -150px;
|
60 |
+
|
61 |
+
}
|
62 |
+
|
63 |
+
.circles li:nth-child(1) {
|
64 |
+
left: 25%;
|
65 |
+
width: 80px;
|
66 |
+
height: 80px;
|
67 |
+
animation-delay: 0s;
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
.circles li:nth-child(2) {
|
72 |
+
left: 10%;
|
73 |
+
width: 20px;
|
74 |
+
height: 20px;
|
75 |
+
animation-delay: 2s;
|
76 |
+
animation-duration: 12s;
|
77 |
+
}
|
78 |
+
|
79 |
+
.circles li:nth-child(3) {
|
80 |
+
left: 70%;
|
81 |
+
width: 20px;
|
82 |
+
height: 20px;
|
83 |
+
animation-delay: 4s;
|
84 |
+
}
|
85 |
+
|
86 |
+
.circles li:nth-child(4) {
|
87 |
+
left: 40%;
|
88 |
+
width: 60px;
|
89 |
+
height: 60px;
|
90 |
+
animation-delay: 0s;
|
91 |
+
animation-duration: 18s;
|
92 |
+
}
|
93 |
+
|
94 |
+
.circles li:nth-child(5) {
|
95 |
+
left: 65%;
|
96 |
+
width: 20px;
|
97 |
+
height: 20px;
|
98 |
+
animation-delay: 0s;
|
99 |
+
}
|
100 |
+
|
101 |
+
.circles li:nth-child(6) {
|
102 |
+
left: 75%;
|
103 |
+
width: 110px;
|
104 |
+
height: 110px;
|
105 |
+
animation-delay: 3s;
|
106 |
+
}
|
107 |
+
|
108 |
+
.circles li:nth-child(7) {
|
109 |
+
left: 35%;
|
110 |
+
width: 150px;
|
111 |
+
height: 150px;
|
112 |
+
animation-delay: 7s;
|
113 |
+
}
|
114 |
+
|
115 |
+
.circles li:nth-child(8) {
|
116 |
+
left: 50%;
|
117 |
+
width: 25px;
|
118 |
+
height: 25px;
|
119 |
+
animation-delay: 15s;
|
120 |
+
animation-duration: 45s;
|
121 |
+
}
|
122 |
+
|
123 |
+
.circles li:nth-child(9) {
|
124 |
+
left: 20%;
|
125 |
+
width: 15px;
|
126 |
+
height: 15px;
|
127 |
+
animation-delay: 2s;
|
128 |
+
animation-duration: 35s;
|
129 |
+
}
|
130 |
+
|
131 |
+
.circles li:nth-child(10) {
|
132 |
+
left: 85%;
|
133 |
+
width: 150px;
|
134 |
+
height: 150px;
|
135 |
+
animation-delay: 0s;
|
136 |
+
animation-duration: 11s;
|
137 |
+
}
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
@keyframes animate {
|
142 |
+
|
143 |
+
0% {
|
144 |
+
transform: translateY(0) rotate(0deg);
|
145 |
+
opacity: 1;
|
146 |
+
border-radius: 0;
|
147 |
+
}
|
148 |
+
|
149 |
+
100% {
|
150 |
+
transform: translateY(-1000px) rotate(720deg);
|
151 |
+
opacity: 0;
|
152 |
+
border-radius: 50%;
|
153 |
+
}
|
154 |
+
|
155 |
+
}
|
156 |
+
|
157 |
+
.context {
|
158 |
+
text-align: center;
|
159 |
+
color: #fff;
|
160 |
+
}
|
161 |
+
|
162 |
+
.button-container {
|
163 |
+
margin-top: 50px;
|
164 |
+
}
|
165 |
+
|
166 |
+
.register-button {
|
167 |
+
display: inline-block;
|
168 |
+
padding: 10px 20px;
|
169 |
+
background-color: transparent;
|
170 |
+
border: 2px solid #fff;
|
171 |
+
color: #fff;
|
172 |
+
text-decoration: none;
|
173 |
+
font-size: 18px;
|
174 |
+
border-radius: 15px;
|
175 |
+
transition: background-color 0.3s, color 0.3s;
|
176 |
+
}
|
177 |
+
|
178 |
+
.register-button:hover {
|
179 |
+
transform: scale(1.09);
|
180 |
+
}
|
181 |
+
|
182 |
+
|
183 |
+
/* Responsive Design */
|
184 |
+
|
185 |
+
/* For Mobile Devices */
|
186 |
+
@media (max-width: 767px) {
|
187 |
+
.context h1 {
|
188 |
+
font-size: 80px;
|
189 |
+
/* Adjust the font size for smaller screens */
|
190 |
+
}
|
191 |
+
|
192 |
+
.context h3 {
|
193 |
+
font-size: 20px;
|
194 |
+
/* Adjust the font size for smaller screens */
|
195 |
+
}
|
196 |
+
}
|
197 |
+
|
198 |
+
/* For Tablet Devices */
|
199 |
+
@media (min-width: 768px) and (max-width: 1023px) {
|
200 |
+
.context h1 {
|
201 |
+
font-size: 120px;
|
202 |
+
/* Adjust the font size for tablet screens */
|
203 |
+
}
|
204 |
+
|
205 |
+
.context h3 {
|
206 |
+
font-size: 25px;
|
207 |
+
/* Adjust the font size for tablet screens */
|
208 |
+
}
|
209 |
+
}
|
static/js/scripts.js
ADDED
File without changes
|
templates/index.html
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
<meta charset="UTF-8">
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
7 |
+
<title>Shivang - Flask api</title>
|
8 |
+
<link rel="icon" href="/static/images/logo.ico" type="image/x-icon">
|
9 |
+
<link rel="stylesheet" href="/static/css/styles.css">
|
10 |
+
<link href="https://fonts.googleapis.com/css?family=Exo:400,700" rel="stylesheet">
|
11 |
+
</head>
|
12 |
+
|
13 |
+
<body>
|
14 |
+
<div class="area">
|
15 |
+
<div class="circles">
|
16 |
+
<ul>
|
17 |
+
<li></li>
|
18 |
+
<li></li>
|
19 |
+
<li></li>
|
20 |
+
<li></li>
|
21 |
+
<li></li>
|
22 |
+
<li></li>
|
23 |
+
<li></li>
|
24 |
+
<li></li>
|
25 |
+
<li></li>
|
26 |
+
<li></li>
|
27 |
+
</ul>
|
28 |
+
</div>
|
29 |
+
</div>
|
30 |
+
<div class="context">
|
31 |
+
<h1>Flask Api calls</h1>
|
32 |
+
<h4>By Shivang sinha</h4>
|
33 |
+
<ul>
|
34 |
+
<li>/api/query</li>
|
35 |
+
<li>/upload</li>
|
36 |
+
</ul>
|
37 |
+
</div>
|
38 |
+
</body>
|
39 |
+
|
40 |
+
</html>
|
uploads/employee_handbook_print_1.pdf
ADDED
Binary file (649 kB). View file
|
|
vercel.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": 2,
|
3 |
+
"builds": [
|
4 |
+
{"src": "app.py", "use": "@vercel/python"}
|
5 |
+
],
|
6 |
+
"routes": [
|
7 |
+
{"src": "/(.*)", "dest": "app.py"}
|
8 |
+
]
|
9 |
+
}
|