Spaces:

Shivangsinha
/

DocDetailer

Sleeping

File size: 6,462 Bytes

31ef0bb

import os
import fitz  # PyMuPDF
import tensorflow as tf
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json
import re

# This folder should contain all the pdf files which we need to work on . Below given is just an example
pdf_folder = '/Users/shivangsinha/Downloads/personalProject'
pdf_text_data = {}
embeddings = []
metadata = []

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2') - Also tried with other model but seems the current one is working better.

# converting tensor to string so that to store it in json format.
def tensor_to_string(tensor):
    return tensor.numpy().decode("utf-8")  # Assuming utf-8 encoding

# extract text based on page number so that it is more relevant for search. 
def extract_text_from_pdf_with_page_numbers(pdf_path):
    doc = fitz.open(pdf_path)
    text_pages = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        text_pages.append((page_num + 1, text))  # Page numbers are 1-based in fitz

    return text_pages

# Making sure inout data is not coming from table of content part and also preprocess all the text which are irrevant for the search.
def custom_standardization(input_data):
    
    # If index pattern is seems to be part of table of content then simply ignore it.
    index_pattern = re.compile(r'\.{3,}')
    if bool(index_pattern.search(input_data.numpy().decode('utf-8'))):
        return ""

    # Remove URLs
    stripped_urls = tf.strings.regex_replace(input_data, r"https?://\S+|www\.\S+", "")

    # Remove email addresses
    stripped_emails = tf.strings.regex_replace(stripped_urls, r"\S+@\S+", "")

    # Remove text in angular brackets (usually HTML tags)
    stripped_brackets = tf.strings.regex_replace(stripped_emails, r"<.*?>", "")

    # Remove any square brackets and leave the text within square brackets
    stripped_square_brackets = tf.strings.regex_replace(stripped_brackets, r"\[|\]", "")

    # Remove alphanumeric characters with digits
    stripped_digits = tf.strings.regex_replace(stripped_square_brackets, r"\w*\d\w*", "")

    # Remove non-alphabet characters
    stripped_non_alpha = tf.strings.regex_replace(stripped_digits, r"[^a-zA-Z\s]", "")

    # Replace multiple whitespaces with a single whitespace
    standardized_text = tf.strings.regex_replace(stripped_non_alpha, r"\s+", " ")

    return standardized_text.numpy().decode('utf-8')


# For the time being I am using the pattern of question and answer. I am splitting up text into paragraphs which ends with ? mark
def split_into_paragraphs(text):
    pattern = r'(?<=\n)(?=\d+\.)'
    
    # Split text using the pattern
    paragraphs = re.split(pattern, text)
    
    # Remove leading/trailing whitespace from each paragraph and filter out empty paragraphs
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
    
    return paragraphs


# This part is for storing the vector of a paragraph in a required format
def text_to_vectors(paragraphs):
    vectors = model.encode(paragraphs)
    return vectors

# This split is used to Answer the query or simply show the relevant text from the book.
def split_into_qa(text):
    # Find the last occurrence of a question mark
    index_pattern = re.compile(r'\.{3,}')
    # Split the text at each question mark followed by a newline or space
    match = re.search(r'(.*\?.*?)\n', text, re.DOTALL)
    
    # If a match is found, split the text accordingly
    if match:
        question = match.group(1).strip()  # The part before the last question mark
        answer = text[match.end():].strip()  # The part after the last question mark
        
        # Filter out index-like entries in both question and answer
        if index_pattern.search(question):
            question = ""  # Ignore this as it looks like an index entry
        if index_pattern.search(answer):
            answer = ""  # Ignore this as it looks like an index entry
    else:
        question = text.strip()  # No question mark found, consider the entire text as the question
        answer = ""  # No answer part
    
    return question, answer

# storing vector to use it later while querying
def store_vectors(paragraphs, vectors, metadata, filename, page_num):
    for i, (paragraph, vector) in enumerate(zip(paragraphs, vectors)):
        original_text = paragraph
        question,answer = split_into_qa(original_text)
        original_text = paragraph[:500]  # Store the first 500 characters of the original text
        standardized_text = custom_standardization(tf.constant(paragraph))
        vector = model.encode(standardized_text).tolist()  # Recompute vector for standardized text
        metadata.append({
            "index": f'paragraph-{i}',
            "filename": filename,
            "page_num": page_num,
            "standardized_text": standardized_text,
            "question_text":question,
            "answerable_text":answer
        })
        embeddings.append(vector)

for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        text_pages = extract_text_from_pdf_with_page_numbers(pdf_path)
        for page_num, text in text_pages:
            paragraphs = split_into_paragraphs(text)
            vectors = text_to_vectors(paragraphs)
            store_vectors(paragraphs, vectors, metadata, filename, page_num)
        pdf_text_data[filename] = text_pages

# Save FAISS index and metadata to JSON
index_path = 'vector_indexNLP.faiss'
metadata_path = 'metadataNLP.json'

# Convert embeddings to numpy array for FAISS
embeddings_array = np.array(embeddings, dtype='float32')

# Initialize FAISS index
dimension = embeddings_array.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)

# Add embeddings in batches to avoid memory issues. I faced some issue while adding index
batch_size = 1000  # Adjust batch size based on available memory
for i in range(0, len(embeddings), batch_size):
    batch_embeddings = embeddings_array[i:i+batch_size]
    index.add(batch_embeddings)

# Save the FAISS index
faiss.write_index(index, index_path)

# Save metadata
with open(metadata_path, 'w') as f:
    json.dump(metadata, f)

print(f"FAISS index saved to: {index_path}")
print(f"Metadata saved to: {metadata_path}")