sbaiiinfo / app.py
Sujal Bhat
Initial commit without large files
35d7369
raw
history blame
5.61 kB
# Required Libraries
import fitz # PyMuPDF
import os
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from qdrant_client import QdrantClient # Import Qdrant client
import uuid # Add this import at the top of your file
import json
# Load environment variables from .env file
load_dotenv()
# Initialize Qdrant client
qdrant_api_key = os.getenv("QDRANT_API_KEY") # Get the Qdrant API key from environment variables
qdrant_client = QdrantClient(
url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
api_key=qdrant_api_key,
timeout=300 # 5 minutes timeout
)
# New function to load processed documents
def load_processed_docs():
try:
with open('processed_docs.json', 'r') as f:
return set(json.load(f))
except FileNotFoundError:
return set()
# New function to save processed documents
def save_processed_docs(processed_docs):
with open('processed_docs.json', 'w') as f:
json.dump(list(processed_docs), f)
# Modified function to create the Qdrant collection if it doesn't exist
def create_qdrant_collection(collection_name, vector_size):
try:
# Check if the collection already exists
collection_info = qdrant_client.get_collection(collection_name)
print(f"Collection '{collection_name}' already exists.")
except Exception as e:
# If the collection doesn't exist, create it
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config={
"size": vector_size,
"distance": "Cosine"
}
)
print(f"Collection '{collection_name}' created successfully.")
# Modified function to store embeddings in Qdrant
def store_embeddings_in_qdrant(embedded_chunks, collection_name):
points = []
for theme, embeddings in embedded_chunks.items():
for embedding in embeddings:
points.append({
"id": str(uuid.uuid4()), # Generate a UUID for each point
"vector": embedding,
"payload": {"theme": theme}
})
# Batch upload points
batch_size = 100
for i in range(0, len(points), batch_size):
batch = points[i:i+batch_size]
try:
qdrant_client.upsert(
collection_name=collection_name,
points=batch
)
print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
except Exception as e:
print(f"Error uploading batch {i//batch_size + 1}: {e}")
print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")
# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
# Step 2: Define Themes
themes = [
"Safe and Effective Systems",
"Algorithmic Discrimination Protections",
"Data Privacy",
"Notice and Explanation",
"Human Alternatives",
"Risk Management",
"Governance",
"Trustworthiness",
"Unclassified"
]
# Step 3: Chunk the Text
def chunk_text(text, themes):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)
thematic_chunks = {theme: [] for theme in themes}
thematic_chunks["Unclassified"] = [] # Add an "Unclassified" category
for chunk in chunks:
theme_found = False
for theme in themes:
if theme.lower() in chunk.lower():
thematic_chunks[theme].append(chunk)
theme_found = True
break
if not theme_found:
thematic_chunks["Unclassified"].append(chunk)
return thematic_chunks
# Step 4: Embed the Chunks
def embed_chunks(thematic_chunks):
openai_api_key = os.getenv("OPENAI_API_KEY") # Get the API key from environment variables
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
return embedded_chunks
# Modified main execution block
def main():
resources_folder = "resources"
processed_docs = load_processed_docs()
new_docs_processed = False
collection_name = "ai_info_collection"
for filename in os.listdir(resources_folder):
if filename.endswith(".pdf") and filename not in processed_docs:
pdf_path = os.path.join(resources_folder, filename)
text = extract_text_from_pdf(pdf_path)
thematic_chunks = chunk_text(text, themes)
embedded_chunks = embed_chunks(thematic_chunks)
# Ensure the collection exists
if not new_docs_processed:
vector_size = len(next(iter(embedded_chunks.values()))[0])
create_qdrant_collection(collection_name, vector_size)
# Store embeddings for this document
store_embeddings_in_qdrant(embedded_chunks, collection_name)
processed_docs.add(filename)
new_docs_processed = True
print(f"Processed and added embeddings for {filename}")
if new_docs_processed:
save_processed_docs(processed_docs)
print("New documents processed and added to the collection.")
else:
print("No new documents to process.")
if __name__ == "__main__":
main()