Spaces:

svb01
/

sbaiiinfo

Paused

sbaiiinfo / app.py

Sujal Bhat

Initial commit without large files

35d7369 about 2 months ago

5.61 kB

	# Required Libraries
	import fitz # PyMuPDF
	import os
	from langchain_openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from dotenv import load_dotenv
	from qdrant_client import QdrantClient # Import Qdrant client
	import uuid # Add this import at the top of your file
	import json

	# Load environment variables from .env file
	load_dotenv()

	# Initialize Qdrant client
	qdrant_api_key = os.getenv("QDRANT_API_KEY") # Get the Qdrant API key from environment variables
	qdrant_client = QdrantClient(
	url="https://9266da83-dbfe-48d6-b2d8-cdf101299284.europe-west3-0.gcp.cloud.qdrant.io",
	api_key=qdrant_api_key,
	timeout=300 # 5 minutes timeout
	)

	# New function to load processed documents
	def load_processed_docs():
	try:
	with open('processed_docs.json', 'r') as f:
	return set(json.load(f))
	except FileNotFoundError:
	return set()

	# New function to save processed documents
	def save_processed_docs(processed_docs):
	with open('processed_docs.json', 'w') as f:
	json.dump(list(processed_docs), f)

	# Modified function to create the Qdrant collection if it doesn't exist
	def create_qdrant_collection(collection_name, vector_size):
	try:
	# Check if the collection already exists
	collection_info = qdrant_client.get_collection(collection_name)
	print(f"Collection '{collection_name}' already exists.")
	except Exception as e:
	# If the collection doesn't exist, create it
	qdrant_client.create_collection(
	collection_name=collection_name,
	vectors_config={
	"size": vector_size,
	"distance": "Cosine"
	}
	)
	print(f"Collection '{collection_name}' created successfully.")

	# Modified function to store embeddings in Qdrant
	def store_embeddings_in_qdrant(embedded_chunks, collection_name):
	points = []
	for theme, embeddings in embedded_chunks.items():
	for embedding in embeddings:
	points.append({
	"id": str(uuid.uuid4()), # Generate a UUID for each point
	"vector": embedding,
	"payload": {"theme": theme}
	})

	# Batch upload points
	batch_size = 100
	for i in range(0, len(points), batch_size):
	batch = points[i:i+batch_size]
	try:
	qdrant_client.upsert(
	collection_name=collection_name,
	points=batch
	)
	print(f"Uploaded batch {i//batch_size + 1} to collection '{collection_name}'.")
	except Exception as e:
	print(f"Error uploading batch {i//batch_size + 1}: {e}")

	print(f"Finished uploading {len(points)} points to collection '{collection_name}'.")

	# Step 1: Extract Text from PDFs
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	# Step 2: Define Themes
	themes = [
	"Safe and Effective Systems",
	"Algorithmic Discrimination Protections",
	"Data Privacy",
	"Notice and Explanation",
	"Human Alternatives",
	"Risk Management",
	"Governance",
	"Trustworthiness",
	"Unclassified"
	]

	# Step 3: Chunk the Text
	def chunk_text(text, themes):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	chunks = text_splitter.split_text(text)
	thematic_chunks = {theme: [] for theme in themes}
	thematic_chunks["Unclassified"] = [] # Add an "Unclassified" category

	for chunk in chunks:
	theme_found = False
	for theme in themes:
	if theme.lower() in chunk.lower():
	thematic_chunks[theme].append(chunk)
	theme_found = True
	break
	if not theme_found:
	thematic_chunks["Unclassified"].append(chunk)
	return thematic_chunks

	# Step 4: Embed the Chunks
	def embed_chunks(thematic_chunks):
	openai_api_key = os.getenv("OPENAI_API_KEY") # Get the API key from environment variables
	embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
	embedded_chunks = {theme: embeddings.embed_documents(chunks) for theme, chunks in thematic_chunks.items()}
	return embedded_chunks

	# Modified main execution block
	def main():
	resources_folder = "resources"
	processed_docs = load_processed_docs()
	new_docs_processed = False

	collection_name = "ai_info_collection"

	for filename in os.listdir(resources_folder):
	if filename.endswith(".pdf") and filename not in processed_docs:
	pdf_path = os.path.join(resources_folder, filename)
	text = extract_text_from_pdf(pdf_path)
	thematic_chunks = chunk_text(text, themes)
	embedded_chunks = embed_chunks(thematic_chunks)

	# Ensure the collection exists
	if not new_docs_processed:
	vector_size = len(next(iter(embedded_chunks.values()))[0])
	create_qdrant_collection(collection_name, vector_size)

	# Store embeddings for this document
	store_embeddings_in_qdrant(embedded_chunks, collection_name)

	processed_docs.add(filename)
	new_docs_processed = True
	print(f"Processed and added embeddings for {filename}")

	if new_docs_processed:
	save_processed_docs(processed_docs)
	print("New documents processed and added to the collection.")
	else:
	print("No new documents to process.")

	if __name__ == "__main__":
	main()