Spaces:

svb01
/

sbaiiinfo

Paused

sbaiiinfo / qa_generate_ans.py

Sujal Bhat

Initial commit without large files

35d7369 about 2 months ago

4.3 kB

	import os
	import json
	import asyncio
	from dotenv import load_dotenv
	import pandas as pd
	from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
	from ragas.testset.generator import TestsetGenerator
	from ragas.testset.evolutions import simple, reasoning, multi_context
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	import openai
	from tenacity import retry, stop_after_attempt, wait_random_exponential
	from qa_system import generate_answer # Import the QA system

	@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
	def generate_testset(generator, documents, test_size, distributions):
	return generator.generate_with_langchain_docs(
	documents,
	test_size=test_size,
	distributions=distributions
	)

	async def main():
	# Load environment variables from .env file
	load_dotenv()

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	if not OPENAI_API_KEY:
	raise ValueError("OpenAI API key not found in environment variables.")

	# Initialize OpenAI API key
	os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
	openai.api_key = OPENAI_API_KEY # Explicitly set the API key

	# Check if testset.json exists
	if os.path.exists("testset.json"):
	print("Loading existing testset from testset.json")
	with open("testset.json", "r") as f:
	testset_dict = json.load(f)
	df = pd.DataFrame(testset_dict)
	else:
	print("Generating new testset")
	# Load PDF documents
	documents = []
	pdf_dir = "resources"
	for filename in os.listdir(pdf_dir):
	if filename.lower().endswith(".pdf"):
	loader = PyPDFLoader(os.path.join(pdf_dir, filename))
	docs = loader.load()
	for doc in docs:
	doc.metadata['filename'] = filename
	documents.extend(docs)

	for document in documents:
	document.metadata['filename'] = document.metadata['source']

	# Initialize OpenAI models (using gpt-3.5-turbo for both to reduce costs)
	generator_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
	critic_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
	embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

	# Initialize the Testset Generator
	generator = TestsetGenerator.from_langchain(
	generator_llm,
	critic_llm,
	embeddings
	)

	try:
	# Generate testset with retry logic
	testset = generate_testset(
	generator,
	documents,
	test_size=3,
	distributions={simple: 0.6, reasoning: 0.2, multi_context: 0.2}
	)

	# Convert testset to pandas DataFrame
	df = testset.to_pandas()

	# Save testset to JSON file
	testset_dict = df.to_dict(orient='records')
	with open("testset.json", "w") as f:
	json.dump(testset_dict, f, indent=2)
	print(f"New testset saved to testset.json")

	except Exception as e:
	print(f"An unexpected error occurred while generating testset: {e}")
	return

	try:
	# Generate new answers for each question using the existing QA system
	print("Generating new answers for all questions")
	df['answer'] = df['question'].apply(generate_answer)

	# Update JSON file with new answers
	testset_dict = df.to_dict(orient='records')
	with open("testset_with_answers.json", "w") as f:
	json.dump(testset_dict, f, indent=2)
	print(f"Testset with new answers saved to testset_with_answers.json")

	# Save testset with new answers to CSV file
	df.to_csv("testset_with_answers.csv", index=False)
	print(f"Testset with new answers saved to testset_with_answers.csv")

	except openai.APIConnectionError as e:
	print(f"Failed to connect to OpenAI API after multiple attempts: {e}")
	except RuntimeError as e:
	print(f"Runtime error occurred: {e}")
	except Exception as e:
	print(f"An unexpected error occurred: {e}")

	if __name__ == "__main__":
	asyncio.run(main())