sbaiiinfo / qa_generate_ans.py
Sujal Bhat
Initial commit without large files
35d7369
raw
history blame
4.3 kB
import os
import json
import asyncio
from dotenv import load_dotenv
import pandas as pd
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential
from qa_system import generate_answer # Import the QA system
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def generate_testset(generator, documents, test_size, distributions):
return generator.generate_with_langchain_docs(
documents,
test_size=test_size,
distributions=distributions
)
async def main():
# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OpenAI API key not found in environment variables.")
# Initialize OpenAI API key
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = OPENAI_API_KEY # Explicitly set the API key
# Check if testset.json exists
if os.path.exists("testset.json"):
print("Loading existing testset from testset.json")
with open("testset.json", "r") as f:
testset_dict = json.load(f)
df = pd.DataFrame(testset_dict)
else:
print("Generating new testset")
# Load PDF documents
documents = []
pdf_dir = "resources"
for filename in os.listdir(pdf_dir):
if filename.lower().endswith(".pdf"):
loader = PyPDFLoader(os.path.join(pdf_dir, filename))
docs = loader.load()
for doc in docs:
doc.metadata['filename'] = filename
documents.extend(docs)
for document in documents:
document.metadata['filename'] = document.metadata['source']
# Initialize OpenAI models (using gpt-3.5-turbo for both to reduce costs)
generator_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
# Initialize the Testset Generator
generator = TestsetGenerator.from_langchain(
generator_llm,
critic_llm,
embeddings
)
try:
# Generate testset with retry logic
testset = generate_testset(
generator,
documents,
test_size=3,
distributions={simple: 0.6, reasoning: 0.2, multi_context: 0.2}
)
# Convert testset to pandas DataFrame
df = testset.to_pandas()
# Save testset to JSON file
testset_dict = df.to_dict(orient='records')
with open("testset.json", "w") as f:
json.dump(testset_dict, f, indent=2)
print(f"New testset saved to testset.json")
except Exception as e:
print(f"An unexpected error occurred while generating testset: {e}")
return
try:
# Generate new answers for each question using the existing QA system
print("Generating new answers for all questions")
df['answer'] = df['question'].apply(generate_answer)
# Update JSON file with new answers
testset_dict = df.to_dict(orient='records')
with open("testset_with_answers.json", "w") as f:
json.dump(testset_dict, f, indent=2)
print(f"Testset with new answers saved to testset_with_answers.json")
# Save testset with new answers to CSV file
df.to_csv("testset_with_answers.csv", index=False)
print(f"Testset with new answers saved to testset_with_answers.csv")
except openai.APIConnectionError as e:
print(f"Failed to connect to OpenAI API after multiple attempts: {e}")
except RuntimeError as e:
print(f"Runtime error occurred: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
asyncio.run(main())