Spaces:

Sunbird
/

acres

Running

App Files Files Community

acres / rag /rag_pipeline.py

ak3ra

added rag pipelin

bc5a5b2 2 months ago

raw

history blame

4.76 kB

	import json
	import os
	from typing import Dict, Any
	from llama_index.core import (
	SimpleDirectoryReader,
	VectorStoreIndex,
	Document,
	StorageContext,
	load_index_from_storage,
	)
	from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
	from llama_index.embeddings.openai import OpenAIEmbedding
	from llama_index.core import PromptTemplate


	class RAGPipeline:
	def __init__(
	self, metadata_file: str, pdf_dir: str, use_semantic_splitter: bool = False
	):
	self.metadata_file = metadata_file
	self.pdf_dir = pdf_dir
	self.use_semantic_splitter = use_semantic_splitter
	self.index = None
	self.load_documents()
	self.build_index()

	def load_documents(self):
	with open(self.metadata_file, "r") as f:
	self.metadata = json.load(f)

	self.documents = []
	for item_key, item_data in self.metadata.items():
	metadata = item_data["metadata"]
	pdf_path = item_data.get("pdf_path")

	if pdf_path:
	full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
	if os.path.exists(full_pdf_path):
	pdf_content = (
	SimpleDirectoryReader(input_files=[full_pdf_path])
	.load_data()[0]
	.text
	)
	else:
	pdf_content = "PDF file not found"
	else:
	pdf_content = "PDF path not available in metadata"

	doc_content = (
	f"Title: {metadata['title']}\n"
	f"Abstract: {metadata['abstract']}\n"
	f"Authors: {metadata['authors']}\n"
	f"Year: {metadata['year']}\n"
	f"DOI: {metadata['doi']}\n"
	f"Full Text: {pdf_content}"
	)

	self.documents.append(
	Document(text=doc_content, id_=item_key, metadata=metadata)
	)

	def build_index(self):
	if self.use_semantic_splitter:
	embed_model = OpenAIEmbedding()
	splitter = SemanticSplitterNodeParser(
	buffer_size=1,
	breakpoint_percentile_threshold=95,
	embed_model=embed_model,
	)
	else:
	splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

	nodes = splitter.get_nodes_from_documents(self.documents)
	self.index = VectorStoreIndex(nodes)

	def query(self, question: str, prompt_type: str = "default") -> Dict[str, Any]:
	prompt_template = self._get_prompt_template(prompt_type)

	query_engine = self.index.as_query_engine(
	text_qa_template=prompt_template, similarity_top_k=5
	)
	response = query_engine.query(question)

	return response

	def _get_prompt_template(self, prompt_type: str) -> PromptTemplate:
	if prompt_type == "highlight":
	return PromptTemplate(
	"Context information is below.\n"
	"---------------------\n"
	"{context_str}\n"
	"---------------------\n"
	"Given this information, please answer the question: {query_str}\n"
	"Include all relevant information from the provided context. "
	"Highlight key information by enclosing it in asterisks. "
	"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
	)
	elif prompt_type == "evidence_based":
	return PromptTemplate(
	"Context information is below.\n"
	"---------------------\n"
	"{context_str}\n"
	"---------------------\n"
	"Given this information, please answer the question: {query_str}\n"
	"Provide an answer to the question using evidence from the context above. "
	"Cite sources using square brackets."
	)
	else:
	return PromptTemplate(
	"Context information is below.\n"
	"---------------------\n"
	"{context_str}\n"
	"---------------------\n"
	"Given this information, please answer the question: {query_str}\n"
	"Include all relevant information from the provided context. "
	"If information comes from multiple sources, please mention all of them. "
	"If the information is not available in the context, please state that clearly. "
	"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
	)