Spaces:

Sunbird
/

acres

Running

App Files Files Community

acres / rag /rag_pipeline.py

ak3ra

custom prompts

6a076b8 2 months ago

raw

history blame

3.4 kB

	# rag/rag_pipeline.py

	import json
	from typing import Dict, Any
	from llama_index.core import Document, VectorStoreIndex
	from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
	from llama_index.core import PromptTemplate
	from typing import List


	class RAGPipeline:
	def __init__(self, study_json, use_semantic_splitter=False):
	self.study_json = study_json
	self.use_semantic_splitter = use_semantic_splitter
	self.documents = None
	self.index = None

	def load_documents(self):
	if self.documents is None:
	with open(self.study_json, "r") as f:
	self.data = json.load(f)

	self.documents = []

	for index, doc_data in enumerate(self.data):
	doc_content = (
	f"Title: {doc_data['title']}\n"
	f"Authors: {', '.join(doc_data['authors'])}\n"
	f"Full Text: {doc_data['full_text']}"
	)

	metadata = {
	"title": doc_data.get("title"),
	"abstract": doc_data.get("abstract"),
	"authors": doc_data.get("authors", []),
	"year": doc_data.get("year"),
	"doi": doc_data.get("doi"),
	}

	self.documents.append(
	Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
	)

	def build_index(self):
	if self.index is None:
	self.load_documents()
	sentence_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=13)

	def _split(text: str) -> List[str]:
	return sentence_splitter.split_text(text)

	node_parser = SentenceWindowNodeParser.from_defaults(
	sentence_splitter=_split,
	window_size=3,
	window_metadata_key="window",
	original_text_metadata_key="original_text",
	)

	nodes = node_parser.get_nodes_from_documents(self.documents)
	self.index = VectorStoreIndex(nodes)

	def query(
	self, question: str, prompt_template: PromptTemplate = None
	) -> Dict[str, Any]:
	self.build_index() # This will only build the index if it hasn't been built yet

	if prompt_template is None:
	prompt_template = PromptTemplate(
	"Context information is below.\n"
	"---------------------\n"
	"{context_str}\n"
	"---------------------\n"
	"Given this information, please answer the question: {query_str}\n"
	"Include all relevant information from the provided context. "
	"If information comes from multiple sources, please mention all of them. "
	"If the information is not available in the context, please state that clearly. "
	"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
	)

	query_engine = self.index.as_query_engine(
	text_qa_template=prompt_template, similarity_top_k=5
	)
	response = query_engine.query(question)

	return {
	"question": question,
	"answer": response.response,
	"sources": [node.metadata for node in response.source_nodes],
	}