Spaces:
Running
Running
File size: 4,763 Bytes
8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff bc5a5b2 8121eff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import json
import os
from typing import Dict, Any
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
Document,
StorageContext,
load_index_from_storage,
)
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import PromptTemplate
class RAGPipeline:
def __init__(
self, metadata_file: str, pdf_dir: str, use_semantic_splitter: bool = False
):
self.metadata_file = metadata_file
self.pdf_dir = pdf_dir
self.use_semantic_splitter = use_semantic_splitter
self.index = None
self.load_documents()
self.build_index()
def load_documents(self):
with open(self.metadata_file, "r") as f:
self.metadata = json.load(f)
self.documents = []
for item_key, item_data in self.metadata.items():
metadata = item_data["metadata"]
pdf_path = item_data.get("pdf_path")
if pdf_path:
full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
if os.path.exists(full_pdf_path):
pdf_content = (
SimpleDirectoryReader(input_files=[full_pdf_path])
.load_data()[0]
.text
)
else:
pdf_content = "PDF file not found"
else:
pdf_content = "PDF path not available in metadata"
doc_content = (
f"Title: {metadata['title']}\n"
f"Abstract: {metadata['abstract']}\n"
f"Authors: {metadata['authors']}\n"
f"Year: {metadata['year']}\n"
f"DOI: {metadata['doi']}\n"
f"Full Text: {pdf_content}"
)
self.documents.append(
Document(text=doc_content, id_=item_key, metadata=metadata)
)
def build_index(self):
if self.use_semantic_splitter:
embed_model = OpenAIEmbedding()
splitter = SemanticSplitterNodeParser(
buffer_size=1,
breakpoint_percentile_threshold=95,
embed_model=embed_model,
)
else:
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(self.documents)
self.index = VectorStoreIndex(nodes)
def query(self, question: str, prompt_type: str = "default") -> Dict[str, Any]:
prompt_template = self._get_prompt_template(prompt_type)
query_engine = self.index.as_query_engine(
text_qa_template=prompt_template, similarity_top_k=5
)
response = query_engine.query(question)
return response
def _get_prompt_template(self, prompt_type: str) -> PromptTemplate:
if prompt_type == "highlight":
return PromptTemplate(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given this information, please answer the question: {query_str}\n"
"Include all relevant information from the provided context. "
"Highlight key information by enclosing it in **asterisks**. "
"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
)
elif prompt_type == "evidence_based":
return PromptTemplate(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given this information, please answer the question: {query_str}\n"
"Provide an answer to the question using evidence from the context above. "
"Cite sources using square brackets."
)
else:
return PromptTemplate(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given this information, please answer the question: {query_str}\n"
"Include all relevant information from the provided context. "
"If information comes from multiple sources, please mention all of them. "
"If the information is not available in the context, please state that clearly. "
"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
)
|