File size: 3,752 Bytes
b87527c
8121eff
 
 
 
 
 
 
 
 
 
 
 
b87527c
8121eff
 
 
b87527c
8121eff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import json
import os
from typing import Dict, Any
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Document,
    Response,
    PromptTemplate
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

# Make sure to set your OpenAI API key in the Hugging Face Spaces secrets
import openai
openai.api_key = os.environ.get('OPENAI_API_KEY')



class RAGPipeline:
    def __init__(self, metadata_file, pdf_dir, use_semantic_splitter=False):
        self.metadata_file = metadata_file
        self.pdf_dir = pdf_dir
        self.index = None
        self.use_semantic_splitter = use_semantic_splitter
        self.load_documents()
        self.build_index()

    def load_documents(self):
        with open(self.metadata_file, 'r') as f:
            self.metadata = json.load(f)

        self.documents = []
        for item_key, item_data in self.metadata.items():
            metadata = item_data['metadata']
            pdf_path = item_data.get('pdf_path')

            if pdf_path:
                full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
                if os.path.exists(full_pdf_path):
                    pdf_content = SimpleDirectoryReader(input_files=[full_pdf_path]).load_data()[0].text
                else:
                    pdf_content = "PDF file not found"
            else:
                pdf_content = "PDF path not available in metadata"

            doc_content = (
                f"Title: {metadata['title']}\n"
                f"Abstract: {metadata['abstract']}\n"
                f"Authors: {metadata['authors']}\n"
                f"Year: {metadata['year']}\n"
                f"DOI: {metadata['doi']}\n"
                f"Full Text: {pdf_content}"
            )

            self.documents.append(Document(
                text=doc_content,
                id_=item_key,
                metadata={
                    "title": metadata['title'],
                    "abstract": metadata['abstract'],
                    "authors": metadata['authors'],
                    "year": metadata['year'],
                    "doi": metadata['doi']
                }
            ))


    def build_index(self):
        if self.use_semantic_splitter:
            embed_model = OpenAIEmbedding()
            splitter = SemanticSplitterNodeParser(
                buffer_size=1,
                breakpoint_percentile_threshold=95,
                embed_model=embed_model
            )
        else:
            splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

        nodes = splitter.get_nodes_from_documents(self.documents)
        self.index = VectorStoreIndex(nodes)


    def query(self, question, prompt_template=None):
        if prompt_template is None:
            prompt_template = PromptTemplate(
                "Context information is below.\n"
                "---------------------\n"
                "{context_str}\n"
                "---------------------\n"
                "Given this information, please answer the question: {query_str}\n"
                "Include all relevant information from the provided context. "
                "If information comes from multiple sources, please mention all of them. "
                "If the information is not available in the context, please state that clearly. "
                "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
            )

        query_engine = self.index.as_query_engine(
            text_qa_template=prompt_template,
            similarity_top_k=5
        )
        response = query_engine.query(question)

        return response