ak3ra commited on
Commit
8121eff
1 Parent(s): 466e2e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -4
app.py CHANGED
@@ -1,7 +1,105 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import json
3
+ import os
4
+ from typing import Dict, Any
5
+ from llama_index.core import (
6
+ SimpleDirectoryReader,
7
+ VectorStoreIndex,
8
+ Document,
9
+ Response,
10
+ PromptTemplate
11
+ )
12
+ from llama_index.core.node_parser import SentenceSplitter
13
+ from llama_index.embeddings.openai import OpenAIEmbedding
14
 
15
+ # Make sure to set your OpenAI API key in the Hugging Face Spaces secrets
16
+ import openai
17
+ openai.api_key = os.environ.get('OPENAI_API_KEY')
18
 
19
+
20
+
21
+ class RAGPipeline:
22
+ def __init__(self, metadata_file, pdf_dir, use_semantic_splitter=False):
23
+ self.metadata_file = metadata_file
24
+ self.pdf_dir = pdf_dir
25
+ self.index = None
26
+ self.use_semantic_splitter = use_semantic_splitter
27
+ self.load_documents()
28
+ self.build_index()
29
+
30
+ def load_documents(self):
31
+ with open(self.metadata_file, 'r') as f:
32
+ self.metadata = json.load(f)
33
+
34
+ self.documents = []
35
+ for item_key, item_data in self.metadata.items():
36
+ metadata = item_data['metadata']
37
+ pdf_path = item_data.get('pdf_path')
38
+
39
+ if pdf_path:
40
+ full_pdf_path = os.path.join(self.pdf_dir, os.path.basename(pdf_path))
41
+ if os.path.exists(full_pdf_path):
42
+ pdf_content = SimpleDirectoryReader(input_files=[full_pdf_path]).load_data()[0].text
43
+ else:
44
+ pdf_content = "PDF file not found"
45
+ else:
46
+ pdf_content = "PDF path not available in metadata"
47
+
48
+ doc_content = (
49
+ f"Title: {metadata['title']}\n"
50
+ f"Abstract: {metadata['abstract']}\n"
51
+ f"Authors: {metadata['authors']}\n"
52
+ f"Year: {metadata['year']}\n"
53
+ f"DOI: {metadata['doi']}\n"
54
+ f"Full Text: {pdf_content}"
55
+ )
56
+
57
+ self.documents.append(Document(
58
+ text=doc_content,
59
+ id_=item_key,
60
+ metadata={
61
+ "title": metadata['title'],
62
+ "abstract": metadata['abstract'],
63
+ "authors": metadata['authors'],
64
+ "year": metadata['year'],
65
+ "doi": metadata['doi']
66
+ }
67
+ ))
68
+
69
+
70
+ def build_index(self):
71
+ if self.use_semantic_splitter:
72
+ embed_model = OpenAIEmbedding()
73
+ splitter = SemanticSplitterNodeParser(
74
+ buffer_size=1,
75
+ breakpoint_percentile_threshold=95,
76
+ embed_model=embed_model
77
+ )
78
+ else:
79
+ splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
80
+
81
+ nodes = splitter.get_nodes_from_documents(self.documents)
82
+ self.index = VectorStoreIndex(nodes)
83
+
84
+
85
+ def query(self, question, prompt_template=None):
86
+ if prompt_template is None:
87
+ prompt_template = PromptTemplate(
88
+ "Context information is below.\n"
89
+ "---------------------\n"
90
+ "{context_str}\n"
91
+ "---------------------\n"
92
+ "Given this information, please answer the question: {query_str}\n"
93
+ "Include all relevant information from the provided context. "
94
+ "If information comes from multiple sources, please mention all of them. "
95
+ "If the information is not available in the context, please state that clearly. "
96
+ "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
97
+ )
98
+
99
+ query_engine = self.index.as_query_engine(
100
+ text_qa_template=prompt_template,
101
+ similarity_top_k=5
102
+ )
103
+ response = query_engine.query(question)
104
+
105
+ return response