Spaces:

Sunbird
/

acres

Running

App Files Files Community

ak3ra commited on Sep 6

Commit

d762ede

•

1 Parent(s): 70691a9

bugfix with fastapi because bleeding edge gradio https://github.com/gradio-app/gradio/issues/9275

Browse files

Files changed (8) hide show

__pycache__/config.cpython-311.pyc +0 -0
app.py +3 -19
rag/__pycache__/__init__.cpython-311.pyc +0 -0
rag/__pycache__/rag_pipeline.cpython-311.pyc +0 -0
rag/rag_pipeline.py +13 -37
requirements.txt +1 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/prompts.cpython-311.pyc +0 -0

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (455 Bytes). View file

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from utils.prompts import highlight_prompt, evidence_based_prompt
 from utils.prompts import (
     sample_questions,
 )
 from config import STUDY_FILES
 # Cache for RAG pipelines
@@ -24,9 +25,6 @@ def get_rag_pipeline(study_name):
 def query_rag(study_name: str, question: str, prompt_type: str) -> str:
     rag = get_rag_pipeline(study_name)
-    # Extract study information using RAG
-    study_info = rag.extract_study_info()
     if prompt_type == "Highlight":
         prompt = highlight_prompt
     elif prompt_type == "Evidence-based":
@@ -34,17 +32,8 @@ def query_rag(study_name: str, question: str, prompt_type: str) -> str:
     else:
         prompt = None
-    # Prepare the context with study info
-    context = "Study Information:\n"
-    for key, value in study_info.items():
-        context += f"{key}: {value}\n"
-    context += "\n"
-    # Add the question to the context
-    context += f"Question: {question}\n"
     # Use the prepared context in the query
-    response = rag.query(context, prompt_template=prompt)
     # Format the response as Markdown
     formatted_response = f"## Question\n\n{question}\n\n## Answer\n\n{response['answer']}\n\n## Sources\n\n"
@@ -53,11 +42,6 @@ def query_rag(study_name: str, question: str, prompt_type: str) -> str:
             f"- {source['title']} ({source.get('year', 'Year not specified')})\n"
         )
-    # Add extracted study information to the response
-    formatted_response += "\n## Extracted Study Information\n\n"
-    for key, value in study_info.items():
-        formatted_response += f"- **{key.replace('_', ' ').title()}**: {value}\n"
     return formatted_response
@@ -122,4 +106,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 from utils.prompts import (
     sample_questions,
 )
 from config import STUDY_FILES
 # Cache for RAG pipelines
 def query_rag(study_name: str, question: str, prompt_type: str) -> str:
     rag = get_rag_pipeline(study_name)
     if prompt_type == "Highlight":
         prompt = highlight_prompt
     elif prompt_type == "Evidence-based":
     else:
         prompt = None
     # Use the prepared context in the query
+    response = rag.query(question, prompt_template=prompt)
     # Format the response as Markdown
     formatted_response = f"## Question\n\n{question}\n\n## Answer\n\n{response['answer']}\n\n## Sources\n\n"
             f"- {source['title']} ({source.get('year', 'Year not specified')})\n"
         )
     return formatted_response
     )
 if __name__ == "__main__":
+    demo.launch(share=True, debug=True)

rag/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (159 Bytes). View file

rag/__pycache__/rag_pipeline.cpython-311.pyc ADDED Viewed

Binary file (5.58 kB). View file

rag/rag_pipeline.py CHANGED Viewed

@@ -4,6 +4,8 @@ from llama_index.core import Document, VectorStoreIndex
 from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
 from llama_index.core import PromptTemplate
 from typing import List
 class RAGPipeline:
@@ -43,53 +45,22 @@ class RAGPipeline:
     def build_index(self):
         if self.index is None:
-            sentence_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=13)
             def _split(text: str) -> List[str]:
                 return sentence_splitter.split_text(text)
             node_parser = SentenceWindowNodeParser.from_defaults(
                 sentence_splitter=_split,
-                window_size=3,
                 window_metadata_key="window",
                 original_text_metadata_key="original_text",
             )
             nodes = node_parser.get_nodes_from_documents(self.documents)
-            self.index = VectorStoreIndex(nodes)
-    def extract_study_info(self) -> Dict[str, Any]:
-        extraction_prompt = PromptTemplate(
-            "Based on the given context, please extract the following information about the study:\n"
-            "1. Study ID\n"
-            "2. Author(s)\n"
-            "3. Year\n"
-            "4. Title\n"
-            "5. Study design\n"
-            "6. Study area/region\n"
-            "7. Study population\n"
-            "8. Disease under study\n"
-            "9. Duration of study\n"
-            "If the information is not available, please respond with 'Not found' for that field.\n"
-            "Context: {context_str}\n"
-            "Extracted information:"
-        )
-        query_engine = self.index.as_query_engine(
-            text_qa_template=extraction_prompt, similarity_top_k=5
-        )
-        response = query_engine.query("Extract study information")
-        # Parse the response to extract key-value pairs
-        lines = response.response.split("\n")
-        extracted_info = {}
-        for line in lines:
-            if ":" in line:
-                key, value = line.split(":", 1)
-                extracted_info[key.strip().lower().replace(" ", "_")] = value.strip()
-        return extracted_info
     def query(
         self, context: str, prompt_template: PromptTemplate = None
@@ -107,8 +78,13 @@ class RAGPipeline:
                 "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
             )
         query_engine = self.index.as_query_engine(
-            text_qa_template=prompt_template, similarity_top_k=5
         )
         response = query_engine.query(context)

 from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
 from llama_index.core import PromptTemplate
 from typing import List
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
 class RAGPipeline:
     def build_index(self):
         if self.index is None:
+            sentence_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
             def _split(text: str) -> List[str]:
                 return sentence_splitter.split_text(text)
             node_parser = SentenceWindowNodeParser.from_defaults(
                 sentence_splitter=_split,
+                window_size=5,
                 window_metadata_key="window",
                 original_text_metadata_key="original_text",
             )
             nodes = node_parser.get_nodes_from_documents(self.documents)
+            self.index = VectorStoreIndex(
+                nodes, embed_model=OpenAIEmbedding(model_name="text-embedding-3-large")
+            )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
                 "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
             )
+        # This is a hack to index all the documents in the store :)
+        n_documents = len(self.index.docstore.docs)
         query_engine = self.index.as_query_engine(
+            text_qa_template=prompt_template,
+            similarity_top_k=n_documents,
+            response_mode="tree_summarize",
+            llm=OpenAI(model="gpt-4o-mini"),
         )
         response = query_engine.query(context)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio
 llama-index
 openai

+fastapi==0.112.2
 gradio
 llama-index
 openai

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (161 Bytes). View file

utils/__pycache__/prompts.cpython-311.pyc ADDED Viewed

Binary file (5.68 kB). View file