Spaces:

LVKinyanjui
/

QueryYourDocs

Sleeping

LVKinyanjui commited on Aug 30

Commit

38f846a

•

1 Parent(s): cd41c7b

Got dependency errors when working with chromadb and numpy

Files changed (5) hide show

app.py CHANGED Viewed

@@ -1,22 +1,33 @@
 import streamlit as st
 import pymupdf
-from io import StringIO
 st.write("## Local RAG \n Get Insights from your documents")
 file = st.file_uploader("Upload your Document Here", type=['pdf'])
 if file is not None:
-    # doc = pymupdf.open(file)
-    # texts = [page.get_text().encode("utf-8") for page in doc]
-    # texts
-    # To read file as bytes:
     bytes_data = file.getvalue()
     with open("data/uploaded_file.pdf", "wb") as fp:
         fp.write(bytes_data)
         doc = pymupdf.open(fp)
     texts = [page.get_text().encode("utf-8") for page in doc]
-    texts

 import streamlit as st
 import pymupdf
+import chromadb
+from uuid import uuid4
+@st.cache_resource
+def initdb():
+    return chromadb.Client()
 st.write("## Local RAG \n Get Insights from your documents")
 file = st.file_uploader("Upload your Document Here", type=['pdf'])
 if file is not None:
+    # Read file as bytes and save it.
+    # PyMuPDF open can only load from file path
     bytes_data = file.getvalue()
     with open("data/uploaded_file.pdf", "wb") as fp:
         fp.write(bytes_data)
         doc = pymupdf.open(fp)
     texts = [page.get_text().encode("utf-8") for page in doc]
+    # VECTOR STORE
+    chroma_client = initdb()
+    collection = chroma_client.create_collection(name="rag_collection")
+    text_ids = [str(uuid4()) for text in texts]
+    collection.add(documents=texts, ids=text_ids)
+    st.write("Succesfully uploaded document to database.")

examples/chromadb_.py ADDED Viewed

+import chromadb
+from uuid import uuid4
+# @st.cache_resource
+chroma_client = chromadb.Client()
+collection = chroma_client.create_collection(name="example_collection")
+# Data
+documents = [
+        "This is a document about pineapple",
+        "This is a document about oranges"
+    ]
+ids = [str(uuid4()) for doc in documents]
+# Upserting
+collection.add(
+    documents= documents,
+    ids=ids
+)
+# Querying
+results = collection.query(
+    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
+    n_results=2 # how many results to return
+)
+print(results)
+print("Done")

{data → examples/data}/State Machines.pdf RENAMED Viewed

File without changes

document_loader.py → examples/pymupdf_loader.py RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,50 +1,3 @@
-altair==5.4.1
-anyio==4.4.0
-attrs==24.2.0
-blinker==1.8.2
-cachetools==5.5.0
-certifi==2024.7.4
-charset-normalizer==3.3.2
-click==8.1.7
-exceptiongroup==1.2.2
-gitdb==4.0.11
-GitPython==3.1.43
-h11==0.14.0
-httpcore==1.0.5
-httpx==0.27.2
-idna==3.8
-Jinja2==3.1.4
-jsonschema==4.23.0
-jsonschema-specifications==2023.12.1
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-mdurl==0.1.2
-narwhals==1.5.5
-numpy==2.1.0
-ollama==0.3.2
-packaging==24.1
-pandas==2.2.2
-pillow==10.4.0
-protobuf==5.27.4
-pyarrow==17.0.0
-pydeck==0.9.1
-Pygments==2.18.0
-PyMuPDF==1.24.9
-PyMuPDFb==1.24.9
-python-dateutil==2.9.0.post0
-pytz==2024.1
-referencing==0.35.1
-requests==2.32.3
-rich==13.8.0
-rpds-py==0.20.0
-six==1.16.0
-smmap==5.0.1
-sniffio==1.3.1
 streamlit==1.38.0
-tenacity==8.5.0
-toml==0.10.2
-tornado==6.4.1
-typing_extensions==4.12.2
-tzdata==2024.1
-urllib3==2.2.2
-watchdog==4.0.2

+chromadb==0.5.5
+pymupdf==1.24.9
 streamlit==1.38.0