Spaces:
Sleeping
Sleeping
LVKinyanjui
commited on
Commit
β’
38f846a
1
Parent(s):
cd41c7b
Got dependency errors when working with chromadb and numpy
Browse files- app.py +18 -7
- examples/chromadb_.py +28 -0
- {data β examples/data}/State Machines.pdf +0 -0
- document_loader.py β examples/pymupdf_loader.py +0 -0
- requirements.txt +2 -49
app.py
CHANGED
@@ -1,22 +1,33 @@
|
|
1 |
import streamlit as st
|
2 |
import pymupdf
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
st.write("## Local RAG \n Get Insights from your documents")
|
6 |
|
7 |
file = st.file_uploader("Upload your Document Here", type=['pdf'])
|
8 |
|
9 |
if file is not None:
|
10 |
-
#
|
11 |
-
#
|
12 |
-
# texts
|
13 |
-
|
14 |
-
# To read file as bytes:
|
15 |
bytes_data = file.getvalue()
|
16 |
with open("data/uploaded_file.pdf", "wb") as fp:
|
17 |
fp.write(bytes_data)
|
18 |
doc = pymupdf.open(fp)
|
19 |
|
20 |
texts = [page.get_text().encode("utf-8") for page in doc]
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
1 |
import streamlit as st
|
2 |
import pymupdf
|
3 |
+
import chromadb
|
4 |
+
from uuid import uuid4
|
5 |
+
|
6 |
+
@st.cache_resource
|
7 |
+
def initdb():
|
8 |
+
return chromadb.Client()
|
9 |
|
10 |
st.write("## Local RAG \n Get Insights from your documents")
|
11 |
|
12 |
file = st.file_uploader("Upload your Document Here", type=['pdf'])
|
13 |
|
14 |
if file is not None:
|
15 |
+
# Read file as bytes and save it.
|
16 |
+
# PyMuPDF open can only load from file path
|
|
|
|
|
|
|
17 |
bytes_data = file.getvalue()
|
18 |
with open("data/uploaded_file.pdf", "wb") as fp:
|
19 |
fp.write(bytes_data)
|
20 |
doc = pymupdf.open(fp)
|
21 |
|
22 |
texts = [page.get_text().encode("utf-8") for page in doc]
|
23 |
+
|
24 |
+
# VECTOR STORE
|
25 |
+
chroma_client = initdb()
|
26 |
+
collection = chroma_client.create_collection(name="rag_collection")
|
27 |
+
|
28 |
+
text_ids = [str(uuid4()) for text in texts]
|
29 |
+
collection.add(documents=texts, ids=text_ids)
|
30 |
+
st.write("Succesfully uploaded document to database.")
|
31 |
+
|
32 |
+
|
33 |
|
examples/chromadb_.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from uuid import uuid4
|
3 |
+
|
4 |
+
# @st.cache_resource
|
5 |
+
chroma_client = chromadb.Client()
|
6 |
+
collection = chroma_client.create_collection(name="example_collection")
|
7 |
+
|
8 |
+
# Data
|
9 |
+
documents = [
|
10 |
+
"This is a document about pineapple",
|
11 |
+
"This is a document about oranges"
|
12 |
+
]
|
13 |
+
ids = [str(uuid4()) for doc in documents]
|
14 |
+
|
15 |
+
# Upserting
|
16 |
+
collection.add(
|
17 |
+
documents= documents,
|
18 |
+
ids=ids
|
19 |
+
)
|
20 |
+
|
21 |
+
# Querying
|
22 |
+
results = collection.query(
|
23 |
+
query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
|
24 |
+
n_results=2 # how many results to return
|
25 |
+
)
|
26 |
+
print(results)
|
27 |
+
|
28 |
+
print("Done")
|
{data β examples/data}/State Machines.pdf
RENAMED
File without changes
|
document_loader.py β examples/pymupdf_loader.py
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,50 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
attrs==24.2.0
|
4 |
-
blinker==1.8.2
|
5 |
-
cachetools==5.5.0
|
6 |
-
certifi==2024.7.4
|
7 |
-
charset-normalizer==3.3.2
|
8 |
-
click==8.1.7
|
9 |
-
exceptiongroup==1.2.2
|
10 |
-
gitdb==4.0.11
|
11 |
-
GitPython==3.1.43
|
12 |
-
h11==0.14.0
|
13 |
-
httpcore==1.0.5
|
14 |
-
httpx==0.27.2
|
15 |
-
idna==3.8
|
16 |
-
Jinja2==3.1.4
|
17 |
-
jsonschema==4.23.0
|
18 |
-
jsonschema-specifications==2023.12.1
|
19 |
-
markdown-it-py==3.0.0
|
20 |
-
MarkupSafe==2.1.5
|
21 |
-
mdurl==0.1.2
|
22 |
-
narwhals==1.5.5
|
23 |
-
numpy==2.1.0
|
24 |
-
ollama==0.3.2
|
25 |
-
packaging==24.1
|
26 |
-
pandas==2.2.2
|
27 |
-
pillow==10.4.0
|
28 |
-
protobuf==5.27.4
|
29 |
-
pyarrow==17.0.0
|
30 |
-
pydeck==0.9.1
|
31 |
-
Pygments==2.18.0
|
32 |
-
PyMuPDF==1.24.9
|
33 |
-
PyMuPDFb==1.24.9
|
34 |
-
python-dateutil==2.9.0.post0
|
35 |
-
pytz==2024.1
|
36 |
-
referencing==0.35.1
|
37 |
-
requests==2.32.3
|
38 |
-
rich==13.8.0
|
39 |
-
rpds-py==0.20.0
|
40 |
-
six==1.16.0
|
41 |
-
smmap==5.0.1
|
42 |
-
sniffio==1.3.1
|
43 |
streamlit==1.38.0
|
44 |
-
tenacity==8.5.0
|
45 |
-
toml==0.10.2
|
46 |
-
tornado==6.4.1
|
47 |
-
typing_extensions==4.12.2
|
48 |
-
tzdata==2024.1
|
49 |
-
urllib3==2.2.2
|
50 |
-
watchdog==4.0.2
|
|
|
1 |
+
chromadb==0.5.5
|
2 |
+
pymupdf==1.24.9
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
streamlit==1.38.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|