LVKinyanjui commited on
Commit
38f846a
β€’
1 Parent(s): cd41c7b

Got dependency errors when working with chromadb and numpy

Browse files
app.py CHANGED
@@ -1,22 +1,33 @@
1
  import streamlit as st
2
  import pymupdf
3
- from io import StringIO
 
 
 
 
 
4
 
5
  st.write("## Local RAG \n Get Insights from your documents")
6
 
7
  file = st.file_uploader("Upload your Document Here", type=['pdf'])
8
 
9
  if file is not None:
10
- # doc = pymupdf.open(file)
11
- # texts = [page.get_text().encode("utf-8") for page in doc]
12
- # texts
13
-
14
- # To read file as bytes:
15
  bytes_data = file.getvalue()
16
  with open("data/uploaded_file.pdf", "wb") as fp:
17
  fp.write(bytes_data)
18
  doc = pymupdf.open(fp)
19
 
20
  texts = [page.get_text().encode("utf-8") for page in doc]
21
- texts
 
 
 
 
 
 
 
 
 
22
 
 
1
  import streamlit as st
2
  import pymupdf
3
+ import chromadb
4
+ from uuid import uuid4
5
+
6
+ @st.cache_resource
7
+ def initdb():
8
+ return chromadb.Client()
9
 
10
  st.write("## Local RAG \n Get Insights from your documents")
11
 
12
  file = st.file_uploader("Upload your Document Here", type=['pdf'])
13
 
14
  if file is not None:
15
+ # Read file as bytes and save it.
16
+ # PyMuPDF open can only load from file path
 
 
 
17
  bytes_data = file.getvalue()
18
  with open("data/uploaded_file.pdf", "wb") as fp:
19
  fp.write(bytes_data)
20
  doc = pymupdf.open(fp)
21
 
22
  texts = [page.get_text().encode("utf-8") for page in doc]
23
+
24
+ # VECTOR STORE
25
+ chroma_client = initdb()
26
+ collection = chroma_client.create_collection(name="rag_collection")
27
+
28
+ text_ids = [str(uuid4()) for text in texts]
29
+ collection.add(documents=texts, ids=text_ids)
30
+ st.write("Succesfully uploaded document to database.")
31
+
32
+
33
 
examples/chromadb_.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from uuid import uuid4
3
+
4
+ # @st.cache_resource
5
+ chroma_client = chromadb.Client()
6
+ collection = chroma_client.create_collection(name="example_collection")
7
+
8
+ # Data
9
+ documents = [
10
+ "This is a document about pineapple",
11
+ "This is a document about oranges"
12
+ ]
13
+ ids = [str(uuid4()) for doc in documents]
14
+
15
+ # Upserting
16
+ collection.add(
17
+ documents= documents,
18
+ ids=ids
19
+ )
20
+
21
+ # Querying
22
+ results = collection.query(
23
+ query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
24
+ n_results=2 # how many results to return
25
+ )
26
+ print(results)
27
+
28
+ print("Done")
{data β†’ examples/data}/State Machines.pdf RENAMED
File without changes
document_loader.py β†’ examples/pymupdf_loader.py RENAMED
File without changes
requirements.txt CHANGED
@@ -1,50 +1,3 @@
1
- altair==5.4.1
2
- anyio==4.4.0
3
- attrs==24.2.0
4
- blinker==1.8.2
5
- cachetools==5.5.0
6
- certifi==2024.7.4
7
- charset-normalizer==3.3.2
8
- click==8.1.7
9
- exceptiongroup==1.2.2
10
- gitdb==4.0.11
11
- GitPython==3.1.43
12
- h11==0.14.0
13
- httpcore==1.0.5
14
- httpx==0.27.2
15
- idna==3.8
16
- Jinja2==3.1.4
17
- jsonschema==4.23.0
18
- jsonschema-specifications==2023.12.1
19
- markdown-it-py==3.0.0
20
- MarkupSafe==2.1.5
21
- mdurl==0.1.2
22
- narwhals==1.5.5
23
- numpy==2.1.0
24
- ollama==0.3.2
25
- packaging==24.1
26
- pandas==2.2.2
27
- pillow==10.4.0
28
- protobuf==5.27.4
29
- pyarrow==17.0.0
30
- pydeck==0.9.1
31
- Pygments==2.18.0
32
- PyMuPDF==1.24.9
33
- PyMuPDFb==1.24.9
34
- python-dateutil==2.9.0.post0
35
- pytz==2024.1
36
- referencing==0.35.1
37
- requests==2.32.3
38
- rich==13.8.0
39
- rpds-py==0.20.0
40
- six==1.16.0
41
- smmap==5.0.1
42
- sniffio==1.3.1
43
  streamlit==1.38.0
44
- tenacity==8.5.0
45
- toml==0.10.2
46
- tornado==6.4.1
47
- typing_extensions==4.12.2
48
- tzdata==2024.1
49
- urllib3==2.2.2
50
- watchdog==4.0.2
 
1
+ chromadb==0.5.5
2
+ pymupdf==1.24.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  streamlit==1.38.0