Spaces:
Sleeping
Sleeping
LVKinyanjui
commited on
Commit
•
cd41c7b
1
Parent(s):
0b5e429
Spun up app and added pdf uploading functionality with pymupdf
Browse files- Dockerfile +26 -19
- app.py +22 -0
- compose.yaml +2 -0
- data/State Machines.pdf +0 -0
- data/uploaded_file.pdf +0 -0
- document_loader.py +9 -0
- requirements.txt +50 -0
- uploaded_file.pdf +0 -0
Dockerfile
CHANGED
@@ -18,28 +18,35 @@ ENV PYTHONUNBUFFERED=1
|
|
18 |
|
19 |
WORKDIR /app
|
20 |
|
21 |
-
# Create a non-privileged user that the app will run under.
|
22 |
-
# See https://docs.docker.com/go/dockerfile-user-best-practices/
|
23 |
-
ARG UID=10001
|
24 |
-
RUN adduser \
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
# Download dependencies as a separate step to take advantage of Docker's caching.
|
34 |
-
# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
|
35 |
-
# Leverage a bind mount to requirements.txt to avoid having to copy them into
|
36 |
-
# into this layer.
|
|
|
37 |
RUN --mount=type=cache,target=/root/.cache/pip \
|
38 |
--mount=type=bind,source=requirements.txt,target=requirements.txt \
|
39 |
python -m pip install -r requirements.txt
|
40 |
|
41 |
-
#
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# Copy the source code into the container.
|
45 |
COPY . .
|
@@ -48,4 +55,4 @@ COPY . .
|
|
48 |
EXPOSE 8000
|
49 |
|
50 |
# Run the application.
|
51 |
-
CMD streamlit run app.py --server.port
|
|
|
18 |
|
19 |
WORKDIR /app
|
20 |
|
21 |
+
# # Create a non-privileged user that the app will run under.
|
22 |
+
# # See https://docs.docker.com/go/dockerfile-user-best-practices/
|
23 |
+
# ARG UID=10001
|
24 |
+
# RUN adduser \
|
25 |
+
# --disabled-password \
|
26 |
+
# --gecos "" \
|
27 |
+
# --home "/nonexistent" \
|
28 |
+
# --shell "/sbin/nologin" \
|
29 |
+
# --no-create-home \
|
30 |
+
# --uid "${UID}" \
|
31 |
+
# appuser
|
32 |
+
|
33 |
+
# # Download dependencies as a separate step to take advantage of Docker's caching.
|
34 |
+
# # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
|
35 |
+
# # Leverage a bind mount to requirements.txt to avoid having to copy them into
|
36 |
+
# # into this layer.
|
37 |
+
|
38 |
RUN --mount=type=cache,target=/root/.cache/pip \
|
39 |
--mount=type=bind,source=requirements.txt,target=requirements.txt \
|
40 |
python -m pip install -r requirements.txt
|
41 |
|
42 |
+
# COPY requirements.txt .
|
43 |
+
# RUN python -m pip install --no-cache-dir -r requirements.txt
|
44 |
+
|
45 |
+
# Install ollama
|
46 |
+
RUN curl -fsSL https://ollama.com/install.sh | sh
|
47 |
+
|
48 |
+
# # Switch to the non-privileged user to run the application.
|
49 |
+
# USER appuser
|
50 |
|
51 |
# Copy the source code into the container.
|
52 |
COPY . .
|
|
|
55 |
EXPOSE 8000
|
56 |
|
57 |
# Run the application.
|
58 |
+
CMD streamlit run app.py --server.port 8000
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pymupdf
|
3 |
+
from io import StringIO
|
4 |
+
|
5 |
+
st.write("## Local RAG \n Get Insights from your documents")
|
6 |
+
|
7 |
+
file = st.file_uploader("Upload your Document Here", type=['pdf'])
|
8 |
+
|
9 |
+
if file is not None:
|
10 |
+
# doc = pymupdf.open(file)
|
11 |
+
# texts = [page.get_text().encode("utf-8") for page in doc]
|
12 |
+
# texts
|
13 |
+
|
14 |
+
# To read file as bytes:
|
15 |
+
bytes_data = file.getvalue()
|
16 |
+
with open("data/uploaded_file.pdf", "wb") as fp:
|
17 |
+
fp.write(bytes_data)
|
18 |
+
doc = pymupdf.open(fp)
|
19 |
+
|
20 |
+
texts = [page.get_text().encode("utf-8") for page in doc]
|
21 |
+
texts
|
22 |
+
|
compose.yaml
CHANGED
@@ -11,6 +11,8 @@ services:
|
|
11 |
server:
|
12 |
build:
|
13 |
context: .
|
|
|
|
|
14 |
ports:
|
15 |
- 8000:8000
|
16 |
|
|
|
11 |
server:
|
12 |
build:
|
13 |
context: .
|
14 |
+
volumes:
|
15 |
+
- .:/app
|
16 |
ports:
|
17 |
- 8000:8000
|
18 |
|
data/State Machines.pdf
ADDED
Binary file (532 kB). View file
|
|
data/uploaded_file.pdf
ADDED
Binary file (532 kB). View file
|
|
document_loader.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymupdf
|
2 |
+
|
3 |
+
doc = pymupdf.open("data/State Machines.pdf")
|
4 |
+
|
5 |
+
texts = [page.get_text().encode("utf-8") for page in doc]
|
6 |
+
|
7 |
+
print("Done")
|
8 |
+
|
9 |
+
# with open("data/State Machines.pdf", "wb", encoding="utf-8") as out:
|
requirements.txt
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.4.1
|
2 |
+
anyio==4.4.0
|
3 |
+
attrs==24.2.0
|
4 |
+
blinker==1.8.2
|
5 |
+
cachetools==5.5.0
|
6 |
+
certifi==2024.7.4
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
exceptiongroup==1.2.2
|
10 |
+
gitdb==4.0.11
|
11 |
+
GitPython==3.1.43
|
12 |
+
h11==0.14.0
|
13 |
+
httpcore==1.0.5
|
14 |
+
httpx==0.27.2
|
15 |
+
idna==3.8
|
16 |
+
Jinja2==3.1.4
|
17 |
+
jsonschema==4.23.0
|
18 |
+
jsonschema-specifications==2023.12.1
|
19 |
+
markdown-it-py==3.0.0
|
20 |
+
MarkupSafe==2.1.5
|
21 |
+
mdurl==0.1.2
|
22 |
+
narwhals==1.5.5
|
23 |
+
numpy==2.1.0
|
24 |
+
ollama==0.3.2
|
25 |
+
packaging==24.1
|
26 |
+
pandas==2.2.2
|
27 |
+
pillow==10.4.0
|
28 |
+
protobuf==5.27.4
|
29 |
+
pyarrow==17.0.0
|
30 |
+
pydeck==0.9.1
|
31 |
+
Pygments==2.18.0
|
32 |
+
PyMuPDF==1.24.9
|
33 |
+
PyMuPDFb==1.24.9
|
34 |
+
python-dateutil==2.9.0.post0
|
35 |
+
pytz==2024.1
|
36 |
+
referencing==0.35.1
|
37 |
+
requests==2.32.3
|
38 |
+
rich==13.8.0
|
39 |
+
rpds-py==0.20.0
|
40 |
+
six==1.16.0
|
41 |
+
smmap==5.0.1
|
42 |
+
sniffio==1.3.1
|
43 |
+
streamlit==1.38.0
|
44 |
+
tenacity==8.5.0
|
45 |
+
toml==0.10.2
|
46 |
+
tornado==6.4.1
|
47 |
+
typing_extensions==4.12.2
|
48 |
+
tzdata==2024.1
|
49 |
+
urllib3==2.2.2
|
50 |
+
watchdog==4.0.2
|
uploaded_file.pdf
ADDED
Binary file (532 kB). View file
|
|