LVKinyanjui commited on
Commit
cd41c7b
1 Parent(s): 0b5e429

Spun up app and added pdf uploading functionality with pymupdf

Browse files
Dockerfile CHANGED
@@ -18,28 +18,35 @@ ENV PYTHONUNBUFFERED=1
18
 
19
  WORKDIR /app
20
 
21
- # Create a non-privileged user that the app will run under.
22
- # See https://docs.docker.com/go/dockerfile-user-best-practices/
23
- ARG UID=10001
24
- RUN adduser \
25
- --disabled-password \
26
- --gecos "" \
27
- --home "/nonexistent" \
28
- --shell "/sbin/nologin" \
29
- --no-create-home \
30
- --uid "${UID}" \
31
- appuser
32
-
33
- # Download dependencies as a separate step to take advantage of Docker's caching.
34
- # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
35
- # Leverage a bind mount to requirements.txt to avoid having to copy them into
36
- # into this layer.
 
37
  RUN --mount=type=cache,target=/root/.cache/pip \
38
  --mount=type=bind,source=requirements.txt,target=requirements.txt \
39
  python -m pip install -r requirements.txt
40
 
41
- # Switch to the non-privileged user to run the application.
42
- USER appuser
 
 
 
 
 
 
43
 
44
  # Copy the source code into the container.
45
  COPY . .
@@ -48,4 +55,4 @@ COPY . .
48
  EXPOSE 8000
49
 
50
  # Run the application.
51
- CMD streamlit run app.py --server.port 8080
 
18
 
19
  WORKDIR /app
20
 
21
+ # # Create a non-privileged user that the app will run under.
22
+ # # See https://docs.docker.com/go/dockerfile-user-best-practices/
23
+ # ARG UID=10001
24
+ # RUN adduser \
25
+ # --disabled-password \
26
+ # --gecos "" \
27
+ # --home "/nonexistent" \
28
+ # --shell "/sbin/nologin" \
29
+ # --no-create-home \
30
+ # --uid "${UID}" \
31
+ # appuser
32
+
33
+ # # Download dependencies as a separate step to take advantage of Docker's caching.
34
+ # # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
35
+ # # Leverage a bind mount to requirements.txt to avoid having to copy them into
36
+ # # into this layer.
37
+
38
  RUN --mount=type=cache,target=/root/.cache/pip \
39
  --mount=type=bind,source=requirements.txt,target=requirements.txt \
40
  python -m pip install -r requirements.txt
41
 
42
+ # COPY requirements.txt .
43
+ # RUN python -m pip install --no-cache-dir -r requirements.txt
44
+
45
+ # Install ollama
46
+ RUN curl -fsSL https://ollama.com/install.sh | sh
47
+
48
+ # # Switch to the non-privileged user to run the application.
49
+ # USER appuser
50
 
51
  # Copy the source code into the container.
52
  COPY . .
 
55
  EXPOSE 8000
56
 
57
  # Run the application.
58
+ CMD streamlit run app.py --server.port 8000
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pymupdf
3
+ from io import StringIO
4
+
5
+ st.write("## Local RAG \n Get Insights from your documents")
6
+
7
+ file = st.file_uploader("Upload your Document Here", type=['pdf'])
8
+
9
+ if file is not None:
10
+ # doc = pymupdf.open(file)
11
+ # texts = [page.get_text().encode("utf-8") for page in doc]
12
+ # texts
13
+
14
+ # To read file as bytes:
15
+ bytes_data = file.getvalue()
16
+ with open("data/uploaded_file.pdf", "wb") as fp:
17
+ fp.write(bytes_data)
18
+ doc = pymupdf.open(fp)
19
+
20
+ texts = [page.get_text().encode("utf-8") for page in doc]
21
+ texts
22
+
compose.yaml CHANGED
@@ -11,6 +11,8 @@ services:
11
  server:
12
  build:
13
  context: .
 
 
14
  ports:
15
  - 8000:8000
16
 
 
11
  server:
12
  build:
13
  context: .
14
+ volumes:
15
+ - .:/app
16
  ports:
17
  - 8000:8000
18
 
data/State Machines.pdf ADDED
Binary file (532 kB). View file
 
data/uploaded_file.pdf ADDED
Binary file (532 kB). View file
 
document_loader.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf
2
+
3
+ doc = pymupdf.open("data/State Machines.pdf")
4
+
5
+ texts = [page.get_text().encode("utf-8") for page in doc]
6
+
7
+ print("Done")
8
+
9
+ # with open("data/State Machines.pdf", "wb", encoding="utf-8") as out:
requirements.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.4.1
2
+ anyio==4.4.0
3
+ attrs==24.2.0
4
+ blinker==1.8.2
5
+ cachetools==5.5.0
6
+ certifi==2024.7.4
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ exceptiongroup==1.2.2
10
+ gitdb==4.0.11
11
+ GitPython==3.1.43
12
+ h11==0.14.0
13
+ httpcore==1.0.5
14
+ httpx==0.27.2
15
+ idna==3.8
16
+ Jinja2==3.1.4
17
+ jsonschema==4.23.0
18
+ jsonschema-specifications==2023.12.1
19
+ markdown-it-py==3.0.0
20
+ MarkupSafe==2.1.5
21
+ mdurl==0.1.2
22
+ narwhals==1.5.5
23
+ numpy==2.1.0
24
+ ollama==0.3.2
25
+ packaging==24.1
26
+ pandas==2.2.2
27
+ pillow==10.4.0
28
+ protobuf==5.27.4
29
+ pyarrow==17.0.0
30
+ pydeck==0.9.1
31
+ Pygments==2.18.0
32
+ PyMuPDF==1.24.9
33
+ PyMuPDFb==1.24.9
34
+ python-dateutil==2.9.0.post0
35
+ pytz==2024.1
36
+ referencing==0.35.1
37
+ requests==2.32.3
38
+ rich==13.8.0
39
+ rpds-py==0.20.0
40
+ six==1.16.0
41
+ smmap==5.0.1
42
+ sniffio==1.3.1
43
+ streamlit==1.38.0
44
+ tenacity==8.5.0
45
+ toml==0.10.2
46
+ tornado==6.4.1
47
+ typing_extensions==4.12.2
48
+ tzdata==2024.1
49
+ urllib3==2.2.2
50
+ watchdog==4.0.2
uploaded_file.pdf ADDED
Binary file (532 kB). View file