Spaces:
Sleeping
Sleeping
File size: 1,389 Bytes
31d9dba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import pymupdf
from PIL import Image
import io
import gradio as gr
import base64
import pandas as pd
import pymupdf
def image_to_bytes(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
def extract_pdfs(docs, doc_collection):
if docs:
doc_collection = []
doc_collection.extend(docs)
return (
doc_collection,
gr.Tabs(selected=1),
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
)
def extract_images(docs):
images = []
for doc_path in docs:
doc = pymupdf.open(doc_path)
for page_index in range(len(doc)):
page = doc[page_index]
image_list = page.get_images()
for _, img in enumerate(image_list, start=1):
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
if pix.n - pix.alpha > 3:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
return images
def clean_text(text):
text = text.strip()
cleaned_text = text.replace("\n", " ")
cleaned_text = cleaned_text.replace("\t", " ")
cleaned_text = cleaned_text.replace(" ", " ")
cleaned_text = cleaned_text.strip()
return cleaned_text
|