Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from pypdf import PdfReader | |
import ocrmypdf | |
def extract_text_from_pdf(reader): | |
full_text = "" | |
for idx, page in enumerate(reader.pages): | |
text = page.extract_text() | |
if len(text) > 0: | |
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n" | |
return full_text.strip() | |
def convert(pdf_file): | |
reader = PdfReader(pdf_file) | |
# Extract metadata | |
metadata = { | |
"author": reader.metadata.author, | |
"creator": reader.metadata.creator, | |
"producer": reader.metadata.producer, | |
"subject": reader.metadata.subject, | |
"title": reader.metadata.title, | |
} | |
# Extract text | |
full_text = extract_text_from_pdf(reader) | |
# Check if there are any images | |
image_count = 0 | |
for page in reader.pages: | |
image_count += len(page.images) | |
# If there are images and not much content, perform OCR on the document | |
if image_count > 0 and len(full_text) < 1000: | |
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf") | |
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True) | |
# Re-extract text | |
reader = PdfReader(pdf_file) | |
full_text = extract_text_from_pdf(reader) | |
return full_text, metadata | |
gr.Interface( | |
convert, | |
inputs=[ | |
gr.File(label="Upload PDF", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Markdown"), | |
gr.JSON(label="Metadata"), | |
], | |
).launch() | |