MohamedRashad's picture
Add image processing functionality and update dependencies
2943064
raw
history blame
3.52 kB
from transformers import NougatProcessor, VisionEncoderDecoderModel
import gradio as gr
import torch
from PIL import Image
from pathlib import Path
from pdf2image import convert_from_path
# Load the model and processor
processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat")
model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using {device} device")
context_length = 2048
def extract_text_from_image(image):
"""
Extract text from PIL image
Args:
image (PIL.Image): Input image
Returns:
str: Extracted text from the image
"""
# prepare PDF image for the model
pixel_values = processor(image, return_tensors="pt").pixel_values
# generate transcription
outputs = model.generate(
pixel_values.to(device),
min_length=1,
max_new_tokens=context_length,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
)
page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
return page_sequence
def extract_text_from_pdf(pdf_path, progress=gr.Progress()):
"""
Extract text from PDF
Args:
pdf_path (str): Path to the PDF file
progress (gr.Progress): Progress bar
Returns:
str: Extracted text from the PDF
"""
progress(0, desc="Starting...")
images = convert_from_path(pdf_path)
texts = []
for image in progress.tqdm(images):
extracted_text = extract_text_from_image(image)
texts.append(extracted_text)
return "\n".join(texts)
model_description = """
This is a demo for the Arabic Small Nougat model. It is an end-to-end OCR model that can extract text from images and PDFs.
- The model is trained on the [Khatt dataset](https://huggingface.co/datasets/Fakhraddin/khatt) and custom made dataset.
- The model is a finetune of [facebook/nougat-small](https://huggingface.co/facebook/nougat-small) model.
**Note**: The model is a prototype in my book and may not work well on all types of images and PDFs. **Check the output carefully before using it for any serious work.**
"""
example_images = [Image.open(Path(__file__).parent / "book_page.jpeg")]
with gr.Blocks(title="Arabic Small Nougat") as demo:
gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>")
gr.Markdown(model_description)
with gr.Tab("Extract Text from Image"):
with gr.Row():
with gr.Column():
input_image = gr.Image(label="Input Image", type="pil")
image_submit_button = gr.Button(value="Submit", variant="primary")
output = gr.Markdown(label="Output Markdown", rtl=True)
image_submit_button.click(extract_text_from_image, inputs=[input_image], outputs=output)
gr.Examples(example_images, [input_image], output, extract_text_from_image, cache_examples=True)
with gr.Tab("Extract Text from PDF"):
with gr.Row():
with gr.Column():
pdf = gr.File(label="Input PDF", type="filepath")
pdf_submit_button = gr.Button(value="Submit", variant="primary")
output = gr.Markdown(label="Output Markdown", rtl=True)
pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output)
demo.queue().launch(share=False)