import gradio as gr import requests from pypdf import PdfReader import pypdfium2 as pdfium import easyocr ocr_id = { "Afrikaans": "af", "Albanian": "sq", "Arabic": "ar", "Azerbaijani": "az", "Belarusian": "be", "Bulgarian": "bg", "Bengali": "bn", "Bosnian": "bs", "Chinese (simplified)": "ch_sim", "Chinese (traditional)": "ch_tra", "Croatian": "hr", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Estonian": "et", "French": "fr", "German": "de", "Irish": "ga", "Hindi": "hi", "Hungarian": "hu", "Indonesian": "id", "Icelandic": "is", "Italian": "it", "Japanese": "ja", "Kannada": "kn", "Korean": "ko", "Lithuanian": "lt", "Latvian": "lv", "Mongolian": "mn", "Marathi": "mr", "Malay": "ms", "Nepali": "ne", "Norwegian": "no", "Occitan": "oc", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro", "Russian": "ru", "Serbian (cyrillic)": "rs_cyrillic", "Serbian (latin)": "rs_latin", "Slovak": "sk", "Slovenian": "sl", "Spanish": "es", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Thai": "th", "Tagalog": "tl", "Turkish": "tr", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Zulu": "zu", } def pdf_pil(file_path,page_num,up_scale): pdf = pdfium.PdfDocument("data.pdf") page = pdf.get_page(int(page_num)-1) bitmap = page.render( scale = int(up_scale), # 72dpi resolution rotation = 0, # no additional rotation # ... further rendering options ) pil_image = bitmap.to_pil() pil_image.save(f"image_{page_num}.png") return (f"image_{page_num}.png") def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale): img1 = pdf_pil(file_path,page_num,up_scale) lang=[f"{ocr_id[pdf_lang]}"] reader = easyocr.Reader(lang) bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det) this = "" for bound in bounds: this = (f'{this} \n{bound[1]}') return this def scrape(instring): html_src=(f'''