import gradio as gr import requests from pypdf import PdfReader import pypdfium2 as pdfium import easyocr ocr_id = { "Afrikaans": "af", "Albanian": "sq", "Arabic": "ar", "Azerbaijani": "az", "Belarusian": "be", "Bulgarian": "bg", "Bengali": "bn", "Bosnian": "bs", "Chinese (simplified)": "ch_sim", "Chinese (traditional)": "ch_tra", "Croatian": "hr", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Estonian": "et", "French": "fr", "German": "de", "Irish": "ga", "Hindi": "hi", "Hungarian": "hu", "Indonesian": "id", "Icelandic": "is", "Italian": "it", "Japanese": "ja", "Kannada": "kn", "Korean": "ko", "Lithuanian": "lt", "Latvian": "lv", "Mongolian": "mn", "Marathi": "mr", "Malay": "ms", "Nepali": "ne", "Norwegian": "no", "Occitan": "oc", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro", "Russian": "ru", "Serbian (cyrillic)": "rs_cyrillic", "Serbian (latin)": "rs_latin", "Slovak": "sk", "Slovenian": "sl", "Spanish": "es", "Swedish": "sv", "Swahili": "sw", "Tamil": "ta", "Thai": "th", "Tagalog": "tl", "Turkish": "tr", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Zulu": "zu", } def pdf_pil(file_path,page_num,up_scale): pdf = pdfium.PdfDocument("data.pdf") page = pdf.get_page(int(page_num)-1) bitmap = page.render( scale = int(up_scale), # 72dpi resolution rotation = 0, # no additional rotation # ... further rendering options ) pil_image = bitmap.to_pil() pil_image.save(f"image_{page_num}.png") return (f"image_{page_num}.png") def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale): img1 = pdf_pil(file_path,page_num,up_scale) lang=[f"{ocr_id[pdf_lang]}"] reader = easyocr.Reader(lang) bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det) this = "" for bound in bounds: this = (f'{this} \n{bound[1]}') return this def scrape(instring): html_src=(f'''

PDF Viewer

''') return gr.HTML.update(f'''{html_src}''') def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale): response = requests.get(instring, stream=True) if response.status_code == 200: with open("data.pdf", "wb") as f: f.write(response.content) else: print(response.status_code) #out = Path("./data.pdf") #print (out) reader = PdfReader("data.pdf") number_of_pages = len(reader.pages) page = reader.pages[int(page_num)-1] text = page.extract_text() print (text) summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn") try: sum_out = summarizer(text) except Exception: try: text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale) sum_out = summarizer(text) except Exception: sum_out = "Error" return text, sum_out,gr.Markdown.update("""

Complete""") with gr.Blocks() as app: gr.Markdown('''

PDF Viewer''') with gr.Row(): inp=gr.Textbox(label="PDF URL",scale=3) pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1) with gr.Tab("View PDF"): go_btn = gr.Button("Load PDF") outp = gr.HTML() with gr.Tab("Summarize"): mes = gr.Markdown("""

Summarize Text in PDF""") with gr.Row(): with gr.Box(): with gr.Column(): sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space") contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold") with gr.Column(): up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale") with gr.Column(): target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English") sum_btn = gr.Button("Summarize") with gr.Row(): text_out = gr.Textbox() sum_out = gr.Textbox() go_btn.click(scrape,inp,outp) sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes]) app.queue(concurrency_count=10).launch()