PDF Viewer''') with gr.Row(): inp=gr.Textbox(label="PDF URL",scale=3) pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1) with gr.Tab("View PDF"): go_btn = gr.Button("Load PDF") outp = gr.HTML() with gr.Tab("Summarize"): mes = gr.Markdown("""

import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium
import easyocr

ocr_id = {
    "Afrikaans": "af",
    "Albanian": "sq",
    "Arabic": "ar",
    "Azerbaijani": "az",
    "Belarusian": "be",
    "Bulgarian": "bg",
    "Bengali": "bn",
    "Bosnian": "bs",
    "Chinese (simplified)": "ch_sim",
    "Chinese (traditional)": "ch_tra",
    "Croatian": "hr",
    "Czech": "cs",
    "Danish": "da",
    "Dutch": "nl",
    "English": "en",
    "Estonian": "et",
    "French": "fr",
    "German": "de",
    "Irish": "ga",
    "Hindi": "hi",
    "Hungarian": "hu",
    "Indonesian": "id",
    "Icelandic": "is",
    "Italian": "it",
    "Japanese": "ja",
    "Kannada": "kn",
    "Korean": "ko",
    "Lithuanian": "lt",
    "Latvian": "lv",
    "Mongolian": "mn",
    "Marathi": "mr",
    "Malay": "ms",
    "Nepali": "ne",
    "Norwegian": "no",
    "Occitan": "oc",
    "Polish": "pl",
    "Portuguese": "pt",
    "Romanian": "ro",
    "Russian": "ru",
    "Serbian (cyrillic)": "rs_cyrillic",
    "Serbian (latin)": "rs_latin",
    "Slovak": "sk",
    "Slovenian": "sl",
    "Spanish": "es",
    "Swedish": "sv",
    "Swahili": "sw",
    "Tamil": "ta",
    "Thai": "th",
    "Tagalog": "tl",
    "Turkish": "tr",
    "Ukrainian": "uk",
    "Urdu": "ur",
    "Uzbek": "uz",
    "Vietnamese": "vi",
    "Welsh": "cy",
    "Zulu": "zu",
}

def pdf_pil(file_path,page_num,up_scale):

    pdf = pdfium.PdfDocument("data.pdf")
    page = pdf.get_page(int(page_num)-1)
    bitmap = page.render(
        scale = int(up_scale),    # 72dpi resolution
        rotation = 0, # no additional rotation
        # ... further rendering options
    )
    pil_image = bitmap.to_pil()
    pil_image.save(f"image_{page_num}.png")
    
    return (f"image_{page_num}.png")

def ocrpdf(file_path,pdf_lang,page_num,sent_wid,contrast_det,up_scale):
    img1 = pdf_pil(file_path,page_num,up_scale)
    lang=[f"{ocr_id[pdf_lang]}"]
    reader = easyocr.Reader(lang)
    bounds = reader.readtext(img1,width_ths=sent_wid,contrast_ths=contrast_det)
    
    this = ""
    for bound in bounds:
        this = (f'{this} \n{bound[1]}')
    return this    
    

def scrape(instring):
    html_src=(f'''
    <div style="text-align:center">
    <h4>PDF Viewer</h4>
    <iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
    </div>''')
    return gr.HTML.update(f'''{html_src}''')

def scrape00(instring, page_num,pdf_lang,sent_wid,contrast_det,up_scale):

    response = requests.get(instring, stream=True)

    if response.status_code == 200:
        with open("data.pdf", "wb") as f:
            f.write(response.content)
    else:
        print(response.status_code)


    #out = Path("./data.pdf")
    #print (out)
    reader = PdfReader("data.pdf")
    number_of_pages = len(reader.pages)
    page = reader.pages[int(page_num)-1]
    text = page.extract_text()
    print (text)
    summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
    try:
        sum_out = summarizer(text)
    except Exception:
        try:
            text = ocrpdf("data.pdf",pdf_lang,page_num,sent_wid,contrast_det,up_scale)
            sum_out = summarizer(text)
        except Exception:
            sum_out = "Error"
            
    return text, sum_out,gr.Markdown.update("""<h3> Complete""")

with gr.Blocks() as app:
    gr.Markdown('''<h1>PDF Viewer''')
    with gr.Row():
        inp=gr.Textbox(label="PDF URL",scale=3)
        pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
    with gr.Tab("View PDF"):
        go_btn = gr.Button("Load PDF")
        outp = gr.HTML()

    with gr.Tab("Summarize"):
        mes = gr.Markdown("""<h3> Summarize Text in PDF""")
        with gr.Row():
            with gr.Box():
                with gr.Column():
                    sent_wid=gr.Slider(0.1, 3, step=0.1,value=1,label="Horizontal Word Space")
                    contrast_det=gr.Slider(0.1, 1, step=0.1,value=0.1,label="Contrast Threshold")
                with gr.Column():
                    up_scale=gr.Slider(0.1, 5, step=0.1,value=1,label="PDF to Image Scale")
            with gr.Column():
                target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
                sum_btn = gr.Button("Summarize")
        with gr.Row():
            text_out = gr.Textbox()
            sum_out = gr.Textbox()
    go_btn.click(scrape,inp,outp)
    sum_btn.click(scrape00,[inp,pg_num,target_lang,sent_wid,contrast_det,up_scale],[text_out,sum_out,mes])
app.queue(concurrency_count=10).launch()