Spaces:

quantaji
/

document2slide-demo

Sleeping

App Files Files Community

quantaji commited on May 22, 2023

Commit

9ee83a7

•

1 Parent(s): 2d17079

add all other all

Browse files

Files changed (13) hide show

.gitattributes +1 -0
.gitattributes copy +1 -0
.gitignore +15 -0
Dockerfile +41 -0
README copy.md +14 -0
app.py +133 -0
core/init_nltk.py +3 -0
core/init_sbt.py +14 -0
core/read_pdf.py +130 -0
core/tei.py +145 -0
requirements.txt +46 -0
shiny_example_dockerfile +13 -0
start_service.sh +2 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example.pdf filter=lfs diff=lfs merge=lfs -text

.gitattributes copy ADDED Viewed

	@@ -0,0 +1 @@


1	+ example.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.ipynb_checkpoints
+axcell_data/papers/
+# experiments output
+experiments/
+# output file
+*.out
+temp/
+*.ipynb

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM grobid/grobid:0.7.3
+RUN apt-get update && \
+    apt-get install wget unzip texlive-full nano git apt-transport-https curl gnupg -yqq
+# -- installing grobid, python (torch and tensorflow), java and latex finished -- #
+# now install grobid python client
+WORKDIR /opt
+RUN git clone https://github.com/kermitt2/grobid_client_python && \
+    cd grobid_client_python && \
+    python3 setup.py install
+# install sbt and pdf2figures
+RUN echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
+    echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
+    curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
+    chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
+    apt-get update && \
+    apt-get install sbt -yqq
+RUN git clone https://github.com/allenai/pdffigures2.git
+# install python dependency
+WORKDIR /project
+ADD ./requirements.txt /project/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# add code
+ADD ./core/ /project/core/
+# download tokernizer for nltk
+RUN python core/init_nltk.py
+ADD ./example.pdf /project/example/example.pdf
+# init sbt
+RUN python core/init_sbt.py
+# add app
+ADD ./app.py /project/app.py
+EXPOSE 7860
+EXPOSE 8070
+EXPOSE 8071
+# add service starting
+ADD ./start_service.sh /project/start_service.sh
+WORKDIR /project
+CMD ["bash", "start_service.sh"]

README copy.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# docker integration of existing code
+build docker
+```sh
+docker build --tag doc2slide  -f Dockerfile .
+```
+run in interactive mode and use gpu
+```sh
+docker run --name test --gpus all -p 7080:8070 -p 7081:8071 -p 7860:7860 --rm -it doc2slide bash
+```
+current version of grobid 0.7.3 have python 3.8.10 and openjdk 17. To run as a service
+```sh
+docker run -d --name test --gpus all -p 7080:8070 -p 7081:8071 -p 7860:7860 --rm doc2slide
+```
+https://shinylive.io/py/examples gives a lot of examples

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import hashlib
+from subprocess import call
+from shiny import App, reactive, render, ui
+from core.read_pdf import process_pdf, temp_dir
+last_pdf_md5_preprocess_stage = None
+def compute_hash(file_pth):
+    with open(file_pth, 'rb') as file_to_check:
+        # read contents of the file
+        data = file_to_check.read()
+        # pipe contents of the file through
+        md5_returned = hashlib.md5(data).hexdigest()
+    return md5_returned
+def ui_card(title, *args):
+    return (ui.div(
+        {"class": "card mb-4"},
+        ui.div(title, class_="card-header"),
+        ui.div({"class": "card-body"}, *args),
+    ), )
+app_ui = ui.page_fluid(
+    ui.h1("Document2Slide Demo"),
+    ui_card(
+        "Upload PDF",
+        ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=True),
+        ui.output_text("upload_file_status", ),
+    ),
+    ui_card(
+        "Preprocess",
+        ui.p(
+            ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
+            ui.output_text("preprocess_result", ),
+        ),
+        ui.output_text("preprocess_status", ),
+        ui.download_button("download_preprocessed", "Download preprocessed file"),
+    ),
+    ui_card(
+        "Download the bullet points in Markdown format.",
+        ui.download_button("download_bullet_point", "Download bullet point"),
+    ),
+    ui_card(
+        "Download the beamer source code `.tex` of the slide",
+        ui.download_button("download_beamer", "Download beamer source code"),
+    ),
+    ui_card(
+        "Download the PDF of slide.",
+        ui.download_button("download_slide", "Download slide generated"),
+    ),
+)
+def server(input, output, session):
+    @output
+    @render.text
+    def upload_file_status():
+        file_infos = input.input_pdf()
+        # print(file_infos) # [{'name': 'Poster.pdf', 'size': 598394, 'type': 'application/pdf', 'datapath': '/tmp/fileupload-2c21fv0a/tmpi91sy07h/0.pdf'}]
+        if not file_infos:
+            return "There is no file provided currently."
+        elif file_infos[0]['type'] != 'application/pdf':
+            return "the file you provide is not in PDF format, upload another one!"
+        else:
+            return "PDF file successfully uploaded!"
+    @output
+    @render.text
+    def preprocess_status():
+        global last_pdf_md5_preprocess_stage
+        file_infos = input.input_pdf()
+        file_md5 = compute_hash(file_infos[0]['datapath']) if file_infos else None
+        if (file_infos is not None) and file_infos[0]['type'] == 'application/pdf' and (file_md5 != last_pdf_md5_preprocess_stage):
+            return "Ready to preprocess the PDF!"
+        elif file_md5 == last_pdf_md5_preprocess_stage:
+            return "PDF already preprocessed! You can continue!"
+        else:
+            return "No PDF ready currently, please upload a PDF!"
+    @output
+    @render.text
+    @reactive.event(input.preprocess_action)  # Take a dependency on the button
+    async def preprocess_result():
+        global last_pdf_md5_preprocess_stage
+        file_infos = input.input_pdf()
+        if (file_infos is not None) and file_infos[0]['type'] == 'application/pdf':
+            file_name = file_infos[0]['name']
+            original_pdf_pth = file_infos[0]['datapath']
+            dir_name = os.path.dirname(original_pdf_pth)
+            new_pdf_pth = os.path.join(dir_name, file_name)
+            os.rename(original_pdf_pth, new_pdf_pth)
+            file_infos[0]['datapath'] = new_pdf_pth
+            file_md5 = compute_hash(file_infos[0]['datapath'])
+            try:
+                if file_md5 != last_pdf_md5_preprocess_stage:
+                    process_pdf(pdf_pth=new_pdf_pth, file_name=file_name)
+                    last_pdf_md5_preprocess_stage = file_md5
+                    return "Process successfully!"
+                else:
+                    return "Already processed!!!"
+            except:
+                return "Something wrong happen, please switch to another file!"
+        else:
+            return "No PDF provided!"
+    @session.download()
+    def download_preprocessed():
+        file_infos = input.input_pdf()
+        file_name = file_infos[0]['name'][:-4]
+        preprocessed_file_dir = os.path.join(temp_dir, file_name)
+        if os.path.exists(preprocessed_file_dir):  # this dir exists
+            args = ['zip', '-r', file_name + '.zip', './' + file_name]
+            call(args, cwd=temp_dir)
+            return str(os.path.join(temp_dir, file_name + '.zip'))
+app = App(app_ui, server)

core/init_nltk.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import nltk
2	+
3	+ nltk.download('punkt')

core/init_sbt.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+import shutil
+import sys
+from subprocess import call
+if __name__ == '__main__':
+    example_pdf_pth = '/project/example/example.pdf'
+    pdffigures2_home = '/opt/pdffigures2'
+    args = [
+        'sbt',
+        '-J-Xmx4G',
+        'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q /project/example/ -m ./example/image/ -d ./example/json/ -s ./example/stat.json',
+    ]
+    call(args, cwd=pdffigures2_home)

core/read_pdf.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# add module
+import os
+import shutil
+import sys
+from subprocess import call
+from grobid_client.grobid_client import GrobidClient
+module_path = os.path.abspath(os.path.join('/project'))
+if module_path not in sys.path:
+    sys.path.append(module_path)
+from core.tei import single_entry
+# temp_dir = '/project/temp'
+# pdffigures2_home = '/opt/pdffigures2'
+# grobid_home = '/opt/grobid'
+# grobid_python_config_pth = '/opt/grobid_client_python/config.json
+temp_dir = '/home/quanta/Projects/doc2slide-summarizer/temp'
+pdffigures2_home = '/home/quanta/Library/pdffigures2'
+grobid_home = '/home/quanta/Library/grobid/grobid-0.6.2'
+grobid_python_config_pth = '/home/quanta/Library/grobid_client_python/config.json'
+def remove_temp_directory():
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+def grobid_clident():
+    return GrobidClient(config_path=grobid_python_config_pth)
+def process_pdf(pdf_pth: str, file_name: str):
+    """This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""
+    client = grobid_clident()
+    remove_temp_directory()
+    name = file_name[:-4]
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
+    if not os.path.exists(temp_pdf_dir):
+        os.makedirs(temp_pdf_dir)
+    temp_xml_dir = os.path.join(temp_dir, name, 'xml')
+    if not os.path.exists(temp_xml_dir):
+        os.makedirs(temp_xml_dir)
+    # copy pdf to temp dir
+    shutil.copy(pdf_pth, temp_pdf_dir)
+    # process to xml
+    client.process(
+        'processFulltextDocument',
+        temp_pdf_dir,
+        tei_coordinates=True,
+        force=True,
+        verbose=True,
+        output=temp_xml_dir,
+    )
+    xml_name = name + '.tei.xml'
+    xml_pth = os.path.join(temp_xml_dir, xml_name)
+    # now scan figures
+    fig_dir_profix = 'figure'
+    img_dir_profix = 'figure/image'
+    json_dir_profix = 'figure/json'
+    tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
+    if not os.path.exists(tmp_fig_dir):
+        os.makedirs(tmp_fig_dir)
+    tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
+    if not os.path.exists(tmp_img_dir):
+        os.makedirs(tmp_img_dir)
+    tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
+    if not os.path.exists(tmp_json_dir):
+        os.makedirs(tmp_json_dir)
+    args = [
+        'sbt',
+        '-J-Xmx4G',
+        'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
+    ]
+    call(args, cwd=pdffigures2_home)
+    shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))
+    figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')
+    # merge to single json
+    _, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)
+    temp_json_dir = os.path.join(temp_dir, name, 'json')
+    if not os.path.exists(temp_json_dir):
+        os.makedirs(temp_json_dir)
+    json_data = {
+        'title': title,
+        'abstract': abstract,
+        'text': text,
+        'headers': headers,
+        'figures': figures,
+    }
+    import json
+    json_pth = os.path.join(temp_json_dir, name + '.json')
+    with open(json_pth, 'w') as f:
+        json.dump(json_data, f, indent=4)
+    # get preprocessed data
+    with open(json_pth, 'r') as f:
+        data = json.load(f)
+    paper_length = len(data['text'])
+    sections = [{
+        'idx': i,
+        'title': head['section'],
+        'n': head['n'],
+        'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
+        'matched_slides': [],
+    } for i, head in enumerate(data['headers'])]
+    with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
+        json.dump([sec['text'] for sec in sections], f, indent=4)
+if __name__ == '__main__':
+    process_pdf('/project/example/example.pdf')

core/tei.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# code modified from https://github.com/IBM/document2slides/blob/main/sciduet-build/extract_papers.py
+import os
+from os import path
+import json
+from dataclasses import dataclass
+from multiprocessing.pool import Pool
+import pandas as pd
+from pathlib import Path
+from bs4 import BeautifulSoup
+import nltk
+from dataclasses import dataclass
+@dataclass
+class Person:
+    firstname: str
+    middlename: str
+    surname: str
+def read_tei(tei_file):
+    with open(tei_file, 'r', encoding='utf-8') as tei:
+        soup = BeautifulSoup(tei, 'xml')
+        return soup
+def elem_to_text(elem, default=''):
+    if elem:
+        return elem.getText()
+    else:
+        return default
+class TEIFile(object):
+    def __init__(self, xml_pth: str = None, fig_json_pth: str = ''):
+        # self.filename = filename
+        # self.dir = os.path.abspath(paper_dir)
+        self.xml_pth = os.path.abspath(xml_pth)
+        self.fig_json_pth = os.path.abspath(fig_json_pth)
+        # self.soup = read_tei(os.path.join(self.dir, 'paper.tei.xml'))
+        self.soup = read_tei(self.xml_pth)
+        self._text = None
+        self._title = ''
+        self._abstract = ''
+        self._headers = None
+        self._figures = None
+    @property
+    def doi(self):
+        idno_elem = self.soup.find('idno', type='DOI')
+        if not idno_elem:
+            return ''
+        else:
+            return idno_elem.getText()
+    @property
+    def title(self):
+        if not self._title:
+            self._title = self.soup.title.getText()
+        return self._title
+    @property
+    def abstract(self):
+        if not self._abstract:
+            abstract = self.soup.abstract.getText(separator=' ', strip=True)
+            self._abstract = abstract
+        return self._abstract
+    @property
+    def authors(self):
+        authors_in_header = self.soup.analytic.find_all('author')
+        # print(authors_in_header)
+        result = []
+        for author in authors_in_header:
+            persname = author.persName
+            if not persname:
+                persname = author.persname
+            if not persname:
+                continue
+            firstname = elem_to_text(persname.find("forename", type="first"))
+            middlename = elem_to_text(persname.find("forename", type="middle"))
+            surname = elem_to_text(persname.surname)
+            person = Person(firstname, middlename, surname)
+            result.append(person)
+        return result
+    @property
+    def text(self):
+        if not self._text:
+            self._headers = []
+            headerlist = self.soup.body.find_all("head")
+            sections = []
+            print(headerlist)
+            for head in headerlist:
+                if head.parent.name == 'div':
+                    txt = head.parent.get_text(separator=' ', strip=True)
+                    # the following is only valid for arabic numerals...
+                    if head.get("n"):
+                        sections.append([head.text, head.get('n'), txt])
+                    else:
+                        if len(sections) == 0:
+                            print("Grobid processing error.")
+                        sections[-1][2] += txt
+                    # sections.append([head.text, 'invalid n', txt])
+            start = 0
+            for i in sections:
+                sent = nltk.tokenize.sent_tokenize(i[2])
+                sec_dic = {'section': i[0], 'n': i[1], 'start': start, 'end': start + len(sent) - 1}
+                self._headers.append(sec_dic)
+                start += len(sent)
+            plain_text = " ".join([i[2] for i in sections])
+            self._text = [{'id': i, 'string': s} for i, s in enumerate(nltk.tokenize.sent_tokenize(plain_text))]
+        return self._text
+    @property
+    def headers(self):
+        if not self._headers:
+            self.text()
+        return self._headers
+    @property
+    def figures(self):
+        if not self._figures:
+            # base_name = basename_without_ext(self.filename)
+            self._figures = []
+            # fn = 'figures/{}.json'.format(base_name)  # link to figures dir
+            # fn = os.path.join(self.dir, 'paper_figure/paper.json')
+            fn = self.fig_json_pth
+            if not path.isfile(fn):
+                return []
+            with open(fn) as f:
+                data = json.load(f)
+            for i in data:
+                elem = {'filename': i['renderURL'], 'caption': i['caption'], 'page': i['page'], 'bbox': i['regionBoundary']}
+                self._figures.append(elem)
+        return self._figures
+def single_entry(uuid: str, xml_pth: str, fig_json_pth: str):
+    tei = TEIFile(xml_pth=xml_pth, fig_json_pth=fig_json_pth)
+    return uuid, tei.title, tei.abstract, tei.text, tei.headers, tei.figures

requirements.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+# for processing pdf
+nltk
+pandas
+tqdm
+bs4
+lxml
+# for interaction with openai
+openai
+# for shiny
+anyio==3.6.2
+appdirs==1.4.4
+asgiref==3.6.0
+click==8.1.3
+contextvars==2.4
+contourpy==1.0.7
+cycler==0.11.0
+fonttools==4.39.3
+h11==0.14.0
+htmltools==0.2.1
+idna==3.4
+immutables==0.19
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+markdown-it-py==2.2.0
+matplotlib==3.7.1
+mdit-py-plugins==0.3.5
+mdurl==0.1.2
+numpy==1.24.2
+packaging==23.1
+Pillow==9.5.0
+pyparsing==3.0.9
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+seaborn==0.12.2
+shiny==0.3.3
+shinyswatch==0.2.3
+six==1.16.0
+sniffio==1.3.0
+starlette==0.26.1
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.1
+uvicorn==0.21.1
+websockets==11.0.2
+XStatic-bootswatch==3.3.7.0

shiny_example_dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

start_service.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ cd /opt/grobid && ./grobid-service/bin/grobid-service &
2	+ cd /project && shiny run /project/app.py --host 0.0.0.0 --port 7860