Spaces:
Sleeping
Sleeping
add all other all
Browse files- .gitattributes +1 -0
- .gitattributes copy +1 -0
- .gitignore +15 -0
- Dockerfile +41 -0
- README copy.md +14 -0
- app.py +133 -0
- core/init_nltk.py +3 -0
- core/init_sbt.py +14 -0
- core/read_pdf.py +130 -0
- core/tei.py +145 -0
- requirements.txt +46 -0
- shiny_example_dockerfile +13 -0
- start_service.sh +2 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
example.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitattributes copy
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
example.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
*.ipynb_checkpoints
|
7 |
+
axcell_data/papers/
|
8 |
+
|
9 |
+
# experiments output
|
10 |
+
experiments/
|
11 |
+
|
12 |
+
# output file
|
13 |
+
*.out
|
14 |
+
temp/
|
15 |
+
*.ipynb
|
Dockerfile
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM grobid/grobid:0.7.3
|
2 |
+
RUN apt-get update && \
|
3 |
+
apt-get install wget unzip texlive-full nano git apt-transport-https curl gnupg -yqq
|
4 |
+
# -- installing grobid, python (torch and tensorflow), java and latex finished -- #
|
5 |
+
|
6 |
+
# now install grobid python client
|
7 |
+
WORKDIR /opt
|
8 |
+
RUN git clone https://github.com/kermitt2/grobid_client_python && \
|
9 |
+
cd grobid_client_python && \
|
10 |
+
python3 setup.py install
|
11 |
+
|
12 |
+
# install sbt and pdf2figures
|
13 |
+
RUN echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
|
14 |
+
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
|
15 |
+
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
|
16 |
+
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
|
17 |
+
apt-get update && \
|
18 |
+
apt-get install sbt -yqq
|
19 |
+
RUN git clone https://github.com/allenai/pdffigures2.git
|
20 |
+
|
21 |
+
# install python dependency
|
22 |
+
WORKDIR /project
|
23 |
+
ADD ./requirements.txt /project/requirements.txt
|
24 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
25 |
+
# add code
|
26 |
+
ADD ./core/ /project/core/
|
27 |
+
# download tokernizer for nltk
|
28 |
+
RUN python core/init_nltk.py
|
29 |
+
ADD ./example.pdf /project/example/example.pdf
|
30 |
+
# init sbt
|
31 |
+
RUN python core/init_sbt.py
|
32 |
+
# add app
|
33 |
+
ADD ./app.py /project/app.py
|
34 |
+
EXPOSE 7860
|
35 |
+
EXPOSE 8070
|
36 |
+
EXPOSE 8071
|
37 |
+
# add service starting
|
38 |
+
ADD ./start_service.sh /project/start_service.sh
|
39 |
+
|
40 |
+
WORKDIR /project
|
41 |
+
CMD ["bash", "start_service.sh"]
|
README copy.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# docker integration of existing code
|
2 |
+
build docker
|
3 |
+
```sh
|
4 |
+
docker build --tag doc2slide -f Dockerfile .
|
5 |
+
```
|
6 |
+
run in interactive mode and use gpu
|
7 |
+
```sh
|
8 |
+
docker run --name test --gpus all -p 7080:8070 -p 7081:8071 -p 7860:7860 --rm -it doc2slide bash
|
9 |
+
```
|
10 |
+
current version of grobid 0.7.3 have python 3.8.10 and openjdk 17. To run as a service
|
11 |
+
```sh
|
12 |
+
docker run -d --name test --gpus all -p 7080:8070 -p 7081:8071 -p 7860:7860 --rm doc2slide
|
13 |
+
```
|
14 |
+
https://shinylive.io/py/examples gives a lot of examples
|
app.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import hashlib
|
3 |
+
from subprocess import call
|
4 |
+
|
5 |
+
from shiny import App, reactive, render, ui
|
6 |
+
|
7 |
+
from core.read_pdf import process_pdf, temp_dir
|
8 |
+
|
9 |
+
last_pdf_md5_preprocess_stage = None
|
10 |
+
|
11 |
+
|
12 |
+
def compute_hash(file_pth):
|
13 |
+
with open(file_pth, 'rb') as file_to_check:
|
14 |
+
# read contents of the file
|
15 |
+
data = file_to_check.read()
|
16 |
+
# pipe contents of the file through
|
17 |
+
md5_returned = hashlib.md5(data).hexdigest()
|
18 |
+
return md5_returned
|
19 |
+
|
20 |
+
|
21 |
+
def ui_card(title, *args):
|
22 |
+
return (ui.div(
|
23 |
+
{"class": "card mb-4"},
|
24 |
+
ui.div(title, class_="card-header"),
|
25 |
+
ui.div({"class": "card-body"}, *args),
|
26 |
+
), )
|
27 |
+
|
28 |
+
|
29 |
+
app_ui = ui.page_fluid(
|
30 |
+
ui.h1("Document2Slide Demo"),
|
31 |
+
ui_card(
|
32 |
+
"Upload PDF",
|
33 |
+
ui.input_file("input_pdf", "Choose a .pdf file to upload:", multiple=True),
|
34 |
+
ui.output_text("upload_file_status", ),
|
35 |
+
),
|
36 |
+
ui_card(
|
37 |
+
"Preprocess",
|
38 |
+
ui.p(
|
39 |
+
ui.input_action_button("preprocess_action", "Preprocess file", class_="btn-primary"),
|
40 |
+
ui.output_text("preprocess_result", ),
|
41 |
+
),
|
42 |
+
ui.output_text("preprocess_status", ),
|
43 |
+
ui.download_button("download_preprocessed", "Download preprocessed file"),
|
44 |
+
),
|
45 |
+
ui_card(
|
46 |
+
"Download the bullet points in Markdown format.",
|
47 |
+
ui.download_button("download_bullet_point", "Download bullet point"),
|
48 |
+
),
|
49 |
+
ui_card(
|
50 |
+
"Download the beamer source code `.tex` of the slide",
|
51 |
+
ui.download_button("download_beamer", "Download beamer source code"),
|
52 |
+
),
|
53 |
+
ui_card(
|
54 |
+
"Download the PDF of slide.",
|
55 |
+
ui.download_button("download_slide", "Download slide generated"),
|
56 |
+
),
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
def server(input, output, session):
|
61 |
+
|
62 |
+
@output
|
63 |
+
@render.text
|
64 |
+
def upload_file_status():
|
65 |
+
file_infos = input.input_pdf()
|
66 |
+
# print(file_infos) # [{'name': 'Poster.pdf', 'size': 598394, 'type': 'application/pdf', 'datapath': '/tmp/fileupload-2c21fv0a/tmpi91sy07h/0.pdf'}]
|
67 |
+
if not file_infos:
|
68 |
+
return "There is no file provided currently."
|
69 |
+
elif file_infos[0]['type'] != 'application/pdf':
|
70 |
+
return "the file you provide is not in PDF format, upload another one!"
|
71 |
+
else:
|
72 |
+
return "PDF file successfully uploaded!"
|
73 |
+
|
74 |
+
@output
|
75 |
+
@render.text
|
76 |
+
def preprocess_status():
|
77 |
+
global last_pdf_md5_preprocess_stage
|
78 |
+
file_infos = input.input_pdf()
|
79 |
+
|
80 |
+
file_md5 = compute_hash(file_infos[0]['datapath']) if file_infos else None
|
81 |
+
|
82 |
+
if (file_infos is not None) and file_infos[0]['type'] == 'application/pdf' and (file_md5 != last_pdf_md5_preprocess_stage):
|
83 |
+
return "Ready to preprocess the PDF!"
|
84 |
+
elif file_md5 == last_pdf_md5_preprocess_stage:
|
85 |
+
return "PDF already preprocessed! You can continue!"
|
86 |
+
else:
|
87 |
+
return "No PDF ready currently, please upload a PDF!"
|
88 |
+
|
89 |
+
@output
|
90 |
+
@render.text
|
91 |
+
@reactive.event(input.preprocess_action) # Take a dependency on the button
|
92 |
+
async def preprocess_result():
|
93 |
+
|
94 |
+
global last_pdf_md5_preprocess_stage
|
95 |
+
|
96 |
+
file_infos = input.input_pdf()
|
97 |
+
if (file_infos is not None) and file_infos[0]['type'] == 'application/pdf':
|
98 |
+
|
99 |
+
file_name = file_infos[0]['name']
|
100 |
+
original_pdf_pth = file_infos[0]['datapath']
|
101 |
+
dir_name = os.path.dirname(original_pdf_pth)
|
102 |
+
new_pdf_pth = os.path.join(dir_name, file_name)
|
103 |
+
os.rename(original_pdf_pth, new_pdf_pth)
|
104 |
+
file_infos[0]['datapath'] = new_pdf_pth
|
105 |
+
|
106 |
+
file_md5 = compute_hash(file_infos[0]['datapath'])
|
107 |
+
|
108 |
+
try:
|
109 |
+
if file_md5 != last_pdf_md5_preprocess_stage:
|
110 |
+
process_pdf(pdf_pth=new_pdf_pth, file_name=file_name)
|
111 |
+
last_pdf_md5_preprocess_stage = file_md5
|
112 |
+
return "Process successfully!"
|
113 |
+
else:
|
114 |
+
return "Already processed!!!"
|
115 |
+
|
116 |
+
except:
|
117 |
+
return "Something wrong happen, please switch to another file!"
|
118 |
+
|
119 |
+
else:
|
120 |
+
return "No PDF provided!"
|
121 |
+
|
122 |
+
@session.download()
|
123 |
+
def download_preprocessed():
|
124 |
+
file_infos = input.input_pdf()
|
125 |
+
file_name = file_infos[0]['name'][:-4]
|
126 |
+
preprocessed_file_dir = os.path.join(temp_dir, file_name)
|
127 |
+
if os.path.exists(preprocessed_file_dir): # this dir exists
|
128 |
+
args = ['zip', '-r', file_name + '.zip', './' + file_name]
|
129 |
+
call(args, cwd=temp_dir)
|
130 |
+
return str(os.path.join(temp_dir, file_name + '.zip'))
|
131 |
+
|
132 |
+
|
133 |
+
app = App(app_ui, server)
|
core/init_nltk.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
|
3 |
+
nltk.download('punkt')
|
core/init_sbt.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import sys
|
4 |
+
from subprocess import call
|
5 |
+
|
6 |
+
if __name__ == '__main__':
|
7 |
+
example_pdf_pth = '/project/example/example.pdf'
|
8 |
+
pdffigures2_home = '/opt/pdffigures2'
|
9 |
+
args = [
|
10 |
+
'sbt',
|
11 |
+
'-J-Xmx4G',
|
12 |
+
'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q /project/example/ -m ./example/image/ -d ./example/json/ -s ./example/stat.json',
|
13 |
+
]
|
14 |
+
call(args, cwd=pdffigures2_home)
|
core/read_pdf.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# add module
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import sys
|
5 |
+
from subprocess import call
|
6 |
+
|
7 |
+
from grobid_client.grobid_client import GrobidClient
|
8 |
+
|
9 |
+
module_path = os.path.abspath(os.path.join('/project'))
|
10 |
+
if module_path not in sys.path:
|
11 |
+
sys.path.append(module_path)
|
12 |
+
|
13 |
+
from core.tei import single_entry
|
14 |
+
|
15 |
+
# temp_dir = '/project/temp'
|
16 |
+
# pdffigures2_home = '/opt/pdffigures2'
|
17 |
+
# grobid_home = '/opt/grobid'
|
18 |
+
# grobid_python_config_pth = '/opt/grobid_client_python/config.json
|
19 |
+
temp_dir = '/home/quanta/Projects/doc2slide-summarizer/temp'
|
20 |
+
pdffigures2_home = '/home/quanta/Library/pdffigures2'
|
21 |
+
grobid_home = '/home/quanta/Library/grobid/grobid-0.6.2'
|
22 |
+
grobid_python_config_pth = '/home/quanta/Library/grobid_client_python/config.json'
|
23 |
+
|
24 |
+
|
25 |
+
def remove_temp_directory():
|
26 |
+
if os.path.exists(temp_dir):
|
27 |
+
shutil.rmtree(temp_dir)
|
28 |
+
|
29 |
+
|
30 |
+
def grobid_clident():
|
31 |
+
return GrobidClient(config_path=grobid_python_config_pth)
|
32 |
+
|
33 |
+
|
34 |
+
def process_pdf(pdf_pth: str, file_name: str):
|
35 |
+
"""This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""
|
36 |
+
|
37 |
+
client = grobid_clident()
|
38 |
+
remove_temp_directory()
|
39 |
+
|
40 |
+
name = file_name[:-4]
|
41 |
+
|
42 |
+
if not os.path.exists(temp_dir):
|
43 |
+
os.makedirs(temp_dir)
|
44 |
+
temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
|
45 |
+
if not os.path.exists(temp_pdf_dir):
|
46 |
+
os.makedirs(temp_pdf_dir)
|
47 |
+
temp_xml_dir = os.path.join(temp_dir, name, 'xml')
|
48 |
+
if not os.path.exists(temp_xml_dir):
|
49 |
+
os.makedirs(temp_xml_dir)
|
50 |
+
|
51 |
+
# copy pdf to temp dir
|
52 |
+
shutil.copy(pdf_pth, temp_pdf_dir)
|
53 |
+
|
54 |
+
# process to xml
|
55 |
+
client.process(
|
56 |
+
'processFulltextDocument',
|
57 |
+
temp_pdf_dir,
|
58 |
+
tei_coordinates=True,
|
59 |
+
force=True,
|
60 |
+
verbose=True,
|
61 |
+
output=temp_xml_dir,
|
62 |
+
)
|
63 |
+
|
64 |
+
xml_name = name + '.tei.xml'
|
65 |
+
xml_pth = os.path.join(temp_xml_dir, xml_name)
|
66 |
+
|
67 |
+
# now scan figures
|
68 |
+
fig_dir_profix = 'figure'
|
69 |
+
img_dir_profix = 'figure/image'
|
70 |
+
json_dir_profix = 'figure/json'
|
71 |
+
|
72 |
+
tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
|
73 |
+
if not os.path.exists(tmp_fig_dir):
|
74 |
+
os.makedirs(tmp_fig_dir)
|
75 |
+
tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
|
76 |
+
if not os.path.exists(tmp_img_dir):
|
77 |
+
os.makedirs(tmp_img_dir)
|
78 |
+
tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
|
79 |
+
if not os.path.exists(tmp_json_dir):
|
80 |
+
os.makedirs(tmp_json_dir)
|
81 |
+
|
82 |
+
args = [
|
83 |
+
'sbt',
|
84 |
+
'-J-Xmx4G',
|
85 |
+
'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
|
86 |
+
]
|
87 |
+
call(args, cwd=pdffigures2_home)
|
88 |
+
|
89 |
+
shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))
|
90 |
+
|
91 |
+
figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')
|
92 |
+
|
93 |
+
# merge to single json
|
94 |
+
_, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)
|
95 |
+
|
96 |
+
temp_json_dir = os.path.join(temp_dir, name, 'json')
|
97 |
+
if not os.path.exists(temp_json_dir):
|
98 |
+
os.makedirs(temp_json_dir)
|
99 |
+
|
100 |
+
json_data = {
|
101 |
+
'title': title,
|
102 |
+
'abstract': abstract,
|
103 |
+
'text': text,
|
104 |
+
'headers': headers,
|
105 |
+
'figures': figures,
|
106 |
+
}
|
107 |
+
|
108 |
+
import json
|
109 |
+
json_pth = os.path.join(temp_json_dir, name + '.json')
|
110 |
+
with open(json_pth, 'w') as f:
|
111 |
+
json.dump(json_data, f, indent=4)
|
112 |
+
|
113 |
+
# get preprocessed data
|
114 |
+
with open(json_pth, 'r') as f:
|
115 |
+
data = json.load(f)
|
116 |
+
paper_length = len(data['text'])
|
117 |
+
sections = [{
|
118 |
+
'idx': i,
|
119 |
+
'title': head['section'],
|
120 |
+
'n': head['n'],
|
121 |
+
'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
|
122 |
+
'matched_slides': [],
|
123 |
+
} for i, head in enumerate(data['headers'])]
|
124 |
+
|
125 |
+
with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
|
126 |
+
json.dump([sec['text'] for sec in sections], f, indent=4)
|
127 |
+
|
128 |
+
|
129 |
+
if __name__ == '__main__':
|
130 |
+
process_pdf('/project/example/example.pdf')
|
core/tei.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# code modified from https://github.com/IBM/document2slides/blob/main/sciduet-build/extract_papers.py
|
2 |
+
import os
|
3 |
+
from os import path
|
4 |
+
import json
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from multiprocessing.pool import Pool
|
7 |
+
import pandas as pd
|
8 |
+
from pathlib import Path
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
import nltk
|
11 |
+
from dataclasses import dataclass
|
12 |
+
|
13 |
+
|
14 |
+
@dataclass
|
15 |
+
class Person:
|
16 |
+
firstname: str
|
17 |
+
middlename: str
|
18 |
+
surname: str
|
19 |
+
|
20 |
+
|
21 |
+
def read_tei(tei_file):
|
22 |
+
with open(tei_file, 'r', encoding='utf-8') as tei:
|
23 |
+
soup = BeautifulSoup(tei, 'xml')
|
24 |
+
return soup
|
25 |
+
|
26 |
+
|
27 |
+
def elem_to_text(elem, default=''):
|
28 |
+
if elem:
|
29 |
+
return elem.getText()
|
30 |
+
else:
|
31 |
+
return default
|
32 |
+
|
33 |
+
|
34 |
+
class TEIFile(object):
|
35 |
+
|
36 |
+
def __init__(self, xml_pth: str = None, fig_json_pth: str = ''):
|
37 |
+
# self.filename = filename
|
38 |
+
# self.dir = os.path.abspath(paper_dir)
|
39 |
+
self.xml_pth = os.path.abspath(xml_pth)
|
40 |
+
self.fig_json_pth = os.path.abspath(fig_json_pth)
|
41 |
+
# self.soup = read_tei(os.path.join(self.dir, 'paper.tei.xml'))
|
42 |
+
self.soup = read_tei(self.xml_pth)
|
43 |
+
self._text = None
|
44 |
+
self._title = ''
|
45 |
+
self._abstract = ''
|
46 |
+
self._headers = None
|
47 |
+
self._figures = None
|
48 |
+
|
49 |
+
@property
|
50 |
+
def doi(self):
|
51 |
+
idno_elem = self.soup.find('idno', type='DOI')
|
52 |
+
if not idno_elem:
|
53 |
+
return ''
|
54 |
+
else:
|
55 |
+
return idno_elem.getText()
|
56 |
+
|
57 |
+
@property
|
58 |
+
def title(self):
|
59 |
+
if not self._title:
|
60 |
+
self._title = self.soup.title.getText()
|
61 |
+
return self._title
|
62 |
+
|
63 |
+
@property
|
64 |
+
def abstract(self):
|
65 |
+
if not self._abstract:
|
66 |
+
abstract = self.soup.abstract.getText(separator=' ', strip=True)
|
67 |
+
self._abstract = abstract
|
68 |
+
return self._abstract
|
69 |
+
|
70 |
+
@property
|
71 |
+
def authors(self):
|
72 |
+
authors_in_header = self.soup.analytic.find_all('author')
|
73 |
+
|
74 |
+
# print(authors_in_header)
|
75 |
+
|
76 |
+
result = []
|
77 |
+
for author in authors_in_header:
|
78 |
+
persname = author.persName
|
79 |
+
if not persname:
|
80 |
+
persname = author.persname
|
81 |
+
if not persname:
|
82 |
+
continue
|
83 |
+
firstname = elem_to_text(persname.find("forename", type="first"))
|
84 |
+
middlename = elem_to_text(persname.find("forename", type="middle"))
|
85 |
+
surname = elem_to_text(persname.surname)
|
86 |
+
person = Person(firstname, middlename, surname)
|
87 |
+
result.append(person)
|
88 |
+
|
89 |
+
return result
|
90 |
+
|
91 |
+
@property
|
92 |
+
def text(self):
|
93 |
+
if not self._text:
|
94 |
+
self._headers = []
|
95 |
+
headerlist = self.soup.body.find_all("head")
|
96 |
+
sections = []
|
97 |
+
print(headerlist)
|
98 |
+
for head in headerlist:
|
99 |
+
if head.parent.name == 'div':
|
100 |
+
txt = head.parent.get_text(separator=' ', strip=True)
|
101 |
+
# the following is only valid for arabic numerals...
|
102 |
+
if head.get("n"):
|
103 |
+
sections.append([head.text, head.get('n'), txt])
|
104 |
+
else:
|
105 |
+
if len(sections) == 0:
|
106 |
+
print("Grobid processing error.")
|
107 |
+
sections[-1][2] += txt
|
108 |
+
# sections.append([head.text, 'invalid n', txt])
|
109 |
+
start = 0
|
110 |
+
for i in sections:
|
111 |
+
sent = nltk.tokenize.sent_tokenize(i[2])
|
112 |
+
sec_dic = {'section': i[0], 'n': i[1], 'start': start, 'end': start + len(sent) - 1}
|
113 |
+
self._headers.append(sec_dic)
|
114 |
+
start += len(sent)
|
115 |
+
plain_text = " ".join([i[2] for i in sections])
|
116 |
+
self._text = [{'id': i, 'string': s} for i, s in enumerate(nltk.tokenize.sent_tokenize(plain_text))]
|
117 |
+
return self._text
|
118 |
+
|
119 |
+
@property
|
120 |
+
def headers(self):
|
121 |
+
if not self._headers:
|
122 |
+
self.text()
|
123 |
+
return self._headers
|
124 |
+
|
125 |
+
@property
|
126 |
+
def figures(self):
|
127 |
+
if not self._figures:
|
128 |
+
# base_name = basename_without_ext(self.filename)
|
129 |
+
self._figures = []
|
130 |
+
# fn = 'figures/{}.json'.format(base_name) # link to figures dir
|
131 |
+
# fn = os.path.join(self.dir, 'paper_figure/paper.json')
|
132 |
+
fn = self.fig_json_pth
|
133 |
+
if not path.isfile(fn):
|
134 |
+
return []
|
135 |
+
with open(fn) as f:
|
136 |
+
data = json.load(f)
|
137 |
+
for i in data:
|
138 |
+
elem = {'filename': i['renderURL'], 'caption': i['caption'], 'page': i['page'], 'bbox': i['regionBoundary']}
|
139 |
+
self._figures.append(elem)
|
140 |
+
return self._figures
|
141 |
+
|
142 |
+
|
143 |
+
def single_entry(uuid: str, xml_pth: str, fig_json_pth: str):
|
144 |
+
tei = TEIFile(xml_pth=xml_pth, fig_json_pth=fig_json_pth)
|
145 |
+
return uuid, tei.title, tei.abstract, tei.text, tei.headers, tei.figures
|
requirements.txt
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# for processing pdf
|
2 |
+
nltk
|
3 |
+
pandas
|
4 |
+
tqdm
|
5 |
+
bs4
|
6 |
+
lxml
|
7 |
+
# for interaction with openai
|
8 |
+
openai
|
9 |
+
# for shiny
|
10 |
+
anyio==3.6.2
|
11 |
+
appdirs==1.4.4
|
12 |
+
asgiref==3.6.0
|
13 |
+
click==8.1.3
|
14 |
+
contextvars==2.4
|
15 |
+
contourpy==1.0.7
|
16 |
+
cycler==0.11.0
|
17 |
+
fonttools==4.39.3
|
18 |
+
h11==0.14.0
|
19 |
+
htmltools==0.2.1
|
20 |
+
idna==3.4
|
21 |
+
immutables==0.19
|
22 |
+
kiwisolver==1.4.4
|
23 |
+
linkify-it-py==2.0.0
|
24 |
+
markdown-it-py==2.2.0
|
25 |
+
matplotlib==3.7.1
|
26 |
+
mdit-py-plugins==0.3.5
|
27 |
+
mdurl==0.1.2
|
28 |
+
numpy==1.24.2
|
29 |
+
packaging==23.1
|
30 |
+
Pillow==9.5.0
|
31 |
+
pyparsing==3.0.9
|
32 |
+
python-dateutil==2.8.2
|
33 |
+
python-multipart==0.0.6
|
34 |
+
pytz==2023.3
|
35 |
+
seaborn==0.12.2
|
36 |
+
shiny==0.3.3
|
37 |
+
shinyswatch==0.2.3
|
38 |
+
six==1.16.0
|
39 |
+
sniffio==1.3.0
|
40 |
+
starlette==0.26.1
|
41 |
+
typing_extensions==4.5.0
|
42 |
+
tzdata==2023.3
|
43 |
+
uc-micro-py==1.0.1
|
44 |
+
uvicorn==0.21.1
|
45 |
+
websockets==11.0.2
|
46 |
+
XStatic-bootswatch==3.3.7.0
|
shiny_example_dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
EXPOSE 7860
|
12 |
+
|
13 |
+
CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
|
start_service.sh
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
cd /opt/grobid && ./grobid-service/bin/grobid-service &
|
2 |
+
cd /project && shiny run /project/app.py --host 0.0.0.0 --port 7860
|