qqqwt commited on
Commit
e806188
1 Parent(s): d8cd291

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +634 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import datetime
5
+ import arxiv
6
+ import openai, tenacity
7
+ import base64, requests
8
+ import argparse
9
+ import configparser
10
+ import fitz, io, os
11
+ from PIL import Image
12
+ import gradio
13
+ import markdown
14
+
15
+ class Paper:
16
+ def __init__(self, path, title='', url='', abs='', authers=[], sl=[]):
17
+ # 初始化函数,根据pdf路径初始化Paper对象
18
+ self.url = url # 文章链接
19
+ self.path = path # pdf路径
20
+ self.sl = sl
21
+ self.section_names = [] # 段落标题
22
+ self.section_texts = {} # 段落内容
23
+ if title == '':
24
+ self.pdf = fitz.open(self.path) # pdf文档
25
+ self.title = self.get_title()
26
+ self.parse_pdf()
27
+ else:
28
+ self.title = title
29
+ self.authers = authers
30
+ self.abs = abs
31
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
32
+ self.digit_num = [str(d+1) for d in range(10)]
33
+ self.first_image = ''
34
+
35
+ def parse_pdf(self):
36
+ self.pdf = fitz.open(self.path) # pdf文档
37
+ self.text_list = [page.get_text() for page in self.pdf]
38
+ self.all_text = ' '.join(self.text_list)
39
+ self.section_page_dict = self._get_all_page_index() # 段落与页码的对应字典
40
+ print("section_page_dict", self.section_page_dict)
41
+ self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
42
+ self.section_text_dict.update({"title": self.title})
43
+ self.pdf.close()
44
+
45
+ def get_image_path(self, image_path=''):
46
+ """
47
+ 将PDF中的第一张图保存到image.png里面,存到本地目录,返回文件名称,供gitee读取
48
+ :param filename: 图片所在路径,"C:\\Users\\Administrator\\Desktop\\nwd.pdf"
49
+ :param image_path: 图片提取后的保存路径
50
+ :return:
51
+ """
52
+ # open file
53
+ max_size = 0
54
+ image_list = []
55
+ with fitz.Document(self.path) as my_pdf_file:
56
+ # 遍历所有页面
57
+ for page_number in range(1, len(my_pdf_file) + 1):
58
+ # 查看独立页面
59
+ page = my_pdf_file[page_number - 1]
60
+ # 查看当前页所有图片
61
+ images = page.get_images()
62
+ # 遍历当前页面所有图片
63
+ for image_number, image in enumerate(page.get_images(), start=1):
64
+ # 访问图片xref
65
+ xref_value = image[0]
66
+ # 提取图片信息
67
+ base_image = my_pdf_file.extract_image(xref_value)
68
+ # 访问图片
69
+ image_bytes = base_image["image"]
70
+ # 获取图片扩展名
71
+ ext = base_image["ext"]
72
+ # 加载图片
73
+ image = Image.open(io.BytesIO(image_bytes))
74
+ image_size = image.size[0] * image.size[1]
75
+ if image_size > max_size:
76
+ max_size = image_size
77
+ image_list.append(image)
78
+ for image in image_list:
79
+ image_size = image.size[0] * image.size[1]
80
+ if image_size == max_size:
81
+ image_name = f"image.{ext}"
82
+ im_path = os.path.join(image_path, image_name)
83
+ print("im_path:", im_path)
84
+
85
+ max_pix = 480
86
+ origin_min_pix = min(image.size[0], image.size[1])
87
+
88
+ if image.size[0] > image.size[1]:
89
+ min_pix = int(image.size[1] * (max_pix/image.size[0]))
90
+ newsize = (max_pix, min_pix)
91
+ else:
92
+ min_pix = int(image.size[0] * (max_pix/image.size[1]))
93
+ newsize = (min_pix, max_pix)
94
+ image = image.resize(newsize)
95
+
96
+ image.save(open(im_path, "wb"))
97
+ return im_path, ext
98
+ return None, None
99
+
100
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
101
+ def get_chapter_names(self,):
102
+ # # 打开一个pdf文件
103
+ doc = fitz.open(self.path) # pdf文档
104
+ text_list = [page.get_text() for page in doc]
105
+ all_text = ''
106
+ for text in text_list:
107
+ all_text += text
108
+ # # 创建一个空列表,用于存储章节名称
109
+ chapter_names = []
110
+ for line in all_text.split('\n'):
111
+ line_list = line.split(' ')
112
+ if '.' in line:
113
+ point_split_list = line.split('.')
114
+ space_split_list = line.split(' ')
115
+ if 1 < len(space_split_list) < 5:
116
+ if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
117
+ print("line:", line)
118
+ chapter_names.append(line)
119
+
120
+ return chapter_names
121
+
122
+ def get_title(self):
123
+ doc = self.pdf # 打开pdf文件
124
+ max_font_size = 0 # 初始化最大字体大小为0
125
+ max_string = "" # 初始化最大字体大小对应的字符串为空
126
+ max_font_sizes = [0]
127
+ for page in doc: # 遍历每一页
128
+ text = page.get_text("dict") # 获取页面上的文本信息
129
+ blocks = text["blocks"] # 获取文本块列表
130
+ for block in blocks: # 遍历每个文本块
131
+ if block["type"] == 0: # 如果是文字类型
132
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
133
+ max_font_sizes.append(font_size)
134
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
135
+ max_font_size = font_size # 更新最大值
136
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
137
+ max_font_sizes.sort()
138
+ print("max_font_sizes", max_font_sizes[-10:])
139
+ cur_title = ''
140
+ for page in doc: # 遍历每一页
141
+ text = page.get_text("dict") # 获取页面上的文本信息
142
+ blocks = text["blocks"] # 获取文本块列表
143
+ for block in blocks: # 遍历每个文本块
144
+ if block["type"] == 0: # 如果是文字类型
145
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
146
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
147
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
148
+ # print(font_size)
149
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
150
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
151
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
152
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
153
+ if cur_title == '' :
154
+ cur_title += cur_string
155
+ else:
156
+ cur_title += ' ' + cur_string
157
+ # break
158
+ title = cur_title.replace('\n', ' ')
159
+ return title
160
+
161
+ def _get_all_page_index(self):
162
+ # 定义需要寻找的章节名称列表
163
+ section_list = self.sl
164
+ # 初始化一个字典来存储找到的章节和它们在文档中出现的页码
165
+ section_page_dict = {}
166
+ # 遍历每一页文档
167
+ for page_index, page in enumerate(self.pdf):
168
+ # 获取当前页面的文本内容
169
+ cur_text = page.get_text()
170
+ # 遍历需要寻找的章节名称列表
171
+ for section_name in section_list:
172
+ # 将章节名称转换成大写形式
173
+ section_name_upper = section_name.upper()
174
+ # 如果当前页面包含"Abstract"这个关键词
175
+ if "Abstract" == section_name and section_name in cur_text:
176
+ # 将"Abstract"和它所在的页码加入字典中
177
+ section_page_dict[section_name] = page_index
178
+ # 如果当前页面包含章节名称,则将章节名称和它所在的页码加入字典中
179
+ else:
180
+ if section_name + '\n' in cur_text:
181
+ section_page_dict[section_name] = page_index
182
+ elif section_name_upper + '\n' in cur_text:
183
+ section_page_dict[section_name] = page_index
184
+ # 返回所有找到的章节名称及它们在文档中出现的页码
185
+ return section_page_dict
186
+
187
+ def _get_all_page(self):
188
+ """
189
+ 获取PDF文件中每个页面的文本信息,并将文本信息按照章节组织成字典返回。
190
+ Returns:
191
+ section_dict (dict): 每个章节的文本信息字典,key为章节名,value为章节文本。
192
+ """
193
+ text = ''
194
+ text_list = []
195
+ section_dict = {}
196
+
197
+ # # 先处理Abstract章节
198
+ # for page_index, page in enumerate(self.pdf):
199
+ # cur_text = page.get_text()
200
+ # # 如果该页面是Abstract章节所在页面
201
+ # if page_index == list(self.section_page_dict.values())[0]:
202
+ # abs_str = "Abstract"
203
+ # # 获取Abstract章节的起始位置
204
+ # first_index = cur_text.find(abs_str)
205
+ # # 查找下一个章节的关键词,这里是Introduction
206
+ # intro_str = "Introduction"
207
+ # if intro_str in cur_text:
208
+ # second_index = cur_text.find(intro_str)
209
+ # elif intro_str.upper() in cur_text:
210
+ # second_index = cur_text.find(intro_str.upper())
211
+ # # 将Abstract章节内容加入字典中
212
+ # section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
213
+ # '').replace('\n', ' ').split('I.')[0].split("II.")[0]
214
+
215
+ # 再处理其他章节:
216
+ text_list = [page.get_text() for page in self.pdf]
217
+ for sec_index, sec_name in enumerate(self.section_page_dict):
218
+ print(sec_index, sec_name, self.section_page_dict[sec_name])
219
+ if sec_index <= 0:
220
+ continue
221
+ else:
222
+ # 直接考虑后面的内容:
223
+ start_page = self.section_page_dict[sec_name]
224
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
225
+ end_page = self.section_page_dict[list(self.section_page_dict.keys())[sec_index+1]]
226
+ else:
227
+ end_page = len(text_list)
228
+ print("start_page, end_page:", start_page, end_page)
229
+ cur_sec_text = ''
230
+ if end_page - start_page == 0:
231
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
232
+ next_sec = list(self.section_page_dict.keys())[sec_index+1]
233
+ if text_list[start_page].find(sec_name) == -1:
234
+ start_i = text_list[start_page].find(sec_name.upper())
235
+ else:
236
+ start_i = text_list[start_page].find(sec_name)
237
+ if text_list[start_page].find(next_sec) == -1:
238
+ end_i = text_list[start_page].find(next_sec.upper())
239
+ else:
240
+ end_i = text_list[start_page].find(next_sec)
241
+ cur_sec_text += text_list[start_page][start_i:end_i]
242
+ else:
243
+ for page_i in range(start_page, end_page):
244
+ # print("page_i:", page_i)
245
+ if page_i == start_page:
246
+ if text_list[start_page].find(sec_name) == -1:
247
+ start_i = text_list[start_page].find(sec_name.upper())
248
+ else:
249
+ start_i = text_list[start_page].find(sec_name)
250
+ cur_sec_text += text_list[page_i][start_i:]
251
+ elif page_i < end_page:
252
+ cur_sec_text += text_list[page_i]
253
+ elif page_i == end_page:
254
+ if sec_index < len(list(self.section_page_dict.keys()))-1:
255
+ next_sec = list(self.section_page_dict.keys())[sec_index+1]
256
+ if text_list[start_page].find(next_sec) == -1:
257
+ end_i = text_list[start_page].find(next_sec.upper())
258
+ else:
259
+ end_i = text_list[start_page].find(next_sec)
260
+ cur_sec_text += text_list[page_i][:end_i]
261
+ section_dict[sec_name] = cur_sec_text.replace('-\n', '').replace('\n', ' ')
262
+ return section_dict
263
+
264
+ # 定义Reader类
265
+ class Reader:
266
+ # 初始化方法,设置属性
267
+ def __init__(self, key_word='', query='', filter_keys='',
268
+ root_path='./',
269
+ gitee_key='',
270
+ sort=arxiv.SortCriterion.SubmittedDate, user_name='defualt', language='cn', key=''):
271
+ self.key = str(key) # OpenAI key
272
+ self.user_name = user_name # 读者姓名
273
+ self.key_word = key_word # 读者感兴趣的关键词
274
+ self.query = query # 读者输入的搜索查询
275
+ self.sort = sort # 读者选择的排序方式
276
+ self.language = language # 读者选择的语言
277
+ self.filter_keys = filter_keys # 用于在摘要中筛选的关键词
278
+ self.root_path = root_path
279
+ self.file_format = 'md' # or 'txt',如果为图片,则必须为'md'
280
+ self.save_image = False
281
+ if self.save_image:
282
+ self.gitee_key = self.config.get('Gitee', 'api')
283
+ else:
284
+ self.gitee_key = ''
285
+
286
+ def get_arxiv(self, max_results=30):
287
+ search = arxiv.Search(query=self.query,
288
+ max_results=max_results,
289
+ sort_by=self.sort,
290
+ sort_order=arxiv.SortOrder.Descending,
291
+ )
292
+ return search
293
+
294
+ def filter_arxiv(self, max_results=30):
295
+ search = self.get_arxiv(max_results=max_results)
296
+ print("all search:")
297
+ for index, result in enumerate(search.results()):
298
+ print(index, result.title, result.updated)
299
+
300
+ filter_results = []
301
+ filter_keys = self.filter_keys
302
+
303
+ print("filter_keys:", self.filter_keys)
304
+ # 确保每个关键词都能在摘要中找到,才算是目标论文
305
+ for index, result in enumerate(search.results()):
306
+ abs_text = result.summary.replace('-\n', '-').replace('\n', ' ')
307
+ meet_num = 0
308
+ for f_key in filter_keys.split(" "):
309
+ if f_key.lower() in abs_text.lower():
310
+ meet_num += 1
311
+ if meet_num == len(filter_keys.split(" ")):
312
+ filter_results.append(result)
313
+ # break
314
+ print("filter_results:", len(filter_results))
315
+ print("filter_papers:")
316
+ for index, result in enumerate(filter_results):
317
+ print(index, result.title, result.updated)
318
+ return filter_results
319
+
320
+ def validateTitle(self, title):
321
+ # 将论文的乱七八糟的路径格式修正
322
+ rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
323
+ new_title = re.sub(rstr, "_", title) # 替换为下划线
324
+ return new_title
325
+
326
+ def download_pdf(self, filter_results):
327
+ # 先创建文件夹
328
+ date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
329
+ key_word = str(self.key_word.replace(':', ' '))
330
+ path = self.root_path + 'pdf_files/' + self.query.replace('au: ', '').replace('title: ', '').replace('ti: ', '').replace(':', ' ')[:25] + '-' + date_str
331
+ try:
332
+ os.makedirs(path)
333
+ except:
334
+ pass
335
+ print("All_paper:", len(filter_results))
336
+ # 开始下载:
337
+ paper_list = []
338
+ for r_index, result in enumerate(filter_results):
339
+ try:
340
+ title_str = self.validateTitle(result.title)
341
+ pdf_name = title_str+'.pdf'
342
+ # result.download_pdf(path, filename=pdf_name)
343
+ self.try_download_pdf(result, path, pdf_name)
344
+ paper_path = os.path.join(path, pdf_name)
345
+ print("paper_path:", paper_path)
346
+ paper = Paper(path=paper_path,
347
+ url=result.entry_id,
348
+ title=result.title,
349
+ abs=result.summary.replace('-\n', '-').replace('\n', ' '),
350
+ authers=[str(aut) for aut in result.authors],
351
+ )
352
+ # 下载完毕,开始解析:
353
+ paper.parse_pdf()
354
+ paper_list.append(paper)
355
+ except Exception as e:
356
+ print("download_error:", e)
357
+ pass
358
+ return paper_list
359
+
360
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
361
+ stop=tenacity.stop_after_attempt(5),
362
+ reraise=True)
363
+ def try_download_pdf(self, result, path, pdf_name):
364
+ result.download_pdf(path, filename=pdf_name)
365
+
366
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
367
+ stop=tenacity.stop_after_attempt(5),
368
+ reraise=True)
369
+ def upload_gitee(self, image_path, image_name='', ext='png'):
370
+ """
371
+ 上传到码云
372
+ :return:
373
+ """
374
+ with open(image_path, 'rb') as f:
375
+ base64_data = base64.b64encode(f.read())
376
+ base64_content = base64_data.decode()
377
+
378
+ date_str = str(datetime.datetime.now())[:19].replace(':', '-').replace(' ', '-') + '.' + ext
379
+ path = image_name+ '-' +date_str
380
+
381
+ payload = {
382
+ "access_token": self.gitee_key,
383
+ "owner": self.config.get('Gitee', 'owner'),
384
+ "repo": self.config.get('Gitee', 'repo'),
385
+ "path": self.config.get('Gitee', 'path'),
386
+ "content": base64_content,
387
+ "message": "upload image"
388
+ }
389
+ # 这里需要修改成你的gitee的账户和仓库名,以及文件夹的名字:
390
+ url = f'https://gitee.com/api/v5/repos/'+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/'+path
391
+ rep = requests.post(url, json=payload).json()
392
+ print("rep:", rep)
393
+ if 'content' in rep.keys():
394
+ image_url = rep['content']['download_url']
395
+ else:
396
+ image_url = r"https://gitee.com/api/v5/repos/"+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/' + path
397
+
398
+ return image_url
399
+
400
+ def summary_with_chat(self, paper_list, key):
401
+ htmls = []
402
+ for paper_index, paper in enumerate(paper_list):
403
+ # 第一步先用title,abs,和introduction进行总结。
404
+ text = ''
405
+ text += 'Title:' + paper.title
406
+ text += 'Url:' + paper.url
407
+ text += 'Abstrat:' + paper.abs
408
+ # intro
409
+ text += list(paper.section_text_dict.values())[0]
410
+ max_token = 2500 * 4
411
+ text = text[:max_token]
412
+ chat_summary_text = self.chat_summary(text=text, key=str(key))
413
+ htmls.append(chat_summary_text)
414
+
415
+ # TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
416
+ first_image, ext = paper.get_image_path()
417
+ if first_image is None or self.gitee_key == '':
418
+ pass
419
+ else:
420
+ image_title = self.validateTitle(paper.title)
421
+ image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
422
+ htmls.append("\n")
423
+ htmls.append("![Fig]("+image_url+")")
424
+ htmls.append("\n")
425
+ # 第二步总结方法:
426
+ # TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
427
+ method_key = ''
428
+ for parse_key in paper.section_text_dict.keys():
429
+ if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
430
+ method_key = parse_key
431
+ break
432
+
433
+ if method_key != '':
434
+ text = ''
435
+ method_text = ''
436
+ summary_text = ''
437
+ summary_text += "<summary>" + chat_summary_text
438
+ # methods
439
+ method_text += paper.section_text_dict[method_key]
440
+ # TODO 把这个变成tenacity的自动判别!
441
+ max_token = 2500 * 4
442
+ text = summary_text + "\n <Methods>:\n" + method_text
443
+ text = text[:max_token]
444
+ chat_method_text = self.chat_method(text=text, key=str(key))
445
+ htmls.append(chat_method_text)
446
+ else:
447
+ chat_method_text = ''
448
+ htmls.append("\n")
449
+
450
+ # 第三步总结全文,并打分:
451
+ conclusion_key = ''
452
+ for parse_key in paper.section_text_dict.keys():
453
+ if 'conclu' in parse_key.lower():
454
+ conclusion_key = parse_key
455
+ break
456
+
457
+ text = ''
458
+ conclusion_text = ''
459
+ summary_text = ''
460
+ summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
461
+ if conclusion_key != '':
462
+ # conclusion
463
+ conclusion_text += paper.section_text_dict[conclusion_key]
464
+ max_token = 2500 * 4
465
+ text = summary_text + "\n <Conclusion>:\n" + conclusion_text
466
+ else:
467
+ text = summary_text
468
+ text = text[:max_token]
469
+ chat_conclusion_text = self.chat_conclusion(text=text, key=str(key))
470
+ htmls.append(chat_conclusion_text)
471
+ htmls.append("\n")
472
+ md_text = "\n".join(htmls)
473
+
474
+ return markdown.markdown(md_text)
475
+
476
+
477
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
478
+ stop=tenacity.stop_after_attempt(5),
479
+ reraise=True)
480
+ def chat_conclusion(self, text, key):
481
+ openai.api_key = key
482
+ response = openai.ChatCompletion.create(
483
+ model="gpt-3.5-turbo",
484
+ # prompt需要用英语替换,少占用token。
485
+ messages=[
486
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的审稿人,你需要严格评审这篇文章"}, # chatgpt 角色
487
+ {"role": "assistant", "content": "这是一篇英文文献的<summary>和<conclusion>部分内容,其中<summary>你已经总结好了,但是<conclusion>部分,我需要你帮忙归纳下面问题:"+text}, # 背景知识,可以参考OpenReview的审稿流程
488
+ {"role": "user", "content": """
489
+ 8. 做出如下总结:
490
+ - (1):这篇工作的意义如何?
491
+ - (2):从创新点、性能、工作量这三个维度,总结这篇文章的优点和缺点。
492
+ .......
493
+ 按照后面的格式输出:
494
+ 8. Conclusion:
495
+ - (1):xxx;
496
+ - (2):创新点: xxx; 性能: xxx; 工作量: xxx;
497
+
498
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,.......代表按照实际需求填写,如果没有可以不用写.
499
+ """},
500
+ ]
501
+ )
502
+ result = ''
503
+ for choice in response.choices:
504
+ result += choice.message.content
505
+ print("conclusion_result:\n", result)
506
+ return result
507
+
508
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
509
+ stop=tenacity.stop_after_attempt(5),
510
+ reraise=True)
511
+ def chat_method(self, text, key):
512
+ openai.api_key = key
513
+ response = openai.ChatCompletion.create(
514
+ model="gpt-3.5-turbo",
515
+ messages=[
516
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
517
+ {"role": "assistant", "content": "这是一篇英文文献的<summary>和<Method>部分内容,其中<summary>你已经总结好了,但是<Methods>部分,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
518
+ {"role": "user", "content": """
519
+ 7. 详细描述这篇文章的方法思路。比如说它的步骤是:
520
+ - (1):...
521
+ - (2):...
522
+ - (3):...
523
+ - .......
524
+ 按照后面的格式输出:
525
+ 7. Methods:
526
+ - (1):xxx;
527
+ - (2):xxx;
528
+ - (3):xxx;
529
+ .......
530
+
531
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行,.......代表按照实际需求填写,如果没有可以不用写.
532
+ """},
533
+ ]
534
+ )
535
+ result = ''
536
+ for choice in response.choices:
537
+ result += choice.message.content
538
+ print("method_result:\n", result)
539
+ return result
540
+
541
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
542
+ stop=tenacity.stop_after_attempt(5),
543
+ reraise=True)
544
+ def chat_summary(self, text, key):
545
+ openai.api_key = key
546
+ response = openai.ChatCompletion.create(
547
+ model="gpt-3.5-turbo",
548
+ messages=[
549
+ {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
550
+ {"role": "assistant", "content": "这是一篇英文文献的标题,作者,链接,Abstract和Introduction部分内容,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
551
+ {"role": "user", "content": """
552
+ 1. 标记出这篇文献的标题(加上中文翻译)
553
+ 2. 列举所有的作者姓名 (使用英文)
554
+ 3. 标记第一作者的单位(只输出中文翻译)
555
+ 4. 标记出这篇文章的关键词(使用英文)
556
+ 5. 论文链接,Github代码链接(如果有的话,没有的话请填写Github:None)
557
+ 6. 按照下面四个点进行总结:
558
+ - (1):这篇文章的研究背景是什么?
559
+ - (2):过去的方法有哪些?它们存在什么问题?本文和过去的研究有哪些本质的区别?Is the approach well motivated?
560
+ - (3):本文提出的研究方法是什么?
561
+ - (4):本文方法在什么任务上,取得了什么性能?性能能否支持他们的目标?
562
+ 按照后面的格式输出:
563
+ 1. Title: xxx
564
+ 2. Authors: xxx
565
+ 3. Affiliation: xxx
566
+ 4. Keywords: xxx
567
+ 5. Urls: xxx or xxx , xxx
568
+ 6. Summary:
569
+ - (1):xxx;
570
+ - (2):xxx;
571
+ - (3):xxx;
572
+ - (4):xxx.
573
+
574
+ 务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要有太多重复的信息,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行.
575
+ """},
576
+ ]
577
+ )
578
+ result = ''
579
+ for choice in response.choices:
580
+ result += choice.message.content
581
+ print("summary_result:\n", result)
582
+ return result
583
+
584
+ def export_to_markdown(self, text, file_name, mode='w'):
585
+ # 使用markdown模块的convert方法,将文本转换为html格式
586
+ # html = markdown.markdown(text)
587
+ # 打开一个文件,以写入模式
588
+ with open(file_name, mode, encoding="utf-8") as f:
589
+ # 将html格式的内容写入文件
590
+ f.write(text)
591
+
592
+ # 定义一个方法,打印出读者信息
593
+ def show_info(self):
594
+ print(f"Key word: {self.key_word}")
595
+ print(f"Query: {self.query}")
596
+ print(f"Sort: {self.sort}")
597
+
598
+ def upload_pdf(key, text, file):
599
+ # 检查两个输入都不为空
600
+ if not key or not text or not file:
601
+ return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
602
+ # 判断PDF文件
603
+ if file and file.name.split(".")[-1].lower() != "pdf":
604
+ return '请勿上传非 PDF 文件!'
605
+ else:
606
+ section_list = text.split(',')
607
+ paper_list = [Paper(path=file, sl=section_list)]
608
+ # 创建一个Reader对象
609
+ reader = Reader()
610
+ sum_info = reader.summary_with_chat(paper_list=paper_list, key=key)
611
+ return sum_info
612
+
613
+ # 标题
614
+ title = "ChatPaper"
615
+ # 描述
616
+ description = '''<div align='center'>
617
+
618
+ Use ChatGPT to summary the papers.
619
+
620
+ Star our Github [ChatPaper](https://github.com/kaixindelele/ChatPaper)
621
+
622
+ </div>
623
+ '''
624
+ # 创建Gradio界面
625
+ ip = [
626
+ gradio.inputs.Textbox(label="请输入你的API-key(必填)", default=""),
627
+ gradio.inputs.Textbox(label="请输入论文大标题索引(用英文逗号隔开,必填)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
628
+ gradio.inputs.File(label="请上传论文PDF(必填)")
629
+ ]
630
+
631
+ interface = gradio.Interface(fn=upload_pdf, inputs=ip, outputs="html", title=title, description=description)
632
+
633
+ # 运行Gradio应用程序
634
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ arxiv==1.4.3
2
+ PyMuPDF==1.21.1
3
+ requests==2.26.0
4
+ tiktoken==0.2.0
5
+ tenacity==8.2.2
6
+ pybase64==1.2.3
7
+ Pillow==9.4.0
8
+ openai==0.27.0
9
+ markdown