Spaces:
Sleeping
Sleeping
Update 2-column pdf; Update new config type; Update new proxy method (#479)
Browse files* Update 2-column pdf; Update new config type; Update new proxy method
* 更新requirements
---------
Co-authored-by: Tuchuanhuhuhu <gzblog@hdu.edu.cn>
- .gitignore +1 -1
- ChuanhuChatbot.py +1 -47
- config_example.json +11 -0
- modules/chat_func.py +9 -9
- modules/config.py +113 -0
- modules/llama_func.py +10 -5
- modules/openai_func.py +7 -8
- modules/pdf_func.py +180 -0
- modules/utils.py +3 -25
- requirements.txt +1 -0
.gitignore
CHANGED
@@ -134,6 +134,6 @@ dmypy.json
|
|
134 |
**/.DS_Store
|
135 |
|
136 |
api_key.txt
|
137 |
-
|
138 |
auth.json
|
139 |
.idea
|
|
|
134 |
**/.DS_Store
|
135 |
|
136 |
api_key.txt
|
137 |
+
config.json
|
138 |
auth.json
|
139 |
.idea
|
ChuanhuChatbot.py
CHANGED
@@ -5,59 +5,13 @@ import sys
|
|
5 |
|
6 |
import gradio as gr
|
7 |
|
|
|
8 |
from modules.utils import *
|
9 |
from modules.presets import *
|
10 |
from modules.overwrites import *
|
11 |
from modules.chat_func import *
|
12 |
from modules.openai_func import get_usage
|
13 |
|
14 |
-
logging.basicConfig(
|
15 |
-
level=logging.DEBUG,
|
16 |
-
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
17 |
-
)
|
18 |
-
|
19 |
-
my_api_key = "" # 在这里输入你的 API 密钥
|
20 |
-
|
21 |
-
# if we are running in Docker
|
22 |
-
if os.environ.get("dockerrun") == "yes":
|
23 |
-
dockerflag = True
|
24 |
-
else:
|
25 |
-
dockerflag = False
|
26 |
-
|
27 |
-
authflag = False
|
28 |
-
auth_list = []
|
29 |
-
|
30 |
-
if not my_api_key:
|
31 |
-
my_api_key = os.environ.get("my_api_key")
|
32 |
-
if dockerflag:
|
33 |
-
if my_api_key == "empty":
|
34 |
-
logging.error("Please give a api key!")
|
35 |
-
sys.exit(1)
|
36 |
-
# auth
|
37 |
-
username = os.environ.get("USERNAME")
|
38 |
-
password = os.environ.get("PASSWORD")
|
39 |
-
if not (isinstance(username, type(None)) or isinstance(password, type(None))):
|
40 |
-
auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
|
41 |
-
authflag = True
|
42 |
-
else:
|
43 |
-
if (
|
44 |
-
not my_api_key
|
45 |
-
and os.path.exists("api_key.txt")
|
46 |
-
and os.path.getsize("api_key.txt")
|
47 |
-
):
|
48 |
-
with open("api_key.txt", "r") as f:
|
49 |
-
my_api_key = f.read().strip()
|
50 |
-
if os.path.exists("auth.json"):
|
51 |
-
authflag = True
|
52 |
-
with open("auth.json", "r", encoding='utf-8') as f:
|
53 |
-
auth = json.load(f)
|
54 |
-
for _ in auth:
|
55 |
-
if auth[_]["username"] and auth[_]["password"]:
|
56 |
-
auth_list.append((auth[_]["username"], auth[_]["password"]))
|
57 |
-
else:
|
58 |
-
logging.error("请检查auth.json文件中的用户名和密码!")
|
59 |
-
sys.exit(1)
|
60 |
-
|
61 |
gr.Chatbot.postprocess = postprocess
|
62 |
PromptHelper.compact_text_chunks = compact_text_chunks
|
63 |
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
|
8 |
+
from modules.config import *
|
9 |
from modules.utils import *
|
10 |
from modules.presets import *
|
11 |
from modules.overwrites import *
|
12 |
from modules.chat_func import *
|
13 |
from modules.openai_func import get_usage
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
gr.Chatbot.postprocess = postprocess
|
16 |
PromptHelper.compact_text_chunks = compact_text_chunks
|
17 |
|
config_example.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"openai_api_key": "sk-xxxxxxxxxxxxxxxxxxxxxxxxx",
|
3 |
+
"https_proxy": "http://127.0.0.1:1079",
|
4 |
+
"http_proxy": "http://127.0.0.1:1079",
|
5 |
+
"advanced_pdf_kwargs": {
|
6 |
+
"two_column": true
|
7 |
+
},
|
8 |
+
"users": [
|
9 |
+
["root", "root"]
|
10 |
+
]
|
11 |
+
}
|
modules/chat_func.py
CHANGED
@@ -21,6 +21,7 @@ from modules.presets import *
|
|
21 |
from modules.llama_func import *
|
22 |
from modules.utils import *
|
23 |
import modules.shared as shared
|
|
|
24 |
|
25 |
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
|
26 |
|
@@ -61,20 +62,19 @@ def get_response(
|
|
61 |
else:
|
62 |
timeout = timeout_all
|
63 |
|
64 |
-
proxies = get_proxies()
|
65 |
|
66 |
# 如果有自定义的api-url,使用自定义url发送请求,否则使用默认设置发送请求
|
67 |
if shared.state.api_url != API_URL:
|
68 |
logging.info(f"使用自定义API URL: {shared.state.api_url}")
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
|
79 |
return response
|
80 |
|
|
|
21 |
from modules.llama_func import *
|
22 |
from modules.utils import *
|
23 |
import modules.shared as shared
|
24 |
+
from modules.config import retrieve_proxy
|
25 |
|
26 |
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
|
27 |
|
|
|
62 |
else:
|
63 |
timeout = timeout_all
|
64 |
|
|
|
65 |
|
66 |
# 如果有自定义的api-url,使用自定义url发送请求,否则使用默认设置发送请求
|
67 |
if shared.state.api_url != API_URL:
|
68 |
logging.info(f"使用自定义API URL: {shared.state.api_url}")
|
69 |
|
70 |
+
with retrieve_proxy():
|
71 |
+
response = requests.post(
|
72 |
+
shared.state.api_url,
|
73 |
+
headers=headers,
|
74 |
+
json=payload,
|
75 |
+
stream=True,
|
76 |
+
timeout=timeout,
|
77 |
+
)
|
78 |
|
79 |
return response
|
80 |
|
modules/config.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import contextmanager
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import sys
|
5 |
+
import json
|
6 |
+
|
7 |
+
__all__ = [
|
8 |
+
"my_api_key",
|
9 |
+
"authflag",
|
10 |
+
"auth_list",
|
11 |
+
"dockerflag",
|
12 |
+
"retrieve_proxy",
|
13 |
+
"log_level",
|
14 |
+
]
|
15 |
+
|
16 |
+
# 添加一个统一的config文件,避免文件过多造成的疑惑(优先级最低)
|
17 |
+
# 同时,也可以为后续支持自定义功能提供config的帮助
|
18 |
+
if os.path.exists("config.json"):
|
19 |
+
with open("config.json", "r", encoding='utf-8') as f:
|
20 |
+
config = json.load(f)
|
21 |
+
else:
|
22 |
+
config = {}
|
23 |
+
|
24 |
+
## 处理docker if we are running in Docker
|
25 |
+
dockerflag = config.get("dockerflag", False)
|
26 |
+
if os.environ.get("dockerrun") == "yes":
|
27 |
+
dockerflag = True
|
28 |
+
|
29 |
+
## 处理 api-key 以及 允许的用户列表
|
30 |
+
my_api_key = config.get("openai_api_key", "") # 在这里输入你的 API 密钥
|
31 |
+
authflag = "users" in config
|
32 |
+
auth_list = config.get("users", []) # 实际上是使用者的列表
|
33 |
+
my_api_key = os.environ.get("my_api_key", my_api_key)
|
34 |
+
if dockerflag:
|
35 |
+
if my_api_key == "empty":
|
36 |
+
logging.error("Please give a api key!")
|
37 |
+
sys.exit(1)
|
38 |
+
# auth
|
39 |
+
username = os.environ.get("USERNAME")
|
40 |
+
password = os.environ.get("PASSWORD")
|
41 |
+
if not (isinstance(username, type(None)) or isinstance(password, type(None))):
|
42 |
+
auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
|
43 |
+
authflag = True
|
44 |
+
else:
|
45 |
+
if (
|
46 |
+
not my_api_key
|
47 |
+
and os.path.exists("api_key.txt")
|
48 |
+
and os.path.getsize("api_key.txt")
|
49 |
+
):
|
50 |
+
with open("api_key.txt", "r") as f:
|
51 |
+
my_api_key = f.read().strip()
|
52 |
+
if os.path.exists("auth.json"):
|
53 |
+
authflag = True
|
54 |
+
with open("auth.json", "r", encoding='utf-8') as f:
|
55 |
+
auth = json.load(f)
|
56 |
+
for _ in auth:
|
57 |
+
if auth[_]["username"] and auth[_]["password"]:
|
58 |
+
auth_list.append((auth[_]["username"], auth[_]["password"]))
|
59 |
+
else:
|
60 |
+
logging.error("请检查auth.json文件中的用户名和密码!")
|
61 |
+
sys.exit(1)
|
62 |
+
|
63 |
+
@contextmanager
|
64 |
+
def retrieve_openai_api(api_key = None):
|
65 |
+
old_api_key = os.environ.get("OPENAI_API_KEY", "")
|
66 |
+
if api_key is None:
|
67 |
+
os.environ["OPENAI_API_KEY"] = my_api_key
|
68 |
+
yield my_api_key
|
69 |
+
else:
|
70 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
71 |
+
yield api_key
|
72 |
+
os.environ["OPENAI_API_KEY"] = old_api_key
|
73 |
+
|
74 |
+
## 处理log
|
75 |
+
log_level = config.get("log_level", "INFO")
|
76 |
+
logging.basicConfig(
|
77 |
+
level=log_level,
|
78 |
+
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
79 |
+
)
|
80 |
+
|
81 |
+
## 处理代理:
|
82 |
+
http_proxy = config.get("http_proxy", "")
|
83 |
+
https_proxy = config.get("https_proxy", "")
|
84 |
+
http_proxy = os.environ.get("HTTP_PROXY", http_proxy)
|
85 |
+
https_proxy = os.environ.get("HTTPS_PROXY", https_proxy)
|
86 |
+
|
87 |
+
# 重置系统变量,在不需要设置的时候不设置环境变量,以免引起全局代理报错
|
88 |
+
os.environ["HTTP_PROXY"] = ""
|
89 |
+
os.environ["HTTPS_PROXY"] = ""
|
90 |
+
|
91 |
+
@contextmanager
|
92 |
+
def retrieve_proxy(proxy=None):
|
93 |
+
"""
|
94 |
+
1, 如果proxy = NONE,设置环境变量,并返回最新设置的代理
|
95 |
+
2,如果proxy != NONE,更新当前的代理配置,但是不更新环境变量
|
96 |
+
"""
|
97 |
+
global http_proxy, https_proxy
|
98 |
+
if proxy is not None:
|
99 |
+
http_proxy = proxy
|
100 |
+
https_proxy = proxy
|
101 |
+
yield http_proxy, https_proxy
|
102 |
+
else:
|
103 |
+
old_var = os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"]
|
104 |
+
os.environ["HTTP_PROXY"] = http_proxy
|
105 |
+
os.environ["HTTPS_PROXY"] = https_proxy
|
106 |
+
yield http_proxy, https_proxy # return new proxy
|
107 |
+
|
108 |
+
# return old proxy
|
109 |
+
os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"] = old_var
|
110 |
+
|
111 |
+
|
112 |
+
## 处理advance pdf
|
113 |
+
advance_pdf = config.get("advance_pdf", {})
|
modules/llama_func.py
CHANGED
@@ -46,11 +46,16 @@ def get_documents(file_src):
|
|
46 |
logging.info(f"loading file: {file.name}")
|
47 |
if os.path.splitext(file.name)[1] == ".pdf":
|
48 |
logging.debug("Loading PDF...")
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
54 |
text_raw = pdftext
|
55 |
elif os.path.splitext(file.name)[1] == ".docx":
|
56 |
logging.debug("Loading DOCX...")
|
|
|
46 |
logging.info(f"loading file: {file.name}")
|
47 |
if os.path.splitext(file.name)[1] == ".pdf":
|
48 |
logging.debug("Loading PDF...")
|
49 |
+
try:
|
50 |
+
from modules.pdf_func import parse_pdf
|
51 |
+
from modules.config import advance_pdf
|
52 |
+
text = parse_pdf(file.name, advance_pdf.get("two_column", False)).text
|
53 |
+
except:
|
54 |
+
pdftext = ""
|
55 |
+
with open(file.name, 'rb') as pdfFileObj:
|
56 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
57 |
+
for page in tqdm(pdfReader.pages):
|
58 |
+
pdftext += page.extract_text()
|
59 |
text_raw = pdftext
|
60 |
elif os.path.splitext(file.name)[1] == ".docx":
|
61 |
logging.debug("Loading DOCX...")
|
modules/openai_func.py
CHANGED
@@ -11,7 +11,7 @@ from modules.presets import (
|
|
11 |
)
|
12 |
|
13 |
from modules import shared
|
14 |
-
from modules.
|
15 |
import os, datetime
|
16 |
|
17 |
def get_billing_data(openai_api_key, billing_url):
|
@@ -21,13 +21,12 @@ def get_billing_data(openai_api_key, billing_url):
|
|
21 |
}
|
22 |
|
23 |
timeout = timeout_all
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
)
|
31 |
|
32 |
if response.status_code == 200:
|
33 |
data = response.json()
|
|
|
11 |
)
|
12 |
|
13 |
from modules import shared
|
14 |
+
from modules.config import retrieve_proxy
|
15 |
import os, datetime
|
16 |
|
17 |
def get_billing_data(openai_api_key, billing_url):
|
|
|
21 |
}
|
22 |
|
23 |
timeout = timeout_all
|
24 |
+
with retrieve_proxy():
|
25 |
+
response = requests.get(
|
26 |
+
billing_url,
|
27 |
+
headers=headers,
|
28 |
+
timeout=timeout,
|
29 |
+
)
|
|
|
30 |
|
31 |
if response.status_code == 200:
|
32 |
data = response.json()
|
modules/pdf_func.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from types import SimpleNamespace
|
2 |
+
import pdfplumber
|
3 |
+
import logging
|
4 |
+
from llama_index import Document
|
5 |
+
|
6 |
+
def prepare_table_config(crop_page):
|
7 |
+
"""Prepare table查找边界, 要求page为原始page
|
8 |
+
|
9 |
+
From https://github.com/jsvine/pdfplumber/issues/242
|
10 |
+
"""
|
11 |
+
page = crop_page.root_page # root/parent
|
12 |
+
cs = page.curves + page.edges
|
13 |
+
def curves_to_edges():
|
14 |
+
"""See https://github.com/jsvine/pdfplumber/issues/127"""
|
15 |
+
edges = []
|
16 |
+
for c in cs:
|
17 |
+
edges += pdfplumber.utils.rect_to_edges(c)
|
18 |
+
return edges
|
19 |
+
edges = curves_to_edges()
|
20 |
+
return {
|
21 |
+
"vertical_strategy": "explicit",
|
22 |
+
"horizontal_strategy": "explicit",
|
23 |
+
"explicit_vertical_lines": edges,
|
24 |
+
"explicit_horizontal_lines": edges,
|
25 |
+
"intersection_y_tolerance": 10,
|
26 |
+
}
|
27 |
+
|
28 |
+
def get_text_outside_table(crop_page):
|
29 |
+
ts = prepare_table_config(crop_page)
|
30 |
+
if len(ts["explicit_vertical_lines"]) == 0 or len(ts["explicit_horizontal_lines"]) == 0:
|
31 |
+
return crop_page
|
32 |
+
|
33 |
+
### Get the bounding boxes of the tables on the page.
|
34 |
+
bboxes = [table.bbox for table in crop_page.root_page.find_tables(table_settings=ts)]
|
35 |
+
def not_within_bboxes(obj):
|
36 |
+
"""Check if the object is in any of the table's bbox."""
|
37 |
+
def obj_in_bbox(_bbox):
|
38 |
+
"""See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404"""
|
39 |
+
v_mid = (obj["top"] + obj["bottom"]) / 2
|
40 |
+
h_mid = (obj["x0"] + obj["x1"]) / 2
|
41 |
+
x0, top, x1, bottom = _bbox
|
42 |
+
return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
43 |
+
return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
|
44 |
+
|
45 |
+
return crop_page.filter(not_within_bboxes)
|
46 |
+
# 请使用 LaTeX 表达公式,行内公式以 $ 包裹,行间公式以 $$ 包裹
|
47 |
+
|
48 |
+
extract_words = lambda page: page.extract_words(keep_blank_chars=True, y_tolerance=0, x_tolerance=1, extra_attrs=["fontname", "size", "object_type"])
|
49 |
+
# dict_keys(['text', 'x0', 'x1', 'top', 'doctop', 'bottom', 'upright', 'direction', 'fontname', 'size'])
|
50 |
+
|
51 |
+
def get_title_with_cropped_page(first_page):
|
52 |
+
title = [] # 处理标题
|
53 |
+
x0,top,x1,bottom = first_page.bbox # 获取页面边框
|
54 |
+
|
55 |
+
for word in extract_words(first_page):
|
56 |
+
word = SimpleNamespace(**word)
|
57 |
+
|
58 |
+
if word.size >= 14:
|
59 |
+
title.append(word.text)
|
60 |
+
title_bottom = word.bottom
|
61 |
+
elif word.text == "Abstract": # 获取页面abstract
|
62 |
+
top = word.top
|
63 |
+
|
64 |
+
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
65 |
+
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
66 |
+
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
67 |
+
|
68 |
+
def get_column_cropped_pages(pages, two_column=True):
|
69 |
+
new_pages = []
|
70 |
+
for page in pages:
|
71 |
+
if two_column:
|
72 |
+
left = page.within_bbox((0, 0, page.width/2, page.height),relative=True)
|
73 |
+
right = page.within_bbox((page.width/2, 0, page.width, page.height), relative=True)
|
74 |
+
new_pages.append(left)
|
75 |
+
new_pages.append(right)
|
76 |
+
else:
|
77 |
+
new_pages.append(page)
|
78 |
+
|
79 |
+
return new_pages
|
80 |
+
|
81 |
+
def parse_pdf(filename, two_column = True):
|
82 |
+
level = logging.getLogger().level
|
83 |
+
if level == logging.getLevelName("DEBUG"):
|
84 |
+
logging.getLogger().setLevel("INFO")
|
85 |
+
|
86 |
+
with pdfplumber.open(filename) as pdf:
|
87 |
+
title, user_info, first_page = get_title_with_cropped_page(pdf.pages[0])
|
88 |
+
new_pages = get_column_cropped_pages([first_page] + pdf.pages[1:], two_column)
|
89 |
+
|
90 |
+
chapters = []
|
91 |
+
# tuple (chapter_name, [pageid] (start,stop), chapter_text)
|
92 |
+
create_chapter = lambda page_start,name_top,name_bottom: SimpleNamespace(
|
93 |
+
name=[],
|
94 |
+
name_top=name_top,
|
95 |
+
name_bottom=name_bottom,
|
96 |
+
record_chapter_name = True,
|
97 |
+
|
98 |
+
page_start=page_start,
|
99 |
+
page_stop=None,
|
100 |
+
|
101 |
+
text=[],
|
102 |
+
)
|
103 |
+
cur_chapter = None
|
104 |
+
|
105 |
+
# 按页遍历PDF文档
|
106 |
+
for idx, page in enumerate(new_pages):
|
107 |
+
page = get_text_outside_table(page)
|
108 |
+
|
109 |
+
# 按行遍历页面文本
|
110 |
+
for word in extract_words(page):
|
111 |
+
word = SimpleNamespace(**word)
|
112 |
+
|
113 |
+
# 检查行文本是否以12号字体打印,如果是,则将其作为新章节开始
|
114 |
+
if word.size >= 11: # 出现chapter name
|
115 |
+
if cur_chapter is None:
|
116 |
+
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
117 |
+
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
118 |
+
# 不再继续写chapter name
|
119 |
+
cur_chapter.page_stop = page.page_number # stop id
|
120 |
+
chapters.append(cur_chapter)
|
121 |
+
# 重置当前chapter信息
|
122 |
+
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
123 |
+
|
124 |
+
# print(word.size, word.top, word.bottom, word.text)
|
125 |
+
cur_chapter.name.append(word.text)
|
126 |
+
else:
|
127 |
+
cur_chapter.record_chapter_name = False # chapter name 结束
|
128 |
+
cur_chapter.text.append(word.text)
|
129 |
+
else:
|
130 |
+
# 处理最后一个章节
|
131 |
+
cur_chapter.page_stop = page.page_number # stop id
|
132 |
+
chapters.append(cur_chapter)
|
133 |
+
|
134 |
+
for i in chapters:
|
135 |
+
logging.info(f"section: {i.name} pages:{i.page_start, i.page_stop} word-count:{len(i.text)}")
|
136 |
+
logging.debug(" ".join(i.text))
|
137 |
+
|
138 |
+
title = " ".join(title)
|
139 |
+
user_info = " ".join(user_info)
|
140 |
+
text = f"Article Title: {title}, Information:{user_info}\n"
|
141 |
+
for idx, chapter in enumerate(chapters):
|
142 |
+
chapter.name = " ".join(chapter.name)
|
143 |
+
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
144 |
+
|
145 |
+
logging.getLogger().setLevel(level)
|
146 |
+
return Document(text=text, extra_info={"title": title})
|
147 |
+
|
148 |
+
BASE_POINTS = """
|
149 |
+
1. Who are the authors?
|
150 |
+
2. What is the process of the proposed method?
|
151 |
+
3. What is the performance of the proposed method? Please note down its performance metrics.
|
152 |
+
4. What are the baseline models and their performances? Please note down these baseline methods.
|
153 |
+
5. What dataset did this paper use?
|
154 |
+
"""
|
155 |
+
|
156 |
+
READING_PROMPT = """
|
157 |
+
You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
|
158 |
+
Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
|
159 |
+
When you are reading, You need to focus on these key points:{}
|
160 |
+
"""
|
161 |
+
|
162 |
+
READING_PROMT_V2 = """
|
163 |
+
You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
|
164 |
+
Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
|
165 |
+
When you are reading, You need to focus on these key points:{},
|
166 |
+
|
167 |
+
And You need to generate a brief but informative title for this part.
|
168 |
+
Your return format:
|
169 |
+
- title: '...'
|
170 |
+
- summary: '...'
|
171 |
+
"""
|
172 |
+
|
173 |
+
SUMMARY_PROMPT = "You are a researcher helper bot. Now you need to read the summaries of a research paper."
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == '__main__':
|
177 |
+
# Test code
|
178 |
+
z = parse_pdf("./build/test.pdf")
|
179 |
+
print(z["user_info"])
|
180 |
+
print(z["title"])
|
modules/utils.py
CHANGED
@@ -24,11 +24,7 @@ from pygments.formatters import HtmlFormatter
|
|
24 |
|
25 |
from modules.presets import *
|
26 |
import modules.shared as shared
|
27 |
-
|
28 |
-
logging.basicConfig(
|
29 |
-
level=logging.INFO,
|
30 |
-
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
|
31 |
-
)
|
32 |
|
33 |
if TYPE_CHECKING:
|
34 |
from typing import TypedDict
|
@@ -333,8 +329,7 @@ def reset_textbox():
|
|
333 |
|
334 |
def reset_default():
|
335 |
newurl = shared.state.reset_api_url()
|
336 |
-
|
337 |
-
os.environ.pop("https_proxy", None)
|
338 |
return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
|
339 |
|
340 |
|
@@ -346,6 +341,7 @@ def change_api_url(url):
|
|
346 |
|
347 |
|
348 |
def change_proxy(proxy):
|
|
|
349 |
os.environ["HTTPS_PROXY"] = proxy
|
350 |
msg = f"代理更改为了{proxy}"
|
351 |
logging.info(msg)
|
@@ -443,24 +439,6 @@ def transfer_input(inputs):
|
|
443 |
)
|
444 |
|
445 |
|
446 |
-
def get_proxies():
|
447 |
-
# 获取环境变量中的代理设置
|
448 |
-
http_proxy = os.environ.get("HTTP_PROXY") or os.environ.get("http_proxy")
|
449 |
-
https_proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("https_proxy")
|
450 |
-
|
451 |
-
# 如果存在代理设置,使用它们
|
452 |
-
proxies = {}
|
453 |
-
if http_proxy:
|
454 |
-
logging.info(f"使用 HTTP 代理: {http_proxy}")
|
455 |
-
proxies["http"] = http_proxy
|
456 |
-
if https_proxy:
|
457 |
-
logging.info(f"使用 HTTPS 代理: {https_proxy}")
|
458 |
-
proxies["https"] = https_proxy
|
459 |
-
|
460 |
-
if proxies == {}:
|
461 |
-
proxies = None
|
462 |
-
|
463 |
-
return proxies
|
464 |
|
465 |
def run(command, desc=None, errdesc=None, custom_env=None, live=False):
|
466 |
if desc is not None:
|
|
|
24 |
|
25 |
from modules.presets import *
|
26 |
import modules.shared as shared
|
27 |
+
from modules.config import retrieve_proxy
|
|
|
|
|
|
|
|
|
28 |
|
29 |
if TYPE_CHECKING:
|
30 |
from typing import TypedDict
|
|
|
329 |
|
330 |
def reset_default():
|
331 |
newurl = shared.state.reset_api_url()
|
332 |
+
retrieve_proxy("")
|
|
|
333 |
return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
|
334 |
|
335 |
|
|
|
341 |
|
342 |
|
343 |
def change_proxy(proxy):
|
344 |
+
retrieve_proxy(proxy)
|
345 |
os.environ["HTTPS_PROXY"] = proxy
|
346 |
msg = f"代理更改为了{proxy}"
|
347 |
logging.info(msg)
|
|
|
439 |
)
|
440 |
|
441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
|
443 |
def run(command, desc=None, errdesc=None, custom_env=None, live=False):
|
444 |
if desc is not None:
|
requirements.txt
CHANGED
@@ -11,3 +11,4 @@ llama_index
|
|
11 |
langchain
|
12 |
markdown
|
13 |
PyPDF2
|
|
|
|
11 |
langchain
|
12 |
markdown
|
13 |
PyPDF2
|
14 |
+
pdfplumber
|