Spaces:

markqiu
/

prinvest_mate

Sleeping

App Files Files Community

insight Tuchuanhuhuhu commited on Mar 31, 2023

Commit

1bda668

•

1 Parent(s): 7edba86

Update 2-column pdf; Update new config type; Update new proxy method (#479)

Browse files

* Update 2-column pdf; Update new config type; Update new proxy method

* 更新requirements

---------

Co-authored-by: Tuchuanhuhuhu <gzblog@hdu.edu.cn>

Files changed (10) hide show

.gitignore +1 -1
ChuanhuChatbot.py +1 -47
config_example.json +11 -0
modules/chat_func.py +9 -9
modules/config.py +113 -0
modules/llama_func.py +10 -5
modules/openai_func.py +7 -8
modules/pdf_func.py +180 -0
modules/utils.py +3 -25
requirements.txt +1 -0

.gitignore CHANGED Viewed

@@ -134,6 +134,6 @@ dmypy.json
 **/.DS_Store
 api_key.txt
 auth.json
 .idea

 **/.DS_Store
 api_key.txt
+config.json
 auth.json
 .idea

ChuanhuChatbot.py CHANGED Viewed

@@ -5,59 +5,13 @@ import sys
 import gradio as gr
 from modules.utils import *
 from modules.presets import *
 from modules.overwrites import *
 from modules.chat_func import *
 from modules.openai_func import get_usage
-logging.basicConfig(
-    level=logging.DEBUG,
-    format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
-)
-my_api_key = ""  # 在这里输入你的 API 密钥
-# if we are running in Docker
-if os.environ.get("dockerrun") == "yes":
-    dockerflag = True
-else:
-    dockerflag = False
-authflag = False
-auth_list = []
-if not my_api_key:
-    my_api_key = os.environ.get("my_api_key")
-if dockerflag:
-    if my_api_key == "empty":
-        logging.error("Please give a api key!")
-        sys.exit(1)
-    # auth
-    username = os.environ.get("USERNAME")
-    password = os.environ.get("PASSWORD")
-    if not (isinstance(username, type(None)) or isinstance(password, type(None))):
-        auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
-        authflag = True
-else:
-    if (
-        not my_api_key
-        and os.path.exists("api_key.txt")
-        and os.path.getsize("api_key.txt")
-    ):
-        with open("api_key.txt", "r") as f:
-            my_api_key = f.read().strip()
-    if os.path.exists("auth.json"):
-        authflag = True
-        with open("auth.json", "r", encoding='utf-8') as f:
-            auth = json.load(f)
-            for _ in auth:
-                if auth[_]["username"] and auth[_]["password"]:
-                    auth_list.append((auth[_]["username"], auth[_]["password"]))
-                else:
-                    logging.error("请检查auth.json文件中的用户名和密码！")
-                    sys.exit(1)
 gr.Chatbot.postprocess = postprocess
 PromptHelper.compact_text_chunks = compact_text_chunks

 import gradio as gr
+from modules.config import *
 from modules.utils import *
 from modules.presets import *
 from modules.overwrites import *
 from modules.chat_func import *
 from modules.openai_func import get_usage
 gr.Chatbot.postprocess = postprocess
 PromptHelper.compact_text_chunks = compact_text_chunks

config_example.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "openai_api_key": "sk-xxxxxxxxxxxxxxxxxxxxxxxxx",
+    "https_proxy": "http://127.0.0.1:1079",
+    "http_proxy": "http://127.0.0.1:1079",
+    "advanced_pdf_kwargs": {
+        "two_column": true
+    },
+    "users": [
+        ["root", "root"]
+    ]
+}

modules/chat_func.py CHANGED Viewed

@@ -21,6 +21,7 @@ from modules.presets import *
 from modules.llama_func import *
 from modules.utils import *
 import modules.shared as shared
 # logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
@@ -61,20 +62,19 @@ def get_response(
     else:
         timeout = timeout_all
-    proxies = get_proxies()
     # 如果有自定义的api-url，使用自定义url发送请求，否则使用默认设置发送请求
     if shared.state.api_url != API_URL:
         logging.info(f"使用自定义API URL: {shared.state.api_url}")
-    response = requests.post(
-        shared.state.api_url,
-        headers=headers,
-        json=payload,
-        stream=True,
-        timeout=timeout,
-        proxies=proxies,
-    )
     return response

 from modules.llama_func import *
 from modules.utils import *
 import modules.shared as shared
+from modules.config import retrieve_proxy
 # logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s")
     else:
         timeout = timeout_all
     # 如果有自定义的api-url，使用自定义url发送请求，否则使用默认设置发送请求
     if shared.state.api_url != API_URL:
         logging.info(f"使用自定义API URL: {shared.state.api_url}")
+    with retrieve_proxy():
+        response = requests.post(
+            shared.state.api_url,
+            headers=headers,
+            json=payload,
+            stream=True,
+            timeout=timeout,
+        )
     return response

modules/config.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from contextlib import contextmanager
+import os
+import logging
+import sys
+import json
+__all__ = [
+    "my_api_key",
+    "authflag",
+    "auth_list",
+    "dockerflag",
+    "retrieve_proxy",
+    "log_level",
+]
+# 添加一个统一的config文件，避免文件过多造成的疑惑（优先级最低）
+# 同时，也可以为后续支持自定义功能提供config的帮助
+if os.path.exists("config.json"):
+    with open("config.json", "r", encoding='utf-8') as f:
+        config = json.load(f)
+else:
+    config = {}
+## 处理docker if we are running in Docker
+dockerflag = config.get("dockerflag", False)
+if os.environ.get("dockerrun") == "yes":
+    dockerflag = True
+## 处理 api-key 以及 允许的用户列表
+my_api_key = config.get("openai_api_key", "") # 在这里输入你的 API 密钥
+authflag = "users" in config
+auth_list = config.get("users", []) # 实际上是使用者的列表
+my_api_key = os.environ.get("my_api_key", my_api_key)
+if dockerflag:
+    if my_api_key == "empty":
+        logging.error("Please give a api key!")
+        sys.exit(1)
+    # auth
+    username = os.environ.get("USERNAME")
+    password = os.environ.get("PASSWORD")
+    if not (isinstance(username, type(None)) or isinstance(password, type(None))):
+        auth_list.append((os.environ.get("USERNAME"), os.environ.get("PASSWORD")))
+        authflag = True
+else:
+    if (
+        not my_api_key
+        and os.path.exists("api_key.txt")
+        and os.path.getsize("api_key.txt")
+    ):
+        with open("api_key.txt", "r") as f:
+            my_api_key = f.read().strip()
+    if os.path.exists("auth.json"):
+        authflag = True
+        with open("auth.json", "r", encoding='utf-8') as f:
+            auth = json.load(f)
+            for _ in auth:
+                if auth[_]["username"] and auth[_]["password"]:
+                    auth_list.append((auth[_]["username"], auth[_]["password"]))
+                else:
+                    logging.error("请检查auth.json文件中的用户名和密码！")
+                    sys.exit(1)
+@contextmanager
+def retrieve_openai_api(api_key = None):
+    old_api_key = os.environ.get("OPENAI_API_KEY", "")
+    if api_key is None:
+        os.environ["OPENAI_API_KEY"] = my_api_key
+        yield my_api_key
+    else:
+        os.environ["OPENAI_API_KEY"] = api_key
+        yield api_key
+    os.environ["OPENAI_API_KEY"] = old_api_key
+## 处理log
+log_level = config.get("log_level", "INFO")
+logging.basicConfig(
+    level=log_level,
+    format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
+)
+## 处理代理：
+http_proxy = config.get("http_proxy", "")
+https_proxy = config.get("https_proxy", "")
+http_proxy = os.environ.get("HTTP_PROXY", http_proxy)
+https_proxy = os.environ.get("HTTPS_PROXY", https_proxy)
+# 重置系统变量，在不需要设置的时候不设置环境变量，以免引起全局代理报错
+os.environ["HTTP_PROXY"] = ""
+os.environ["HTTPS_PROXY"] = ""
+@contextmanager
+def retrieve_proxy(proxy=None):
+    """
+    1, 如果proxy = NONE，设置环境变量，并返回最新设置的代理
+    2，如果proxy ！= NONE，更新当前的代理配置，但是不更新环境变量
+    """
+    global http_proxy, https_proxy
+    if proxy is not None:
+        http_proxy = proxy
+        https_proxy = proxy
+        yield http_proxy, https_proxy
+    else:
+        old_var = os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"]
+        os.environ["HTTP_PROXY"] = http_proxy
+        os.environ["HTTPS_PROXY"] = https_proxy
+        yield http_proxy, https_proxy # return new proxy
+        # return old proxy
+        os.environ["HTTP_PROXY"], os.environ["HTTPS_PROXY"] = old_var
+## 处理advance pdf
+advance_pdf = config.get("advance_pdf", {})

modules/llama_func.py CHANGED Viewed

@@ -46,11 +46,16 @@ def get_documents(file_src):
         logging.info(f"loading file: {file.name}")
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
-            pdftext = ""
-            with open(file.name, 'rb') as pdfFileObj:
-                pdfReader = PyPDF2.PdfReader(pdfFileObj)
-                for page in tqdm(pdfReader.pages):
-                    pdftext += page.extract_text()
             text_raw = pdftext
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")

         logging.info(f"loading file: {file.name}")
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
+            try:
+                from modules.pdf_func import parse_pdf
+                from modules.config import advance_pdf
+                text = parse_pdf(file.name, advance_pdf.get("two_column", False)).text
+            except:
+                pdftext = ""
+                with open(file.name, 'rb') as pdfFileObj:
+                    pdfReader = PyPDF2.PdfReader(pdfFileObj)
+                    for page in tqdm(pdfReader.pages):
+                        pdftext += page.extract_text()
             text_raw = pdftext
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")

modules/openai_func.py CHANGED Viewed

@@ -11,7 +11,7 @@ from modules.presets import (
 )
 from modules import shared
-from modules.utils import get_proxies
 import os, datetime
 def get_billing_data(openai_api_key, billing_url):
@@ -21,13 +21,12 @@ def get_billing_data(openai_api_key, billing_url):
     }
     timeout = timeout_all
-    proxies = get_proxies()
-    response = requests.get(
-        billing_url,
-        headers=headers,
-        timeout=timeout,
-        proxies=proxies,
-    )
     if response.status_code == 200:
         data = response.json()

 )
 from modules import shared
+from modules.config import retrieve_proxy
 import os, datetime
 def get_billing_data(openai_api_key, billing_url):
     }
     timeout = timeout_all
+    with retrieve_proxy():
+        response = requests.get(
+            billing_url,
+            headers=headers,
+            timeout=timeout,
+        )
     if response.status_code == 200:
         data = response.json()

modules/pdf_func.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from types import SimpleNamespace
+import pdfplumber
+import logging
+from llama_index import Document
+def prepare_table_config(crop_page):
+    """Prepare table查找边界, 要求page为原始page
+    From https://github.com/jsvine/pdfplumber/issues/242
+    """
+    page = crop_page.root_page # root/parent
+    cs = page.curves + page.edges
+    def curves_to_edges():
+        """See https://github.com/jsvine/pdfplumber/issues/127"""
+        edges = []
+        for c in cs:
+            edges += pdfplumber.utils.rect_to_edges(c)
+        return edges
+    edges = curves_to_edges()
+    return {
+        "vertical_strategy": "explicit",
+        "horizontal_strategy": "explicit",
+        "explicit_vertical_lines": edges,
+        "explicit_horizontal_lines": edges,
+        "intersection_y_tolerance": 10,
+    }
+def get_text_outside_table(crop_page):
+    ts = prepare_table_config(crop_page)
+    if len(ts["explicit_vertical_lines"]) == 0 or len(ts["explicit_horizontal_lines"]) == 0:
+        return crop_page
+    ### Get the bounding boxes of the tables on the page.
+    bboxes = [table.bbox for table in crop_page.root_page.find_tables(table_settings=ts)]
+    def not_within_bboxes(obj):
+        """Check if the object is in any of the table's bbox."""
+        def obj_in_bbox(_bbox):
+            """See https://github.com/jsvine/pdfplumber/blob/stable/pdfplumber/table.py#L404"""
+            v_mid = (obj["top"] + obj["bottom"]) / 2
+            h_mid = (obj["x0"] + obj["x1"]) / 2
+            x0, top, x1, bottom = _bbox
+            return (h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
+        return not any(obj_in_bbox(__bbox) for __bbox in bboxes)
+    return crop_page.filter(not_within_bboxes)
+# 请使用 LaTeX 表达公式，行内公式以 $ 包裹，行间公式以 $$ 包裹
+extract_words = lambda page: page.extract_words(keep_blank_chars=True, y_tolerance=0, x_tolerance=1, extra_attrs=["fontname", "size", "object_type"])
+# dict_keys(['text', 'x0', 'x1', 'top', 'doctop', 'bottom', 'upright', 'direction', 'fontname', 'size'])
+def get_title_with_cropped_page(first_page):
+    title = [] # 处理标题
+    x0,top,x1,bottom = first_page.bbox # 获取页面边框
+    for word in extract_words(first_page):
+        word = SimpleNamespace(**word)
+        if word.size >= 14:
+            title.append(word.text)
+            title_bottom = word.bottom
+        elif word.text == "Abstract": # 获取页面abstract
+            top = word.top
+    user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
+    # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
+    return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
+def get_column_cropped_pages(pages, two_column=True):
+    new_pages = []
+    for page in pages:
+        if two_column:
+            left = page.within_bbox((0, 0, page.width/2, page.height),relative=True)
+            right = page.within_bbox((page.width/2, 0, page.width, page.height), relative=True)
+            new_pages.append(left)
+            new_pages.append(right)
+        else:
+            new_pages.append(page)
+    return new_pages
+def parse_pdf(filename, two_column = True):
+    level = logging.getLogger().level
+    if level == logging.getLevelName("DEBUG"):
+        logging.getLogger().setLevel("INFO")
+    with pdfplumber.open(filename) as pdf:
+        title, user_info, first_page = get_title_with_cropped_page(pdf.pages[0])
+        new_pages = get_column_cropped_pages([first_page] + pdf.pages[1:], two_column)
+        chapters = []
+        # tuple (chapter_name, [pageid] (start,stop), chapter_text)
+        create_chapter = lambda page_start,name_top,name_bottom: SimpleNamespace(
+            name=[],
+            name_top=name_top,
+            name_bottom=name_bottom,
+            record_chapter_name = True,
+            page_start=page_start,
+            page_stop=None,
+            text=[],
+        )
+        cur_chapter = None
+        # 按页遍历PDF文档
+        for idx, page in enumerate(new_pages):
+            page = get_text_outside_table(page)
+            # 按行遍历页面文本
+            for word in extract_words(page):
+                word = SimpleNamespace(**word)
+                # 检查行文本是否以12号字体打印，如果是，则将其作为新章节开始
+                if word.size >= 11: # 出现chapter name
+                    if cur_chapter is None:
+                        cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
+                    elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
+                        # 不再继续写chapter name
+                        cur_chapter.page_stop = page.page_number # stop id
+                        chapters.append(cur_chapter)
+                        # 重置当前chapter信息
+                        cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
+                    # print(word.size, word.top, word.bottom, word.text)
+                    cur_chapter.name.append(word.text)
+                else:
+                    cur_chapter.record_chapter_name = False # chapter name 结束
+                    cur_chapter.text.append(word.text)
+        else:
+            # 处理最后一个章节
+            cur_chapter.page_stop = page.page_number # stop id
+            chapters.append(cur_chapter)
+        for i in chapters:
+            logging.info(f"section: {i.name} pages:{i.page_start, i.page_stop} word-count:{len(i.text)}")
+            logging.debug(" ".join(i.text))
+    title = " ".join(title)
+    user_info = " ".join(user_info)
+    text = f"Article Title: {title}, Information:{user_info}\n"
+    for idx, chapter in enumerate(chapters):
+        chapter.name = " ".join(chapter.name)
+        text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
+    logging.getLogger().setLevel(level)
+    return Document(text=text, extra_info={"title": title})
+BASE_POINTS = """
+1. Who are the authors?
+2. What is the process of the proposed method?
+3. What is the performance of the proposed method? Please note down its performance metrics.
+4. What are the baseline models and their performances? Please note down these baseline methods.
+5. What dataset did this paper use?
+"""
+READING_PROMPT = """
+You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
+Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
+When you are reading, You need to focus on these key points:{}
+"""
+READING_PROMT_V2 = """
+You are a researcher helper bot. You can help the user with research paper reading and summarizing. \n
+Now I am going to send you a paper. You need to read it and summarize it for me part by part. \n
+When you are reading, You need to focus on these key points:{},
+And You need to generate a brief but informative title for this part.
+Your return format:
+- title: '...'
+- summary: '...'
+"""
+SUMMARY_PROMPT = "You are a researcher helper bot. Now you need to read the summaries of a research paper."
+if __name__ == '__main__':
+    # Test code
+    z = parse_pdf("./build/test.pdf")
+    print(z["user_info"])
+    print(z["title"])

modules/utils.py CHANGED Viewed

@@ -24,11 +24,7 @@ from pygments.formatters import HtmlFormatter
 from modules.presets import *
 import modules.shared as shared
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
-)
 if TYPE_CHECKING:
     from typing import TypedDict
@@ -333,8 +329,7 @@ def reset_textbox():
 def reset_default():
     newurl = shared.state.reset_api_url()
-    os.environ.pop("HTTPS_PROXY", None)
-    os.environ.pop("https_proxy", None)
     return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
@@ -346,6 +341,7 @@ def change_api_url(url):
 def change_proxy(proxy):
     os.environ["HTTPS_PROXY"] = proxy
     msg = f"代理更改为了{proxy}"
     logging.info(msg)
@@ -443,24 +439,6 @@ def transfer_input(inputs):
     )
-def get_proxies():
-    # 获取环境变量中的代理设置
-    http_proxy = os.environ.get("HTTP_PROXY") or os.environ.get("http_proxy")
-    https_proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("https_proxy")
-    # 如果存在代理设置，使用它们
-    proxies = {}
-    if http_proxy:
-        logging.info(f"使用 HTTP 代理: {http_proxy}")
-        proxies["http"] = http_proxy
-    if https_proxy:
-        logging.info(f"使用 HTTPS 代理: {https_proxy}")
-        proxies["https"] = https_proxy
-    if proxies == {}:
-        proxies = None
-    return proxies
 def run(command, desc=None, errdesc=None, custom_env=None, live=False):
     if desc is not None:

 from modules.presets import *
 import modules.shared as shared
+from modules.config import retrieve_proxy
 if TYPE_CHECKING:
     from typing import TypedDict
 def reset_default():
     newurl = shared.state.reset_api_url()
+    retrieve_proxy("")
     return gr.update(value=newurl), gr.update(value=""), "API URL 和代理已重置"
 def change_proxy(proxy):
+    retrieve_proxy(proxy)
     os.environ["HTTPS_PROXY"] = proxy
     msg = f"代理更改为了{proxy}"
     logging.info(msg)
     )
 def run(command, desc=None, errdesc=None, custom_env=None, live=False):
     if desc is not None:

requirements.txt CHANGED Viewed

@@ -11,3 +11,4 @@ llama_index
 langchain
 markdown
 PyPDF2

 langchain
 markdown
 PyPDF2
+pdfplumber