LLMChat

Running

App Files Files Community

JohnSmith9982 commited on Mar 28, 2023

Commit

44846b2

•

1 Parent(s): 8ad9e26

Upload 7 files

Browse files

Files changed (2) hide show

modules/chat_func.py +2 -2
modules/llama_func.py +42 -38

modules/chat_func.py CHANGED Viewed

@@ -155,7 +155,7 @@ def stream_predict(
     yield get_return_value()
     error_json_str = ""
-    for chunk in response.iter_lines():
         if counter == 0:
             counter += 1
             continue
@@ -272,7 +272,7 @@ def predict(
     if reply_language == "跟随问题语言（不稳定）":
         reply_language = "the same language as the question, such as English, 中文, 日本語, Español, Français, or Deutsch."
     if files:
-        msg = "构建索引中……（这可能需要比较久的时间）"
         logging.info(msg)
         yield chatbot+[(inputs, "")], history, msg, all_token_counts
         index = construct_index(openai_api_key, file_src=files)

     yield get_return_value()
     error_json_str = ""
+    for chunk in tqdm(response.iter_lines()):
         if counter == 0:
             counter += 1
             continue
     if reply_language == "跟随问题语言（不稳定）":
         reply_language = "the same language as the question, such as English, 中文, 日本語, Español, Français, or Deutsch."
     if files:
+        msg = "加载索引中……（这可能需要几分钟）"
         logging.info(msg)
         yield chatbot+[(inputs, "")], history, msg, all_token_counts
         index = construct_index(openai_api_key, file_src=files)

modules/llama_func.py CHANGED Viewed

@@ -13,54 +13,57 @@ from llama_index import (
 from langchain.llms import OpenAI
 import colorama
 from modules.presets import *
 from modules.utils import *
 def get_documents(file_src):
     documents = []
-    index_name = ""
     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
-        logging.debug(f"file: {file.name}")
-        index_name += file.name
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
             CJKPDFReader = download_loader("CJKPDFReader")
             loader = CJKPDFReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
-            documents += loader.load_data(file=file.name)
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
-            documents += loader.load_data(file=file.name)
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
-                text = add_space(f.read())
-                documents += [Document(text)]
-    index_name = sha1sum(index_name)
-    return documents, index_name
 def construct_index(
-    api_key,
-    file_src,
-    max_input_size=4096,
-    num_outputs=1,
-    max_chunk_overlap=20,
-    chunk_size_limit=600,
-    embedding_limit=None,
-    separator=" ",
-    num_children=10,
-    max_keywords_per_chunk=10,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
@@ -78,12 +81,13 @@ def construct_index(
         chunk_size_limit,
         separator=separator,
     )
-    documents, index_name = get_documents(file_src)
     if os.path.exists(f"./index/{index_name}.json"):
         logging.info("找到了缓存的索引文件，加载中……")
         return GPTSimpleVectorIndex.load_from_disk(f"./index/{index_name}.json")
     else:
         try:
             logging.debug("构建索引中……")
             index = GPTSimpleVectorIndex(
                 documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
@@ -97,12 +101,12 @@ def construct_index(
 def chat_ai(
-    api_key,
-    index,
-    question,
-    context,
-    chatbot,
-    reply_language,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
@@ -133,15 +137,15 @@ def chat_ai(
 def ask_ai(
-    api_key,
-    index,
-    question,
-    prompt_tmpl,
-    refine_tmpl,
-    sim_k=1,
-    temprature=0,
-    prefix_messages=[],
-    reply_language="中文",
 ):
     os.environ["OPENAI_API_KEY"] = api_key
@@ -174,7 +178,7 @@ def ask_ai(
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
-                f"<details><summary>[{index+1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(

 from langchain.llms import OpenAI
 import colorama
 from modules.presets import *
 from modules.utils import *
+def get_index_name(file_src):
+    index_name = ""
+    for file in file_src:
+        index_name += os.path.basename(file.name)
+    index_name = sha1sum(index_name)
+    return index_name
 def get_documents(file_src):
     documents = []
     logging.debug("Loading documents...")
     logging.debug(f"file_src: {file_src}")
     for file in file_src:
+        logging.info(f"loading file: {file.name}")
         if os.path.splitext(file.name)[1] == ".pdf":
             logging.debug("Loading PDF...")
             CJKPDFReader = download_loader("CJKPDFReader")
             loader = CJKPDFReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         elif os.path.splitext(file.name)[1] == ".docx":
             logging.debug("Loading DOCX...")
             DocxReader = download_loader("DocxReader")
             loader = DocxReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         elif os.path.splitext(file.name)[1] == ".epub":
             logging.debug("Loading EPUB...")
             EpubReader = download_loader("EpubReader")
             loader = EpubReader()
+            text_raw = loader.load_data(file=file.name)[0].text
         else:
             logging.debug("Loading text file...")
             with open(file.name, "r", encoding="utf-8") as f:
+                text_raw = f.read()
+        text = add_space(text_raw)
+        documents += [Document(text)]
+    return documents
 def construct_index(
+        api_key,
+        file_src,
+        max_input_size=4096,
+        num_outputs=1,
+        max_chunk_overlap=20,
+        chunk_size_limit=600,
+        embedding_limit=None,
+        separator=" ",
+        num_children=10,
+        max_keywords_per_chunk=10,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
     chunk_size_limit = None if chunk_size_limit == 0 else chunk_size_limit
         chunk_size_limit,
         separator=separator,
     )
+    index_name = get_index_name(file_src)
     if os.path.exists(f"./index/{index_name}.json"):
         logging.info("找到了缓存的索引文件，加载中……")
         return GPTSimpleVectorIndex.load_from_disk(f"./index/{index_name}.json")
     else:
         try:
+            documents = get_documents(file_src)
             logging.debug("构建索引中……")
             index = GPTSimpleVectorIndex(
                 documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
 def chat_ai(
+        api_key,
+        index,
+        question,
+        context,
+        chatbot,
+        reply_language,
 ):
     os.environ["OPENAI_API_KEY"] = api_key
 def ask_ai(
+        api_key,
+        index,
+        question,
+        prompt_tmpl,
+        refine_tmpl,
+        sim_k=1,
+        temprature=0,
+        prefix_messages=[],
+        reply_language="中文",
 ):
     os.environ["OPENAI_API_KEY"] = api_key
         for index, node in enumerate(response.source_nodes):
             brief = node.source_text[:25].replace("\n", "")
             nodes.append(
+                f"<details><summary>[{index + 1}]\t{brief}...</summary><p>{node.source_text}</p></details>"
             )
         new_response = ret_text + "\n----------\n" + "\n\n".join(nodes)
         logging.info(