Spaces:

markqiu
/

prinvest_mate

Sleeping

App Files Files Community

Tuchuanhuhuhu commited on Apr 7, 2023

Commit

6a2dc28

•

1 Parent(s): d708c00

ChatGLM可以用了

Browse files

Files changed (5) hide show

ChuanhuChatbot.py +2 -2
modules/base_model.py +18 -6
modules/models.py +136 -21
modules/presets.py +3 -0
requirements.txt +4 -0

ChuanhuChatbot.py CHANGED Viewed

@@ -22,7 +22,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
     user_name = gr.State("")
     promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
     user_question = gr.State("")
-    current_model = gr.State(get_model(MODELS[0], my_api_key)[0])
     topic = gr.State("未命名对话历史记录")
@@ -78,7 +78,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
                     else:
                         usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
                     model_select_dropdown = gr.Dropdown(
-                        label="选择模型", choices=MODELS, multiselect=False, value=MODELS[0], interactive=True
                     )
                     use_streaming_checkbox = gr.Checkbox(
                         label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION

     user_name = gr.State("")
     promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
     user_question = gr.State("")
+    current_model = gr.State(get_model(MODELS[DEFAULT_MODEL], my_api_key)[0])
     topic = gr.State("未命名对话历史记录")
                     else:
                         usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
                     model_select_dropdown = gr.Dropdown(
+                        label="选择模型", choices=MODELS, multiselect=False, value=MODELS[DEFAULT_MODEL], interactive=True
                     )
                     use_streaming_checkbox = gr.Checkbox(
                         label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION

modules/base_model.py CHANGED Viewed

@@ -33,7 +33,7 @@ class ModelType(Enum):
         model_type = None
         if "gpt" in model_name.lower():
             model_type = ModelType.OpenAI
-        elif "chatglm" in model_name.upper():
             model_type = ModelType.ChatGLM
         else:
             model_type = ModelType.LLaMA
@@ -59,7 +59,10 @@ class BaseLLMModel:
         self.all_token_counts = []
         self.model_name = model_name
         self.model_type = ModelType.get_type(model_name)
-        self.token_upper_limit = MODEL_TOKEN_LIMIT[model_name]
         self.interrupted = False
         self.system_prompt = system_prompt
         self.api_key = None
@@ -79,7 +82,9 @@ class BaseLLMModel:
         conversations are stored in self.history, with the most recent question, in OpenAI format
         should return a generator, each time give the next word (str) in the answer
         """
-        pass
     def get_answer_at_once(self):
         """predict at once, need to be implemented
@@ -88,15 +93,22 @@ class BaseLLMModel:
         the answer (str)
         total token count (int)
         """
-        pass
     def billing_info(self):
         """get billing infomation, inplement if needed"""
         return BILLING_NOT_APPLICABLE_MSG
     def count_token(self, user_input):
         """get token count from input, implement if needed"""
-        return 0
     def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
         def get_return_value():
@@ -234,7 +246,7 @@ class BaseLLMModel:
         else:
             display_reference = ""
-        if len(self.api_key) == 0 and not shared.state.multi_api_key:
             status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
             logging.info(status_text)
             chatbot.append((inputs, ""))

         model_type = None
         if "gpt" in model_name.lower():
             model_type = ModelType.OpenAI
+        elif "chatglm" in model_name.lower():
             model_type = ModelType.ChatGLM
         else:
             model_type = ModelType.LLaMA
         self.all_token_counts = []
         self.model_name = model_name
         self.model_type = ModelType.get_type(model_name)
+        try:
+            self.token_upper_limit = MODEL_TOKEN_LIMIT[model_name]
+        except KeyError:
+            self.token_upper_limit = DEFAULT_TOKEN_LIMIT
         self.interrupted = False
         self.system_prompt = system_prompt
         self.api_key = None
         conversations are stored in self.history, with the most recent question, in OpenAI format
         should return a generator, each time give the next word (str) in the answer
         """
+        logging.warning("stream predict not implemented, using at once predict instead")
+        response, _ = self.get_answer_at_once()
+        yield response
     def get_answer_at_once(self):
         """predict at once, need to be implemented
         the answer (str)
         total token count (int)
         """
+        logging.warning("at once predict not implemented, using stream predict instead")
+        response_iter = self.get_answer_stream_iter()
+        count = 0
+        for response in response_iter:
+            count += 1
+        return response, sum(self.all_token_counts) + count
     def billing_info(self):
         """get billing infomation, inplement if needed"""
+        logging.warning("billing info not implemented, using default")
         return BILLING_NOT_APPLICABLE_MSG
     def count_token(self, user_input):
         """get token count from input, implement if needed"""
+        logging.warning("token count not implemented, using default")
+        return len(user_input)
     def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
         def get_return_value():
         else:
             display_reference = ""
+        if self.api_key is not None and len(self.api_key) == 0 and not shared.state.multi_api_key:
             status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
             logging.info(status_text)
             chatbot.append((inputs, ""))

modules/models.py CHANGED Viewed

@@ -10,6 +10,14 @@ import requests
 import urllib3
 import platform
 from tqdm import tqdm
 import colorama
 from duckduckgo_search import ddg
@@ -213,27 +221,39 @@ class ChatGLM_Client(BaseLLMModel):
         else:
             model_source = f"THUDM/{model_name}"
         self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
         if torch.cuda.is_available():
             # run on CUDA
-            model = AutoModel.from_pretrained(model_source, trust_remote_code=True).half().cuda()
-        elif system_name == "Darwin" and model_path is not None:
-            # running on macOS and model already downloaded
-            model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().to('mps')
         else:
-            # run on CPU
-            model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
         model = model.eval()
         self.model = model
     def _get_glm_style_input(self):
         history = [x["content"] for x in self.history]
         query = history.pop()
         return history, query
     def get_answer_at_once(self):
         history, query = self._get_glm_style_input()
         response, _ = self.model.chat(self.tokenizer, query, history=history)
-        return response
     def get_answer_stream_iter(self):
         history, query = self._get_glm_style_input()
@@ -241,6 +261,100 @@ class ChatGLM_Client(BaseLLMModel):
                                                temperature=self.temperature):
             yield response
 def get_model(
     model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
@@ -248,7 +362,7 @@ def get_model(
     msg = f"模型设置为了： {model_name}"
     logging.info(msg)
     model_type = ModelType.get_type(model_name)
-    del model
     if model_type == ModelType.OpenAI:
         model = OpenAIClient(
             model_name=model_name,
@@ -265,29 +379,30 @@ def get_model(
 if __name__ == "__main__":
     with open("config.json", "r") as f:
         openai_api_key = cjson.load(f)["openai_api_key"]
-    client = OpenAIClient("gpt-3.5-turbo", openai_api_key)
     chatbot = []
-    stream = False
     # 测试账单功能
-    print(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
-    print(client.billing_info())
     # 测试问答
-    print(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
     question = "巴黎是中国的首都吗？"
     for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
-        print(i)
-    print(f"测试问答后history : {client.history}")
     # 测试记忆力
-    print(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
     question = "我刚刚问了你什么问题？"
     for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
-        print(i)
-    print(f"测试记忆力后history : {client.history}")
     # 测试重试功能
-    print(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
     for i in client.retry(chatbot=chatbot, stream=stream):
-        print(i)
-    print(f"重试后history : {client.history}")
     # # 测试总结功能
     # print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
     # chatbot, msg = client.reduce_token_size(chatbot=chatbot)

 import urllib3
 import platform
+from dataclasses import dataclass, field
+from transformers import HfArgumentParser
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.models.auto_model import AutoModel
+from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
 from tqdm import tqdm
 import colorama
 from duckduckgo_search import ddg
         else:
             model_source = f"THUDM/{model_name}"
         self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
+        quantified = False
+        if "int4" in model_name:
+            quantified = True
+        if quantified:
+            model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
+        else:
+            model = AutoModel.from_pretrained(model_source, trust_remote_code=True).half()
         if torch.cuda.is_available():
             # run on CUDA
+            logging.info("CUDA is available, using CUDA")
+            model = model.cuda()
+        # mps加速还存在一些问题，暂时不使用
+        # elif system_name == "Darwin" and model_path is not None:
+        #     logging.info("Running on macOS, using MPS")
+        #     # running on macOS and model already downloaded
+        #     model = model.to('mps')
         else:
+            logging.info("GPU is not available, using CPU")
         model = model.eval()
         self.model = model
     def _get_glm_style_input(self):
         history = [x["content"] for x in self.history]
         query = history.pop()
+        logging.info(colorama.Fore.YELLOW + f"{history}" + colorama.Fore.RESET)
+        assert len(history) % 2 == 0
+        history = [[history[i], history[i+1]] for i in range(0, len(history), 2)]
         return history, query
     def get_answer_at_once(self):
         history, query = self._get_glm_style_input()
         response, _ = self.model.chat(self.tokenizer, query, history=history)
+        return response, len(response)
     def get_answer_stream_iter(self):
         history, query = self._get_glm_style_input()
                                                temperature=self.temperature):
             yield response
+@dataclass
+class ChatbotArguments:
+    pass
+class LLaMA_Client(BaseLLMModel):
+    def __init__(
+        self,
+        model_name,
+        lora_path = None,
+    ) -> None:
+        super().__init__(
+            model_name=model_name
+        )
+        self.max_generation_token = 1000
+        pipeline_name = "inferencer"
+        PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+        parser = HfArgumentParser((
+            ModelArguments,
+            PipelineArguments,
+            ChatbotArguments,
+        ))
+        model_args, pipeline_args, chatbot_args = (
+            parser.parse_args_into_dataclasses()
+        )
+        with open (pipeline_args.deepspeed, "r") as f:
+            ds_config = json.load(f)
+        self.model = AutoModel.get_model(
+            model_args,
+            tune_strategy='none',
+            ds_config=ds_config,
+        )
+        # We don't need input data, we will read interactively from stdin
+        data_args = DatasetArguments(dataset_path=None)
+        self.dataset = Dataset(data_args)
+        self.inferencer = AutoPipeline.get_pipeline(
+            pipeline_name=pipeline_name,
+            model_args=model_args,
+            data_args=data_args,
+            pipeline_args=pipeline_args,
+        )
+        # Chats
+        model_name = model_args.model_name_or_path
+        if model_args.lora_model_path is not None:
+            model_name += f" + {model_args.lora_model_path}"
+        # context = (
+        #     "You are a helpful assistant who follows the given instructions"
+        #     " unconditionally."
+        # )
+        self.end_string = "\n\n"
+    def _get_llama_style_input(self):
+        history = [x["content"] for x in self.history]
+        context = "\n".join(history)
+        return context
+    def get_answer_at_once(self):
+        context = self._get_llama_style_input()
+        input_dataset = self.dataset.from_dict({
+            "type": "text_only",
+            "instances": [ { "text": context } ]
+        })
+        output_dataset = self.inferencer.inference(
+            model=self.model,
+            dataset=input_dataset,
+            max_new_tokens=self.max_generation_token,
+            temperature=self.temperature,
+        )
+        response = output_dataset.to_dict()["instances"][0]["text"]
+        try:
+            index = response.index(self.end_string)
+        except ValueError:
+            response += self.end_string
+            index = response.index(self.end_string)
+        response = response[:index + 1]
+        return response, len(response)
+    def get_answer_stream_iter(self):
+        response, _ = self.get_answer_at_once()
+        yield response
 def get_model(
     model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
     msg = f"模型设置为了： {model_name}"
     logging.info(msg)
     model_type = ModelType.get_type(model_name)
+    print(model_type.name)
     if model_type == ModelType.OpenAI:
         model = OpenAIClient(
             model_name=model_name,
 if __name__ == "__main__":
     with open("config.json", "r") as f:
         openai_api_key = cjson.load(f)["openai_api_key"]
+    # client, _ = get_model("gpt-3.5-turbo", openai_api_key)
+    client, _ = get_model("chatglm-6b-int4")
     chatbot = []
+    stream = True
     # 测试账单功能
+    logging.info(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
+    logging.info(client.billing_info())
     # 测试问答
+    logging.info(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
     question = "巴黎是中国的首都吗？"
     for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
+        logging.info(i)
+    logging.info(f"测试问答后history : {client.history}")
     # 测试记忆力
+    logging.info(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
     question = "我刚刚问了你什么问题？"
     for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
+        logging.info(i)
+    logging.info(f"测试记忆力后history : {client.history}")
     # 测试重试功能
+    logging.info(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
     for i in client.retry(chatbot=chatbot, stream=stream):
+        logging.info(i)
+    logging.info(f"重试后history : {client.history}")
     # # 测试总结功能
     # print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
     # chatbot, msg = client.reduce_token_size(chatbot=chatbot)

modules/presets.py CHANGED Viewed

@@ -62,6 +62,8 @@ MODELS = [
     "chatglm-6b-int4-qe"
 ]  # 可选的模型
 MODEL_TOKEN_LIMIT = {
     "gpt-3.5-turbo": 4096,
     "gpt-3.5-turbo-0301": 4096,
@@ -72,6 +74,7 @@ MODEL_TOKEN_LIMIT = {
 }
 TOKEN_OFFSET = 1000 # 模型的token上限减去这个值，得到软上限。到达软上限之后，自动尝试减少token占用。
 REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘，得到目标token数。减少token占用时，将token占用减少到目标token数以下。
 REPLY_LANGUAGES = [

     "chatglm-6b-int4-qe"
 ]  # 可选的模型
+DEFAULT_MODEL = 0  # 默认的模型在MODELS中的序号，从0开始数
 MODEL_TOKEN_LIMIT = {
     "gpt-3.5-turbo": 4096,
     "gpt-3.5-turbo-0301": 4096,
 }
 TOKEN_OFFSET = 1000 # 模型的token上限减去这个值，得到软上限。到达软上限之后，自动尝试减少token占用。
+DEFAULT_TOKEN_LIMIT = 3000 # 默认的token上限
 REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘，得到目标token数。减少token占用时，将token占用减少到目标token数以下。
 REPLY_LANGUAGES = [

requirements.txt CHANGED Viewed

@@ -15,3 +15,7 @@ pdfplumber
 pandas
 transformers
 torch

 pandas
 transformers
 torch
+mpi4py
+icetk
+git+https://github.com/OptimalScale/LMFlow.git
+cpm-kernels