Spaces:

xu-song
/

self-chat

Running

App Files Files Community

xu song commited on Jul 31

Commit

f0929ee

•

1 Parent(s): 12a161c

update

Browse files

Files changed (3) hide show

app.py +45 -38
app_util.py +5 -3
models/cpp_qwen2.py +10 -0

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ system_list = [
     "你是一个心理咨询师。",
 ]
 doc = """\
 There are maily two types of user simulator:
 - prompt-based user-simulator (role-play)
@@ -24,48 +23,55 @@ with gr.Blocks() as demo:
     gr.HTML("""<h1 align="center">Distilling the Knowledge through Self Chatting</h1>""")
     gr.Markdown(doc, visible=False)
     with gr.Row():
-        gr.Dropdown(
-            ["Qwen2-0.5B-Instruct", "llama3.1", "gemini"],
-            value="Qwen2-0.5B-Instruct",
-            label="model",
-            interactive=True,
-            scale=1,
-            visible=False
-        )
-        system = gr.Dropdown(
-            choices=system_list,
-            value=system_list[0],
-            allow_custom_value=True,
-            interactive=True,
-            label="System message",
-            scale=5,
-        )
-    chatbot = gr.Chatbot(avatar_images=("assets/man.png", "assets/bot.png"))
-    with gr.Row():
-        with gr.Column():
-            slider_max_tokens = gr.Slider(minimum=1, maximum=config.MAX_SEQUENCE_LENGTH,
-                                          value=config.DEFAULT_MAX_TOKENS, step=1, label="Max tokens")
-            slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
-                                           value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
-                                           info="Larger temperature increase the randomness")
-            slider_top_p = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=config.DEFAULT_TOP_P,
-                step=0.05,
-                label="Top-p (nucleus sampling)",
             )
-        with gr.Column(scale=4):
             generated_text = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
             generate_btn = gr.Button("🤔️ Generate", variant="primary")
             with gr.Row():
-                retry_btn = gr.Button("🔄  Regenerate")
-                undo_btn = gr.Button("↩️ Undo")
-                clear_btn = gr.Button("🗑️  Clear")  # 🧹 Clear History (清除历史)
-                # stop_btn = gr.Button("停止生成", variant="primary", visible=False)
     ########
@@ -81,8 +87,9 @@ with gr.Blocks() as demo:
     undo_btn.click(undo_generate, [chatbot, history], outputs=[generated_text, chatbot, history])
     slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
-    slider_top_p.change(set_top_p, inputs=[slider_top_p])
     slider_temperature.change(set_temperature, inputs=[slider_temperature])
 # demo.queue().launch(share=False, server_name="0.0.0.0")
 # demo.queue().launch(concurrency_count=1, max_size=5)

     "你是一个心理咨询师。",
 ]
 doc = """\
 There are maily two types of user simulator:
 - prompt-based user-simulator (role-play)
     gr.HTML("""<h1 align="center">Distilling the Knowledge through Self Chatting</h1>""")
     gr.Markdown(doc, visible=False)
     with gr.Row():
+        with gr.Column(scale=5):
+            system = gr.Dropdown(
+                choices=system_list,
+                value=system_list[0],
+                allow_custom_value=True,
+                interactive=True,
+                label="System message",
+                scale=5,
             )
+            chatbot = gr.Chatbot(avatar_images=("assets/man.png", "assets/bot.png"))
             generated_text = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
             generate_btn = gr.Button("🤔️ Generate", variant="primary")
             with gr.Row():
+                retry_btn = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
+                undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
+                clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
+                # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
+        with gr.Column(variant="compact"):
+        # with gr.Column():
+            model = gr.Dropdown(
+                ["Qwen2-0.5B-Instruct", "llama3.1", "gemini"],
+                value="Qwen2-0.5B-Instruct",
+                label="Model",
+                interactive=True,
+                # visible=False
+            )
+            with gr.Accordion(label="Parameters", open=True):
+                slider_max_tokens = gr.Slider(minimum=1, maximum=config.MAX_SEQUENCE_LENGTH,
+                                              value=config.DEFAULT_MAX_TOKENS, step=1, label="Max tokens")
+                slider_temperature = gr.Slider(minimum=0.1, maximum=10.0,
+                                               value=config.DEFAULT_TEMPERATURE, step=0.1, label="Temperature",
+                                               info="Larger temperature increase the randomness")
+                slider_top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=config.DEFAULT_TOP_P,
+                    step=0.05,
+                    label="Top-p (nucleus sampling)",
+                )
+                slider_top_k = gr.Slider(
+                    minimum=1,
+                    maximum=200,
+                    value=config.DEFAULT_TOP_K,
+                    step=1,
+                    label="Top-k",
+                )
     ########
     undo_btn.click(undo_generate, [chatbot, history], outputs=[generated_text, chatbot, history])
     slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
     slider_temperature.change(set_temperature, inputs=[slider_temperature])
+    slider_top_p.change(set_top_p, inputs=[slider_top_p])
+    slider_top_k.change(set_top_k, inputs=[slider_top_k])
 # demo.queue().launch(share=False, server_name="0.0.0.0")
 # demo.queue().launch(concurrency_count=1, max_size=5)

app_util.py CHANGED Viewed

@@ -98,10 +98,12 @@ def reset_state(system):
 def set_max_tokens(max_tokens):
     bot.generation_kwargs["max_tokens"] = max_tokens
 def set_top_p(top_p):
     bot.generation_kwargs["top_p"] = top_p
-def set_temperature(temperature):
-    bot.generation_kwargs["temperature"] = temperature

 def set_max_tokens(max_tokens):
     bot.generation_kwargs["max_tokens"] = max_tokens
+def set_temperature(temperature):
+    bot.generation_kwargs["temperature"] = temperature
 def set_top_p(top_p):
     bot.generation_kwargs["top_p"] = top_p
+def set_top_k(top_k):
+    bot.generation_kwargs["top_k"] = top_k

models/cpp_qwen2.py CHANGED Viewed

@@ -13,6 +13,13 @@ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-C
 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
 """
 import json
@@ -37,6 +44,7 @@ class Qwen2Simulator(Simulator):
                 tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                 n_ctx=config.MAX_SEQUENCE_LENGTH,  #
                 # n_threads=None, # 默认会根据cpu数来设置 n_threads
                 verbose=False,
             )
         else:
@@ -45,6 +53,8 @@ class Qwen2Simulator(Simulator):
                 repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                 filename="*fp16.gguf",
                 tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                 verbose=False,
             )
         logger.info(f"llm has been initialized: {self.llm}, n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}")

 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
+## reference
+- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
+- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
+- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
 """
 import json
                 tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                 n_ctx=config.MAX_SEQUENCE_LENGTH,  #
                 # n_threads=None, # 默认会根据cpu数来设置 n_threads
+                use_mlock=True,
                 verbose=False,
             )
         else:
                 repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                 filename="*fp16.gguf",
                 tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
+                n_ctx=config.MAX_SEQUENCE_LENGTH,
+                use_mlock=True,
                 verbose=False,
             )
         logger.info(f"llm has been initialized: {self.llm}, n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}")