xu song
commited on
Commit
•
b70508d
1
Parent(s):
73119ac
update
Browse files- app.py +3 -3
- models/cpp_qwen2.py +14 -11
app.py
CHANGED
@@ -48,7 +48,7 @@ with gr.Blocks() as demo:
|
|
48 |
show_share_button=True,
|
49 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
50 |
|
51 |
-
gr.Textbox("For faster inference, you can build locally with ")
|
52 |
# ss
|
53 |
with gradio.Tab("Self Chat"):
|
54 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
@@ -64,7 +64,7 @@ with gr.Blocks() as demo:
|
|
64 |
visible=True)
|
65 |
|
66 |
# 也叫 chat-assistant,
|
67 |
-
with gradio.Tab("Response Generator"):
|
68 |
with gr.Row():
|
69 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
70 |
generate_btn_2 = gr.Button("Send", variant="primary")
|
@@ -75,7 +75,7 @@ with gr.Blocks() as demo:
|
|
75 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
76 |
|
77 |
#
|
78 |
-
with gradio.Tab("User Simulator"):
|
79 |
with gr.Row():
|
80 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
81 |
generate_btn_3 = gr.Button("Send", variant="primary")
|
|
|
48 |
show_share_button=True,
|
49 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
50 |
|
51 |
+
# gr.Textbox("For faster inference, you can build locally with ")
|
52 |
# ss
|
53 |
with gradio.Tab("Self Chat"):
|
54 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
|
|
64 |
visible=True)
|
65 |
|
66 |
# 也叫 chat-assistant,
|
67 |
+
with gradio.Tab("Response Generator", visible=False):
|
68 |
with gr.Row():
|
69 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
70 |
generate_btn_2 = gr.Button("Send", variant="primary")
|
|
|
75 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
76 |
|
77 |
#
|
78 |
+
with gradio.Tab("User Simulator", visible=False):
|
79 |
with gr.Row():
|
80 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
81 |
generate_btn_3 = gr.Button("Send", variant="primary")
|
models/cpp_qwen2.py
CHANGED
@@ -188,22 +188,18 @@ class Qwen2Simulator(Simulator):
|
|
188 |
logger.info(
|
189 |
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
190 |
|
191 |
-
#
|
192 |
-
|
193 |
-
# <|im_end|>\n
|
194 |
-
logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
|
195 |
-
self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
|
196 |
-
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
197 |
|
198 |
def pre_cache_system(self, system_list):
|
199 |
-
"""
|
200 |
:param system_list:
|
201 |
:return:
|
202 |
"""
|
203 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
204 |
for system_prompt in system_list:
|
205 |
-
logger.info(f"pre caching {system_prompt}")
|
206 |
-
input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
|
207 |
output = self.llm.create_completion(
|
208 |
input_ids,
|
209 |
stream=False,
|
@@ -215,8 +211,15 @@ class Qwen2Simulator(Simulator):
|
|
215 |
# disable cache after
|
216 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
217 |
|
218 |
-
def
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
|
222 |
bot = Qwen2Simulator()
|
|
|
188 |
logger.info(
|
189 |
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
190 |
|
191 |
+
#
|
192 |
+
self.post_cache(suffix_tokens)
|
|
|
|
|
|
|
|
|
193 |
|
194 |
def pre_cache_system(self, system_list):
|
195 |
+
""" warmup for system prompt
|
196 |
:param system_list:
|
197 |
:return:
|
198 |
"""
|
199 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
200 |
for system_prompt in system_list:
|
201 |
+
logger.info(f"pre caching '{system_prompt}'")
|
202 |
+
input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
|
203 |
output = self.llm.create_completion(
|
204 |
input_ids,
|
205 |
stream=False,
|
|
|
211 |
# disable cache after
|
212 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
213 |
|
214 |
+
def post_cache(self, suffix_tokens):
|
215 |
+
""" warmup for next turn generation
|
216 |
+
:param suffix_tokens:
|
217 |
+
:return:
|
218 |
+
"""
|
219 |
+
if suffix_tokens:
|
220 |
+
logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
|
221 |
+
self.llm.eval([151645, 198] + suffix_tokens) # <|im_end|>\n
|
222 |
+
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
223 |
|
224 |
|
225 |
bot = Qwen2Simulator()
|