xu song commited on
Commit
b70508d
1 Parent(s): 73119ac
Files changed (2) hide show
  1. app.py +3 -3
  2. models/cpp_qwen2.py +14 -11
app.py CHANGED
@@ -48,7 +48,7 @@ with gr.Blocks() as demo:
48
  show_share_button=True,
49
  avatar_images=("assets/man.png", "assets/bot.png"))
50
 
51
- gr.Textbox("For faster inference, you can build locally with ")
52
  # ss
53
  with gradio.Tab("Self Chat"):
54
  input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
@@ -64,7 +64,7 @@ with gr.Blocks() as demo:
64
  visible=True)
65
 
66
  # 也叫 chat-assistant,
67
- with gradio.Tab("Response Generator"):
68
  with gr.Row():
69
  input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
70
  generate_btn_2 = gr.Button("Send", variant="primary")
@@ -75,7 +75,7 @@ with gr.Blocks() as demo:
75
  gr.Markdown("Response simulator is the most commonly used chatbot.")
76
 
77
  #
78
- with gradio.Tab("User Simulator"):
79
  with gr.Row():
80
  input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
81
  generate_btn_3 = gr.Button("Send", variant="primary")
 
48
  show_share_button=True,
49
  avatar_images=("assets/man.png", "assets/bot.png"))
50
 
51
+ # gr.Textbox("For faster inference, you can build locally with ")
52
  # ss
53
  with gradio.Tab("Self Chat"):
54
  input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
 
64
  visible=True)
65
 
66
  # 也叫 chat-assistant,
67
+ with gradio.Tab("Response Generator", visible=False):
68
  with gr.Row():
69
  input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
70
  generate_btn_2 = gr.Button("Send", variant="primary")
 
75
  gr.Markdown("Response simulator is the most commonly used chatbot.")
76
 
77
  #
78
+ with gradio.Tab("User Simulator", visible=False):
79
  with gr.Row():
80
  input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
81
  generate_btn_3 = gr.Button("Send", variant="primary")
models/cpp_qwen2.py CHANGED
@@ -188,22 +188,18 @@ class Qwen2Simulator(Simulator):
188
  logger.info(
189
  f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
190
 
191
- # warmup for next turn (下轮解码的加速)
192
- if suffix_tokens:
193
- # <|im_end|>\n
194
- logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
195
- self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
196
- logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
197
 
198
  def pre_cache_system(self, system_list):
199
- """
200
  :param system_list:
201
  :return:
202
  """
203
  logger.info(f"cache size {self.llm.cache.cache_size}")
204
  for system_prompt in system_list:
205
- logger.info(f"pre caching {system_prompt}")
206
- input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
207
  output = self.llm.create_completion(
208
  input_ids,
209
  stream=False,
@@ -215,8 +211,15 @@ class Qwen2Simulator(Simulator):
215
  # disable cache after
216
  llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
217
 
218
- def complete(self):
219
- pass
 
 
 
 
 
 
 
220
 
221
 
222
  bot = Qwen2Simulator()
 
188
  logger.info(
189
  f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
190
 
191
+ #
192
+ self.post_cache(suffix_tokens)
 
 
 
 
193
 
194
  def pre_cache_system(self, system_list):
195
+ """ warmup for system prompt
196
  :param system_list:
197
  :return:
198
  """
199
  logger.info(f"cache size {self.llm.cache.cache_size}")
200
  for system_prompt in system_list:
201
+ logger.info(f"pre caching '{system_prompt}'")
202
+ input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
203
  output = self.llm.create_completion(
204
  input_ids,
205
  stream=False,
 
211
  # disable cache after
212
  llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
213
 
214
+ def post_cache(self, suffix_tokens):
215
+ """ warmup for next turn generation
216
+ :param suffix_tokens:
217
+ :return:
218
+ """
219
+ if suffix_tokens:
220
+ logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
221
+ self.llm.eval([151645, 198] + suffix_tokens) # <|im_end|>\n
222
+ logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
223
 
224
 
225
  bot = Qwen2Simulator()