mobinln commited on
Commit
1cd873c
1 Parent(s): 637f425

add stream

Browse files
Files changed (1) hide show
  1. app.py +20 -7
app.py CHANGED
@@ -5,12 +5,12 @@ model = "Qwen/Qwen2-7B-Instruct-GGUF"
5
  llm = Llama.from_pretrained(
6
  repo_id=model,
7
  filename="qwen2-7b-instruct-q4_k_m.gguf",
8
- verbose=False,
9
  use_mmap=False,
10
  use_mlock=True,
11
  n_threads=2,
12
  n_threads_batch=2,
13
- n_ctx=40000,
14
  )
15
 
16
 
@@ -32,13 +32,26 @@ def respond(
32
 
33
  messages.append({"role": "user", "content": message})
34
 
35
- response = llm.create_chat_completion(
36
- messages=messages,
 
 
 
 
 
 
 
 
 
37
  max_tokens=max_tokens,
 
38
  temperature=temperature,
39
  top_p=top_p,
40
- )
41
- return response["choices"][0]["message"]["content"]
 
 
 
42
 
43
 
44
  demo = gr.ChatInterface(
@@ -48,7 +61,7 @@ demo = gr.ChatInterface(
48
  value="You are a helpful assistant.",
49
  label="System message",
50
  ),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
  gr.Slider(
54
  minimum=0.1,
 
5
  llm = Llama.from_pretrained(
6
  repo_id=model,
7
  filename="qwen2-7b-instruct-q4_k_m.gguf",
8
+ verbose=True,
9
  use_mmap=False,
10
  use_mlock=True,
11
  n_threads=2,
12
  n_threads_batch=2,
13
+ n_ctx=8000,
14
  )
15
 
16
 
 
32
 
33
  messages.append({"role": "user", "content": message})
34
 
35
+ # response = llm.create_chat_completion(
36
+ # messages=messages,
37
+ # max_tokens=max_tokens,
38
+ # temperature=temperature,
39
+ # top_p=top_p,
40
+ # )
41
+ # return response["choices"][0]["message"]["content"]
42
+ response = ""
43
+
44
+ for message in llm.create_chat_completion(
45
+ messages,
46
  max_tokens=max_tokens,
47
+ stream=True,
48
  temperature=temperature,
49
  top_p=top_p,
50
+ ):
51
+ token = message.choices[0].delta.content
52
+
53
+ response += token
54
+ yield response
55
 
56
 
57
  demo = gr.ChatInterface(
 
61
  value="You are a helpful assistant.",
62
  label="System message",
63
  ),
64
+ gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
65
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
66
  gr.Slider(
67
  minimum=0.1,