File size: 1,737 Bytes
1e2d981 6da1c26 1e2d981 2603190 c2aa89c 2603190 1cd873c fca7347 c2aa89c 8926d1f c2aa89c 1e2d981 68eded2 1e2d981 1cd873c 1d9d6ab 1cd873c 1e2d981 1cd873c 1e2d981 92cb988 1d9d6ab 52ae9af d47337b 027f768 d47337b 1e2d981 8a8d916 1e2d981 02743b6 6da1c26 02743b6 1cd873c 1e2d981 6da1c26 1e2d981 8a8d916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
from llama_cpp import Llama
model = "bartowski/Llama-3.2-1B-Instruct-GGUF"
llm = Llama.from_pretrained(
repo_id=model,
filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
verbose=True,
use_mmap=True,
use_mlock=True,
n_threads=2,
n_threads_batch=2,
n_ctx=2000,
)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
completion = llm.create_chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p
)
for message in completion:
delta = message['choices'][0]['delta']
if 'content' in delta:
response += delta['content']
yield response
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(
value="You are a helpful assistant.",
label="System message",
),
gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
description=model,
)
if __name__ == "__main__":
demo.launch()
|