Spaces:
Runtime error
Runtime error
File size: 4,502 Bytes
dffa7a8 5c3f89d dffa7a8 5c3f89d d1d6b48 dffa7a8 d1d6b48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from ctransformers import AutoModelForCausalLM,AutoConfig,AutoTokenizer
from transformers import TextIteratorStreamer
import torch
import gradio as gr
from threading import Thread
hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct-GGUF"
model_file = "Mistral-7b-instruct-cairo-instruct.Q4_k.gguf"
DEVICE,hw,layers = ("cpu",True,0) if torch.cuda.is_available() else ("cpu",False,0)
print("loading LLM")
# Load model
config = AutoConfig.from_pretrained("TheBloke/Mistral-7B-v0.1-GGUF")
config.max_seq_len = 4096
config.max_answer_len= 1024
model = AutoModelForCausalLM.from_pretrained(hub_name, model_file=model_file, model_type="mistral", gpu_layers=layers,
config=config,
compress_pos_emb=2,
top_k=4000,
top_p=0.99,
temperature=0.0001,
do_sample=True,
)
def fmt_history(history) -> str:
return "\n".join(["User: \"{usr_query}\", Assistant: \"{your_resp}\"".format(
usr_query=usr_query.replace("\n",""), your_resp=your_resp.format("\n",""))
for usr_query, your_resp in history])
def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
text =f"""
[INST]
<<SYS>>
A student asks you a question about Cairo 1. Provide a concise answer to the student's questions,do not expand the subject of the question, do not introduce any new topics or new question not provided by the student.
Make sure the explanations never be longer than 300 words.Don’t justify your answers. Don’t give information not mentioned in the CONTEXT INFORMATION.provide only one solution <SYS>>
Question: I'm working in Cairo 1 :{user_text}
[/INST]
"""
model_output = ""
for text in model(text, stream=True,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,temperature=temperature):
model_output += text
yield model_output
return model_output
def reset_textbox():
return gr.update(value='')
with gr.Blocks() as demo:
duplicate_link = "https://huggingface.co/spaces/joaogante/transformers_streaming?duplicate=true"
gr.Markdown(
"# 🔥 Mistral Cairo 🔥\n"
f"[{hub_name}](https://huggingface.co/{hub_name})\n\n"
)
with gr.Row():
with gr.Column(scale=4):
# user_text = gr.Textbox(
# placeholder="Write an email about an alpaca that likes flan",
# label="User input"
# )
# model_output = gr.Markdown(label="Model output", lines=10, interactive=False)
# button_submit = gr.Button(value="Submit")
chatbot = gr.Chatbot()
msg = gr.Textbox()
clear = gr.Button("Clear")
def user(user_message, history):
return "", history + [[user_message, None]]
def respond(history):
message = history[-1][0]
print(f"User: {message}")
print(f"top_p {top_p.value}, temperature {temperature.value}, top_k {top_k.value}, max_new_tokens {max_new_tokens.value}")
bot_message = run_generation(message,top_p.value, temperature.value, top_k.value, max_new_tokens.value)
for character in bot_message:
history[-1][1] = character
yield history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(respond, chatbot, chatbot)
clear.click(lambda: None, None, chatbot, queue=False)
with gr.Column(scale=1):
max_new_tokens = gr.Slider(
minimum=1, maximum=2000, value=2000, step=1, interactive=True, label="Max New Tokens",
)
top_p = gr.Slider(
minimum=0.05, maximum=1.0, value=0.99, step=0.05, interactive=True, label="Top-p (nucleus sampling)",
)
top_k = gr.Slider(
minimum=40, maximum=5000, value=4000, step=10, interactive=True, label="Top-k",
)
temperature = gr.Slider(
minimum=0.01, maximum=0.4, value=0.0001, step=0.1, interactive=True, label="Temperature",
)
# user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
# button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
#demo.queue(max_size=32).launch(enable_queue=True)
demo.queue()
demo.launch()
|