File size: 4,502 Bytes
dffa7a8
 
 
5c3f89d
dffa7a8
5c3f89d
d1d6b48
dffa7a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1d6b48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from ctransformers import AutoModelForCausalLM,AutoConfig,AutoTokenizer
from transformers import  TextIteratorStreamer
import torch
import gradio as gr
from threading import Thread


hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct-GGUF"
model_file = "Mistral-7b-instruct-cairo-instruct.Q4_k.gguf"
DEVICE,hw,layers = ("cpu",True,0) if torch.cuda.is_available() else ("cpu",False,0)





print("loading LLM")
# Load model 
config = AutoConfig.from_pretrained("TheBloke/Mistral-7B-v0.1-GGUF")

config.max_seq_len = 4096
config.max_answer_len= 1024

model = AutoModelForCausalLM.from_pretrained(hub_name, model_file=model_file, model_type="mistral", gpu_layers=layers,
        config=config,
            compress_pos_emb=2,
            top_k=4000,
            top_p=0.99,
            temperature=0.0001,
            do_sample=True,
           

)


def fmt_history(history) -> str:
    
    return "\n".join(["User: \"{usr_query}\", Assistant: \"{your_resp}\"".format(
        usr_query=usr_query.replace("\n",""), your_resp=your_resp.format("\n",""))
        for usr_query, your_resp in history])

    

def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):

    text =f"""
    [INST]
        <<SYS>>     
        A student asks you a question about Cairo 1. Provide a concise answer to the student's questions,do not expand the subject of the question, do not introduce any new topics or new question not provided by the student.
        Make sure the explanations never be longer than 300 words.Don’t justify your answers. Don’t give information not mentioned in the CONTEXT INFORMATION.provide only one solution <SYS>>

        Question: I'm working in Cairo 1 :{user_text} 
        [/INST]
    """

    model_output = ""
    for text in model(text, stream=True,max_new_tokens=max_new_tokens,top_p=top_p,top_k=top_k,temperature=temperature):
       model_output += text
       yield model_output


    return model_output



def reset_textbox():
    return gr.update(value='')

with gr.Blocks() as demo:
    duplicate_link = "https://huggingface.co/spaces/joaogante/transformers_streaming?duplicate=true"
    gr.Markdown(
        "# 🔥 Mistral Cairo 🔥\n"
        f"[{hub_name}](https://huggingface.co/{hub_name})\n\n"

    )

    with gr.Row():
        with gr.Column(scale=4):
          #  user_text = gr.Textbox(
          #      placeholder="Write an email about an alpaca that likes flan",
          #      label="User input"
          #  )
          #  model_output = gr.Markdown(label="Model output", lines=10, interactive=False)
          #  button_submit = gr.Button(value="Submit")

            chatbot = gr.Chatbot()
            msg = gr.Textbox()
            clear = gr.Button("Clear")
            
            def user(user_message, history):
                return "", history + [[user_message, None]]
            def respond(history):
                message = history[-1][0]
                print(f"User: {message}")
                print(f"top_p {top_p.value}, temperature {temperature.value}, top_k {top_k.value}, max_new_tokens {max_new_tokens.value}")
                bot_message = run_generation(message,top_p.value, temperature.value, top_k.value, max_new_tokens.value)
                for character in bot_message:
                    history[-1][1] = character
                    yield history

        
            msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(respond, chatbot, chatbot)
            clear.click(lambda: None, None, chatbot, queue=False)

        with gr.Column(scale=1):
            max_new_tokens = gr.Slider(
                minimum=1, maximum=2000, value=2000, step=1, interactive=True, label="Max New Tokens",
            )
            top_p = gr.Slider(
                minimum=0.05, maximum=1.0, value=0.99, step=0.05, interactive=True, label="Top-p (nucleus sampling)",
            )
            top_k = gr.Slider(
                minimum=40, maximum=5000, value=4000, step=10, interactive=True, label="Top-k",
            )
            temperature = gr.Slider(
                minimum=0.01, maximum=0.4, value=0.0001, step=0.1, interactive=True, label="Temperature",
            )

   # user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
   # button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)

    #demo.queue(max_size=32).launch(enable_queue=True)
    
demo.queue()
demo.launch()