test_mistral_7b_on_cpu

Sleeping

App Files Files Community

thobuiq commited on Jan 27

Commit

d15855c

•

1 Parent(s): 0610b0d

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -63

app.py CHANGED Viewed

@@ -1,66 +1,35 @@
-import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
-from threading import Thread
-# Loading the tokenizer and model from Hugging Face's model hub.
-tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-# using CUDA for an optimal experience
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model = model.to(device)
-# Defining a custom stopping criteria class for the model's text generation.
-class StopOnTokens(StoppingCriteria):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        stop_ids = [2]  # IDs of tokens where the generation should stop.
-        for stop_id in stop_ids:
-            if input_ids[0][-1] == stop_id:  # Checking if the last generated token is a stop token.
-                return True
-        return False
-# Function to generate model predictions.
-def predict(message, history):
-    history_transformer_format = history + [[message, ""]]
-    stop = StopOnTokens()
-    # Formatting the input for the model.
-    messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
-                        for item in history_transformer_format])
-    model_inputs = tokenizer([messages], return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        model_inputs,
-        streamer=streamer,
-        max_new_tokens=1024,
-        do_sample=True,
-        top_p=0.95,
-        top_k=50,
-        temperature=0.7,
-        num_beams=1,
-        stopping_criteria=StoppingCriteriaList([stop])
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()  # Starting the generation in a separate thread.
-    partial_message = ""
-    for new_token in streamer:
-        partial_message += new_token
-        if '</s>' in partial_message:  # Breaking the loop if the stop token is generated.
-            break
-        yield partial_message
-# Setting up the Gradio chat interface.
-gr.ChatInterface(predict,
-                 title="Tinyllama_chatBot",
-                 description="Ask Tiny llama any questions",
-                 examples=['How to cook a fish?', 'Who is the president of US now?']
-                 ).launch()  # Launching the web interface.

+import os
+import chainlit as cl
+from ctransformers import AutoModelForCausalLM
+# Runs when the chat starts
+@cl.on_chat_start
+def main():
+    # Create the llm
+    llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
+                                               model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
+                                               model_type="mistral",
+                                               temperature=0.7,
+                                               gpu_layers=0,
+                                               stream=True,
+                                               threads=int(os.cpu_count() / 2),
+                                               max_new_tokens=10000)
+    # Store the llm in the user session
+    cl.user_session.set("llm", llm)
+# Runs when a message is sent
+@cl.on_message
+async def main(message: cl.Message):
+    # Retrieve the chain from the user session
+    llm = cl.user_session.get("llm")
+    msg = cl.Message(
+        content="",
     )
+    prompt = f"[INST]{message.content}[/INST]"
+    for text in llm(prompt=prompt):
+        await msg.stream_token(text)
+    await msg.send()