test_mistral_7b_on_cpu

Sleeping

App Files Files Community

thobuiq commited on Jan 27

Commit

5059db6

•

1 Parent(s): 2acde50

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -14

app.py CHANGED Viewed

@@ -1,23 +1,12 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from threading import Thread
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-)
 # Loading the tokenizer and model from Hugging Face's model hub.
-tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",
-        load_in_4bit=True,
-        quantization_config=bnb_config,
-        torch_dtype=torch.bfloat16,
-        device_map="cpu",
-        trust_remote_code=True)
 # using CUDA for an optimal experience
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -34,11 +23,13 @@ class StopOnTokens(StoppingCriteria):
         return False
 # Function to generate model predictions.
 def predict(message, history):
     history_transformer_format = history + [[message, ""]]
     stop = StopOnTokens()
     # Formatting the input for the model.
     messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
                         for item in history_transformer_format])
@@ -65,6 +56,8 @@ def predict(message, history):
         yield partial_message
 # Setting up the Gradio chat interface.
 gr.ChatInterface(predict,
                  title="Tinyllama_chatBot",

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from threading import Thread
 # Loading the tokenizer and model from Hugging Face's model hub.
+tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 # using CUDA for an optimal experience
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         return False
 # Function to generate model predictions.
 def predict(message, history):
     history_transformer_format = history + [[message, ""]]
     stop = StopOnTokens()
     # Formatting the input for the model.
     messages = "</s>".join(["</s>".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]])
                         for item in history_transformer_format])
         yield partial_message
 # Setting up the Gradio chat interface.
 gr.ChatInterface(predict,
                  title="Tinyllama_chatBot",