Spaces:
Runtime error
Runtime error
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
from peft import PeftModel | |
import torch | |
import transformers | |
import gradio as gr | |
import time | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
# Dump logs to a file | |
logging.getLogger().addHandler(logging.FileHandler("app_chat.log")) | |
MODEL = "decapoda-research/llama-7b-hf" | |
LORA_WEIGHTS = "tloen/alpaca-lora-7b" | |
device = "cpu" | |
print(f"Model device = {device}", flush=True) | |
def load_model(): | |
logging.info("Loading model...") | |
tokenizer = LlamaTokenizer.from_pretrained(MODEL) | |
model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True) | |
model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16) | |
model.eval() | |
logging.info("Model loaded.") | |
return model, tokenizer | |
def generate_prompt(input): | |
return f""" Below A dialog, where User interacts with you - the AI. | |
### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits. | |
### User: {input} | |
### Response: | |
""" | |
def eval_prompt( | |
model, | |
tokenizer, | |
input: str, | |
temparature = 0.7, | |
top_p = 0.75, | |
top_k = 40, | |
num_beams = 1, | |
max_new_tokens = 128, | |
**kwargs): | |
prompt = generate_prompt(input) | |
inputs = tokenizer(prompt, return_tensors = "pt") | |
input_ids = inputs["input_ids"] | |
generation_config = GenerationConfig( | |
temparatue = temparature, | |
top_p = top_p, | |
top_k = top_k, | |
num_beams = num_beams, | |
repetition_penalty = 1.17, | |
** kwargs,) | |
# with torch.inference_mode(): | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids = input_ids, | |
generation_config = generation_config, | |
return_dict_in_generate = True, | |
output_scores = True, | |
max_new_tokens = max_new_tokens, | |
) | |
s = generation_output.sequences[0] | |
response = tokenizer.decode(s) | |
print(f"Bot response: {response.split('### Response:')[-1].strip()}") | |
bot_response = response.split("### Response:")[-1].strip() | |
return bot_response | |
def run_app(model, tokenizer): | |
logging.info("Starting chat app...") | |
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat: | |
chatbot = gr.Chatbot(label = "Alpaca Demo") | |
msg = gr.Textbox(show_label = False, placeholder = "Enter your text here") | |
clear = gr.Button("Clear") | |
def user(user_msg, history): | |
logging.info("User input received.") | |
return "", history + [[user_msg, None]] | |
def bot(history): | |
logging.info("Processing user input for Alpaca response...") | |
last_input = history[-1][0] | |
logging.info(f"User input = {last_input}") | |
tick = time.time() | |
bot_response = eval_prompt(model, tokenizer, last_input) | |
logging.info(f"Inference time = {time.time() - tick} seconds") | |
history[-1][1] = bot_response | |
logging.info("Response generated and added to history.\n") | |
return history | |
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
bot, chatbot, chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
chat.queue() | |
chat.launch(share=True) | |
if __name__ == "__main__": | |
model, tokenizer = load_model() | |
# Run the actual gradio app | |
run_app(model, tokenizer) | |