llm-chat / app_chat.py
Darpan
Add script for Chat demo
632b9b5
raw
history blame
3.44 kB
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from peft import PeftModel
import torch
import transformers
import gradio as gr
import time
MODEL = "decapoda-research/llama-7b-hf"
LORA_WEIGHTS = "tloen/alpaca-lora-7b"
device = "cpu"
print(f"Model device = {device}", flush=True)
def load_model():
tokenizer = LlamaTokenizer.from_pretrained(MODEL)
model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True)
model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16)
model.eval()
return model, tokenizer
def generate_prompt(input):
return f""" Below A dialog, where User interacts with you - the AI.
### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits.
### User: {input}
### Response:
"""
def eval_prompt(
model,
tokenizer,
input: str,
temparature = 0.7,
top_p = 0.75,
top_k = 40,
num_beams = 1,
max_new_tokens = 128,
**kwargs):
prompt = generate_prompt(input)
inputs = tokenizer(prompt, return_tensors = "pt")
input_ids = inputs["input_ids"]
generation_config = GenerationConfig(
temparatue = temparature,
top_p = top_p,
top_k = top_k,
num_beams = num_beams,
repetition_penalty = 1.17,
** kwargs,)
# with torch.inference_mode():
with torch.no_grad():
generation_output = model.generate(
input_ids = input_ids,
generation_config = generation_config,
return_dict_in_generate = True,
output_scores = True,
max_new_tokens = max_new_tokens,
)
s = generation_output.sequences[0]
response = tokenizer.decode(s)
print(f"Bot response: {response.split('### Response:')[-1].strip()}")
bot_response = response.split("### Response:")[-1].strip()
return bot_response
def run_app(model, tokenizer):
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat:
chatbot = gr.Chatbot(label = "Alpaca Demo")
msg = gr.Textbox(show_label = False, placeholder = "Enter your text here")
clear = gr.Button("Clear")
temparature = gr.Slider(minimum=0, maximum=1, value=0.8, label="Temparature")
def user(user_msg, history):
return "", history + [[user_msg, None]]
def bot(history):
print("Processing user input for Alpaca response...")
last_input = history[-1][0]
print(f"User input = {last_input}")
tick = time.time()
bot_response = eval_prompt(model, tokenizer, last_input)
print(f"Inference time = {time.time() - tick} seconds")
history[-1][1] = bot_response
print("Response generated and added to history.\n")
return history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, chatbot, chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
chat.queue()
chat.launch(share=True)
if __name__ == "__main__":
model, tokenizer = load_model()
# Run the actual gradio app
run_app(model, tokenizer)