llm-chat / app_chat.py
darpan-jain's picture
Add logging
ae9cafc
raw
history blame
3.7 kB
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from peft import PeftModel
import torch
import transformers
import gradio as gr
import time
import logging
logging.basicConfig(level=logging.INFO)
# Dump logs to a file
logging.getLogger().addHandler(logging.FileHandler("app_chat.log"))
MODEL = "decapoda-research/llama-7b-hf"
LORA_WEIGHTS = "tloen/alpaca-lora-7b"
device = "cpu"
print(f"Model device = {device}", flush=True)
def load_model():
logging.info("Loading model...")
tokenizer = LlamaTokenizer.from_pretrained(MODEL)
model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True)
model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16)
model.eval()
logging.info("Model loaded.")
return model, tokenizer
def generate_prompt(input):
return f""" Below A dialog, where User interacts with you - the AI.
### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits.
### User: {input}
### Response:
"""
def eval_prompt(
model,
tokenizer,
input: str,
temparature = 0.7,
top_p = 0.75,
top_k = 40,
num_beams = 1,
max_new_tokens = 128,
**kwargs):
prompt = generate_prompt(input)
inputs = tokenizer(prompt, return_tensors = "pt")
input_ids = inputs["input_ids"]
generation_config = GenerationConfig(
temparatue = temparature,
top_p = top_p,
top_k = top_k,
num_beams = num_beams,
repetition_penalty = 1.17,
** kwargs,)
# with torch.inference_mode():
with torch.no_grad():
generation_output = model.generate(
input_ids = input_ids,
generation_config = generation_config,
return_dict_in_generate = True,
output_scores = True,
max_new_tokens = max_new_tokens,
)
s = generation_output.sequences[0]
response = tokenizer.decode(s)
print(f"Bot response: {response.split('### Response:')[-1].strip()}")
bot_response = response.split("### Response:")[-1].strip()
return bot_response
def run_app(model, tokenizer):
logging.info("Starting chat app...")
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat:
chatbot = gr.Chatbot(label = "Alpaca Demo")
msg = gr.Textbox(show_label = False, placeholder = "Enter your text here")
clear = gr.Button("Clear")
def user(user_msg, history):
logging.info("User input received.")
return "", history + [[user_msg, None]]
def bot(history):
logging.info("Processing user input for Alpaca response...")
last_input = history[-1][0]
logging.info(f"User input = {last_input}")
tick = time.time()
bot_response = eval_prompt(model, tokenizer, last_input)
logging.info(f"Inference time = {time.time() - tick} seconds")
history[-1][1] = bot_response
logging.info("Response generated and added to history.\n")
return history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, chatbot, chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
chat.queue()
chat.launch(share=True)
if __name__ == "__main__":
model, tokenizer = load_model()
# Run the actual gradio app
run_app(model, tokenizer)