Spaces:

darpan-jain
/

llm-chat

Runtime error

File size: 3,442 Bytes

632b9b5

from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from peft import PeftModel
import torch
import transformers
import gradio as gr
import time

MODEL = "decapoda-research/llama-7b-hf"
LORA_WEIGHTS = "tloen/alpaca-lora-7b"
device = "cpu"
print(f"Model device = {device}", flush=True)

def load_model():
    tokenizer = LlamaTokenizer.from_pretrained(MODEL)
    model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True)
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16)
    model.eval()

    return model, tokenizer

def generate_prompt(input):
        return f""" Below A dialog, where User interacts with you - the AI.
        
        ### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits.
        
        ### User: {input}
        
        ### Response:
        """

def eval_prompt(
        model,
        tokenizer,
        input: str,
        temparature = 0.7,
        top_p = 0.75,
        top_k = 40,
        num_beams = 1,
        max_new_tokens = 128,
        **kwargs):

        prompt = generate_prompt(input)
        inputs = tokenizer(prompt, return_tensors = "pt")
        input_ids = inputs["input_ids"]
        generation_config = GenerationConfig(
            temparatue = temparature,
            top_p = top_p,
            top_k = top_k,
            num_beams = num_beams,
            repetition_penalty = 1.17,
            ** kwargs,)

        # with torch.inference_mode():
        with torch.no_grad():
            generation_output = model.generate(
                input_ids = input_ids,
                generation_config = generation_config,
                return_dict_in_generate = True,
                output_scores = True,
                max_new_tokens = max_new_tokens,
            )
            s = generation_output.sequences[0]
            response = tokenizer.decode(s)
            print(f"Bot response: {response.split('### Response:')[-1].strip()}")
            bot_response = response.split("### Response:")[-1].strip()
            return bot_response

def run_app(model, tokenizer):
    with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat:
        chatbot = gr.Chatbot(label = "Alpaca Demo")
        msg = gr.Textbox(show_label = False, placeholder = "Enter your text here")
        clear = gr.Button("Clear")
        temparature = gr.Slider(minimum=0, maximum=1, value=0.8, label="Temparature")

        def user(user_msg, history):
            return "", history + [[user_msg, None]]

        def bot(history):
            print("Processing user input for Alpaca response...")
            last_input = history[-1][0]
            print(f"User input = {last_input}")

            tick = time.time()
            bot_response = eval_prompt(model, tokenizer, last_input)
            print(f"Inference time = {time.time() - tick} seconds")

            history[-1][1] = bot_response
            print("Response generated and added to history.\n")
            return history

        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
            bot, chatbot, chatbot
        )

        clear.click(lambda: None, None, chatbot, queue=False)


    chat.queue()
    chat.launch(share=True)


if __name__ == "__main__":
    model, tokenizer = load_model()

    # Run the actual gradio app
    run_app(model, tokenizer)