Spaces:

darpan-jain
/

llm-chat

Runtime error

App Files Files Community

Darpan commited on Apr 13, 2023

Commit

632b9b5

•

1 Parent(s): b5160f5

Add script for Chat demo

Browse files

Files changed (1) hide show

app_chat.py +106 -0

app_chat.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
+from peft import PeftModel
+import torch
+import transformers
+import gradio as gr
+import time
+MODEL = "decapoda-research/llama-7b-hf"
+LORA_WEIGHTS = "tloen/alpaca-lora-7b"
+device = "cpu"
+print(f"Model device = {device}", flush=True)
+def load_model():
+    tokenizer = LlamaTokenizer.from_pretrained(MODEL)
+    model = LlamaForCausalLM.from_pretrained(MODEL, device_map={"": device}, low_cpu_mem_usage=True)
+    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, device_map={"": device}, torch_dtype=torch.float16)
+    model.eval()
+    return model, tokenizer
+def generate_prompt(input):
+        return f""" Below A dialog, where User interacts with you - the AI.
+        ### Instruction: AI is helpful, kind, obedient, honest, and knows its own limits.
+        ### User: {input}
+        ### Response:
+        """
+def eval_prompt(
+        model,
+        tokenizer,
+        input: str,
+        temparature = 0.7,
+        top_p = 0.75,
+        top_k = 40,
+        num_beams = 1,
+        max_new_tokens = 128,
+        **kwargs):
+        prompt = generate_prompt(input)
+        inputs = tokenizer(prompt, return_tensors = "pt")
+        input_ids = inputs["input_ids"]
+        generation_config = GenerationConfig(
+            temparatue = temparature,
+            top_p = top_p,
+            top_k = top_k,
+            num_beams = num_beams,
+            repetition_penalty = 1.17,
+            ** kwargs,)
+        # with torch.inference_mode():
+        with torch.no_grad():
+            generation_output = model.generate(
+                input_ids = input_ids,
+                generation_config = generation_config,
+                return_dict_in_generate = True,
+                output_scores = True,
+                max_new_tokens = max_new_tokens,
+            )
+            s = generation_output.sequences[0]
+            response = tokenizer.decode(s)
+            print(f"Bot response: {response.split('### Response:')[-1].strip()}")
+            bot_response = response.split("### Response:")[-1].strip()
+            return bot_response
+def run_app(model, tokenizer):
+    with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=True) as chat:
+        chatbot = gr.Chatbot(label = "Alpaca Demo")
+        msg = gr.Textbox(show_label = False, placeholder = "Enter your text here")
+        clear = gr.Button("Clear")
+        temparature = gr.Slider(minimum=0, maximum=1, value=0.8, label="Temparature")
+        def user(user_msg, history):
+            return "", history + [[user_msg, None]]
+        def bot(history):
+            print("Processing user input for Alpaca response...")
+            last_input = history[-1][0]
+            print(f"User input = {last_input}")
+            tick = time.time()
+            bot_response = eval_prompt(model, tokenizer, last_input)
+            print(f"Inference time = {time.time() - tick} seconds")
+            history[-1][1] = bot_response
+            print("Response generated and added to history.\n")
+            return history
+        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+            bot, chatbot, chatbot
+        )
+        clear.click(lambda: None, None, chatbot, queue=False)
+    chat.queue()
+    chat.launch(share=True)
+if __name__ == "__main__":
+    model, tokenizer = load_model()
+    # Run the actual gradio app
+    run_app(model, tokenizer)