from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import pipeline import gradio as gr import time REPO_ID = "sayanbanerjee32/ms-phi2-qlora-oasst1" model = AutoModelForCausalLM.from_pretrained(REPO_ID) tokenizer = AutoTokenizer.from_pretrained(REPO_ID) def generate_text(prompt, chat_history, num_new_tokens = 100): # prompt = "<|prompter|>What is 2 + 2?<|endoftext|><|assistant|>" # change to your desired prompt input_prompt = '' if len(chat_history) > 0: input_prompt += "<|prompter|>" + chat_history[-1][0] + "<|endoftext|><|assistant|>" + chat_history[-1][1] + "<|endoftext|>" input_prompt += "<|prompter|>" + prompt + "<|endoftext|><|assistant|>" # Count the number of tokens in the prompt num_prompt_tokens = len(tokenizer(input_prompt)['input_ids']) # Calculate the maximum length for the generation max_length = num_prompt_tokens + num_new_tokens gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length= max_length ) result = gen(prompt) return result[0]['generated_text'].replace(prompt, '') with gr.Blocks() as demo: gr.HTML("