import spaces import os import time import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, AutoProcessor import gradio as gr from threading import Thread from PIL import Image import subprocess # Install flash-attn if not already installed subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Define placeholder and footer PLACEHOLDER = "Send a message..." footer = """

Powered by Phi-3.5 Models

""" # Model and tokenizer for the chatbot MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct" MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"] HF_TOKEN = os.environ.get("HF_TOKEN", None) device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage / But you need GPU :) quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1) model = AutoModelForCausalLM.from_pretrained( MODEL_ID1, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config) # Chatbot tab function @spaces.GPU() def stream_chat( message: str, history: list, system_prompt: str, temperature: float = 0.8, max_new_tokens: int = 1024, top_p: float = 1.0, top_k: int = 20, penalty: float = 1.2, ): print(f'message: {message}') print(f'history: {history}') conversation = [ {"role": "system", "content": system_prompt} ] for prompt, answer in history: conversation.extend([ {"role": "user", "content": prompt}, {"role": "assistant", "content": answer}, ]) conversation.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, max_new_tokens = max_new_tokens, do_sample = False if temperature == 0 else True, top_p = top_p, top_k = top_k, temperature = temperature, eos_token_id=[128001,128008,128009], streamer=streamer, ) with torch.no_grad(): thread = Thread(target=model.generate, kwargs=generate_kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text yield buffer # Vision model setup models = { "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval() } processors = { "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True) } user_prompt = '\n' assistant_prompt = '\n' prompt_suffix = "\n" # Vision model tab function @spaces.GPU() def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"): model = models[model_id] processor = processors[model_id] # Prepare the image list and corresponding tags images = [Image.fromarray(image).convert("RGB")] placeholder = "<|image_1|>\n" # Using the image tag as per the example # Construct the prompt with the image tag and the user's text input if text_input: prompt_content = placeholder + text_input else: prompt_content = placeholder messages = [ {"role": "user", "content": prompt_content}, ] # Apply the chat template to the messages prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Process the inputs with the processor inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") # Generation parameters generation_args = { "max_new_tokens": 1000, "temperature": 0.0, "do_sample": False, } # Generate the response generate_ids = model.generate( **inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args ) # Remove input tokens from the generated response generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] # Decode the generated output response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response css = """ footer { visibility: hidden; } """ # Gradio app with two tabs with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Tab("Chatbot"): chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) gr.ChatInterface( fn=stream_chat, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Textbox( value="You are a helpful assistant", label="System Prompt", render=False, ), gr.Slider( minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature", render=False, ), gr.Slider( minimum=128, maximum=8192, step=1, value=1024, label="Max new tokens", render=False, ), gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p", render=False, ), gr.Slider( minimum=1, maximum=20, step=1, value=20, label="top_k", render=False, ), gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=1.2, label="Repetition penalty", render=False, ), ], examples=[ ["How to make a self-driving car?"], ["Give me a creative idea to establish a startup"], ["How can I improve my programming skills?"], ["Show me a code snippet of a website's sticky header in CSS and JavaScript."], ], cache_examples=False, ) with gr.Tab("Vision"): with gr.Row(): input_img = gr.Image(label="Input Picture") with gr.Row(): model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct") with gr.Row(): text_input = gr.Textbox(label="Question") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Output Text") submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text]) gr.HTML(footer) # Launch the combined app demo.launch(debug=True)