PHI35VISION

Runtime error

App Files Files Community

PHI35VISION / app.py

aiqtech

Update app.py

f6a98e4 verified 18 days ago

raw

history blame

7.52 kB

	import spaces
	import os
	import time
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig, AutoProcessor
	import gradio as gr
	from threading import Thread
	from PIL import Image
	import subprocess

	# Install flash-attn if not already installed
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	# Model and tokenizer for the chatbot
	MODEL_ID1 = "microsoft/Phi-3.5-mini-instruct"
	MODEL_LIST1 = ["microsoft/Phi-3.5-mini-instruct"]
	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage / But you need GPU :)

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID1)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID1,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	quantization_config=quantization_config)

	# Chatbot tab function
	@spaces.GPU()
	def stream_chat(
	message: str,
	history: list,
	system_prompt: str,
	temperature: float = 0.8,
	max_new_tokens: int = 1024,
	top_p: float = 1.0,
	top_k: int = 20,
	penalty: float = 1.2,
	):
	print(f'message: {message}')
	print(f'history: {history}')

	conversation = [
	{"role": "system", "content": system_prompt}
	]
	for prompt, answer in history:
	conversation.extend([
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": answer},
	])

	conversation.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)

	streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)

	generate_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens = max_new_tokens,
	do_sample = False if temperature == 0 else True,
	top_p = top_p,
	top_k = top_k,
	temperature = temperature,
	eos_token_id=[128001,128008,128009],
	streamer=streamer,
	)

	with torch.no_grad():
	thread = Thread(target=model.generate, kwargs=generate_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer

	# Vision model setup
	models = {
	"microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
	}

	processors = {
	"microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
	}

	user_prompt = '\n'
	assistant_prompt = '\n'
	prompt_suffix = "\n"

	# Vision model tab function
	@spaces.GPU()
	def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
	model = models[model_id]
	processor = processors[model_id]

	# Prepare the image list and corresponding tags
	images = [Image.fromarray(image).convert("RGB")]
	placeholder = "<\|image_1\|>\n" # Using the image tag as per the example

	# Construct the prompt with the image tag and the user's text input
	if text_input:
	prompt_content = placeholder + text_input
	else:
	prompt_content = placeholder

	messages = [
	{"role": "user", "content": prompt_content},
	]

	# Apply the chat template to the messages
	prompt = processor.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process the inputs with the processor
	inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")

	# Generation parameters
	generation_args = {
	"max_new_tokens": 1000,
	"temperature": 0.0,
	"do_sample": False,
	}

	# Generate the response
	generate_ids = model.generate(
	**inputs,
	eos_token_id=processor.tokenizer.eos_token_id,
	**generation_args
	)

	# Remove input tokens from the generated response
	generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]

	# Decode the generated output
	response = processor.batch_decode(
	generate_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return response



	css = """
	footer {
	visibility: hidden;
	}
	"""

	# Gradio app with two tabs
	with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:

	with gr.Tab("Chatbot"):
	chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
	gr.ChatInterface(
	fn=stream_chat,
	chatbot=chatbot,
	fill_height=True,
	additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful assistant",
	label="System Prompt",
	render=False,
	),
	gr.Slider(
	minimum=0,
	maximum=1,
	step=0.1,
	value=0.8,
	label="Temperature",
	render=False,
	),
	gr.Slider(
	minimum=128,
	maximum=8192,
	step=1,
	value=1024,
	label="Max new tokens",
	render=False,
	),
	gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	label="top_p",
	render=False,
	),
	gr.Slider(
	minimum=1,
	maximum=20,
	step=1,
	value=20,
	label="top_k",
	render=False,
	),
	gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.2,
	label="Repetition penalty",
	render=False,
	),
	],
	examples=[
	["How to make a self-driving car?"],
	["Give me a creative idea to establish a startup"],
	["How can I improve my programming skills?"],
	["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
	],
	cache_examples=False,
	)
	with gr.Tab("Vision"):
	with gr.Row():
	input_img = gr.Image(label="Input Picture")
	with gr.Row():
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
	with gr.Row():
	text_input = gr.Textbox(label="Question")
	with gr.Row():
	submit_btn = gr.Button(value="Submit")
	with gr.Row():
	output_text = gr.Textbox(label="Output Text")

	submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])

	gr.HTML(footer)

	# Launch the combined app
	demo.launch(debug=True)