Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
from PIL import Image | |
import torch | |
import spaces | |
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") | |
model = AutoModelForVision2Seq.from_pretrained( | |
"HuggingFaceTB/SmolVLM-Instruct", | |
torch_dtype=torch.bfloat16, | |
).to("cuda") | |
DEFAULT_ASSISTANT_PREFIX = "Let's think step by step:" | |
DEFAULT_DECODING_STRATEGY = "Top P Sampling" | |
DEFAULT_TEMPERATURE = 0.4 | |
DEFAULT_MAX_NEW_TOKENS = 512 | |
DEFAULT_REPETITION_PENALTY = 1.2 | |
DEFAULT_TOP_P = 0.8 | |
def model_inference(images, text): | |
# Kullanıcıdan gelen metin ve görsel sorgularını işleyin | |
if text == "" and not images: | |
return "Please input a query and optionally image(s)." | |
if text == "" and images: | |
return "Please input a text query along with the image(s)." | |
if isinstance(images, Image.Image): | |
images = [images] | |
text = f"{DEFAULT_ASSISTANT_PREFIX} {text}" | |
resulting_messages = [ | |
{ | |
"role": "user", | |
"content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}] | |
} | |
] | |
prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True) | |
inputs = processor(text=prompt, images=images, return_tensors="pt") | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
generation_args = { | |
"max_new_tokens": DEFAULT_MAX_NEW_TOKENS, | |
"repetition_penalty": DEFAULT_REPETITION_PENALTY, | |
"temperature": DEFAULT_TEMPERATURE, | |
"do_sample": (DEFAULT_DECODING_STRATEGY == "Top P Sampling"), | |
"top_p": DEFAULT_TOP_P if DEFAULT_DECODING_STRATEGY == "Top P Sampling" else None, | |
} | |
generation_args.update(inputs) | |
generated_ids = model.generate(**generation_args) | |
generated_texts = processor.batch_decode( | |
generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True | |
) | |
return generated_texts[0] | |
examples = [ | |
["image1.jpeg", "What does this painting tell us explain in detail?"], | |
["image2.jpg", "What does this painting tell us explain in detail?"], | |
["image3.jpg", "Describe the scene in this picture."] | |
] | |
with gr.Blocks() as demo: | |
gr.Markdown("## SmolVLM Vision Instruct Demo with Example Inputs") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image(label="Input Picture", type="pil", interactive=True) | |
query_input = gr.Textbox(label="Question", interactive=True) | |
submit_btn = gr.Button("Submit") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Output Text", interactive=False) | |
gr.Examples( | |
examples=examples, | |
inputs=[image_input, query_input], | |
outputs=output_text, | |
fn=model_inference | |
) | |
submit_btn.click( | |
model_inference, | |
inputs=[image_input, query_input], | |
outputs=output_text | |
) | |
demo.launch(debug=True) | |