gizemsarsinlar's picture
Update app.py
45352fb verified
raw
history blame
2.97 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
import spaces
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
"HuggingFaceTB/SmolVLM-Instruct",
torch_dtype=torch.bfloat16,
).to("cuda")
DEFAULT_ASSISTANT_PREFIX = "Let's think step by step:"
DEFAULT_DECODING_STRATEGY = "Top P Sampling"
DEFAULT_TEMPERATURE = 0.4
DEFAULT_MAX_NEW_TOKENS = 512
DEFAULT_REPETITION_PENALTY = 1.2
DEFAULT_TOP_P = 0.8
@spaces.GPU
def model_inference(images, text):
# Kullanıcıdan gelen metin ve görsel sorgularını işleyin
if text == "" and not images:
return "Please input a query and optionally image(s)."
if text == "" and images:
return "Please input a text query along with the image(s)."
if isinstance(images, Image.Image):
images = [images]
text = f"{DEFAULT_ASSISTANT_PREFIX} {text}"
resulting_messages = [
{
"role": "user",
"content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]
}
]
prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=images, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
generation_args = {
"max_new_tokens": DEFAULT_MAX_NEW_TOKENS,
"repetition_penalty": DEFAULT_REPETITION_PENALTY,
"temperature": DEFAULT_TEMPERATURE,
"do_sample": (DEFAULT_DECODING_STRATEGY == "Top P Sampling"),
"top_p": DEFAULT_TOP_P if DEFAULT_DECODING_STRATEGY == "Top P Sampling" else None,
}
generation_args.update(inputs)
generated_ids = model.generate(**generation_args)
generated_texts = processor.batch_decode(
generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True
)
return generated_texts[0]
examples = [
["image1.jpeg", "What does this painting tell us explain in detail?"],
["image2.jpg", "What does this painting tell us explain in detail?"],
["image3.jpg", "Describe the scene in this picture."]
]
with gr.Blocks() as demo:
gr.Markdown("## SmolVLM Vision Instruct Demo with Example Inputs")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Input Picture", type="pil", interactive=True)
query_input = gr.Textbox(label="Question", interactive=True)
submit_btn = gr.Button("Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text", interactive=False)
gr.Examples(
examples=examples,
inputs=[image_input, query_input],
outputs=output_text,
fn=model_inference
)
submit_btn.click(
model_inference,
inputs=[image_input, query_input],
outputs=output_text
)
demo.launch(debug=True)