MolmoVision / app.py
yasserrmd's picture
Update app.py
ff95e3f verified
raw
history blame contribute delete
No virus
1.91 kB
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import torch
import requests
# Load the processor and model
processor = AutoProcessor.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(
'allenai/Molmo-7B-D-0924',
trust_remote_code=True,
torch_dtype='auto',
device_map='auto'
)
@spaces.GPU
def describe_image(image, prompt):
# Process the image with the user-provided text prompt
inputs = processor.process(images=[image], text=prompt)
# Move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
# Generate output with a maximum of 200 new tokens
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer
)
# Decode and return the generated text
generated_tokens = output[0, inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return generated_text
# Gradio interface using the latest API
with gr.Blocks() as demo:
gr.Markdown("# Visual Language Model - Molmo")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an image")
text_input = gr.Textbox(label="Enter a prompt", placeholder="Describe this image...")
output_text = gr.Textbox(label="Generated Description")
submit_button = gr.Button("Generate Description")
# Connect the inputs (image, text prompt) to the function and output
submit_button.click(fn=describe_image, inputs=[image_input, text_input], outputs=output_text)
# Launch the app
demo.launch()