import spaces import os import time import torch import gradio as gr from threading import Thread from PIL import Image # Install required packages import subprocess subprocess.run('pip install --upgrade transformers', shell=True) subprocess.run('pip install accelerate', shell=True) from transformers import AutoProcessor, AutoModelForVisionText2Text # Model and processor initialization with trust_remote_code=True processor = AutoProcessor.from_pretrained( "Qwen/QVQ-72B-Preview", trust_remote_code=True ) model = AutoModelForVisionText2Text.from_pretrained( "Qwen/QVQ-72B-Preview", trust_remote_code=True, device_map="auto" ).eval() # Footer footer = """

Powered by QVQ-72B Model

""" # Vision model function @spaces.GPU() def process_image(image, text_input=None): try: # Convert image to PIL format image = Image.fromarray(image).convert("RGB") # Prepare inputs if text_input: inputs = processor(text=text_input, images=image, return_tensors="pt") else: inputs = processor(images=image, return_tensors="pt") # Move inputs to the same device as the model inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate output outputs = model.generate(**inputs, max_new_tokens=1000) # Decode response response = processor.batch_decode(outputs, skip_special_tokens=True)[0] return response except Exception as e: return f"Error processing image: {str(e)}" # CSS styling css = """ footer { visibility: hidden; } """ # Gradio interface with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Row(): input_img = gr.Image(label="Input Image") with gr.Row(): text_input = gr.Textbox(label="Question (Optional)") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Response") submit_btn.click(process_image, [input_img, text_input], [output_text]) gr.HTML(footer) # Launch the app demo.launch(debug=True)