import spaces import os import time import torch import gradio as gr from threading import Thread from PIL import Image # Install required packages import subprocess subprocess.run('pip install --upgrade transformers', shell=True) subprocess.run('pip install accelerate', shell=True) from transformers import AutoModelForCausalLM, AutoTokenizer # Model and tokenizer initialization model_name = "Qwen/QVQ-72B-Preview" tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16 ) # Footer footer = """

Powered by QVQ-72B Model

""" # Vision model function @spaces.GPU() def process_image(image, text_input=None): try: # Convert image to PIL format image = Image.fromarray(image).convert("RGB") # Prepare prompt if text_input: prompt = f"Please describe this image and answer: {text_input}" else: prompt = "Please describe this image in detail." # Generate response response = model.chat(tokenizer, prompt, history=[], images=image) return response except Exception as e: return f"Error processing image: {str(e)}" # CSS styling css = """ footer { visibility: hidden; } """ # Gradio interface with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Row(): input_img = gr.Image(label="Input Image") with gr.Row(): text_input = gr.Textbox(label="Question (Optional)") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Response") submit_btn.click(process_image, [input_img, text_input], [output_text]) gr.HTML(footer) # Launch the app demo.launch(debug=True)