import spaces import os import time import torch import gradio as gr from threading import Thread from PIL import Image from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info # Model and processor initialization model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/QVQ-72B-Preview", torch_dtype="auto", device_map="auto" ) processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview") # Footer footer = """

Powered by QVQ-72B Model

""" # Vision model function @spaces.GPU() def process_image(image, text_input=None): try: # Convert image to PIL format if needed if not isinstance(image, Image.Image): image = Image.fromarray(image).convert("RGB") # Prepare messages if not text_input: text_input = "Please describe this image in detail." messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."} ], }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": text_input} ], } ] # Process inputs text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Generate response generated_ids = model.generate(**inputs, max_new_tokens=8192) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return output_text except Exception as e: return f"Error processing image: {str(e)}" # CSS styling css = """ footer { visibility: hidden; } """ # Gradio interface with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Row(): input_img = gr.Image(label="Input Image") with gr.Row(): text_input = gr.Textbox(label="Question (Optional)") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Response") submit_btn.click(process_image, [input_img, text_input], [output_text]) gr.HTML(footer) # Launch the app demo.launch(debug=True)