import torch from PIL import Image from transformers import AutoModel, AutoTokenizer import gradio as gr # Check if GPU is available and set the device device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load the model and tokenizer model = AutoModel.from_pretrained( 'openbmb/MiniCPM-V', trust_remote_code=True, torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32 ) model = model.eval().to(device) tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V', trust_remote_code=True) # Function to process the image and question def predict(image, question): image = image.convert('RGB') msgs = [{'role': 'user', 'content': [image, question]}] res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer) generated_text = "" for new_text in res: generated_text += new_text return generated_text # Set up the Gradio interface iface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Upload an Image"), gr.Textbox(label="Ask a Question") ], outputs="text", title="Image Question Answering", description="Upload an image and ask a question about its content." ) # Launch the Gradio app iface.launch()