import torch from PIL import Image from transformers import AutoModel, AutoTokenizer import gradio as gr # Load the model and tokenizer model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True) # Function to process the image and question def predict(image, question): image = image.convert('RGB') msgs = [{'role': 'user', 'content': [image, question]}] res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer) generated_text = "" for new_text in res: generated_text += new_text return generated_text # Set up the Gradio interface iface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Upload an Image"), gr.Textbox(label="Ask a Question") ], outputs="text", title="Image Question Answering", description="Upload an image and ask a question about its content." ) # Launch the Gradio app iface.launch()