import gradio as gr import torch import torchvision.transforms as T import numpy as np from PIL import Image device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # DINOv2 # Select checkpoint dinov2_ckpt = ['dinov2_vits14', 'dinov2_vitb14', 'dinov2_vitl14', 'dinov2_vitg14'][1] dinov2 = torch.hub.load('facebookresearch/dinov2', dinov2_ckpt) dinov2.to(device) print() transform_image = T.Compose([ T.Resize((224, 224)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def predict(image): """ Predict the identity of an image. Args: image: A PIL Image object. Returns: A string representing the predicted identity of the image. """ # Convert the image to a tensor. transformed_img = transform_image(image)[:3].unsqueeze(0).to(device) # Get the embedding of the image. with torch.no_grad(): embedding = dinov2(transformed_img) print(embedding.shape) embedding = embedding[0].cpu().numpy().tolist() print(embedding) return { "embedding": embedding } # Create a Gradio interface. interface = gr.Interface( fn=predict, inputs=[gr.Image(type='pil')], outputs=[gr.JSON()], title="DINOv2 Embedding", description=dinov2_ckpt ) # Start the Gradio server. interface.launch()