Spaces:

akhaliq
/

omnivore

Runtime error

File size: 2,873 Bytes

81a2b37
37f7a9f
 
 
710b138
37f7a9f
 
 
 
 
 
c84dd6b
 
37f7a9f
 
 
c33a1bf
37f7a9f
c12ec0f
c33a1bf
92da7c8
 
 
f489d84
 
 
92da7c8
 
37f7a9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd97194
37f7a9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed6d020
37f7a9f
 
 
 
a755c75
37f7a9f
f4eea19
37f7a9f
 
ed6d020
c12ec0f

import os
import json
from typing import List


import torch
import torch.nn.functional as F
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms._transforms_video import NormalizeVideo

import gradio as gr

# Device on which to run the model
# Set to cuda to load on GPU
device = "cpu"
os.system("wget https://huggingface.co/akhaliq/Omnivore/resolve/main/swinB_checkpoint.torch")
# Pick a pretrained model 
model_name = "omnivore_swinB"
model = torch.hub.load('facebookresearch/omnivore:main', "omnivore_swinB", pretrained=False)
new_dict = {}
for key, value in torch.load('/home/user/app/swinB_checkpoint.torch')['trunk'].items():
    new_dict['trunk.' + key] = value
  
for key, value in torch.load('/home/user/app/swinB_checkpoint.torch')['heads'].items():
    new_dict['heads.' + key] = value

model.load_state_dict(new_dict)

# Set to eval mode and move to desired device
model = model.to(device)
model = model.eval()

os.system("wget https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json")

with open("imagenet_class_index.json", "r") as f:
    imagenet_classnames = json.load(f)

# Create an id to label name mapping
imagenet_id_to_classname = {}
for k, v in imagenet_classnames.items():
    imagenet_id_to_classname[k] = v[1] 
    
os.system("wget https://upload.wikimedia.org/wikipedia/commons/thumb/c/c5/13-11-02-olb-by-RalfR-03.jpg/800px-13-11-02-olb-by-RalfR-03.jpg -O library.jpg")

def inference(img):
    image = img
    image_transform = T.Compose(
    [
        T.Resize(224),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
    )
    image = image_transform(image)
    
    # The model expects inputs of shape: B x C x T x H x W
    image = image[None, :, None, ...]
    
    prediction = model(image, input_type="image")
    prediction = F.softmax(prediction, dim=1)
    pred_classes = prediction.topk(k=5).indices
    
    pred_class_names = [imagenet_id_to_classname[str(i.item())] for i in pred_classes[0]]
    return "Top 5 predicted labels: %s" % ", ".join(pred_class_names)
    
inputs = gr.inputs.Image(type='pil')
outputs = gr.outputs.Textbox(label="Output")

title = "Omnivore"

description = "Gradio demo for Omnivore: A Single Model for Many Visual Modalities. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."

article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.08377' target='_blank'>Omnivore: A Single Model for Many Visual Modalities</a> | <a href='https://github.com/facebookresearch/omnivore' target='_blank'>Github Repo</a></p>"


gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=[['library.jpg']]).launch(enable_queue=True,cache_examples=True)