See: https://huggingface.co/timm/eva02_tiny_patch14_336.mim_in22k_ft_in1k
from urllib.request import urlopen
import einops
import numpy as np
import onnxruntime as ort
from PIL import Image
def softmax(x):
y = np.exp(x - np.max(x))
return y / y.sum(axis=0)
IMG_URL = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
IN1K_CLASSES_URL = 'https://storage.googleapis.com/bit_models/ilsvrc2012_wordnet_lemmas.txt'
session = ort.InferenceSession('eva02_tiny_patch14_336.mim_in22k_ft_in1k.ort')
# session = ort.InferenceSession('eva02_tiny_patch14_336.mim_in22k_ft_in1k.onnx')
labels = urlopen(IN1K_CLASSES_URL).read().decode().splitlines()
img = np.array(
Image.open(urlopen(IMG_URL))
.resize(session._sess.inputs_meta[0].shape[2:])
)
# e.g. in1k norm stats
mean = .485, .456, .406
sd = .229, .224, .225
img = (img / 255. - mean) / sd
# to clearly illustrate format ort expects
img = einops.rearrange(img, 'h w c -> 1 c h w').astype(np.float32)
out = session.run(None, {session.get_inputs()[0].name: img})
out = softmax(out[0][0])
topk = np.argsort(out)[::-1][:5]
for i in topk:
print(f'{out[i]:.2f}', labels[i])