# Based on https://github.com/openai/CLIP/blob/main/clip/model.py import onnxruntime import numpy as np from typing import List, Union from PIL import Image from clip.simple_tokenizer import SimpleTokenizer def onnx_node_type_np_type(type): if type == "tensor(float)": return np.float32 if type == "tensor(float16)": return np.float16 if type == "tensor(int32)": return np.int32 if type == "tensor(int64)": return np.int64 raise NotImplementedError(f"Unsupported onnx type: {type}") def ensure_input_type(input, type): np_type = onnx_node_type_np_type(type) if input.dtype == type: return input return input.astype(dtype=np_type) class VisualModel: def __init__(self, path, providers=None): self.path = path print(f"Loading visual model: {path}") self.sess = onnxruntime.InferenceSession(path, providers=providers) self.input = self.sess.get_inputs()[0] self.output = self.sess.get_outputs()[0] if len(self.input.shape) != 4 or self.input.shape[2] != self.input.shape[3]: raise ValueError(f"unexpected shape {self.input.shape}") self.input_size = self.input.shape[2] print(f"Visual inference ready, input size {self.input_size}, type {self.input.type}") def encode(self, image_input): image_input = ensure_input_type(image_input, self.input.type) return self.sess.run([self.output.name], {self.input.name: image_input})[0] def fitted(self, size, w, h): short, long = (w, h) if w <= h else (h, w) new_short, new_long = size, int(size * long / short) new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short) return [new_w, new_h] def resize_to(self, img, size): new_size = self.fitted(size, img.width, img.height) return img.resize(size=new_size, resample=Image.Resampling.BICUBIC) def center_crop(self, img, size): image_height = img.height image_width = img.width if size > image_width or size > image_height: padding_ltrb = [ (size - image_width) // 2 if size > image_width else 0, (size - image_height) // 2 if size > image_height else 0, (size - image_width + 1) // 2 if size > image_width else 0, (size - image_height + 1) // 2 if size > image_height else 0, ] img = img.pad(img, padding_ltrb, fill=0) image_width = img.width image_height = img.height if size == image_width and size == image_height: return img top = int(round((image_height - size) / 2.0)) left = int(round((image_width - size) / 2.0)) return img.crop((left, top, left + size, top + size)) def to_numpy(self, pic): mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32} img = np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True) if pic.mode == "1": img = 255 * img img = np.transpose(img, (2, 0, 1)) img = img.astype(np.float32) img = np.divide(img, 255) return img def normalize(self, img): mean = np.array([0.48145466, 0.4578275, 0.40821073]).reshape((-1, 1, 1)) std = np.array([0.26862954, 0.26130258, 0.27577711]).reshape((-1, 1, 1)) return np.divide(np.subtract(img, mean), std) def preprocess(self, img): img = self.resize_to(img, self.input_size) img = self.center_crop(img, self.input_size) img = img.convert("RGB") img_np = self.to_numpy(img) img_np = self.normalize(img_np) return img_np def preprocess_images(self, images): preprocessed = [] for img in images: if isinstance(img, str): img = Image.open(img) preprocessed.append(self.preprocess(img)) return np.stack(preprocessed) class TextualModel: def __init__(self, path, providers=None): self.path = path print(f"Loading textual model: {path}") self.sess = onnxruntime.InferenceSession(path, providers=providers) self.input = self.sess.get_inputs()[0] self.output = self.sess.get_outputs()[0] self.tokenizer = SimpleTokenizer() if len(self.input.shape) != 2 or self.input.shape[1] != 77: raise ValueError(f"unexpected shape {self.input.shape}") self.input_size = self.input.shape[1] print(f"Textual inference ready, input size {self.input_size}, type {self.input.type}") def encode(self, texts): return self.sess.run([self.output.name], {self.input.name: texts})[0] def tokenize(self, texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> np.array: """ Returns the tokenized representation of given input string(s) Parameters ---------- texts : Union[str, List[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length truncate: bool Whether to truncate the text in case its encoding is longer than the context length Returns ------- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]. We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long. """ if isinstance(texts, str): texts = [texts] sot_token = self.tokenizer.encoder["<|startoftext|>"] eot_token = self.tokenizer.encoder["<|endoftext|>"] all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts] input_type = onnx_node_type_np_type(self.input.type) result = np.zeros(shape=(len(all_tokens), context_length), dtype=input_type) for i, tokens in enumerate(all_tokens): if len(tokens) > context_length: if truncate: tokens = tokens[:context_length] tokens[-1] = eot_token else: raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") result[i, :len(tokens)] = np.array(tokens) return result