mobileclip

how can I convert .pt model to .onnx model?

#1
by sulmz - opened

I want to convert to 2 onnx model, img onnx model and text onnx model. I convert clip-vit-base-patch32 is very simple, just code for below:
from PIL import Image
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
import os,sys
if name == 'main':
top_dir_name = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(file)))))#把最外面的包路径加入进来,即可在mac和linux服务器上同时享受coding和运行的便利了
sys.path.append(top_dir_name)
os.chdir(top_dir_name)
else:
pass
from config import ASSETS_ROOT_PATH
model_name = 'clip-vit-base-patch32'

model_dir = os.path.join(ASSETS_ROOT_PATH,'vector_model','cdn',model_name)
img_onnx_path = os.path.join(model_dir,model_name+'_img'+'.onnx')
txt_onnx_path = os.path.join(model_dir,model_name+'_txt'+'.onnx')

model = CLIPModel.from_pretrained(model_dir)
processor = CLIPProcessor.from_pretrained(model_dir)

url = os.path.join(ASSETS_ROOT_PATH,'cat.jpg')
image = Image.open(url)

inputs = processor(text=["a photo of a cat "], images=image, return_tensors="pt", padding=True)

print("inputs:", inputs)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score

probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

class ImgModelWrapper(nn.Module):
def init(self, model):
super(ImgModelWrapper, self).init()
self.model = model

def forward(self, pixel_values):
    image_features = model.get_image_features(pixel_values=pixel_values)
    return image_features

class TxtModelWrapper(nn.Module):
def init(self, model):
super(TxtModelWrapper, self).init()
self.model = model

def forward(self, input_ids, attention_mask):
    text_features = model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
    return text_features

img_model = ImgModelWrapper(model)
txt_model = TxtModelWrapper(model)

torch.onnx.export(img_model, # model being run
(inputs.pixel_values), # model input (or a tuple for multiple inputs)
img_onnx_path, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=15, # the ONNX version to export the model to
do_constant_folding=False, # whether to execute constant folding for optimization
input_names=['pixel_values'], # the model's input names
# output_names=['output'], # the model's output names
# dynamic_axes={'pixel_values': {0: 'batch', 2: 'hight', 3: 'width'}},
)

text_vector = (inputs.input_ids, inputs.attention_mask)

print(txt_model)

torch.onnx.export(txt_model, # model being run
text_vector, # model input (or a tuple for multiple inputs)
txt_onnx_path, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=15, # the ONNX version to export the model to
do_constant_folding=False, # whether to execute constant folding for optimization
input_names=['input_ids', 'attention_mask'], # the model's input names
# output_names=['output'], # the model's output names
dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
'attention_mask': {0: 'batch', 1: 'seq'}},
)

but, I cannot covert mobileclip series model, what a pity, who can help me?

Sign up or log in to comment