|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
import torch |
|
from PIL import Image |
|
from transformers import AutoModel, AutoConfig |
|
from transformers import CLIPImageProcessor, pipeline, CLIPTokenizer |
|
from configuration_evaclip import EvaCLIPConfig |
|
from modeling_evaclip import EvaCLIPModel |
|
|
|
|
|
KEYS_TO_MODIFY_MAPPING = { |
|
"cls_token":"embeddings.class_embedding", |
|
"pos_embed":"embeddings.position_embedding.weight", |
|
"patch_embed.proj":"embeddings.patch_embedding", |
|
".positional_embedding":".embeddings.position_embedding.weight", |
|
".token_embedding":".embeddings.token_embedding", |
|
|
|
"mlp.c_fc":"mlp.fc1", |
|
"mlp.c_proj":"mlp.fc2", |
|
"mlp.w1":"mlp.fc1", |
|
"mlp.w2":"mlp.fc2", |
|
"mlp.w3":"mlp.fc3", |
|
".proj.":".out_proj.", |
|
|
|
|
|
"out.":"out_proj.", |
|
"norm1":"layer_norm1", |
|
"norm2":"layer_norm2", |
|
"ln_1":"layer_norm1", |
|
"ln_2":"layer_norm2", |
|
".attn":".self_attn", |
|
"norm.":"post_layernorm.", |
|
"ln_final":"final_layer_norm", |
|
"visual.blocks":"vision_model.encoder.layers", |
|
|
|
"visual.head":"visual_projection", |
|
"visual.":"vision_model.", |
|
|
|
|
|
} |
|
|
|
def rename_state_dict(state_dict): |
|
model_state_dict = {} |
|
|
|
for key, value in state_dict.items(): |
|
|
|
for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): |
|
if key_to_modify in key: |
|
key = key.replace(key_to_modify, new_key) |
|
if "text_projection" in key: |
|
model_state_dict[key] = value.T |
|
elif "attn.qkv" in key: |
|
|
|
mixed_qkv = value |
|
qkv_dim = mixed_qkv.size(0) // 3 |
|
|
|
query_layer = mixed_qkv[:qkv_dim] |
|
key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] |
|
value_layer = mixed_qkv[qkv_dim * 2 :] |
|
|
|
model_state_dict[key.replace("qkv", "q_proj")] = query_layer |
|
model_state_dict[key.replace("qkv", "k_proj")] = key_layer |
|
model_state_dict[key.replace("qkv", "v_proj")] = value_layer |
|
|
|
elif "attn.in_proj" in key: |
|
|
|
mixed_qkv = value |
|
qkv_dim = mixed_qkv.size(0) // 3 |
|
|
|
query_layer = mixed_qkv[:qkv_dim] |
|
key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] |
|
value_layer = mixed_qkv[qkv_dim * 2 :] |
|
|
|
model_state_dict[key.replace("in_proj_", "q_proj.")] = query_layer |
|
model_state_dict[key.replace("in_proj_", "k_proj.")] = key_layer |
|
model_state_dict[key.replace("in_proj_", "v_proj.")] = value_layer |
|
|
|
elif "class_embedding" in key: |
|
model_state_dict[key] = value[0,0,:] |
|
elif "vision_model.embeddings.position_embedding" in key: |
|
model_state_dict[key] = value[0,:,:] |
|
|
|
else: |
|
model_state_dict[key] = value |
|
|
|
return model_state_dict |
|
|
|
|
|
|
|
def getevaclip(checkpoint_path, input_pixels, captions): |
|
from eva_clip import create_model_and_transforms, get_tokenizer |
|
model_name = "EVA02-CLIP-bigE-14-plus" |
|
model, _, _ = create_model_and_transforms(model_name, checkpoint_path, force_custom_clip=True) |
|
tokenizer = get_tokenizer(model_name) |
|
text = tokenizer(captions) |
|
|
|
with torch.no_grad(): |
|
text_features = model.encode_text(text) |
|
image_features = model.encode_image(input_pixels) |
|
image_features_normed = image_features / image_features.norm(dim=-1, keepdim=True) |
|
text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True) |
|
|
|
label_probs = (100.0 * image_features_normed @ text_features_normed.T).softmax(dim=-1) |
|
|
|
return label_probs |
|
|
|
def save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config): |
|
hf_model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False) |
|
transformers_config.save_pretrained(pytorch_dump_folder_path) |
|
|
|
def check_loaded_model(pytorch_dump_folder_path, processor, image): |
|
|
|
|
|
hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True) |
|
|
|
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336") |
|
image_path = 'LLM2CLIP-EVA02-L-14-336/CLIP.png' |
|
image = Image.open(image_path) |
|
input_pixels = processor(images=image, return_tensors="pt").pixel_values |
|
with torch.no_grad(): |
|
image_features = hf_model.get_image_features(input_pixels) |
|
print(image_features.shape) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_evaclip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, image_path, save=False): |
|
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336") |
|
image = Image.open(image_path) |
|
input_pixels = processor( images=image, return_tensors="pt", padding=True).pixel_values |
|
|
|
|
|
|
|
|
|
|
|
transformers_config = EvaCLIPConfig.from_pretrained(config_path) |
|
hf_model = EvaCLIPModel(transformers_config) |
|
pt_model_state_dict = torch.load(checkpoint_path)['module'] |
|
state_dict = rename_state_dict(pt_model_state_dict) |
|
|
|
hf_model.load_state_dict(state_dict, strict=False) |
|
|
|
with torch.no_grad(): |
|
image_features = hf_model.get_image_features(input_pixels) |
|
|
|
image_features /= image_features.norm(dim=-1, keepdim=True) |
|
|
|
|
|
print(image_features.shape) |
|
|
|
|
|
|
|
if save: |
|
save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config) |
|
|
|
check_loaded_model(pytorch_dump_folder_path, processor, image) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--pytorch_dump_folder_path", default="LLM2CLIP-EVA02-L-14-336" ,type=str, help="Path to the output PyTorch model.") |
|
parser.add_argument("--checkpoint_path", default="model_states.pt", type=str, help="Path to checkpoint" ) |
|
parser.add_argument("--config_path", default='LLM2CLIP-EVA02-L-14-336', type=str, help="Path to hf config.json of model to convert") |
|
parser.add_argument("--image_path", default='LLM2CLIP-EVA02-L-14-336/CLIP.png', type=str, help="Path to image") |
|
parser.add_argument("--save", default=False, type=str, help="Path to image") |
|
|
|
args = parser.parse_args() |
|
|
|
convert_evaclip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.image_path, args.save) |
|
|