LLM2CLIP-EVA02-L-14-336 / convert_evaclip_pytorch_to_hf.py

Gengzigang

init

3e16209 7 days ago

8.54 kB

	# coding=utf-8
	# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Part of the code was taken from:
	# https://github.com/huggingface/transformers/blob/main/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py

	import argparse

	import torch
	from PIL import Image
	from transformers import AutoModel, AutoConfig
	from transformers import CLIPImageProcessor, pipeline, CLIPTokenizer
	from configuration_evaclip import EvaCLIPConfig
	from modeling_evaclip import EvaCLIPModel


	KEYS_TO_MODIFY_MAPPING = {
	"cls_token":"embeddings.class_embedding",
	"pos_embed":"embeddings.position_embedding.weight",
	"patch_embed.proj":"embeddings.patch_embedding",
	".positional_embedding":".embeddings.position_embedding.weight",
	".token_embedding":".embeddings.token_embedding",
	# "text.text_projection":"text_projection.weight",
	"mlp.c_fc":"mlp.fc1",
	"mlp.c_proj":"mlp.fc2",
	"mlp.w1":"mlp.fc1",
	"mlp.w2":"mlp.fc2",
	"mlp.w3":"mlp.fc3",
	".proj.":".out_proj.",
	# "q_bias":"q_proj.bias",
	# "v_bias":"v_proj.bias",
	"out.":"out_proj.",
	"norm1":"layer_norm1",
	"norm2":"layer_norm2",
	"ln_1":"layer_norm1",
	"ln_2":"layer_norm2",
	".attn":".self_attn",
	"norm.":"post_layernorm.",
	"ln_final":"final_layer_norm",
	"visual.blocks":"vision_model.encoder.layers",
	# "text.transformer.resblocks":"text_model.encoder.layers",
	"visual.head":"visual_projection",
	"visual.":"vision_model.",
	# "text.":"text_model.",

	}

	def rename_state_dict(state_dict):
	model_state_dict = {}

	for key, value in state_dict.items():
	# check if any key needs to be modified
	for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
	if key_to_modify in key:
	key = key.replace(key_to_modify, new_key)
	if "text_projection" in key:
	model_state_dict[key] = value.T
	elif "attn.qkv" in key:
	# split qkv into query key and value
	mixed_qkv = value
	qkv_dim = mixed_qkv.size(0) // 3

	query_layer = mixed_qkv[:qkv_dim]
	key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
	value_layer = mixed_qkv[qkv_dim * 2 :]

	model_state_dict[key.replace("qkv", "q_proj")] = query_layer
	model_state_dict[key.replace("qkv", "k_proj")] = key_layer
	model_state_dict[key.replace("qkv", "v_proj")] = value_layer

	elif "attn.in_proj" in key:
	# split qkv into query key and value
	mixed_qkv = value
	qkv_dim = mixed_qkv.size(0) // 3

	query_layer = mixed_qkv[:qkv_dim]
	key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
	value_layer = mixed_qkv[qkv_dim * 2 :]

	model_state_dict[key.replace("in_proj_", "q_proj.")] = query_layer
	model_state_dict[key.replace("in_proj_", "k_proj.")] = key_layer
	model_state_dict[key.replace("in_proj_", "v_proj.")] = value_layer

	elif "class_embedding" in key:
	model_state_dict[key] = value[0,0,:]
	elif "vision_model.embeddings.position_embedding" in key:
	model_state_dict[key] = value[0,:,:]

	else:
	model_state_dict[key] = value

	return model_state_dict

	# This requires having a clone of https://github.com/baaivision/EVA/tree/master/EVA-CLIP as well as the right conda env
	# Part of the code is copied from https://github.com/baaivision/EVA/blob/master/EVA-CLIP/README.md "Usage" section
	def getevaclip(checkpoint_path, input_pixels, captions):
	from eva_clip import create_model_and_transforms, get_tokenizer
	model_name = "EVA02-CLIP-bigE-14-plus"
	model, _, _ = create_model_and_transforms(model_name, checkpoint_path, force_custom_clip=True)
	tokenizer = get_tokenizer(model_name)
	text = tokenizer(captions)

	with torch.no_grad():
	text_features = model.encode_text(text)
	image_features = model.encode_image(input_pixels)
	image_features_normed = image_features / image_features.norm(dim=-1, keepdim=True)
	text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True)

	label_probs = (100.0 * image_features_normed @ text_features_normed.T).softmax(dim=-1)

	return label_probs

	def save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config):
	hf_model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False)
	transformers_config.save_pretrained(pytorch_dump_folder_path)

	def check_loaded_model(pytorch_dump_folder_path, processor, image):
	# hf_config = AutoConfig.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)
	# hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, config=hf_config, trust_remote_code=True)
	hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)

	processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
	image_path = 'LLM2CLIP-EVA02-L-14-336/CLIP.png'
	image = Image.open(image_path)
	input_pixels = processor(images=image, return_tensors="pt").pixel_values
	with torch.no_grad():
	image_features = hf_model.get_image_features(input_pixels)
	print(image_features.shape)


	# detector = pipeline(model=hf_model, task="zero-shot-image-classification", tokenizer = tokenizer, image_processor=processor)
	# detector_probs = detector(image, candidate_labels=captions)
	# print(f"text_probs loaded hf_model using pipeline: {detector_probs}")

	def convert_evaclip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, image_path, save=False):
	processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
	image = Image.open(image_path)
	input_pixels = processor( images=image, return_tensors="pt", padding=True).pixel_values

	# This requires having a clone of https://github.com/baaivision/EVA/tree/master/EVA-CLIP as well as the right conda env
	# original_evaclip_probs = getevaclip(checkpoint_path, input_pixels, captions)
	# print(f"original_evaclip label probs: {original_evaclip_probs}")

	transformers_config = EvaCLIPConfig.from_pretrained(config_path)
	hf_model = EvaCLIPModel(transformers_config)
	pt_model_state_dict = torch.load(checkpoint_path)['module']
	state_dict = rename_state_dict(pt_model_state_dict)

	hf_model.load_state_dict(state_dict, strict=False)

	with torch.no_grad():
	image_features = hf_model.get_image_features(input_pixels)
	# text_features = hf_model.get_text_features(input_ids)
	image_features /= image_features.norm(dim=-1, keepdim=True)
	# text_features /= text_features.norm(dim=-1, keepdim=True)

	print(image_features.shape)
	# label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
	# print(f"hf_model label probs: {label_probs}")

	if save:
	save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config)

	check_loaded_model(pytorch_dump_folder_path, processor, image)

	# hf_model.push_to_hub("ORGANIZATION_NAME/EVA02_CLIP_E_psz14_plus_s9B")




	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--pytorch_dump_folder_path", default="LLM2CLIP-EVA02-L-14-336" ,type=str, help="Path to the output PyTorch model.")
	parser.add_argument("--checkpoint_path", default="model_states.pt", type=str, help="Path to checkpoint" )
	parser.add_argument("--config_path", default='LLM2CLIP-EVA02-L-14-336', type=str, help="Path to hf config.json of model to convert")
	parser.add_argument("--image_path", default='LLM2CLIP-EVA02-L-14-336/CLIP.png', type=str, help="Path to image")
	parser.add_argument("--save", default=False, type=str, help="Path to image")

	args = parser.parse_args()

	convert_evaclip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.image_path, args.save)