microsoft
/

LLM2CLIP-EVA02-L-14-336

PyTorch

clip

custom_code

Model card Files Files and versions Community

Gengzigang commited on 7 days ago

Commit

dc88ad3

•

1 Parent(s): 3e16209

update

Browse files

Files changed (3) hide show

README.md +5 -5
convert_evaclip_pytorch_to_hf.py +0 -193
teaser.png +0 -0

README.md CHANGED Viewed

@@ -3,12 +3,12 @@ license: apache-2.0
 ---
 <div align="center">
-<h2><a href="https://arxiv.org/abs/*****">LLM2CLIP: Extending the Capability Boundaries of CLIP through Large Language Models</a></h2>
 Weiquan Huang<sup>1*</sup>, Aoqi Wu<sup>1*</sup>, Yifan Yang<sup>2†</sup>, Xufang Luo<sup>2</sup>, Yuqing Yang<sup>2</sup>, Liang Hu<sup>1</sup>, Qi Dai<sup>2</sup>, Xiyang Dai<sup>2</sup>, Dongdong Chen<sup>2</sup>, Chong Luo<sup>2</sup>, Lili Qiu<sup>2</sup>
 <sup>1</sup>Tongji Universiy, <sup>2</sup>Microsoft Corporation <br><sup>*</sup>Equal contribution <br><sup>†</sup> Corresponding to: yifanyang@microsoft.com
-<p><a rel="nofollow" href="">[📂 GitHub]</a>  <a rel="nofollow" href="">[🆕 Blog]</a>  <a rel="nofollow" href="https://arxiv.org/abs/2312.14238">[📜 LLM2CLIP]</a>
 </div>
@@ -17,7 +17,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
 ## LLM2CLIP performance
 <div align="center">
-  <img src="teaser.png" alt="summary_tab" width="75%">
 </div>
 **It's important to note that all results presented in the paper are evaluated using PyTorch weights. There may be differences in performance when using Hugging Face (hf) models.**
@@ -39,7 +39,7 @@ image_path = "CLIP.png"
 model_name_or_path = "LLM2CLIP-EVA02-L-14-336" # or /path/to/local/LLM2CLIP-EVA02-L-14-336
 image_size = 336
-processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
 model = AutoModel.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.float16,
@@ -52,4 +52,4 @@ with torch.no_grad(), torch.cuda.amp.autocast():
     outputs = model.get_image_features(input_pixels)
 ```
-## BibTeX & Citation

 ---
 <div align="center">
+<h2><a href="">LLM2CLIP: Extending the Capability Boundaries of CLIP through Large Language Models</a></h2>
 Weiquan Huang<sup>1*</sup>, Aoqi Wu<sup>1*</sup>, Yifan Yang<sup>2†</sup>, Xufang Luo<sup>2</sup>, Yuqing Yang<sup>2</sup>, Liang Hu<sup>1</sup>, Qi Dai<sup>2</sup>, Xiyang Dai<sup>2</sup>, Dongdong Chen<sup>2</sup>, Chong Luo<sup>2</sup>, Lili Qiu<sup>2</sup>
 <sup>1</sup>Tongji Universiy, <sup>2</sup>Microsoft Corporation <br><sup>*</sup>Equal contribution <br><sup>†</sup> Corresponding to: yifanyang@microsoft.com
+<p><a rel="nofollow" href="https://github.com/microsoft/LLM2CLIP">[📂 GitHub]</a>  <a rel="nofollow" href="https://microsoft.github.io/LLM2CLIP/">[🆕 Blog]</a>  <a rel="nofollow" href="">[📜 LLM2CLIP]</a>
 </div>
 ## LLM2CLIP performance
 <div align="center">
+  <img src="teaser.png" alt="summary_tab" width="85%">
 </div>
 **It's important to note that all results presented in the paper are evaluated using PyTorch weights. There may be differences in performance when using Hugging Face (hf) models.**
 model_name_or_path = "LLM2CLIP-EVA02-L-14-336" # or /path/to/local/LLM2CLIP-EVA02-L-14-336
 image_size = 336
+processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
 model = AutoModel.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.float16,
     outputs = model.get_image_features(input_pixels)
 ```
+## BibTeX & Citation

convert_evaclip_pytorch_to_hf.py DELETED Viewed

@@ -1,193 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Part of the code was taken from:
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
-import argparse
-import torch
-from PIL import Image
-from transformers import AutoModel, AutoConfig
-from transformers import  CLIPImageProcessor, pipeline, CLIPTokenizer
-from configuration_evaclip import EvaCLIPConfig
-from modeling_evaclip import EvaCLIPModel
-KEYS_TO_MODIFY_MAPPING = {
-    "cls_token":"embeddings.class_embedding",
-    "pos_embed":"embeddings.position_embedding.weight",
-    "patch_embed.proj":"embeddings.patch_embedding",
-    ".positional_embedding":".embeddings.position_embedding.weight",
-    ".token_embedding":".embeddings.token_embedding",
-    # "text.text_projection":"text_projection.weight",
-    "mlp.c_fc":"mlp.fc1",
-    "mlp.c_proj":"mlp.fc2",
-    "mlp.w1":"mlp.fc1",
-    "mlp.w2":"mlp.fc2",
-    "mlp.w3":"mlp.fc3",
-    ".proj.":".out_proj.",
-    # "q_bias":"q_proj.bias",
-    # "v_bias":"v_proj.bias",
-    "out.":"out_proj.",
-    "norm1":"layer_norm1",
-    "norm2":"layer_norm2",
-    "ln_1":"layer_norm1",
-    "ln_2":"layer_norm2",
-    ".attn":".self_attn",
-    "norm.":"post_layernorm.",
-    "ln_final":"final_layer_norm",
-    "visual.blocks":"vision_model.encoder.layers",
-    # "text.transformer.resblocks":"text_model.encoder.layers",
-    "visual.head":"visual_projection",
-    "visual.":"vision_model.",
-    # "text.":"text_model.",
-}
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        # check if any key needs to be modified
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        if "text_projection" in key:
-            model_state_dict[key] = value.T
-        elif "attn.qkv" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-            model_state_dict[key.replace("qkv", "q_proj")] = query_layer
-            model_state_dict[key.replace("qkv", "k_proj")] = key_layer
-            model_state_dict[key.replace("qkv", "v_proj")] = value_layer
-        elif "attn.in_proj" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-            model_state_dict[key.replace("in_proj_", "q_proj.")] = query_layer
-            model_state_dict[key.replace("in_proj_", "k_proj.")] = key_layer
-            model_state_dict[key.replace("in_proj_", "v_proj.")] = value_layer
-        elif "class_embedding" in key:
-            model_state_dict[key] = value[0,0,:]
-        elif "vision_model.embeddings.position_embedding" in key:
-            model_state_dict[key] = value[0,:,:]
-        else:
-            model_state_dict[key] = value
-    return model_state_dict
-# This requires having a clone of https://github.com/baaivision/EVA/tree/master/EVA-CLIP as well as the right conda env
-# Part of the code is copied from https://github.com/baaivision/EVA/blob/master/EVA-CLIP/README.md "Usage" section
-def getevaclip(checkpoint_path, input_pixels, captions):
-    from eva_clip import create_model_and_transforms, get_tokenizer
-    model_name = "EVA02-CLIP-bigE-14-plus"
-    model, _, _ = create_model_and_transforms(model_name, checkpoint_path, force_custom_clip=True)
-    tokenizer = get_tokenizer(model_name)
-    text = tokenizer(captions)
-    with torch.no_grad():
-        text_features = model.encode_text(text)
-        image_features = model.encode_image(input_pixels)
-        image_features_normed = image_features / image_features.norm(dim=-1, keepdim=True)
-        text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True)
-        label_probs = (100.0 * image_features_normed @ text_features_normed.T).softmax(dim=-1)
-    return  label_probs
-def save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config):
-    hf_model.save_pretrained(pytorch_dump_folder_path, safe_serialization=False)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-def check_loaded_model(pytorch_dump_folder_path, processor, image):
-    # hf_config = AutoConfig.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)
-    # hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, config=hf_config, trust_remote_code=True)
-    hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)
-    processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
-    image_path = 'LLM2CLIP-EVA02-L-14-336/CLIP.png'
-    image = Image.open(image_path)
-    input_pixels = processor(images=image, return_tensors="pt").pixel_values
-    with torch.no_grad():
-        image_features = hf_model.get_image_features(input_pixels)
-    print(image_features.shape)
-    # detector = pipeline(model=hf_model, task="zero-shot-image-classification", tokenizer = tokenizer, image_processor=processor)
-    # detector_probs = detector(image, candidate_labels=captions)
-    # print(f"text_probs loaded hf_model using pipeline: {detector_probs}")
-def convert_evaclip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, image_path, save=False):
-    processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
-    image = Image.open(image_path)
-    input_pixels = processor( images=image, return_tensors="pt", padding=True).pixel_values
-    # This requires having a clone of https://github.com/baaivision/EVA/tree/master/EVA-CLIP as well as the right conda env
-    # original_evaclip_probs = getevaclip(checkpoint_path, input_pixels, captions)
-    # print(f"original_evaclip label probs: {original_evaclip_probs}")
-    transformers_config = EvaCLIPConfig.from_pretrained(config_path)
-    hf_model = EvaCLIPModel(transformers_config)
-    pt_model_state_dict = torch.load(checkpoint_path)['module']
-    state_dict = rename_state_dict(pt_model_state_dict)
-    hf_model.load_state_dict(state_dict, strict=False)
-    with torch.no_grad():
-        image_features = hf_model.get_image_features(input_pixels)
-        # text_features = hf_model.get_text_features(input_ids)
-        image_features /= image_features.norm(dim=-1, keepdim=True)
-        # text_features /= text_features.norm(dim=-1, keepdim=True)
-    print(image_features.shape)
-    # label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
-    # print(f"hf_model label probs: {label_probs}")
-    if save:
-        save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config)
-    check_loaded_model(pytorch_dump_folder_path, processor, image)
-    # hf_model.push_to_hub("ORGANIZATION_NAME/EVA02_CLIP_E_psz14_plus_s9B")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default="LLM2CLIP-EVA02-L-14-336" ,type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default="model_states.pt", type=str, help="Path to checkpoint" )
-    parser.add_argument("--config_path", default='LLM2CLIP-EVA02-L-14-336', type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument("--image_path", default='LLM2CLIP-EVA02-L-14-336/CLIP.png', type=str, help="Path to image")
-    parser.add_argument("--save", default=False, type=str, help="Path to image")
-    args = parser.parse_args()
-    convert_evaclip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.image_path, args.save)

teaser.png CHANGED Viewed