[Init] upload model

Browse files

Files changed (6) hide show

README.md +69 -0
config.json +37 -0
model.safetensors +3 -0
modeling_config.py +21 -0
modeling_videomaev2.py +535 -0
preprocessor_config.json +18 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+license: cc-by-nc-4.0
+tags:
+- vision
+- video-classification
+pipeline_tag: video-classification
+---
+# VideoMAE-v2 (giant-sized model, Pretrained on UnlabeledHybrid-1M)
+VideoMAEv2-giant model pre-trained for 800 epochs in a self-supervised way on UnlabeldHybrid-1M dataset. It was introduced in the paper [[CVPR23]VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking](https://arxiv.org/abs/2203.12602) by Wang et al. and first released in [GitHub](https://github.com/OpenGVLab/VideoMAEv2).
+## Intended uses & limitations
+You can use the raw model for video feature extraction.
+### How to use
+Here is how to use this model to extract a video feature:
+```python
+from transformers import VideoMAEImageProcessor, AutoModel, AutoConfig
+import numpy as np
+import torch
+config = AutoConfig.from_pretrained("OpenGVLab/VideoMAEv2-giant", trust_remote_code=True)
+processor = VideoMAEImageProcessor.from_pretrained("OpenGVLab/VideoMAEv2-giant")
+model = AutoModel.from_pretrained('OpenGVLab/VideoMAEv2-giant', config=config, trust_remote_code=True)
+video = list(np.random.rand(16, 3, 224, 224))
+# B, T, C, H, W -> B, C, T, H, W
+inputs = processor(video, return_tensors="pt")
+inputs['pixel_values'] = inputs['pixel_values'].permute(0, 2, 1, 3, 4)
+with torch.no_grad():
+  outputs = model(**inputs)
+```
+### BibTeX entry and citation info
+```bibtex
+@InProceedings{wang2023videomaev2,
+    author    = {Wang, Limin and Huang, Bingkun and Zhao, Zhiyu and Tong, Zhan and He, Yinan and Wang, Yi and Wang, Yali and Qiao, Yu},
+    title     = {VideoMAE V2: Scaling Video Masked Autoencoders With Dual Masking},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2023},
+    pages     = {14549-14560}
+}
+@misc{videomaev2,
+      title={VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking},
+      author={Limin Wang and Bingkun Huang and Zhiyu Zhao and Zhan Tong and Yinan He and Yi Wang and Yali Wang and Yu Qiao},
+      year={2023},
+      eprint={2303.16727},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "_name_or_path": "./",
+    "model_type": "VideoMAEv2_Base",
+    "architectures": [
+      "VideoMAEv2_Base"
+    ],
+    "auto_map": {
+        "AutoModel": "modeling_videomaev2.VideoMAEv2",
+        "AutoConfig": "modeling_config.VideoMAEv2Config"
+    },
+    "model_config":{
+      "img_size": 224,
+      "patch_size": 14,
+      "in_chans": 3,
+      "num_classes": 0,
+      "embed_dim": 1408,
+      "depth": 40,
+      "num_heads": 16,
+      "mlp_ratio": 4.363636363636363,
+      "qkv_bias": true,
+      "qk_scale": null,
+      "drop_rate": 0.0,
+      "attn_drop_rate": 0.0,
+      "drop_path_rate": 0.0,
+      "norm_layer": "nn.LayerNorm",
+      "layer_norm_eps": 1e-6,
+      "init_values": 0.0,
+      "use_learnable_pos_emb": false,
+      "tubelet_size": 2,
+      "use_mean_pooling": false,
+      "with_cp": false,
+      "num_frames": 16,
+      "cos_attn": false
+    },
+    "transformers_version": "4.38.0",
+    "use_cache": true
+  }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a231169cd13307aeb29bc56259a494ef273bfd9b7232f881ac4e8d7e39713add
+size 4105286120

modeling_config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import copy
+import re, ast
+from transformers import AutoConfig, LlamaConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from easydict import EasyDict as MyEasyDict
+from importlib import import_module
+import os.path as osp
+import argparse
+import json
+from copy import deepcopy
+import sys
+class VideoMAEv2Config(PretrainedConfig):
+    model_type = 'VideoMAEv2_Base'
+    def __init__(
+            self,
+            **kwargs):
+        super().__init__(**kwargs)

modeling_videomaev2.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# --------------------------------------------------------
+# Based on BEiT, timm, DINO and DeiT code bases
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/facebookresearch/deit
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+from functools import partial
+import logging
+logger = logging.getLogger(__name__)
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from transformers import AutoConfig, PreTrainedModel
+from timm.layers import drop_path, to_2tuple, trunc_normal_
+from .modeling_config import VideoMAEv2Config
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 400,
+        'input_size': (3, 224, 224),
+        'pool_size': None,
+        'crop_pct': .9,
+        'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5),
+        'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class CosAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # self.scale = qk_scale or head_dim**-0.5
+        # DO NOT RENAME [self.scale] (for no weight decay)
+        if qk_scale is None:
+            self.scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))),
+                requires_grad=True)
+        else:
+            self.scale = qk_scale
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        attn = (
+            F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
+        # torch.log(torch.tensor(1. / 0.01)) = 4.6052
+        logit_scale = torch.clamp(self.scale, max=4.6052).exp()
+        attn = attn * logit_scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 attn_head_dim=None,
+                 cos_attn=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        if cos_attn:
+            self.attn = CosAttention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                attn_head_dim=attn_head_dim)
+        else:
+            self.attn = Attention(
+                dim,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop,
+                attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 num_frames=16,
+                 tubelet_size=2):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_spatial_patches = (img_size[0] // patch_size[0]) * (
+            img_size[1] // patch_size[1])
+        num_patches = num_spatial_patches * (num_frames // tubelet_size)
+        self.img_size = img_size
+        self.tubelet_size = tubelet_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv3d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+            stride=(self.tubelet_size, patch_size[0], patch_size[1]))
+    def forward(self, x, **kwargs):
+        B, C, T, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[
+            1], f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        # b, c, l -> b, l, c
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid):
+    ''' Sinusoid position encoding table '''
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.tensor(
+        sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 head_drop_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 layer_norm_eps=1e-12,
+                 init_values=0.,
+                 use_learnable_pos_emb=False,
+                 init_scale=0.,
+                 num_frames=16,
+                 tubelet_size=2,
+                 use_mean_pooling=True,
+                 with_cp=False,
+                 cos_attn=False):
+        super().__init__()
+        self.num_classes = num_classes
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+        self.tubelet_size = tubelet_size
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            num_frames=num_frames,
+            tubelet_size=tubelet_size)
+        num_patches = self.patch_embed.num_patches
+        self.with_cp = with_cp
+        norm_layer = partial(eval(norm_layer), eps=layer_norm_eps)
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, embed_dim))
+        else:
+            # sine-cosine positional embeddings is on the way
+            self.pos_embed = get_sinusoid_encoding_table(
+                num_patches, embed_dim)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                cos_attn=cos_attn) for i in range(depth)
+        ])
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(
+            embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        self.head_dropout = nn.Dropout(head_drop_rate)
+        self.head = nn.Linear(
+            embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if use_learnable_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+        if num_classes > 0:
+            self.head.weight.data.mul_(init_scale)
+            self.head.bias.data.mul_(init_scale)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def get_num_layers(self):
+        return len(self.blocks)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        B = x.size(0)
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(
+                x.device).clone().detach()
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            if self.with_cp:
+                x = cp.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.fc_norm is not None:
+            return self.fc_norm(x.mean(1))
+        else:
+            return self.norm(x[:, 0])
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head_dropout(x)
+        x = self.head(x)
+        return x
+class VideoMAEv2(PreTrainedModel):
+    config_class = VideoMAEv2Config
+    def __init__(self, config=None):
+        super().__init__(config=config)
+        self.model_config = config.model_config
+        logger.info("Model config: {}".format(self.model_config))
+        self.model = VisionTransformer(**self.model_config)
+    def forward(self, pixel_values):
+        return self.model(pixel_values)
+    def extract_features(self, pixel_values):
+        return self.model.forward_features(pixel_values)
+def vit_small_patch16_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+def vit_base_patch16_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+# @register_model
+def vit_huge_patch16_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+# @register_model
+def vit_giant_patch14_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=14,
+        embed_dim=1408,
+        depth=40,
+        num_heads=16,
+        mlp_ratio=48 / 11,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "do_center_crop": true,
+    "do_normalize": true,
+    "do_resize": true,
+    "feature_extractor_type": "VideoMAEFeatureExtractor",
+    "image_mean": [
+      0.485,
+      0.456,
+      0.406
+    ],
+    "image_std": [
+      0.229,
+      0.224,
+      0.225
+    ],
+    "resample": 2,
+    "size": 224
+  }