ynhe commited on
Commit
9981a9c
·
verified ·
1 Parent(s): 2f951ec

[Init] upload model

Browse files
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ tags:
4
+ - vision
5
+ - video-classification
6
+ pipeline_tag: video-classification
7
+ ---
8
+
9
+ # VideoMAE-v2 (Large-sized model, Pretrained on UnlabeledHybrid-1M)
10
+
11
+ VideoMAEv2-Large model pre-trained for 800 epochs in a self-supervised way on UnlabeldHybrid-1M dataset. It was introduced in the paper [[CVPR23]VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking](https://arxiv.org/abs/2203.12602) by Wang et al. and first released in [GitHub](https://github.com/OpenGVLab/VideoMAEv2).
12
+
13
+
14
+ ## Intended uses & limitations
15
+
16
+ You can use the raw model for video feature extraction.
17
+
18
+ ### How to use
19
+
20
+ Here is how to use this model to extract a video feature:
21
+
22
+ ```python
23
+ from transformers import VideoMAEImageProcessor, AutoModel, AutoConfig
24
+ import numpy as np
25
+ import torch
26
+
27
+
28
+ config = AutoConfig.from_pretrained("OpenGVLab/VideoMAEv2-Large", trust_remote_code=True)
29
+ processor = VideoMAEImageProcessor.from_pretrained("OpenGVLab/VideoMAEv2-Large")
30
+ model = AutoModel.from_pretrained('OpenGVLab/VideoMAEv2-Large', config=config, trust_remote_code=True)
31
+
32
+
33
+ video = list(np.random.rand(16, 3, 224, 224))
34
+
35
+
36
+
37
+
38
+ # B, T, C, H, W -> B, C, T, H, W
39
+ inputs = processor(video, return_tensors="pt")
40
+ inputs['pixel_values'] = inputs['pixel_values'].permute(0, 2, 1, 3, 4)
41
+
42
+ with torch.no_grad():
43
+ outputs = model(**inputs)
44
+ ```
45
+
46
+
47
+
48
+
49
+ ### BibTeX entry and citation info
50
+
51
+ ```bibtex
52
+ @InProceedings{wang2023videomaev2,
53
+ author = {Wang, Limin and Huang, Bingkun and Zhao, Zhiyu and Tong, Zhan and He, Yinan and Wang, Yi and Wang, Yali and Qiao, Yu},
54
+ title = {VideoMAE V2: Scaling Video Masked Autoencoders With Dual Masking},
55
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
56
+ month = {June},
57
+ year = {2023},
58
+ pages = {14549-14560}
59
+ }
60
+
61
+ @misc{videomaev2,
62
+ title={VideoMAE V2: Scaling Video Masked Autoencoders with Dual Masking},
63
+ author={Limin Wang and Bingkun Huang and Zhiyu Zhao and Zhan Tong and Yinan He and Yi Wang and Yali Wang and Yu Qiao},
64
+ year={2023},
65
+ eprint={2303.16727},
66
+ archivePrefix={arXiv},
67
+ primaryClass={cs.CV}
68
+ }
69
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./",
3
+ "model_type": "VideoMAEv2_Base",
4
+ "architectures": [
5
+ "VideoMAEv2_Base"
6
+ ],
7
+ "auto_map": {
8
+ "AutoModel": "modeling_videomaev2.VideoMAEv2",
9
+ "AutoConfig": "modeling_config.VideoMAEv2Config"
10
+ },
11
+ "model_config":{
12
+ "img_size": 224,
13
+ "patch_size": 16,
14
+ "in_chans": 3,
15
+ "num_classes": 0,
16
+ "embed_dim": 1024,
17
+ "depth": 24,
18
+ "num_heads": 16,
19
+ "mlp_ratio": 4,
20
+ "qkv_bias": true,
21
+ "qk_scale": null,
22
+ "drop_rate": 0.0,
23
+ "attn_drop_rate": 0.0,
24
+ "drop_path_rate": 0.0,
25
+ "norm_layer": "nn.LayerNorm",
26
+ "layer_norm_eps": 1e-6,
27
+ "init_values": 0.0,
28
+ "use_learnable_pos_emb": false,
29
+ "tubelet_size": 2,
30
+ "use_mean_pooling": true,
31
+ "with_cp": false,
32
+ "num_frames": 16,
33
+ "cos_attn": false
34
+ },
35
+ "transformers_version": "4.38.0",
36
+ "use_cache": true
37
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c27064402bfec2c7495e2226a5e0d8f46374b4df4b03026af1e10a57e11b2996
3
+ size 1215474840
modeling_config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re, ast
3
+ from transformers import AutoConfig, LlamaConfig
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+ from easydict import EasyDict as MyEasyDict
8
+ from importlib import import_module
9
+ import os.path as osp
10
+ import argparse
11
+ import json
12
+ from copy import deepcopy
13
+ import sys
14
+
15
+
16
+ class VideoMAEv2Config(PretrainedConfig):
17
+ model_type = 'VideoMAEv2_Base'
18
+ def __init__(
19
+ self,
20
+ **kwargs):
21
+ super().__init__(**kwargs)
modeling_videomaev2.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Based on BEiT, timm, DINO and DeiT code bases
3
+ # https://github.com/microsoft/unilm/tree/master/beit
4
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm
5
+ # https://github.com/facebookresearch/deit
6
+ # https://github.com/facebookresearch/dino
7
+ # --------------------------------------------------------'
8
+ from functools import partial
9
+ import logging
10
+ logger = logging.getLogger(__name__)
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.nn.functional as F
16
+ import torch.utils.checkpoint as cp
17
+
18
+ from transformers import AutoConfig, PreTrainedModel
19
+
20
+ from timm.layers import drop_path, to_2tuple, trunc_normal_
21
+ from .modeling_config import VideoMAEv2Config
22
+
23
+ def _cfg(url='', **kwargs):
24
+ return {
25
+ 'url': url,
26
+ 'num_classes': 400,
27
+ 'input_size': (3, 224, 224),
28
+ 'pool_size': None,
29
+ 'crop_pct': .9,
30
+ 'interpolation': 'bicubic',
31
+ 'mean': (0.5, 0.5, 0.5),
32
+ 'std': (0.5, 0.5, 0.5),
33
+ **kwargs
34
+ }
35
+
36
+
37
+ class DropPath(nn.Module):
38
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
39
+ """
40
+
41
+ def __init__(self, drop_prob=None):
42
+ super(DropPath, self).__init__()
43
+ self.drop_prob = drop_prob
44
+
45
+ def forward(self, x):
46
+ return drop_path(x, self.drop_prob, self.training)
47
+
48
+ def extra_repr(self) -> str:
49
+ return 'p={}'.format(self.drop_prob)
50
+
51
+
52
+ class Mlp(nn.Module):
53
+
54
+ def __init__(self,
55
+ in_features,
56
+ hidden_features=None,
57
+ out_features=None,
58
+ act_layer=nn.GELU,
59
+ drop=0.):
60
+ super().__init__()
61
+ out_features = out_features or in_features
62
+ hidden_features = hidden_features or in_features
63
+ self.fc1 = nn.Linear(in_features, hidden_features)
64
+ self.act = act_layer()
65
+ self.fc2 = nn.Linear(hidden_features, out_features)
66
+ self.drop = nn.Dropout(drop)
67
+
68
+ def forward(self, x):
69
+ x = self.fc1(x)
70
+ x = self.act(x)
71
+ # x = self.drop(x)
72
+ # commit this for the orignal BERT implement
73
+ x = self.fc2(x)
74
+ x = self.drop(x)
75
+ return x
76
+
77
+
78
+ class CosAttention(nn.Module):
79
+
80
+ def __init__(self,
81
+ dim,
82
+ num_heads=8,
83
+ qkv_bias=False,
84
+ qk_scale=None,
85
+ attn_drop=0.,
86
+ proj_drop=0.,
87
+ attn_head_dim=None):
88
+ super().__init__()
89
+ self.num_heads = num_heads
90
+ head_dim = dim // num_heads
91
+ if attn_head_dim is not None:
92
+ head_dim = attn_head_dim
93
+ all_head_dim = head_dim * self.num_heads
94
+ # self.scale = qk_scale or head_dim**-0.5
95
+ # DO NOT RENAME [self.scale] (for no weight decay)
96
+ if qk_scale is None:
97
+ self.scale = nn.Parameter(
98
+ torch.log(10 * torch.ones((num_heads, 1, 1))),
99
+ requires_grad=True)
100
+ else:
101
+ self.scale = qk_scale
102
+
103
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
104
+ if qkv_bias:
105
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
106
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
107
+ else:
108
+ self.q_bias = None
109
+ self.v_bias = None
110
+
111
+ self.attn_drop = nn.Dropout(attn_drop)
112
+ self.proj = nn.Linear(all_head_dim, dim)
113
+ self.proj_drop = nn.Dropout(proj_drop)
114
+
115
+ def forward(self, x):
116
+ B, N, C = x.shape
117
+ qkv_bias = None
118
+ if self.q_bias is not None:
119
+ qkv_bias = torch.cat(
120
+ (self.q_bias,
121
+ torch.zeros_like(self.v_bias,
122
+ requires_grad=False), self.v_bias))
123
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
124
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
125
+ q, k, v = qkv[0], qkv[1], qkv[
126
+ 2] # make torchscript happy (cannot use tensor as tuple)
127
+
128
+ attn = (
129
+ F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
130
+
131
+ # torch.log(torch.tensor(1. / 0.01)) = 4.6052
132
+ logit_scale = torch.clamp(self.scale, max=4.6052).exp()
133
+
134
+ attn = attn * logit_scale
135
+
136
+ attn = attn.softmax(dim=-1)
137
+ attn = self.attn_drop(attn)
138
+
139
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
140
+
141
+ x = self.proj(x)
142
+ x = self.proj_drop(x)
143
+ return x
144
+
145
+
146
+ class Attention(nn.Module):
147
+
148
+ def __init__(self,
149
+ dim,
150
+ num_heads=8,
151
+ qkv_bias=False,
152
+ qk_scale=None,
153
+ attn_drop=0.,
154
+ proj_drop=0.,
155
+ attn_head_dim=None):
156
+ super().__init__()
157
+ self.num_heads = num_heads
158
+ head_dim = dim // num_heads
159
+ if attn_head_dim is not None:
160
+ head_dim = attn_head_dim
161
+ all_head_dim = head_dim * self.num_heads
162
+ self.scale = qk_scale or head_dim**-0.5
163
+
164
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
165
+ if qkv_bias:
166
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
167
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
168
+ else:
169
+ self.q_bias = None
170
+ self.v_bias = None
171
+
172
+ self.attn_drop = nn.Dropout(attn_drop)
173
+ self.proj = nn.Linear(all_head_dim, dim)
174
+ self.proj_drop = nn.Dropout(proj_drop)
175
+
176
+ def forward(self, x):
177
+ B, N, C = x.shape
178
+ qkv_bias = None
179
+ if self.q_bias is not None:
180
+ qkv_bias = torch.cat(
181
+ (self.q_bias,
182
+ torch.zeros_like(self.v_bias,
183
+ requires_grad=False), self.v_bias))
184
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
185
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
186
+ q, k, v = qkv[0], qkv[1], qkv[
187
+ 2] # make torchscript happy (cannot use tensor as tuple)
188
+
189
+ q = q * self.scale
190
+ attn = (q @ k.transpose(-2, -1))
191
+
192
+ attn = attn.softmax(dim=-1)
193
+ attn = self.attn_drop(attn)
194
+
195
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
196
+
197
+ x = self.proj(x)
198
+ x = self.proj_drop(x)
199
+ return x
200
+
201
+
202
+ class Block(nn.Module):
203
+
204
+ def __init__(self,
205
+ dim,
206
+ num_heads,
207
+ mlp_ratio=4.,
208
+ qkv_bias=False,
209
+ qk_scale=None,
210
+ drop=0.,
211
+ attn_drop=0.,
212
+ drop_path=0.,
213
+ init_values=None,
214
+ act_layer=nn.GELU,
215
+ norm_layer=nn.LayerNorm,
216
+ attn_head_dim=None,
217
+ cos_attn=False):
218
+ super().__init__()
219
+ self.norm1 = norm_layer(dim)
220
+ if cos_attn:
221
+ self.attn = CosAttention(
222
+ dim,
223
+ num_heads=num_heads,
224
+ qkv_bias=qkv_bias,
225
+ qk_scale=qk_scale,
226
+ attn_drop=attn_drop,
227
+ proj_drop=drop,
228
+ attn_head_dim=attn_head_dim)
229
+ else:
230
+ self.attn = Attention(
231
+ dim,
232
+ num_heads=num_heads,
233
+ qkv_bias=qkv_bias,
234
+ qk_scale=qk_scale,
235
+ attn_drop=attn_drop,
236
+ proj_drop=drop,
237
+ attn_head_dim=attn_head_dim)
238
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
239
+ self.drop_path = DropPath(
240
+ drop_path) if drop_path > 0. else nn.Identity()
241
+ self.norm2 = norm_layer(dim)
242
+ mlp_hidden_dim = int(dim * mlp_ratio)
243
+ self.mlp = Mlp(
244
+ in_features=dim,
245
+ hidden_features=mlp_hidden_dim,
246
+ act_layer=act_layer,
247
+ drop=drop)
248
+
249
+ if init_values > 0:
250
+ self.gamma_1 = nn.Parameter(
251
+ init_values * torch.ones((dim)), requires_grad=True)
252
+ self.gamma_2 = nn.Parameter(
253
+ init_values * torch.ones((dim)), requires_grad=True)
254
+ else:
255
+ self.gamma_1, self.gamma_2 = None, None
256
+
257
+ def forward(self, x):
258
+ if self.gamma_1 is None:
259
+ x = x + self.drop_path(self.attn(self.norm1(x)))
260
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
261
+ else:
262
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
263
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
264
+ return x
265
+
266
+
267
+ class PatchEmbed(nn.Module):
268
+ """ Image to Patch Embedding
269
+ """
270
+
271
+ def __init__(self,
272
+ img_size=224,
273
+ patch_size=16,
274
+ in_chans=3,
275
+ embed_dim=768,
276
+ num_frames=16,
277
+ tubelet_size=2):
278
+ super().__init__()
279
+ img_size = to_2tuple(img_size)
280
+ patch_size = to_2tuple(patch_size)
281
+ num_spatial_patches = (img_size[0] // patch_size[0]) * (
282
+ img_size[1] // patch_size[1])
283
+ num_patches = num_spatial_patches * (num_frames // tubelet_size)
284
+
285
+ self.img_size = img_size
286
+ self.tubelet_size = tubelet_size
287
+ self.patch_size = patch_size
288
+ self.num_patches = num_patches
289
+ self.proj = nn.Conv3d(
290
+ in_channels=in_chans,
291
+ out_channels=embed_dim,
292
+ kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
293
+ stride=(self.tubelet_size, patch_size[0], patch_size[1]))
294
+
295
+ def forward(self, x, **kwargs):
296
+ B, C, T, H, W = x.shape
297
+ assert H == self.img_size[0] and W == self.img_size[
298
+ 1], f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
299
+ # b, c, l -> b, l, c
300
+ x = self.proj(x).flatten(2).transpose(1, 2)
301
+ return x
302
+
303
+
304
+ # sin-cos position encoding
305
+ # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
306
+ def get_sinusoid_encoding_table(n_position, d_hid):
307
+ ''' Sinusoid position encoding table '''
308
+
309
+ # TODO: make it with torch instead of numpy
310
+ def get_position_angle_vec(position):
311
+ return [
312
+ position / np.power(10000, 2 * (hid_j // 2) / d_hid)
313
+ for hid_j in range(d_hid)
314
+ ]
315
+
316
+ sinusoid_table = np.array(
317
+ [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
318
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
319
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
320
+
321
+ return torch.tensor(
322
+ sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
323
+
324
+
325
+ class VisionTransformer(nn.Module):
326
+ """ Vision Transformer with support for patch or hybrid CNN input stage
327
+ """
328
+
329
+ def __init__(self,
330
+ img_size=224,
331
+ patch_size=16,
332
+ in_chans=3,
333
+ num_classes=1000,
334
+ embed_dim=768,
335
+ depth=12,
336
+ num_heads=12,
337
+ mlp_ratio=4.,
338
+ qkv_bias=False,
339
+ qk_scale=None,
340
+ drop_rate=0.,
341
+ attn_drop_rate=0.,
342
+ drop_path_rate=0.,
343
+ head_drop_rate=0.,
344
+ norm_layer=nn.LayerNorm,
345
+ layer_norm_eps=1e-12,
346
+ init_values=0.,
347
+ use_learnable_pos_emb=False,
348
+ init_scale=0.,
349
+ num_frames=16,
350
+ tubelet_size=2,
351
+ use_mean_pooling=True,
352
+ with_cp=False,
353
+ cos_attn=False):
354
+ super().__init__()
355
+ self.num_classes = num_classes
356
+ # num_features for consistency with other models
357
+ self.num_features = self.embed_dim = embed_dim
358
+ self.tubelet_size = tubelet_size
359
+ self.patch_embed = PatchEmbed(
360
+ img_size=img_size,
361
+ patch_size=patch_size,
362
+ in_chans=in_chans,
363
+ embed_dim=embed_dim,
364
+ num_frames=num_frames,
365
+ tubelet_size=tubelet_size)
366
+ num_patches = self.patch_embed.num_patches
367
+ self.with_cp = with_cp
368
+
369
+ norm_layer = partial(eval(norm_layer), eps=layer_norm_eps)
370
+
371
+ if use_learnable_pos_emb:
372
+ self.pos_embed = nn.Parameter(
373
+ torch.zeros(1, num_patches, embed_dim))
374
+ else:
375
+ # sine-cosine positional embeddings is on the way
376
+ self.pos_embed = get_sinusoid_encoding_table(
377
+ num_patches, embed_dim)
378
+
379
+ self.pos_drop = nn.Dropout(p=drop_rate)
380
+
381
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
382
+ ] # stochastic depth decay rule
383
+ self.blocks = nn.ModuleList([
384
+ Block(
385
+ dim=embed_dim,
386
+ num_heads=num_heads,
387
+ mlp_ratio=mlp_ratio,
388
+ qkv_bias=qkv_bias,
389
+ qk_scale=qk_scale,
390
+ drop=drop_rate,
391
+ attn_drop=attn_drop_rate,
392
+ drop_path=dpr[i],
393
+ norm_layer=norm_layer,
394
+ init_values=init_values,
395
+ cos_attn=cos_attn) for i in range(depth)
396
+ ])
397
+ self.norm = nn.Identity() if use_mean_pooling else norm_layer(
398
+ embed_dim)
399
+ self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
400
+ self.head_dropout = nn.Dropout(head_drop_rate)
401
+ self.head = nn.Linear(
402
+ embed_dim, num_classes) if num_classes > 0 else nn.Identity()
403
+
404
+ if use_learnable_pos_emb:
405
+ trunc_normal_(self.pos_embed, std=.02)
406
+
407
+ self.apply(self._init_weights)
408
+ if num_classes > 0:
409
+ self.head.weight.data.mul_(init_scale)
410
+ self.head.bias.data.mul_(init_scale)
411
+
412
+ def _init_weights(self, m):
413
+ if isinstance(m, nn.Linear):
414
+ trunc_normal_(m.weight, std=.02)
415
+ if isinstance(m, nn.Linear) and m.bias is not None:
416
+ nn.init.constant_(m.bias, 0)
417
+ elif isinstance(m, nn.LayerNorm):
418
+ nn.init.constant_(m.bias, 0)
419
+ nn.init.constant_(m.weight, 1.0)
420
+
421
+ def get_num_layers(self):
422
+ return len(self.blocks)
423
+
424
+ @torch.jit.ignore
425
+ def no_weight_decay(self):
426
+ return {'pos_embed', 'cls_token'}
427
+
428
+ def get_classifier(self):
429
+ return self.head
430
+
431
+ def reset_classifier(self, num_classes, global_pool=''):
432
+ self.num_classes = num_classes
433
+ self.head = nn.Linear(
434
+ self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
435
+
436
+ def forward_features(self, x):
437
+ B = x.size(0)
438
+
439
+ x = self.patch_embed(x)
440
+
441
+ if self.pos_embed is not None:
442
+ x = x + self.pos_embed.expand(B, -1, -1).type_as(x).to(
443
+ x.device).clone().detach()
444
+ x = self.pos_drop(x)
445
+
446
+ for blk in self.blocks:
447
+ if self.with_cp:
448
+ x = cp.checkpoint(blk, x)
449
+ else:
450
+ x = blk(x)
451
+
452
+ if self.fc_norm is not None:
453
+ return self.fc_norm(x.mean(1))
454
+ else:
455
+ return self.norm(x[:, 0])
456
+
457
+ def forward(self, x):
458
+ x = self.forward_features(x)
459
+ x = self.head_dropout(x)
460
+ x = self.head(x)
461
+ return x
462
+
463
+
464
+
465
+
466
+ class VideoMAEv2(PreTrainedModel):
467
+ config_class = VideoMAEv2Config
468
+ def __init__(self, config=None):
469
+ super().__init__(config=config)
470
+ self.model_config = config.model_config
471
+ logger.info("Model config: {}".format(self.model_config))
472
+ self.model = VisionTransformer(**self.model_config)
473
+
474
+ def forward(self, pixel_values):
475
+ return self.model(pixel_values)
476
+
477
+ def extract_features(self, pixel_values):
478
+ return self.model.forward_features(pixel_values)
479
+ def vit_small_patch16_224(pretrained=False, **kwargs):
480
+ model = VisionTransformer(
481
+ patch_size=16,
482
+ embed_dim=384,
483
+ depth=12,
484
+ num_heads=6,
485
+ mlp_ratio=4,
486
+ qkv_bias=True,
487
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
488
+ **kwargs)
489
+ model.default_cfg = _cfg()
490
+ return model
491
+
492
+
493
+
494
+ def vit_base_patch16_224(pretrained=False, **kwargs):
495
+ model = VisionTransformer(
496
+ patch_size=16,
497
+ embed_dim=768,
498
+ depth=12,
499
+ num_heads=12,
500
+ mlp_ratio=4,
501
+ qkv_bias=True,
502
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
503
+ **kwargs)
504
+ model.default_cfg = _cfg()
505
+ return model
506
+
507
+
508
+ # @register_model
509
+ def vit_huge_patch16_224(pretrained=False, **kwargs):
510
+ model = VisionTransformer(
511
+ patch_size=16,
512
+ embed_dim=1280,
513
+ depth=32,
514
+ num_heads=16,
515
+ mlp_ratio=4,
516
+ qkv_bias=True,
517
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
518
+ **kwargs)
519
+ model.default_cfg = _cfg()
520
+ return model
521
+
522
+
523
+ # @register_model
524
+ def vit_giant_patch14_224(pretrained=False, **kwargs):
525
+ model = VisionTransformer(
526
+ patch_size=14,
527
+ embed_dim=1408,
528
+ depth=40,
529
+ num_heads=16,
530
+ mlp_ratio=48 / 11,
531
+ qkv_bias=True,
532
+ norm_layer=partial(nn.LayerNorm, eps=1e-6),
533
+ **kwargs)
534
+ model.default_cfg = _cfg()
535
+ return model
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_center_crop": true,
3
+ "do_normalize": true,
4
+ "do_resize": true,
5
+ "feature_extractor_type": "VideoMAEFeatureExtractor",
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_std": [
12
+ 0.229,
13
+ 0.224,
14
+ 0.225
15
+ ],
16
+ "resample": 2,
17
+ "size": 224
18
+ }