Upload files
Browse files- .gitattributes +1 -0
- README.md +80 -3
- aimv2_overview_light.png +3 -0
- config.json +25 -0
- configuration_aimv2.py +62 -0
- flax_model.msgpack +3 -0
- mlx_model.safetensors +3 -0
- model.safetensors +3 -0
- modeling_aimv2.py +220 -0
- modeling_flax_aimv2.py +339 -0
- preprocessor_config.json +27 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
aimv2_overview_light.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,80 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
license: apple-ascl
|
4 |
+
metrics:
|
5 |
+
- accuracy
|
6 |
+
pipeline_tag: image-feature-extraction
|
7 |
+
tags:
|
8 |
+
- vision
|
9 |
+
- image-feature-extraction
|
10 |
+
- mlx
|
11 |
+
- pytorch
|
12 |
+
---
|
13 |
+
# Introduction
|
14 |
+
[[`AIMv2 Paper`](#)] [[`BibTeX`](#citation)]
|
15 |
+
|
16 |
+
We introduce the AIMv2 family of vision models pre-trained with a multimodal autoregressive objective.
|
17 |
+
AIMv2 pre-training is simple and straightforward to train and scale effectively. Some AIMv2 highlights include:
|
18 |
+
|
19 |
+
1. Outperforms OAI CLIP and SigLIP on the majority of multimodal understanding benchmarks.
|
20 |
+
2. Outperforms DINOv2 on open-vocabulary object detection and referring expression comprehension.
|
21 |
+
3. Exhibits strong recognition performance with AIMv2-3B achieving *89.5% on ImageNet using a frozen trunk*.
|
22 |
+
|
23 |
+
<img src="aimv2_overview_light.png" alt="AIMv2 Overview"/>
|
24 |
+
|
25 |
+
## Usage
|
26 |
+
|
27 |
+
### PyTorch
|
28 |
+
```python
|
29 |
+
import requests
|
30 |
+
from PIL import Image
|
31 |
+
from transformers import AutoImageProcessor, AutoModel
|
32 |
+
|
33 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
34 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
35 |
+
|
36 |
+
processor = AutoImageProcessor.from_pretrained(
|
37 |
+
"apple/aimv2-large-patch14-native",
|
38 |
+
)
|
39 |
+
model = AutoModel.from_pretrained(
|
40 |
+
"apple/aimv2-large-patch14-native",
|
41 |
+
trust_remote_code=True,
|
42 |
+
)
|
43 |
+
|
44 |
+
inputs = processor(images=image, return_tensors="pt")
|
45 |
+
outputs = model(**inputs)
|
46 |
+
```
|
47 |
+
|
48 |
+
### JAX
|
49 |
+
```python
|
50 |
+
import requests
|
51 |
+
from PIL import Image
|
52 |
+
from transformers import AutoImageProcessor, FlaxAutoModel
|
53 |
+
|
54 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
55 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
56 |
+
|
57 |
+
processor = AutoImageProcessor.from_pretrained(
|
58 |
+
"apple/aimv2-large-patch14-native",
|
59 |
+
)
|
60 |
+
model = FlaxAutoModel.from_pretrained(
|
61 |
+
"apple/aimv2-large-patch14-native",
|
62 |
+
trust_remote_code=True,
|
63 |
+
)
|
64 |
+
|
65 |
+
inputs = processor(images=image, return_tensors="jax")
|
66 |
+
outputs = model(**inputs)
|
67 |
+
```
|
68 |
+
|
69 |
+
## Citation
|
70 |
+
If you find our work useful, please consider citing us as:
|
71 |
+
```bibtex
|
72 |
+
@misc{fini2024multimodal,
|
73 |
+
title = {Multimodal Autoregressive Pre-training of Large Vision Encoders},
|
74 |
+
author = {Enrico Fini and Mustafa Shukor and Xiujun Li and Philipp Dufter and Michal Klein and David Haldimann and Sai Aitharaju and Victor Guilherme Turrisi da Costa and Louis Béthune and Zhe Gan and Alexander T Toshev and Marcin Eichner and Moin Nabi and Yinfei Yang and Joshua M. Susskind and Alaaeldin El-Nouby},
|
75 |
+
year = {2024},
|
76 |
+
archivePrefix = {arXiv},
|
77 |
+
primaryClass = {cs.CV},
|
78 |
+
}
|
79 |
+
```
|
80 |
+
|
aimv2_overview_light.png
ADDED
Git LFS Details
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"AIMv2Model"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"auto_map": {
|
7 |
+
"AutoConfig": "configuration_aimv2.AIMv2Config",
|
8 |
+
"AutoModel": "modeling_aimv2.AIMv2Model",
|
9 |
+
"FlaxAutoModel": "modeling_flax_aimv2.FlaxAIMv2Model"
|
10 |
+
},
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"intermediate_size": 2816,
|
13 |
+
"model_type": "aimv2",
|
14 |
+
"num_attention_heads": 8,
|
15 |
+
"num_channels": 3,
|
16 |
+
"num_hidden_layers": 24,
|
17 |
+
"num_queries": 256,
|
18 |
+
"patch_size": 14,
|
19 |
+
"projection_dropout": 0.0,
|
20 |
+
"qkv_bias": false,
|
21 |
+
"rms_norm_eps": 1e-05,
|
22 |
+
"torch_dtype": "float32",
|
23 |
+
"transformers_version": "4.46.3",
|
24 |
+
"use_bias": false
|
25 |
+
}
|
configuration_aimv2.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
from transformers.configuration_utils import PretrainedConfig
|
4 |
+
|
5 |
+
__all__ = ["AIMv2Config"]
|
6 |
+
|
7 |
+
|
8 |
+
class AIMv2Config(PretrainedConfig):
|
9 |
+
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
|
10 |
+
|
11 |
+
Instantiating a configuration with the defaults will yield a similar configuration
|
12 |
+
to that of the [apple/aimv2-large-patch14-native](https://huggingface.co/apple/aimv2-large-patch14-native)
|
13 |
+
|
14 |
+
Args:
|
15 |
+
hidden_size: Dimension of the hidden representations.
|
16 |
+
intermediate_size: Dimension of the SwiGLU representations.
|
17 |
+
num_hidden_layers: Number of hidden layers in the Transformer.
|
18 |
+
num_attention_heads: Number of attention heads for each attention layer
|
19 |
+
in the Transformer.
|
20 |
+
num_channels: Number of input channels.
|
21 |
+
num_queries: Number of learnable queries in the head.
|
22 |
+
patch_size: Patch size.
|
23 |
+
rms_norm_eps: Epsilon value used for the RMS normalization layer.
|
24 |
+
attention_dropout: Dropout ratio for attention probabilities.
|
25 |
+
projection_dropout: Dropout ratio for the projection layer after the attention.
|
26 |
+
qkv_bias: Whether to add a bias to the queries, keys and values.
|
27 |
+
use_bias: Whether to add a bias in the feed-forward and projection layers.
|
28 |
+
kwargs: Keyword arguments for the [`PretrainedConfig`].
|
29 |
+
"""
|
30 |
+
|
31 |
+
model_type: str = "aimv2"
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
hidden_size: int = 1024,
|
36 |
+
intermediate_size: int = 2816,
|
37 |
+
num_hidden_layers: int = 24,
|
38 |
+
num_attention_heads: int = 8,
|
39 |
+
num_channels: int = 3,
|
40 |
+
num_queries: int = 256,
|
41 |
+
patch_size: int = 14,
|
42 |
+
rms_norm_eps: float = 1e-5,
|
43 |
+
attention_dropout: float = 0.0,
|
44 |
+
projection_dropout: float = 0.0,
|
45 |
+
qkv_bias: bool = False,
|
46 |
+
use_bias: bool = False,
|
47 |
+
**kwargs: Any,
|
48 |
+
):
|
49 |
+
super().__init__(**kwargs)
|
50 |
+
self.hidden_size = hidden_size
|
51 |
+
self.intermediate_size = intermediate_size
|
52 |
+
self.num_hidden_layers = num_hidden_layers
|
53 |
+
self.num_attention_heads = num_attention_heads
|
54 |
+
self.num_channels = num_channels
|
55 |
+
self.num_queries = num_queries
|
56 |
+
self.patch_size = patch_size
|
57 |
+
self.attention_dropout = attention_dropout
|
58 |
+
self.rms_norm_eps = rms_norm_eps
|
59 |
+
|
60 |
+
self.projection_dropout = projection_dropout
|
61 |
+
self.qkv_bias = qkv_bias
|
62 |
+
self.use_bias = use_bias
|
flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d52bb8cda1854e55348e9e6046046cc8a8b6218167d8b8580a340f6f4b172ca4
|
3 |
+
size 1235749750
|
mlx_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2de3d578c9360aaadb2174946f1c72a93b3861f0687dc1b66ad254ddf80f2da
|
3 |
+
size 1235760719
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cdc4c4ea6f2a477edebb482cc36ba021409a313eabdf3e6be62eb722771e7d1
|
3 |
+
size 1235760720
|
modeling_aimv2.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Tuple, Union
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from .configuration_aimv2 import AIMv2Config
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
from transformers.modeling_outputs import BaseModelOutputWithNoAttention
|
8 |
+
from transformers.modeling_utils import PreTrainedModel
|
9 |
+
|
10 |
+
__all__ = ["AIMv2Model"]
|
11 |
+
|
12 |
+
|
13 |
+
def _get_1d_sincos_pos_embed_from_grid(
|
14 |
+
embed_dim: int, pos: torch.Tensor
|
15 |
+
) -> torch.Tensor:
|
16 |
+
omega = torch.arange(embed_dim // 2).float()
|
17 |
+
omega /= embed_dim / 2.0
|
18 |
+
omega = 1.0 / 10000**omega # (D / 2,)
|
19 |
+
pos = pos.reshape(-1) # (M,)
|
20 |
+
out = pos[:, None] * omega[None, :] # (M, D / 2), outer product
|
21 |
+
emb_sin, emb_cos = torch.sin(out), torch.cos(out) # (M, D / 2)
|
22 |
+
emb = torch.concatenate([emb_sin, emb_cos], dim=1) # (M, D)
|
23 |
+
return emb
|
24 |
+
|
25 |
+
|
26 |
+
def get_sincos_pos_embed(h: int, w: int, embed_dim: int) -> torch.Tensor:
|
27 |
+
assert embed_dim % 2 == 0, embed_dim
|
28 |
+
grid_h = torch.arange(h).float()
|
29 |
+
grid_w = torch.arange(w).float()
|
30 |
+
grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
|
31 |
+
grid = torch.stack(grid, dim=0)
|
32 |
+
grid = grid.reshape([2, 1, h, w])
|
33 |
+
emb_h = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
|
34 |
+
emb_w = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
|
35 |
+
pos_embed = torch.concatenate([emb_h, emb_w], dim=1) # (H * W, D)
|
36 |
+
return pos_embed
|
37 |
+
|
38 |
+
|
39 |
+
class RMSNorm(nn.Module):
|
40 |
+
def __init__(self, dim: int, eps: float = 1e-6):
|
41 |
+
super().__init__()
|
42 |
+
self.weight = nn.Parameter(torch.ones(dim))
|
43 |
+
self.eps = eps
|
44 |
+
|
45 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
46 |
+
output = self._norm(x.float()).type_as(x)
|
47 |
+
return output * self.weight
|
48 |
+
|
49 |
+
def extra_repr(self) -> str:
|
50 |
+
return f"{tuple(self.weight.shape)}, eps={self.eps}"
|
51 |
+
|
52 |
+
def _norm(self, x: torch.Tensor) -> torch.Tensor:
|
53 |
+
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
54 |
+
|
55 |
+
|
56 |
+
class AIMv2SwiGLUFFN(nn.Module):
|
57 |
+
def __init__(self, config: AIMv2Config):
|
58 |
+
super().__init__()
|
59 |
+
hidden_features = config.intermediate_size
|
60 |
+
in_features = config.hidden_size
|
61 |
+
bias = config.use_bias
|
62 |
+
|
63 |
+
self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
|
64 |
+
self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
|
65 |
+
self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
|
66 |
+
|
67 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
68 |
+
x = F.silu(self.fc1(x)) * self.fc3(x)
|
69 |
+
x = self.fc2(x)
|
70 |
+
return x
|
71 |
+
|
72 |
+
|
73 |
+
class AIMv2PatchEmbed(nn.Module):
|
74 |
+
def __init__(self, config: AIMv2Config):
|
75 |
+
super().__init__()
|
76 |
+
self.proj = nn.Conv2d(
|
77 |
+
config.num_channels,
|
78 |
+
config.hidden_size,
|
79 |
+
kernel_size=(config.patch_size, config.patch_size),
|
80 |
+
stride=(config.patch_size, config.patch_size),
|
81 |
+
)
|
82 |
+
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
83 |
+
|
84 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
85 |
+
x = self.proj(x).flatten(2).transpose(1, 2)
|
86 |
+
x = self.norm(x)
|
87 |
+
return x
|
88 |
+
|
89 |
+
|
90 |
+
class AIMv2ViTPreprocessor(nn.Module):
|
91 |
+
def __init__(self, config: AIMv2Config):
|
92 |
+
super().__init__()
|
93 |
+
self.patch_h = config.patch_size
|
94 |
+
self.patch_w = config.patch_size
|
95 |
+
self.embed_dim = config.hidden_size
|
96 |
+
|
97 |
+
self.patchifier = AIMv2PatchEmbed(config)
|
98 |
+
|
99 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
100 |
+
_, _, H, W = x.shape
|
101 |
+
tokens = self.patchifier(x)
|
102 |
+
pos_embed = get_sincos_pos_embed(
|
103 |
+
H // self.patch_h, W // self.patch_w, embed_dim=self.embed_dim
|
104 |
+
)
|
105 |
+
tokens = tokens + pos_embed
|
106 |
+
return tokens
|
107 |
+
|
108 |
+
|
109 |
+
class AIMv2Attention(nn.Module):
|
110 |
+
def __init__(self, config: AIMv2Config):
|
111 |
+
super().__init__()
|
112 |
+
dim = config.hidden_size
|
113 |
+
|
114 |
+
self.num_heads = config.num_attention_heads
|
115 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
|
116 |
+
self.attn_drop = nn.Dropout(config.attention_dropout)
|
117 |
+
self.proj = nn.Linear(dim, dim, bias=config.use_bias)
|
118 |
+
self.proj_drop = nn.Dropout(config.projection_dropout)
|
119 |
+
|
120 |
+
def forward(
|
121 |
+
self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
|
122 |
+
) -> torch.Tensor:
|
123 |
+
B, N, C = x.shape
|
124 |
+
qkv = (
|
125 |
+
self.qkv(x)
|
126 |
+
.reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
127 |
+
.permute(2, 0, 3, 1, 4)
|
128 |
+
)
|
129 |
+
q, k, v = qkv.unbind(0)
|
130 |
+
|
131 |
+
x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
|
132 |
+
x = x.transpose(1, 2).contiguous().reshape(B, N, C)
|
133 |
+
x = self.proj(x)
|
134 |
+
x = self.proj_drop(x)
|
135 |
+
return x
|
136 |
+
|
137 |
+
|
138 |
+
class AIMv2Block(nn.Module):
|
139 |
+
def __init__(self, config: AIMv2Config):
|
140 |
+
super().__init__()
|
141 |
+
self.attn = AIMv2Attention(config)
|
142 |
+
self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
143 |
+
self.mlp = AIMv2SwiGLUFFN(config)
|
144 |
+
self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
145 |
+
|
146 |
+
def forward(
|
147 |
+
self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
|
148 |
+
) -> torch.Tensor:
|
149 |
+
x = x + self.attn(self.norm_1(x), mask)
|
150 |
+
x = x + self.mlp(self.norm_2(x))
|
151 |
+
return x
|
152 |
+
|
153 |
+
|
154 |
+
class AIMv2Transformer(nn.Module):
|
155 |
+
def __init__(self, config: AIMv2Config):
|
156 |
+
super().__init__()
|
157 |
+
self.blocks = nn.ModuleList(
|
158 |
+
[AIMv2Block(config) for _ in range(config.num_hidden_layers)]
|
159 |
+
)
|
160 |
+
self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
161 |
+
|
162 |
+
def forward(
|
163 |
+
self,
|
164 |
+
tokens: torch.Tensor,
|
165 |
+
mask: Optional[torch.Tensor] = None,
|
166 |
+
output_hidden_states: bool = False,
|
167 |
+
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
|
168 |
+
hidden_states = () if output_hidden_states else None
|
169 |
+
for block in self.blocks:
|
170 |
+
tokens = block(tokens, mask)
|
171 |
+
if output_hidden_states:
|
172 |
+
hidden_states += (tokens,)
|
173 |
+
tokens = self.post_trunk_norm(tokens)
|
174 |
+
return tokens, hidden_states
|
175 |
+
|
176 |
+
|
177 |
+
class AIMv2PretrainedModel(PreTrainedModel):
|
178 |
+
config_class = AIMv2Config
|
179 |
+
base_model_prefix = "aimv2"
|
180 |
+
main_input_name = "pixel_values"
|
181 |
+
_supports_sdpa = True
|
182 |
+
|
183 |
+
|
184 |
+
class AIMv2Model(AIMv2PretrainedModel):
|
185 |
+
def __init__(self, config: AIMv2Config):
|
186 |
+
super().__init__(config)
|
187 |
+
self.preprocessor = AIMv2ViTPreprocessor(config)
|
188 |
+
self.trunk = AIMv2Transformer(config)
|
189 |
+
|
190 |
+
def forward(
|
191 |
+
self,
|
192 |
+
pixel_values: torch.Tensor,
|
193 |
+
mask: Optional[torch.Tensor] = None,
|
194 |
+
output_hidden_states: Optional[bool] = None,
|
195 |
+
return_dict: Optional[bool] = None,
|
196 |
+
) -> Union[
|
197 |
+
Tuple[torch.Tensor],
|
198 |
+
Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
|
199 |
+
BaseModelOutputWithNoAttention,
|
200 |
+
]:
|
201 |
+
if output_hidden_states is None:
|
202 |
+
output_hidden_states = self.config.output_hidden_states
|
203 |
+
if return_dict is None:
|
204 |
+
return_dict = self.config.use_return_dict
|
205 |
+
|
206 |
+
x = self.preprocessor(pixel_values)
|
207 |
+
x, hidden_states = self.trunk(
|
208 |
+
x, mask, output_hidden_states=output_hidden_states
|
209 |
+
)
|
210 |
+
|
211 |
+
if not return_dict:
|
212 |
+
res = (x,)
|
213 |
+
res += (hidden_states,) if output_hidden_states else ()
|
214 |
+
return res
|
215 |
+
|
216 |
+
return BaseModelOutputWithNoAttention(
|
217 |
+
last_hidden_state=x,
|
218 |
+
hidden_states=hidden_states,
|
219 |
+
)
|
220 |
+
|
modeling_flax_aimv2.py
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Optional, Tuple, Union
|
2 |
+
|
3 |
+
import flax.linen as nn
|
4 |
+
import jax
|
5 |
+
import jax.numpy as jnp
|
6 |
+
from .configuration_aimv2 import AIMv2Config
|
7 |
+
from flax.core import frozen_dict
|
8 |
+
from transformers import FlaxPreTrainedModel
|
9 |
+
from transformers.modeling_flax_outputs import FlaxBaseModelOutput
|
10 |
+
|
11 |
+
__all__ = ["FlaxAIMv2Model"]
|
12 |
+
|
13 |
+
|
14 |
+
def _get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos: jax.Array) -> jax.Array:
|
15 |
+
omega = jnp.arange(embed_dim // 2, dtype=pos.dtype)
|
16 |
+
omega /= embed_dim / 2.0
|
17 |
+
omega = 1.0 / 10000**omega # (D / 2,)
|
18 |
+
pos = pos.reshape(-1) # (M,)
|
19 |
+
out = pos[:, None] * omega[None, :] # (M, D / 2), outer product
|
20 |
+
emb_sin, emb_cos = jnp.sin(out), jnp.cos(out) # (M, D / 2)
|
21 |
+
emb = jnp.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
22 |
+
return emb
|
23 |
+
|
24 |
+
|
25 |
+
def get_sincos_pos_embed(
|
26 |
+
h: int,
|
27 |
+
w: int,
|
28 |
+
embed_dim: int,
|
29 |
+
dtype: jnp.dtype = jnp.float32,
|
30 |
+
) -> jax.Array:
|
31 |
+
assert embed_dim % 2 == 0, embed_dim
|
32 |
+
grid_h = jnp.arange(h, dtype=dtype)
|
33 |
+
grid_w = jnp.arange(w, dtype=dtype)
|
34 |
+
grid = jnp.meshgrid(grid_w, grid_h, indexing="xy")
|
35 |
+
grid = jnp.stack(grid, axis=0)
|
36 |
+
grid = grid.reshape([2, 1, h, w])
|
37 |
+
emb_h = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
|
38 |
+
emb_w = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
|
39 |
+
pos_embed = jnp.concatenate([emb_h, emb_w], axis=1) # (H * W, D)
|
40 |
+
return pos_embed
|
41 |
+
|
42 |
+
|
43 |
+
class FlaxRMSNorm(nn.Module):
|
44 |
+
eps: float = 1e-6
|
45 |
+
|
46 |
+
@nn.compact
|
47 |
+
def __call__(self, x: jax.Array) -> jax.Array:
|
48 |
+
dim = x.shape[-1]
|
49 |
+
scale = self.param("scale", nn.initializers.ones_init(), (dim,))
|
50 |
+
output = self._norm(x.astype(jnp.float32)).astype(x.dtype)
|
51 |
+
output = output * scale.astype(x.dtype)
|
52 |
+
return output
|
53 |
+
|
54 |
+
def _norm(self, x: jax.Array) -> jax.Array:
|
55 |
+
return x * jax.lax.rsqrt(jnp.power(x, 2).mean(-1, keepdims=True) + self.eps)
|
56 |
+
|
57 |
+
|
58 |
+
class FlaxAIMv2SwiGLUFFN(nn.Module):
|
59 |
+
config: AIMv2Config
|
60 |
+
dtype: jnp.dtype = jnp.float32
|
61 |
+
|
62 |
+
@nn.compact
|
63 |
+
def __call__(self, x: jax.Array) -> jax.Array:
|
64 |
+
hidden_features = self.config.intermediate_size
|
65 |
+
in_features = self.config.hidden_size
|
66 |
+
bias = self.config.use_bias
|
67 |
+
|
68 |
+
x1 = nn.Dense(hidden_features, use_bias=bias, dtype=self.dtype, name="fc1")(x)
|
69 |
+
x2 = nn.Dense(hidden_features, use_bias=bias, dtype=self.dtype, name="fc3")(x)
|
70 |
+
x = nn.silu(x1) * x2
|
71 |
+
x = nn.Dense(in_features, use_bias=bias, dtype=self.dtype, name="fc2")(x)
|
72 |
+
return x
|
73 |
+
|
74 |
+
|
75 |
+
class FlaxAIMv2PatchEmbed(nn.Module):
|
76 |
+
config: AIMv2Config
|
77 |
+
dtype: jnp.dtype = jnp.float32
|
78 |
+
|
79 |
+
@nn.compact
|
80 |
+
def __call__(self, x: jax.Array) -> jax.Array:
|
81 |
+
patch_size = (self.config.patch_size, self.config.patch_size)
|
82 |
+
x = x.transpose(0, 2, 3, 1) # (N C H W) -> (N H W C)
|
83 |
+
x = nn.Conv(
|
84 |
+
self.config.hidden_size,
|
85 |
+
kernel_size=patch_size,
|
86 |
+
strides=patch_size,
|
87 |
+
padding=(0, 0),
|
88 |
+
dtype=self.dtype,
|
89 |
+
name="proj",
|
90 |
+
)(x)
|
91 |
+
x = jax.lax.collapse(x, 1, 3) # (N, H * W, F)
|
92 |
+
x = FlaxRMSNorm(self.config.rms_norm_eps, name="norm")(x)
|
93 |
+
return x
|
94 |
+
|
95 |
+
|
96 |
+
class FlaxAIMv2ViTPreprocessor(nn.Module):
|
97 |
+
config: AIMv2Config
|
98 |
+
dtype: jnp.dtype = jnp.float32
|
99 |
+
|
100 |
+
@nn.compact
|
101 |
+
def __call__(self, x: jax.Array) -> jax.Array:
|
102 |
+
_, _, H, W = x.shape
|
103 |
+
patch_h = self.config.patch_size
|
104 |
+
patch_w = self.config.patch_size
|
105 |
+
|
106 |
+
tokens = FlaxAIMv2PatchEmbed(self.config, dtype=self.dtype, name="patchifier")(
|
107 |
+
x
|
108 |
+
)
|
109 |
+
pos_embed = get_sincos_pos_embed(
|
110 |
+
H // patch_h,
|
111 |
+
W // patch_w,
|
112 |
+
embed_dim=self.config.hidden_size,
|
113 |
+
dtype=self.dtype,
|
114 |
+
)
|
115 |
+
tokens = tokens + pos_embed
|
116 |
+
return tokens
|
117 |
+
|
118 |
+
|
119 |
+
class FlaxAIMv2Attention(nn.Module):
|
120 |
+
config: AIMv2Config
|
121 |
+
dtype: jnp.dtype = jnp.float32
|
122 |
+
|
123 |
+
@nn.compact
|
124 |
+
def __call__(
|
125 |
+
self,
|
126 |
+
x: jax.Array,
|
127 |
+
mask: Optional[jax.Array] = None,
|
128 |
+
deterministic: bool = True,
|
129 |
+
output_attentions: bool = False,
|
130 |
+
) -> Tuple[jax.Array, Optional[jax.Array]]:
|
131 |
+
B, N, C = x.shape
|
132 |
+
dim, num_heads = self.config.hidden_size, self.config.num_attention_heads
|
133 |
+
|
134 |
+
qkv = nn.Dense(
|
135 |
+
dim * 3, use_bias=self.config.qkv_bias, dtype=self.dtype, name="qkv"
|
136 |
+
)(x)
|
137 |
+
qkv = qkv.reshape(B, N, 3, num_heads, C // num_heads).transpose(2, 0, 3, 1, 4)
|
138 |
+
q, k, v = qkv[0], qkv[1], qkv[2]
|
139 |
+
|
140 |
+
attn_weights = nn.dot_product_attention_weights(
|
141 |
+
q.swapaxes(-3, -2), # [B, N, H, C]
|
142 |
+
k.swapaxes(-3, -2),
|
143 |
+
mask=mask,
|
144 |
+
deterministic=deterministic,
|
145 |
+
dtype=self.dtype,
|
146 |
+
)
|
147 |
+
attn_weights = nn.Dropout(
|
148 |
+
self.config.attention_dropout, deterministic=deterministic, name="attn_drop"
|
149 |
+
)(attn_weights)
|
150 |
+
|
151 |
+
x = (attn_weights @ v).swapaxes(1, 2).reshape(B, N, C)
|
152 |
+
x = nn.Dense(dim, use_bias=self.config.use_bias, dtype=self.dtype, name="proj")(
|
153 |
+
x
|
154 |
+
)
|
155 |
+
x = nn.Dropout(
|
156 |
+
self.config.projection_dropout,
|
157 |
+
deterministic=deterministic,
|
158 |
+
name="proj_drop",
|
159 |
+
)(x)
|
160 |
+
return (x, attn_weights) if output_attentions else (x, None)
|
161 |
+
|
162 |
+
|
163 |
+
class FlaxAIMv2Block(nn.Module):
|
164 |
+
config: AIMv2Config
|
165 |
+
dtype: jnp.dtype = jnp.float32
|
166 |
+
|
167 |
+
def setup(self):
|
168 |
+
self.attn = FlaxAIMv2Attention(self.config, dtype=self.dtype, name="attn")
|
169 |
+
self.norm_1 = FlaxRMSNorm(self.config.rms_norm_eps, name="norm_1")
|
170 |
+
self.mlp = FlaxAIMv2SwiGLUFFN(self.config, dtype=self.dtype, name="mlp")
|
171 |
+
self.norm_2 = FlaxRMSNorm(self.config.rms_norm_eps, name="norm_2")
|
172 |
+
|
173 |
+
def __call__(
|
174 |
+
self,
|
175 |
+
x: jax.Array,
|
176 |
+
mask: Optional[jax.Array] = None,
|
177 |
+
deterministic: bool = True,
|
178 |
+
output_attentions: bool = False,
|
179 |
+
) -> Tuple[jax.Array, Optional[jax.Array]]:
|
180 |
+
features, attention = self.attn(
|
181 |
+
self.norm_1(x),
|
182 |
+
mask,
|
183 |
+
deterministic=deterministic,
|
184 |
+
output_attentions=output_attentions,
|
185 |
+
)
|
186 |
+
x = x + features
|
187 |
+
x = x + self.mlp(self.norm_2(x))
|
188 |
+
return x, attention
|
189 |
+
|
190 |
+
|
191 |
+
class FlaxAIMv2Transformer(nn.Module):
|
192 |
+
config: AIMv2Config
|
193 |
+
dtype: jnp.dtype = jnp.float32
|
194 |
+
|
195 |
+
@nn.compact
|
196 |
+
def __call__(
|
197 |
+
self,
|
198 |
+
tokens: jax.Array,
|
199 |
+
mask: Optional[jax.Array] = None,
|
200 |
+
deterministic: bool = True,
|
201 |
+
output_attentions: bool = False,
|
202 |
+
output_hidden_states: bool = False,
|
203 |
+
) -> Tuple[
|
204 |
+
jax.Array, Optional[Tuple[jax.Array, ...]], Optional[Tuple[jax.Array, ...]]
|
205 |
+
]:
|
206 |
+
hidden_states = () if output_hidden_states else None
|
207 |
+
attentions = () if output_attentions else None
|
208 |
+
for blk_id, block in enumerate(range(self.config.num_hidden_layers)):
|
209 |
+
tokens, attention = FlaxAIMv2Block(
|
210 |
+
self.config, dtype=self.dtype, name=f"layers_{blk_id}"
|
211 |
+
)(
|
212 |
+
tokens,
|
213 |
+
mask,
|
214 |
+
deterministic=deterministic,
|
215 |
+
output_attentions=output_attentions,
|
216 |
+
)
|
217 |
+
if output_hidden_states:
|
218 |
+
hidden_states += (tokens,)
|
219 |
+
if output_attentions:
|
220 |
+
attentions += (attention,)
|
221 |
+
tokens = FlaxRMSNorm(self.config.rms_norm_eps, name="post_trunk_norm")(tokens)
|
222 |
+
return tokens, hidden_states, attentions
|
223 |
+
|
224 |
+
|
225 |
+
class FlaxAIMv2Module(nn.Module):
|
226 |
+
config: AIMv2Config
|
227 |
+
dtype: jnp.dtype = jnp.float32
|
228 |
+
|
229 |
+
@nn.compact
|
230 |
+
def __call__(
|
231 |
+
self,
|
232 |
+
x: jax.Array,
|
233 |
+
mask: Optional[jax.Array] = None,
|
234 |
+
deterministic: bool = True,
|
235 |
+
output_attentions: bool = False,
|
236 |
+
output_hidden_states: bool = False,
|
237 |
+
) -> Tuple[
|
238 |
+
jax.Array, Optional[Tuple[jax.Array, ...]], Optional[Tuple[jax.Array, ...]]
|
239 |
+
]:
|
240 |
+
x = FlaxAIMv2ViTPreprocessor(
|
241 |
+
self.config, dtype=self.dtype, name="preprocessor"
|
242 |
+
)(x)
|
243 |
+
x, hidden_states, attentions = FlaxAIMv2Transformer(
|
244 |
+
self.config, dtype=self.dtype, name="trunk"
|
245 |
+
)(
|
246 |
+
x,
|
247 |
+
mask,
|
248 |
+
deterministic=deterministic,
|
249 |
+
output_attentions=output_attentions,
|
250 |
+
output_hidden_states=output_hidden_states,
|
251 |
+
)
|
252 |
+
return x, hidden_states, attentions
|
253 |
+
|
254 |
+
|
255 |
+
class FlaxAIMv2PretrainedModel(FlaxPreTrainedModel):
|
256 |
+
config_class = AIMv2Config
|
257 |
+
base_model_prefix = "aimv2"
|
258 |
+
main_input_name = "pixel_values"
|
259 |
+
|
260 |
+
def __init__(
|
261 |
+
self,
|
262 |
+
config: AIMv2Config,
|
263 |
+
input_shape: Optional[Tuple[int, int, int, int]] = None, # [B, C, H, W]
|
264 |
+
dtype: jnp.dtype = jnp.float32,
|
265 |
+
**kwargs: Any,
|
266 |
+
):
|
267 |
+
if input_shape is None:
|
268 |
+
# no effect on the param shape
|
269 |
+
input_shape = (1, 3, 224, 224)
|
270 |
+
super().__init__(
|
271 |
+
config,
|
272 |
+
module=FlaxAIMv2Module(config, dtype=dtype),
|
273 |
+
input_shape=input_shape,
|
274 |
+
dtype=dtype,
|
275 |
+
**kwargs,
|
276 |
+
)
|
277 |
+
|
278 |
+
def init_weights(
|
279 |
+
self,
|
280 |
+
rng: jax.Array,
|
281 |
+
input_shape: Tuple[int, ...],
|
282 |
+
params: Optional[frozen_dict.FrozenDict] = None,
|
283 |
+
) -> frozen_dict.FrozenDict:
|
284 |
+
del params
|
285 |
+
input_pixels = jnp.empty(input_shape)
|
286 |
+
params = self.module.init(rng, input_pixels, deterministic=True)
|
287 |
+
return params["params"]
|
288 |
+
|
289 |
+
|
290 |
+
class FlaxAIMv2Model(FlaxAIMv2PretrainedModel):
|
291 |
+
def __call__(
|
292 |
+
self,
|
293 |
+
pixel_values: jax.Array,
|
294 |
+
params: Optional[frozen_dict.FrozenDict] = None,
|
295 |
+
mask: Optional[jax.Array] = None,
|
296 |
+
dropout_rng: Optional[jax.Array] = None,
|
297 |
+
deterministic: bool = True,
|
298 |
+
output_attentions: Optional[bool] = None,
|
299 |
+
output_hidden_states: Optional[bool] = None,
|
300 |
+
return_dict: Optional[bool] = None,
|
301 |
+
) -> Union[
|
302 |
+
Tuple[jax.Array],
|
303 |
+
Tuple[jax.Array, Tuple[jax.Array, ...]],
|
304 |
+
Tuple[jax.Array, Tuple[jax.Array, ...], Tuple[jax.Array, ...]],
|
305 |
+
FlaxBaseModelOutput,
|
306 |
+
]:
|
307 |
+
if params is None:
|
308 |
+
params = self.params
|
309 |
+
if output_attentions is None:
|
310 |
+
output_attentions = self.config.output_attentions
|
311 |
+
if output_hidden_states is None:
|
312 |
+
output_hidden_states = self.config.output_hidden_states
|
313 |
+
if return_dict is None:
|
314 |
+
return_dict = self.config.use_return_dict
|
315 |
+
|
316 |
+
rngs = None if deterministic else {"dropout": dropout_rng}
|
317 |
+
|
318 |
+
x, hidden_states, attentions = self.module.apply(
|
319 |
+
{"params": params},
|
320 |
+
pixel_values,
|
321 |
+
mask,
|
322 |
+
rngs=rngs,
|
323 |
+
deterministic=deterministic,
|
324 |
+
output_attentions=output_attentions,
|
325 |
+
output_hidden_states=output_hidden_states,
|
326 |
+
)
|
327 |
+
|
328 |
+
if not return_dict:
|
329 |
+
res = (x,)
|
330 |
+
res += (hidden_states,) if output_hidden_states else ()
|
331 |
+
res += (attentions,) if output_attentions else ()
|
332 |
+
return res
|
333 |
+
|
334 |
+
return FlaxBaseModelOutput(
|
335 |
+
last_hidden_state=x,
|
336 |
+
hidden_states=hidden_states,
|
337 |
+
attentions=attentions,
|
338 |
+
)
|
339 |
+
|
preprocessor_config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 224,
|
4 |
+
"width": 224
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
0.48145466,
|
13 |
+
0.4578275,
|
14 |
+
0.40821073
|
15 |
+
],
|
16 |
+
"image_processor_type": "CLIPImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
0.26862954,
|
19 |
+
0.26130258,
|
20 |
+
0.27577711
|
21 |
+
],
|
22 |
+
"resample": 3,
|
23 |
+
"rescale_factor": 0.00392156862745098,
|
24 |
+
"size": {
|
25 |
+
"shortest_edge": 224
|
26 |
+
}
|
27 |
+
}
|