File size: 6,887 Bytes
b3a6e2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from typing import Any, Dict, Optional, Union
from transformers.configuration_utils import PretrainedConfig
__all__ = ["AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2Config"]
class AIMv2VisionConfig(PretrainedConfig):
"""This is the configuration class to store the configuration of an [`AIMv2VisionModel`].
Instantiating a configuration with the defaults will yield a similar configuration
to that of the [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit).
Args:
hidden_size: Dimension of the hidden representations.
intermediate_size: Dimension of the SwiGLU representations.
num_hidden_layers: Number of hidden layers in the Transformer.
num_attention_heads: Number of attention heads for each attention layer
in the Transformer.
num_queries: Number of learnable queries for the attention-pooling head.
num_channels: Number of input channels.
image_size: Image size.
patch_size: Patch size.
rms_norm_eps: Epsilon value used for the RMS normalization layer.
attention_dropout: Dropout ratio for attention probabilities.
projection_dropout: Dropout ratio for the projection layer after the attention.
qkv_bias: Whether to add a bias to the queries, keys and values.
use_bias: Whether to add a bias in the feed-forward and projection layers.
kwargs: Keyword arguments for the [`PretrainedConfig`].
"""
model_type: str = "aimv2"
base_config_key: str = "vision_config"
def __init__(
self,
hidden_size: int = 1024,
intermediate_size: int = 2816,
num_hidden_layers: int = 24,
num_attention_heads: int = 8,
num_queries: int = 1,
num_channels: int = 3,
image_size: int = 224,
patch_size: int = 14,
rms_norm_eps: float = 1e-5,
attention_dropout: float = 0.0,
projection_dropout: float = 0.0,
qkv_bias: bool = False,
use_bias: bool = False,
**kwargs: Any,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_queries = num_queries
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
self.projection_dropout = projection_dropout
self.qkv_bias = qkv_bias
self.use_bias = use_bias
self.is_causal = False
class AIMv2TextConfig(PretrainedConfig):
"""This is the configuration class to store the configuration of an [`AIMv2TextModel`].
Instantiating a configuration with the defaults will yield a similar configuration
to that of the [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit).
Args:
vocab_size: Size of the vocabulary.
hidden_size: Dimension of the hidden representations.
intermediate_size: Dimension of the SwiGLU representations.
num_hidden_layers: Number of hidden layers in the Transformer.
num_attention_heads: Number of attention heads for each attention layer
in the Transformer.
rms_norm_eps: Epsilon value used for the RMS normalization layer.
attention_dropout: Dropout ratio for attention probabilities.
projection_dropout: Dropout ratio for the projection layer after the attention.
qkv_bias: Whether to add a bias to the queries, keys and values.
use_bias: Whether to add a bias in the feed-forward and projection layers.
eos_token_id: End-of-sequence token id.
max_context_length: Maximum number of tokens for the context.
kwargs: Keyword arguments for the [`PretrainedConfig`].
"""
model_type: str = "aimv2"
base_config_key: str = "text_config"
def __init__(
self,
vocab_size: int = 49408,
hidden_size: int = 768,
intermediate_size: int = 2048,
num_hidden_layers: int = 12,
num_attention_heads: int = 6,
rms_norm_eps: float = 1e-5,
attention_dropout: float = 0.0,
projection_dropout: float = 0.0,
qkv_bias: bool = False,
use_bias: bool = False,
eos_token_id: int = 49407,
max_context_length: int = 77,
**kwargs: Any,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
self.projection_dropout = projection_dropout
self.qkv_bias = qkv_bias
self.use_bias = use_bias
self.vocab_size = vocab_size
self.max_context_length = max_context_length
self.eos_token_id = eos_token_id
self.is_causal = True
class AIMv2Config(PretrainedConfig):
"""This is the configuration class to store the configuration of an [`AIMv2Model`].
Instantiating a configuration with the defaults will yield a similar configuration
to that of the [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit).
Args:
vision_config: Vision config.
text_config: Text config.
projection_dim: Dimension of the image and text projection layers.
kwargs: Keyword arguments for the [`PretrainedConfig`].
"""
model_type = "aimv2"
is_composition: bool = True
sub_configs: Dict[str, PretrainedConfig] = {
"vision_config": AIMv2VisionConfig,
"text_config": AIMv2TextConfig,
}
def __init__(
self,
vision_config: Optional[Union[AIMv2VisionConfig, Dict[str, Any]]] = None,
text_config: Optional[Union[AIMv2TextConfig, Dict[str, Any]]] = None,
projection_dim: int = 768,
init_temperature: float = 0.07,
max_logit_scale: float = 100.0,
**kwargs: Any,
):
super().__init__(**kwargs)
if vision_config is None:
vision_config = AIMv2VisionConfig()
elif isinstance(vision_config, dict):
vision_config = AIMv2VisionConfig(**vision_config)
if text_config is None:
text_config = AIMv2TextConfig()
elif isinstance(text_config, dict):
text_config = AIMv2TextConfig(**text_config)
self.vision_config = vision_config
self.text_config = text_config
self.projection_dim = projection_dim
self.init_temperature = init_temperature
self.max_logit_scale = max_logit_scale
|