|
Currently not using the OG trained model for easy and fast loading, ... |
|
Used LLM Qwen2 0.5B |
|
|
|
JOSIE( |
|
(imagebind_encoder): ImageBindModel( |
|
(modality_preprocessors): ModuleDict( |
|
(vision): RGBDTPreprocessor( |
|
(cls_token): tensor((1, 1, 1280), requires_grad=False) |
|
|
|
(rgbt_stem): PatchEmbedGeneric( |
|
(proj): Sequential( |
|
(0): PadIm2Video() |
|
(1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) |
|
) |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( |
|
(pos_embed): tensor((1, 257, 1280), requires_grad=False) |
|
|
|
) |
|
) |
|
(text): TextPreprocessor( |
|
(pos_embed): tensor((1, 77, 1024), requires_grad=False) |
|
(mask): tensor((77, 77), requires_grad=False) |
|
|
|
(token_embedding): Embedding(49408, 1024) |
|
) |
|
(audio): AudioPreprocessor( |
|
(cls_token): tensor((1, 1, 768), requires_grad=False) |
|
|
|
(rgbt_stem): PatchEmbedGeneric( |
|
(proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False) |
|
(norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( |
|
(pos_embed): tensor((1, 229, 768), requires_grad=False) |
|
|
|
) |
|
) |
|
(depth): RGBDTPreprocessor( |
|
(cls_token): tensor((1, 1, 384), requires_grad=False) |
|
|
|
(depth_stem): PatchEmbedGeneric( |
|
(proj): Conv2d(1, 384, kernel_size=(16, 16), stride=(16, 16), bias=False) |
|
(norm_layer): LayerNorm((384,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( |
|
(pos_embed): tensor((1, 197, 384), requires_grad=False) |
|
|
|
) |
|
) |
|
(thermal): ThermalPreprocessor( |
|
(cls_token): tensor((1, 1, 768), requires_grad=False) |
|
|
|
(rgbt_stem): PatchEmbedGeneric( |
|
(proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) |
|
(norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( |
|
(pos_embed): tensor((1, 197, 768), requires_grad=False) |
|
|
|
) |
|
) |
|
(imu): IMUPreprocessor( |
|
(pos_embed): tensor((1, 251, 512), requires_grad=False) |
|
(cls_token): tensor((1, 1, 512), requires_grad=False) |
|
|
|
(imu_stem): PatchEmbedGeneric( |
|
(proj): Linear(in_features=48, out_features=512, bias=False) |
|
(norm_layer): LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
(modality_trunks): ModuleDict( |
|
(vision): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(6): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(7): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(8): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(9): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(10): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(11): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(12): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(13): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(14): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(15): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(16): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(17): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(18): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(19): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(20): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(21): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(22): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(23): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(24): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(25): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(26): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(27): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(28): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(29): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(30): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(31): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1280, out_features=5120, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=5120, out_features=1280, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
(text): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): Identity() |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(6): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(7): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(8): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(9): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(10): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(11): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(12): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(13): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(14): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(15): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(16): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(17): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(18): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(19): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(20): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(21): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(22): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(23): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
(audio): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): Identity() |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.009) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.018) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.027) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.036) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.045) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(6): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.055) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(7): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.064) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(8): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.073) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(9): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.082) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(10): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.091) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(11): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.100) |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
(depth): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): Identity() |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(6): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(7): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(8): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(9): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(10): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(11): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=384, out_features=1536, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=1536, out_features=384, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
(thermal): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): Identity() |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(6): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(7): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(8): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(9): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(10): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(11): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=768, out_features=3072, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=3072, out_features=768, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
(imu): SimpleTransformer( |
|
(pre_transformer_layer): Sequential( |
|
(0): Identity() |
|
(1): EinOpsRearrange() |
|
) |
|
(blocks): Sequential( |
|
(0): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): Identity() |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(1): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.140) |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(2): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.280) |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(3): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.420) |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(4): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.560) |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
(5): BlockWithMasking( |
|
(attn): MultiheadAttention( |
|
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) |
|
) |
|
(drop_path): DropPath(drop_prob=0.700) |
|
(norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(mlp): Mlp( |
|
(fc1): Linear(in_features=512, out_features=2048, bias=True) |
|
(act): GELU(approximate='none') |
|
(fc2): Linear(in_features=2048, out_features=512, bias=True) |
|
(drop): Dropout(p=0.0, inplace=False) |
|
) |
|
(norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
) |
|
) |
|
(post_transformer_layer): EinOpsRearrange() |
|
) |
|
) |
|
(modality_heads): ModuleDict( |
|
(vision): Sequential( |
|
(0): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) |
|
(1): SelectElement() |
|
(2): Linear(in_features=1280, out_features=1024, bias=False) |
|
) |
|
(text): SelectEOSAndProject( |
|
(proj): Sequential( |
|
(0): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) |
|
(1): Linear(in_features=1024, out_features=1024, bias=False) |
|
) |
|
) |
|
(audio): Sequential( |
|
(0): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(1): SelectElement() |
|
(2): Linear(in_features=768, out_features=1024, bias=False) |
|
) |
|
(depth): Sequential( |
|
(0): LayerNorm((384,), eps=1e-06, elementwise_affine=True) |
|
(1): SelectElement() |
|
(2): Linear(in_features=384, out_features=1024, bias=False) |
|
) |
|
(thermal): Sequential( |
|
(0): LayerNorm((768,), eps=1e-06, elementwise_affine=True) |
|
(1): SelectElement() |
|
(2): Linear(in_features=768, out_features=1024, bias=False) |
|
) |
|
(imu): Sequential( |
|
(0): LayerNorm((512,), eps=1e-06, elementwise_affine=True) |
|
(1): SelectElement() |
|
(2): Dropout(p=0.5, inplace=False) |
|
(3): Linear(in_features=512, out_features=1024, bias=False) |
|
) |
|
) |
|
(modality_postprocessors): ModuleDict( |
|
(vision): Normalize() |
|
(text): Sequential( |
|
(0): Normalize() |
|
(1): LearnableLogitScaling(logit_scale_init=14.285714285714285,learnable=True, max_logit_scale=100) |
|
) |
|
(audio): Sequential( |
|
(0): Normalize() |
|
(1): LearnableLogitScaling(logit_scale_init=20.0,learnable=False, max_logit_scale=100) |
|
) |
|
(depth): Sequential( |
|
(0): Normalize() |
|
(1): LearnableLogitScaling(logit_scale_init=5.0,learnable=False, max_logit_scale=100) |
|
) |
|
(thermal): Sequential( |
|
(0): Normalize() |
|
(1): LearnableLogitScaling(logit_scale_init=10.0,learnable=False, max_logit_scale=100) |
|
) |
|
(imu): Sequential( |
|
(0): Normalize() |
|
(1): LearnableLogitScaling(logit_scale_init=5.0,learnable=False, max_logit_scale=100) |
|
) |
|
) |
|
) |
|
(reasoner): Qwen2ForCausalLM( |
|
(model): Qwen2Model( |
|
(embed_tokens): Embedding(151936, 896) |
|
(layers): ModuleList( |
|
(0): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(1): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(2): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(3): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(4): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(5): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(6): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(7): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(8): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(9): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(10): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(11): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(12): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(13): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(14): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(15): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(16): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(17): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(18): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(19): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(20): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(21): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(22): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
(23): Qwen2DecoderLayer( |
|
(self_attn): Qwen2Attention( |
|
(q_proj): Linear(in_features=896, out_features=896, bias=True) |
|
(k_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(v_proj): Linear(in_features=896, out_features=128, bias=True) |
|
(o_proj): Linear(in_features=896, out_features=896, bias=False) |
|
(rotary_emb): Qwen2RotaryEmbedding() |
|
) |
|
(mlp): Qwen2MLP( |
|
(gate_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(up_proj): Linear(in_features=896, out_features=4864, bias=False) |
|
(down_proj): Linear(in_features=4864, out_features=896, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): Qwen2RMSNorm() |
|
(post_attention_layernorm): Qwen2RMSNorm() |
|
) |
|
) |
|
(norm): Qwen2RMSNorm() |
|
) |
|
(lm_head): Linear(in_features=896, out_features=151936, bias=False) |
|
) |
|
(input_projetor): Linear(in_features=1024, out_features=896, bias=True) |
|
) |