JOSIE( (encoder): Encoder( (modality_preprocessors): ModuleDict( (vision): RGBDTPreprocessor( (cls_token): tensor((1, 1, 768), requires_grad=False) (rgbt_stem): PatchEmbedGeneric( (proj): Sequential( (0): PadIm2Video() (1): Conv3d(3, 768, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( (pos_embed): tensor((1, 7681, 768), requires_grad=False) ) ) (audio): AudioPreprocessor( (cls_token): tensor((1, 1, 768), requires_grad=False) (rgbt_stem): PatchEmbedGeneric( (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( (pos_embed): tensor((1, 229, 768), requires_grad=False) ) ) (depth): RGBDTPreprocessor( (cls_token): tensor((1, 1, 384), requires_grad=False) (depth_stem): PatchEmbedGeneric( (proj): Conv2d(1, 384, kernel_size=(16, 16), stride=(16, 16), bias=False) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( (pos_embed): tensor((1, 197, 384), requires_grad=False) ) ) (thermal): ThermalPreprocessor( (cls_token): tensor((1, 1, 768), requires_grad=False) (rgbt_stem): PatchEmbedGeneric( (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper( (pos_embed): tensor((1, 197, 768), requires_grad=False) ) ) ) (modality_transformers): ModuleDict( (vision): EncoderTransformer( (pre_transformer_layer): Sequential( (0): RMSNorm() (1): EinOpsRearrange() ) (post_transformer_layer): EinOpsRearrange() (blocks): ModuleList( (0): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (1): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (2): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (3): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (4): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (5): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (6): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (7): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (8): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (9): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (10): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (11): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) ) ) (audio): EncoderTransformer( (pre_transformer_layer): Sequential( (0): RMSNorm() (1): EinOpsRearrange() ) (post_transformer_layer): EinOpsRearrange() (blocks): ModuleList( (0): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (1): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.009) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (2): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.018) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (3): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.027) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (4): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.036) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (5): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.045) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (6): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.055) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (7): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.064) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (8): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.073) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (9): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.082) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (10): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.091) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (11): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): DropPath(drop_prob=0.100) (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) ) ) (depth): EncoderTransformer( (pre_transformer_layer): Sequential( (0): RMSNorm() (1): EinOpsRearrange() ) (post_transformer_layer): EinOpsRearrange() (blocks): ModuleList( (0): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) (1): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) (2): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) (3): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) (4): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) (5): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=384, out_features=256, bias=False) (w2): Linear(in_features=256, out_features=384, bias=False) (w3): Linear(in_features=384, out_features=256, bias=False) ) ) ) ) (thermal): EncoderTransformer( (pre_transformer_layer): Sequential( (0): RMSNorm() (1): EinOpsRearrange() ) (post_transformer_layer): EinOpsRearrange() (blocks): ModuleList( (0): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (1): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (2): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (3): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (4): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) (5): EncoderTransformerBlock( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (drop_path): Identity() (norm1): RMSNorm() (norm2): RMSNorm() (mlp): MLP( (w1): Linear(in_features=768, out_features=512, bias=False) (w2): Linear(in_features=512, out_features=768, bias=False) (w3): Linear(in_features=768, out_features=512, bias=False) ) ) ) ) ) (modality_heads): ModuleDict( (vision): Sequential( (0): RMSNorm() (1): SelectElement() (2): Linear(in_features=768, out_features=1024, bias=False) ) (audio): Sequential( (0): RMSNorm() (1): SelectElement() (2): Linear(in_features=768, out_features=1024, bias=False) ) (depth): Sequential( (0): RMSNorm() (1): SelectElement() (2): Linear(in_features=384, out_features=1024, bias=False) ) (thermal): Sequential( (0): RMSNorm() (1): SelectElement() (2): Linear(in_features=768, out_features=1024, bias=False) ) ) ) (reasoner): Qwen2ForCausalLM( (model): Qwen2Model( (embed_tokens): Embedding(151936, 896) (layers): ModuleList( (0): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (1): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (2): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (3): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (4): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (5): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (6): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (7): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (8): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (9): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (10): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (11): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (12): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (13): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (14): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (15): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (16): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (17): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (18): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (19): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (20): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (21): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (22): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) (23): Qwen2DecoderLayer( (self_attn): Qwen2Attention( (q_proj): Linear(in_features=896, out_features=896, bias=True) (k_proj): Linear(in_features=896, out_features=128, bias=True) (v_proj): Linear(in_features=896, out_features=128, bias=True) (o_proj): Linear(in_features=896, out_features=896, bias=False) (rotary_emb): Qwen2RotaryEmbedding() ) (mlp): Qwen2MLP( (gate_proj): Linear(in_features=896, out_features=4864, bias=False) (up_proj): Linear(in_features=896, out_features=4864, bias=False) (down_proj): Linear(in_features=4864, out_features=896, bias=False) (act_fn): SiLU() ) (input_layernorm): Qwen2RMSNorm() (post_attention_layernorm): Qwen2RMSNorm() ) ) (norm): Qwen2RMSNorm() ) (lm_head): Linear(in_features=896, out_features=151936, bias=False) ) (input_projetor): Linear(in_features=1024, out_features=896, bias=True) )