File size: 4,795 Bytes
a905bf9 5c1268c a905bf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
Josiev47( (encoder): EncoderModel( (modality_preprocessors): ModuleDict( (vision): RGBDTPreprocessor( (rgbt_stem): PatchEmbedGeneric( (proj): Sequential( (0): make_image_to_video() (1): Conv3d(3, {llm_in_embedding}, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() ) (audio): AudioPreprocessor( (rgbt_stem): PatchEmbedGeneric( (proj): Conv2d(1, {llm_in_embedding}, kernel_size=(16, 16), stride=(10, 10), bias=False) (norm_layer): RMSNorm() ) (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper() ) ) (modality_transformers): ModuleDict( (vision): EncoderTransformer( (pre_transformer_layer_norm): RMSNorm() (layers): ModuleList( (0 - n): EncoderTransformerLayer( (attn): EncoderTransformerAttention( (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) (attn_drop): Dropout(p=0.0, inplace=False) (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) (proj_drop): Dropout(p=0.0, inplace=False) ) (drop_path): Identity() (norm_1): RMSNorm() (mlp): EncoderMLP( (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) (drop): Dropout(p=0.0, inplace=False) (act): SiLU() ) (norm_2): RMSNorm() ) ) (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (post_transformer_layer_norm): RMSNorm() ) (audio): EncoderTransformer( (pre_transformer_layer_norm): RMSNorm() (layers): ModuleList( (0 - n): EncoderTransformerLayer( (attn): EncoderTransformerAttention( (qkv): Linear(in_features={llm_in_embedding}, out_features=3072, bias=True) (attn_drop): Dropout(p=0.0, inplace=False) (proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=True) (proj_drop): Dropout(p=0.0, inplace=False) ) (drop_path): DropPath() (norm_1): RMSNorm() (mlp): EncoderMLP( (fc1): Linear(in_features={llm_in_embedding}, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features={llm_in_embedding}, bias=True) (drop): Dropout(p=0.0, inplace=False) (act): SiLU() ) (norm_2): RMSNorm() ) ) (head): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (post_transformer_layer_norm): RMSNorm() ) ) ) (llm): LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) (layers): ModuleList( (0 - n): LlamaDecoderLayer( (self_attn): LlamaAttention( (q_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (k_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (v_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (o_proj): Linear(in_features={llm_in_embedding}, out_features={llm_in_embedding}, bias=False) (rotary_emb): LlamaRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) (down_proj): Linear(in_features=4096, out_features={llm_in_embedding}, bias=False) (up_proj): Linear(in_features={llm_in_embedding}, out_features=4096, bias=False) (act_fn): SiLU() ) (input_layernorm): RMSNorm() (post_attention_layernorm): RMSNorm() ) ) (norm): RMSNorm() ) (lm_head): Linear(in_features={llm_in_embedding}, out_features={vocab_size}, bias=False) ) (input_embeddings): Embedding({vocab_size}, {llm_in_embedding}, {padding_idx}) ) |