Isaak Carter Augustus commited on
Commit
22daade
1 Parent(s): a8c7fe9

Delete model_architecture.txt

Browse files
Files changed (1) hide show
  1. model_architecture.txt +0 -1977
model_architecture.txt DELETED
@@ -1,1977 +0,0 @@
1
- Currently not using the OG trained model for easy and fast loading, ...
2
- Used LLM Qwen2 0.5B
3
-
4
- JOSIE(
5
- (imagebind_encoder): ImageBindModel(
6
- (modality_preprocessors): ModuleDict(
7
- (vision): RGBDTPreprocessor(
8
- (cls_token): tensor((1, 1, 1280), requires_grad=False)
9
-
10
- (rgbt_stem): PatchEmbedGeneric(
11
- (proj): Sequential(
12
- (0): PadIm2Video()
13
- (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
14
- )
15
- )
16
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
17
- (pos_embed): tensor((1, 257, 1280), requires_grad=False)
18
-
19
- )
20
- )
21
- (text): TextPreprocessor(
22
- (pos_embed): tensor((1, 77, 1024), requires_grad=False)
23
- (mask): tensor((77, 77), requires_grad=False)
24
-
25
- (token_embedding): Embedding(49408, 1024)
26
- )
27
- (audio): AudioPreprocessor(
28
- (cls_token): tensor((1, 1, 768), requires_grad=False)
29
-
30
- (rgbt_stem): PatchEmbedGeneric(
31
- (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
32
- (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
33
- )
34
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
35
- (pos_embed): tensor((1, 229, 768), requires_grad=False)
36
-
37
- )
38
- )
39
- (depth): RGBDTPreprocessor(
40
- (cls_token): tensor((1, 1, 384), requires_grad=False)
41
-
42
- (depth_stem): PatchEmbedGeneric(
43
- (proj): Conv2d(1, 384, kernel_size=(16, 16), stride=(16, 16), bias=False)
44
- (norm_layer): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
45
- )
46
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
47
- (pos_embed): tensor((1, 197, 384), requires_grad=False)
48
-
49
- )
50
- )
51
- (thermal): ThermalPreprocessor(
52
- (cls_token): tensor((1, 1, 768), requires_grad=False)
53
-
54
- (rgbt_stem): PatchEmbedGeneric(
55
- (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
56
- (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
57
- )
58
- (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
59
- (pos_embed): tensor((1, 197, 768), requires_grad=False)
60
-
61
- )
62
- )
63
- (imu): IMUPreprocessor(
64
- (pos_embed): tensor((1, 251, 512), requires_grad=False)
65
- (cls_token): tensor((1, 1, 512), requires_grad=False)
66
-
67
- (imu_stem): PatchEmbedGeneric(
68
- (proj): Linear(in_features=48, out_features=512, bias=False)
69
- (norm_layer): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
70
- )
71
- )
72
- )
73
- (modality_trunks): ModuleDict(
74
- (vision): SimpleTransformer(
75
- (pre_transformer_layer): Sequential(
76
- (0): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
77
- (1): EinOpsRearrange()
78
- )
79
- (blocks): Sequential(
80
- (0): BlockWithMasking(
81
- (attn): MultiheadAttention(
82
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
83
- )
84
- (drop_path): Identity()
85
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
86
- (mlp): Mlp(
87
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
88
- (act): GELU(approximate='none')
89
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
90
- (drop): Dropout(p=0.0, inplace=False)
91
- )
92
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
93
- )
94
- (1): BlockWithMasking(
95
- (attn): MultiheadAttention(
96
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
97
- )
98
- (drop_path): Identity()
99
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
100
- (mlp): Mlp(
101
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
102
- (act): GELU(approximate='none')
103
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
104
- (drop): Dropout(p=0.0, inplace=False)
105
- )
106
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
107
- )
108
- (2): BlockWithMasking(
109
- (attn): MultiheadAttention(
110
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
111
- )
112
- (drop_path): Identity()
113
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
114
- (mlp): Mlp(
115
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
116
- (act): GELU(approximate='none')
117
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
118
- (drop): Dropout(p=0.0, inplace=False)
119
- )
120
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
121
- )
122
- (3): BlockWithMasking(
123
- (attn): MultiheadAttention(
124
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
125
- )
126
- (drop_path): Identity()
127
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
128
- (mlp): Mlp(
129
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
130
- (act): GELU(approximate='none')
131
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
132
- (drop): Dropout(p=0.0, inplace=False)
133
- )
134
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
135
- )
136
- (4): BlockWithMasking(
137
- (attn): MultiheadAttention(
138
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
139
- )
140
- (drop_path): Identity()
141
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
142
- (mlp): Mlp(
143
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
144
- (act): GELU(approximate='none')
145
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
146
- (drop): Dropout(p=0.0, inplace=False)
147
- )
148
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
149
- )
150
- (5): BlockWithMasking(
151
- (attn): MultiheadAttention(
152
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
153
- )
154
- (drop_path): Identity()
155
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
156
- (mlp): Mlp(
157
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
158
- (act): GELU(approximate='none')
159
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
160
- (drop): Dropout(p=0.0, inplace=False)
161
- )
162
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
163
- )
164
- (6): BlockWithMasking(
165
- (attn): MultiheadAttention(
166
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
167
- )
168
- (drop_path): Identity()
169
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
170
- (mlp): Mlp(
171
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
172
- (act): GELU(approximate='none')
173
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
174
- (drop): Dropout(p=0.0, inplace=False)
175
- )
176
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
177
- )
178
- (7): BlockWithMasking(
179
- (attn): MultiheadAttention(
180
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
181
- )
182
- (drop_path): Identity()
183
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
184
- (mlp): Mlp(
185
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
186
- (act): GELU(approximate='none')
187
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
188
- (drop): Dropout(p=0.0, inplace=False)
189
- )
190
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
191
- )
192
- (8): BlockWithMasking(
193
- (attn): MultiheadAttention(
194
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
195
- )
196
- (drop_path): Identity()
197
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
198
- (mlp): Mlp(
199
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
200
- (act): GELU(approximate='none')
201
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
202
- (drop): Dropout(p=0.0, inplace=False)
203
- )
204
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
205
- )
206
- (9): BlockWithMasking(
207
- (attn): MultiheadAttention(
208
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
209
- )
210
- (drop_path): Identity()
211
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
212
- (mlp): Mlp(
213
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
214
- (act): GELU(approximate='none')
215
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
216
- (drop): Dropout(p=0.0, inplace=False)
217
- )
218
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
219
- )
220
- (10): BlockWithMasking(
221
- (attn): MultiheadAttention(
222
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
223
- )
224
- (drop_path): Identity()
225
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
226
- (mlp): Mlp(
227
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
228
- (act): GELU(approximate='none')
229
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
230
- (drop): Dropout(p=0.0, inplace=False)
231
- )
232
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
233
- )
234
- (11): BlockWithMasking(
235
- (attn): MultiheadAttention(
236
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
237
- )
238
- (drop_path): Identity()
239
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
240
- (mlp): Mlp(
241
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
242
- (act): GELU(approximate='none')
243
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
244
- (drop): Dropout(p=0.0, inplace=False)
245
- )
246
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
247
- )
248
- (12): BlockWithMasking(
249
- (attn): MultiheadAttention(
250
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
251
- )
252
- (drop_path): Identity()
253
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
254
- (mlp): Mlp(
255
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
256
- (act): GELU(approximate='none')
257
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
258
- (drop): Dropout(p=0.0, inplace=False)
259
- )
260
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
261
- )
262
- (13): BlockWithMasking(
263
- (attn): MultiheadAttention(
264
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
265
- )
266
- (drop_path): Identity()
267
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
268
- (mlp): Mlp(
269
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
270
- (act): GELU(approximate='none')
271
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
272
- (drop): Dropout(p=0.0, inplace=False)
273
- )
274
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
275
- )
276
- (14): BlockWithMasking(
277
- (attn): MultiheadAttention(
278
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
279
- )
280
- (drop_path): Identity()
281
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
282
- (mlp): Mlp(
283
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
284
- (act): GELU(approximate='none')
285
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
286
- (drop): Dropout(p=0.0, inplace=False)
287
- )
288
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
289
- )
290
- (15): BlockWithMasking(
291
- (attn): MultiheadAttention(
292
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
293
- )
294
- (drop_path): Identity()
295
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
296
- (mlp): Mlp(
297
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
298
- (act): GELU(approximate='none')
299
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
300
- (drop): Dropout(p=0.0, inplace=False)
301
- )
302
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
303
- )
304
- (16): BlockWithMasking(
305
- (attn): MultiheadAttention(
306
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
307
- )
308
- (drop_path): Identity()
309
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
310
- (mlp): Mlp(
311
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
312
- (act): GELU(approximate='none')
313
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
314
- (drop): Dropout(p=0.0, inplace=False)
315
- )
316
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
317
- )
318
- (17): BlockWithMasking(
319
- (attn): MultiheadAttention(
320
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
321
- )
322
- (drop_path): Identity()
323
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
324
- (mlp): Mlp(
325
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
326
- (act): GELU(approximate='none')
327
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
328
- (drop): Dropout(p=0.0, inplace=False)
329
- )
330
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
331
- )
332
- (18): BlockWithMasking(
333
- (attn): MultiheadAttention(
334
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
335
- )
336
- (drop_path): Identity()
337
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
338
- (mlp): Mlp(
339
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
340
- (act): GELU(approximate='none')
341
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
342
- (drop): Dropout(p=0.0, inplace=False)
343
- )
344
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
345
- )
346
- (19): BlockWithMasking(
347
- (attn): MultiheadAttention(
348
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
349
- )
350
- (drop_path): Identity()
351
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
352
- (mlp): Mlp(
353
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
354
- (act): GELU(approximate='none')
355
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
356
- (drop): Dropout(p=0.0, inplace=False)
357
- )
358
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
359
- )
360
- (20): BlockWithMasking(
361
- (attn): MultiheadAttention(
362
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
363
- )
364
- (drop_path): Identity()
365
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
366
- (mlp): Mlp(
367
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
368
- (act): GELU(approximate='none')
369
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
370
- (drop): Dropout(p=0.0, inplace=False)
371
- )
372
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
373
- )
374
- (21): BlockWithMasking(
375
- (attn): MultiheadAttention(
376
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
377
- )
378
- (drop_path): Identity()
379
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
380
- (mlp): Mlp(
381
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
382
- (act): GELU(approximate='none')
383
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
384
- (drop): Dropout(p=0.0, inplace=False)
385
- )
386
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
387
- )
388
- (22): BlockWithMasking(
389
- (attn): MultiheadAttention(
390
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
391
- )
392
- (drop_path): Identity()
393
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
394
- (mlp): Mlp(
395
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
396
- (act): GELU(approximate='none')
397
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
398
- (drop): Dropout(p=0.0, inplace=False)
399
- )
400
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
401
- )
402
- (23): BlockWithMasking(
403
- (attn): MultiheadAttention(
404
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
405
- )
406
- (drop_path): Identity()
407
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
408
- (mlp): Mlp(
409
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
410
- (act): GELU(approximate='none')
411
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
412
- (drop): Dropout(p=0.0, inplace=False)
413
- )
414
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
415
- )
416
- (24): BlockWithMasking(
417
- (attn): MultiheadAttention(
418
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
419
- )
420
- (drop_path): Identity()
421
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
422
- (mlp): Mlp(
423
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
424
- (act): GELU(approximate='none')
425
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
426
- (drop): Dropout(p=0.0, inplace=False)
427
- )
428
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
429
- )
430
- (25): BlockWithMasking(
431
- (attn): MultiheadAttention(
432
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
433
- )
434
- (drop_path): Identity()
435
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
436
- (mlp): Mlp(
437
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
438
- (act): GELU(approximate='none')
439
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
440
- (drop): Dropout(p=0.0, inplace=False)
441
- )
442
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
443
- )
444
- (26): BlockWithMasking(
445
- (attn): MultiheadAttention(
446
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
447
- )
448
- (drop_path): Identity()
449
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
450
- (mlp): Mlp(
451
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
452
- (act): GELU(approximate='none')
453
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
454
- (drop): Dropout(p=0.0, inplace=False)
455
- )
456
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
457
- )
458
- (27): BlockWithMasking(
459
- (attn): MultiheadAttention(
460
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
461
- )
462
- (drop_path): Identity()
463
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
464
- (mlp): Mlp(
465
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
466
- (act): GELU(approximate='none')
467
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
468
- (drop): Dropout(p=0.0, inplace=False)
469
- )
470
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
471
- )
472
- (28): BlockWithMasking(
473
- (attn): MultiheadAttention(
474
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
475
- )
476
- (drop_path): Identity()
477
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
478
- (mlp): Mlp(
479
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
480
- (act): GELU(approximate='none')
481
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
482
- (drop): Dropout(p=0.0, inplace=False)
483
- )
484
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
485
- )
486
- (29): BlockWithMasking(
487
- (attn): MultiheadAttention(
488
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
489
- )
490
- (drop_path): Identity()
491
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
492
- (mlp): Mlp(
493
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
494
- (act): GELU(approximate='none')
495
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
496
- (drop): Dropout(p=0.0, inplace=False)
497
- )
498
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
499
- )
500
- (30): BlockWithMasking(
501
- (attn): MultiheadAttention(
502
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
503
- )
504
- (drop_path): Identity()
505
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
506
- (mlp): Mlp(
507
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
508
- (act): GELU(approximate='none')
509
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
510
- (drop): Dropout(p=0.0, inplace=False)
511
- )
512
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
513
- )
514
- (31): BlockWithMasking(
515
- (attn): MultiheadAttention(
516
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
517
- )
518
- (drop_path): Identity()
519
- (norm_1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
520
- (mlp): Mlp(
521
- (fc1): Linear(in_features=1280, out_features=5120, bias=True)
522
- (act): GELU(approximate='none')
523
- (fc2): Linear(in_features=5120, out_features=1280, bias=True)
524
- (drop): Dropout(p=0.0, inplace=False)
525
- )
526
- (norm_2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
527
- )
528
- )
529
- (post_transformer_layer): EinOpsRearrange()
530
- )
531
- (text): SimpleTransformer(
532
- (pre_transformer_layer): Sequential(
533
- (0): Identity()
534
- (1): EinOpsRearrange()
535
- )
536
- (blocks): Sequential(
537
- (0): BlockWithMasking(
538
- (attn): MultiheadAttention(
539
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
540
- )
541
- (drop_path): Identity()
542
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
543
- (mlp): Mlp(
544
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
545
- (act): GELU(approximate='none')
546
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
547
- (drop): Dropout(p=0.0, inplace=False)
548
- )
549
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
550
- )
551
- (1): BlockWithMasking(
552
- (attn): MultiheadAttention(
553
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
554
- )
555
- (drop_path): Identity()
556
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
557
- (mlp): Mlp(
558
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
559
- (act): GELU(approximate='none')
560
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
561
- (drop): Dropout(p=0.0, inplace=False)
562
- )
563
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
564
- )
565
- (2): BlockWithMasking(
566
- (attn): MultiheadAttention(
567
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
568
- )
569
- (drop_path): Identity()
570
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
571
- (mlp): Mlp(
572
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
573
- (act): GELU(approximate='none')
574
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
575
- (drop): Dropout(p=0.0, inplace=False)
576
- )
577
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
578
- )
579
- (3): BlockWithMasking(
580
- (attn): MultiheadAttention(
581
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
582
- )
583
- (drop_path): Identity()
584
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
585
- (mlp): Mlp(
586
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
587
- (act): GELU(approximate='none')
588
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
589
- (drop): Dropout(p=0.0, inplace=False)
590
- )
591
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
592
- )
593
- (4): BlockWithMasking(
594
- (attn): MultiheadAttention(
595
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
596
- )
597
- (drop_path): Identity()
598
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
599
- (mlp): Mlp(
600
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
601
- (act): GELU(approximate='none')
602
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
603
- (drop): Dropout(p=0.0, inplace=False)
604
- )
605
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
606
- )
607
- (5): BlockWithMasking(
608
- (attn): MultiheadAttention(
609
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
610
- )
611
- (drop_path): Identity()
612
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
613
- (mlp): Mlp(
614
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
615
- (act): GELU(approximate='none')
616
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
617
- (drop): Dropout(p=0.0, inplace=False)
618
- )
619
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
620
- )
621
- (6): BlockWithMasking(
622
- (attn): MultiheadAttention(
623
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
624
- )
625
- (drop_path): Identity()
626
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
627
- (mlp): Mlp(
628
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
629
- (act): GELU(approximate='none')
630
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
631
- (drop): Dropout(p=0.0, inplace=False)
632
- )
633
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
634
- )
635
- (7): BlockWithMasking(
636
- (attn): MultiheadAttention(
637
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
638
- )
639
- (drop_path): Identity()
640
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
641
- (mlp): Mlp(
642
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
643
- (act): GELU(approximate='none')
644
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
645
- (drop): Dropout(p=0.0, inplace=False)
646
- )
647
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
648
- )
649
- (8): BlockWithMasking(
650
- (attn): MultiheadAttention(
651
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
652
- )
653
- (drop_path): Identity()
654
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
655
- (mlp): Mlp(
656
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
657
- (act): GELU(approximate='none')
658
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
659
- (drop): Dropout(p=0.0, inplace=False)
660
- )
661
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
662
- )
663
- (9): BlockWithMasking(
664
- (attn): MultiheadAttention(
665
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
666
- )
667
- (drop_path): Identity()
668
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
669
- (mlp): Mlp(
670
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
671
- (act): GELU(approximate='none')
672
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
673
- (drop): Dropout(p=0.0, inplace=False)
674
- )
675
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
676
- )
677
- (10): BlockWithMasking(
678
- (attn): MultiheadAttention(
679
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
680
- )
681
- (drop_path): Identity()
682
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
683
- (mlp): Mlp(
684
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
685
- (act): GELU(approximate='none')
686
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
687
- (drop): Dropout(p=0.0, inplace=False)
688
- )
689
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
690
- )
691
- (11): BlockWithMasking(
692
- (attn): MultiheadAttention(
693
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
694
- )
695
- (drop_path): Identity()
696
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
697
- (mlp): Mlp(
698
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
699
- (act): GELU(approximate='none')
700
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
701
- (drop): Dropout(p=0.0, inplace=False)
702
- )
703
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
704
- )
705
- (12): BlockWithMasking(
706
- (attn): MultiheadAttention(
707
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
708
- )
709
- (drop_path): Identity()
710
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
711
- (mlp): Mlp(
712
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
713
- (act): GELU(approximate='none')
714
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
715
- (drop): Dropout(p=0.0, inplace=False)
716
- )
717
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
718
- )
719
- (13): BlockWithMasking(
720
- (attn): MultiheadAttention(
721
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
722
- )
723
- (drop_path): Identity()
724
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
725
- (mlp): Mlp(
726
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
727
- (act): GELU(approximate='none')
728
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
729
- (drop): Dropout(p=0.0, inplace=False)
730
- )
731
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
732
- )
733
- (14): BlockWithMasking(
734
- (attn): MultiheadAttention(
735
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
736
- )
737
- (drop_path): Identity()
738
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
739
- (mlp): Mlp(
740
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
741
- (act): GELU(approximate='none')
742
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
743
- (drop): Dropout(p=0.0, inplace=False)
744
- )
745
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
746
- )
747
- (15): BlockWithMasking(
748
- (attn): MultiheadAttention(
749
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
750
- )
751
- (drop_path): Identity()
752
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
753
- (mlp): Mlp(
754
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
755
- (act): GELU(approximate='none')
756
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
757
- (drop): Dropout(p=0.0, inplace=False)
758
- )
759
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
760
- )
761
- (16): BlockWithMasking(
762
- (attn): MultiheadAttention(
763
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
764
- )
765
- (drop_path): Identity()
766
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
767
- (mlp): Mlp(
768
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
769
- (act): GELU(approximate='none')
770
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
771
- (drop): Dropout(p=0.0, inplace=False)
772
- )
773
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
774
- )
775
- (17): BlockWithMasking(
776
- (attn): MultiheadAttention(
777
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
778
- )
779
- (drop_path): Identity()
780
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
781
- (mlp): Mlp(
782
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
783
- (act): GELU(approximate='none')
784
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
785
- (drop): Dropout(p=0.0, inplace=False)
786
- )
787
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
788
- )
789
- (18): BlockWithMasking(
790
- (attn): MultiheadAttention(
791
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
792
- )
793
- (drop_path): Identity()
794
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
795
- (mlp): Mlp(
796
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
797
- (act): GELU(approximate='none')
798
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
799
- (drop): Dropout(p=0.0, inplace=False)
800
- )
801
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
802
- )
803
- (19): BlockWithMasking(
804
- (attn): MultiheadAttention(
805
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
806
- )
807
- (drop_path): Identity()
808
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
809
- (mlp): Mlp(
810
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
811
- (act): GELU(approximate='none')
812
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
813
- (drop): Dropout(p=0.0, inplace=False)
814
- )
815
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
816
- )
817
- (20): BlockWithMasking(
818
- (attn): MultiheadAttention(
819
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
820
- )
821
- (drop_path): Identity()
822
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
823
- (mlp): Mlp(
824
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
825
- (act): GELU(approximate='none')
826
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
827
- (drop): Dropout(p=0.0, inplace=False)
828
- )
829
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
830
- )
831
- (21): BlockWithMasking(
832
- (attn): MultiheadAttention(
833
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
834
- )
835
- (drop_path): Identity()
836
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
837
- (mlp): Mlp(
838
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
839
- (act): GELU(approximate='none')
840
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
841
- (drop): Dropout(p=0.0, inplace=False)
842
- )
843
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
844
- )
845
- (22): BlockWithMasking(
846
- (attn): MultiheadAttention(
847
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
848
- )
849
- (drop_path): Identity()
850
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
851
- (mlp): Mlp(
852
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
853
- (act): GELU(approximate='none')
854
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
855
- (drop): Dropout(p=0.0, inplace=False)
856
- )
857
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
858
- )
859
- (23): BlockWithMasking(
860
- (attn): MultiheadAttention(
861
- (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
862
- )
863
- (drop_path): Identity()
864
- (norm_1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
865
- (mlp): Mlp(
866
- (fc1): Linear(in_features=1024, out_features=4096, bias=True)
867
- (act): GELU(approximate='none')
868
- (fc2): Linear(in_features=4096, out_features=1024, bias=True)
869
- (drop): Dropout(p=0.0, inplace=False)
870
- )
871
- (norm_2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
872
- )
873
- )
874
- (post_transformer_layer): EinOpsRearrange()
875
- )
876
- (audio): SimpleTransformer(
877
- (pre_transformer_layer): Sequential(
878
- (0): Identity()
879
- (1): EinOpsRearrange()
880
- )
881
- (blocks): Sequential(
882
- (0): BlockWithMasking(
883
- (attn): MultiheadAttention(
884
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
885
- )
886
- (drop_path): Identity()
887
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
888
- (mlp): Mlp(
889
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
890
- (act): GELU(approximate='none')
891
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
892
- (drop): Dropout(p=0.0, inplace=False)
893
- )
894
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
895
- )
896
- (1): BlockWithMasking(
897
- (attn): MultiheadAttention(
898
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
899
- )
900
- (drop_path): DropPath(drop_prob=0.009)
901
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
902
- (mlp): Mlp(
903
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
904
- (act): GELU(approximate='none')
905
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
906
- (drop): Dropout(p=0.0, inplace=False)
907
- )
908
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
909
- )
910
- (2): BlockWithMasking(
911
- (attn): MultiheadAttention(
912
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
913
- )
914
- (drop_path): DropPath(drop_prob=0.018)
915
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
916
- (mlp): Mlp(
917
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
918
- (act): GELU(approximate='none')
919
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
920
- (drop): Dropout(p=0.0, inplace=False)
921
- )
922
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
923
- )
924
- (3): BlockWithMasking(
925
- (attn): MultiheadAttention(
926
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
927
- )
928
- (drop_path): DropPath(drop_prob=0.027)
929
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
930
- (mlp): Mlp(
931
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
932
- (act): GELU(approximate='none')
933
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
934
- (drop): Dropout(p=0.0, inplace=False)
935
- )
936
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
937
- )
938
- (4): BlockWithMasking(
939
- (attn): MultiheadAttention(
940
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
941
- )
942
- (drop_path): DropPath(drop_prob=0.036)
943
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
944
- (mlp): Mlp(
945
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
946
- (act): GELU(approximate='none')
947
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
948
- (drop): Dropout(p=0.0, inplace=False)
949
- )
950
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
951
- )
952
- (5): BlockWithMasking(
953
- (attn): MultiheadAttention(
954
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
955
- )
956
- (drop_path): DropPath(drop_prob=0.045)
957
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
958
- (mlp): Mlp(
959
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
960
- (act): GELU(approximate='none')
961
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
962
- (drop): Dropout(p=0.0, inplace=False)
963
- )
964
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
965
- )
966
- (6): BlockWithMasking(
967
- (attn): MultiheadAttention(
968
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
969
- )
970
- (drop_path): DropPath(drop_prob=0.055)
971
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
972
- (mlp): Mlp(
973
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
974
- (act): GELU(approximate='none')
975
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
976
- (drop): Dropout(p=0.0, inplace=False)
977
- )
978
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
979
- )
980
- (7): BlockWithMasking(
981
- (attn): MultiheadAttention(
982
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
983
- )
984
- (drop_path): DropPath(drop_prob=0.064)
985
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
986
- (mlp): Mlp(
987
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
988
- (act): GELU(approximate='none')
989
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
990
- (drop): Dropout(p=0.0, inplace=False)
991
- )
992
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
993
- )
994
- (8): BlockWithMasking(
995
- (attn): MultiheadAttention(
996
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
997
- )
998
- (drop_path): DropPath(drop_prob=0.073)
999
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1000
- (mlp): Mlp(
1001
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1002
- (act): GELU(approximate='none')
1003
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1004
- (drop): Dropout(p=0.0, inplace=False)
1005
- )
1006
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1007
- )
1008
- (9): BlockWithMasking(
1009
- (attn): MultiheadAttention(
1010
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1011
- )
1012
- (drop_path): DropPath(drop_prob=0.082)
1013
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1014
- (mlp): Mlp(
1015
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1016
- (act): GELU(approximate='none')
1017
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1018
- (drop): Dropout(p=0.0, inplace=False)
1019
- )
1020
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1021
- )
1022
- (10): BlockWithMasking(
1023
- (attn): MultiheadAttention(
1024
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1025
- )
1026
- (drop_path): DropPath(drop_prob=0.091)
1027
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1028
- (mlp): Mlp(
1029
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1030
- (act): GELU(approximate='none')
1031
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1032
- (drop): Dropout(p=0.0, inplace=False)
1033
- )
1034
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1035
- )
1036
- (11): BlockWithMasking(
1037
- (attn): MultiheadAttention(
1038
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1039
- )
1040
- (drop_path): DropPath(drop_prob=0.100)
1041
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1042
- (mlp): Mlp(
1043
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1044
- (act): GELU(approximate='none')
1045
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1046
- (drop): Dropout(p=0.0, inplace=False)
1047
- )
1048
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1049
- )
1050
- )
1051
- (post_transformer_layer): EinOpsRearrange()
1052
- )
1053
- (depth): SimpleTransformer(
1054
- (pre_transformer_layer): Sequential(
1055
- (0): Identity()
1056
- (1): EinOpsRearrange()
1057
- )
1058
- (blocks): Sequential(
1059
- (0): BlockWithMasking(
1060
- (attn): MultiheadAttention(
1061
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1062
- )
1063
- (drop_path): Identity()
1064
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1065
- (mlp): Mlp(
1066
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1067
- (act): GELU(approximate='none')
1068
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1069
- (drop): Dropout(p=0.0, inplace=False)
1070
- )
1071
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1072
- )
1073
- (1): BlockWithMasking(
1074
- (attn): MultiheadAttention(
1075
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1076
- )
1077
- (drop_path): Identity()
1078
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1079
- (mlp): Mlp(
1080
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1081
- (act): GELU(approximate='none')
1082
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1083
- (drop): Dropout(p=0.0, inplace=False)
1084
- )
1085
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1086
- )
1087
- (2): BlockWithMasking(
1088
- (attn): MultiheadAttention(
1089
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1090
- )
1091
- (drop_path): Identity()
1092
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1093
- (mlp): Mlp(
1094
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1095
- (act): GELU(approximate='none')
1096
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1097
- (drop): Dropout(p=0.0, inplace=False)
1098
- )
1099
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1100
- )
1101
- (3): BlockWithMasking(
1102
- (attn): MultiheadAttention(
1103
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1104
- )
1105
- (drop_path): Identity()
1106
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1107
- (mlp): Mlp(
1108
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1109
- (act): GELU(approximate='none')
1110
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1111
- (drop): Dropout(p=0.0, inplace=False)
1112
- )
1113
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1114
- )
1115
- (4): BlockWithMasking(
1116
- (attn): MultiheadAttention(
1117
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1118
- )
1119
- (drop_path): Identity()
1120
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1121
- (mlp): Mlp(
1122
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1123
- (act): GELU(approximate='none')
1124
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1125
- (drop): Dropout(p=0.0, inplace=False)
1126
- )
1127
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1128
- )
1129
- (5): BlockWithMasking(
1130
- (attn): MultiheadAttention(
1131
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1132
- )
1133
- (drop_path): Identity()
1134
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1135
- (mlp): Mlp(
1136
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1137
- (act): GELU(approximate='none')
1138
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1139
- (drop): Dropout(p=0.0, inplace=False)
1140
- )
1141
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1142
- )
1143
- (6): BlockWithMasking(
1144
- (attn): MultiheadAttention(
1145
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1146
- )
1147
- (drop_path): Identity()
1148
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1149
- (mlp): Mlp(
1150
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1151
- (act): GELU(approximate='none')
1152
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1153
- (drop): Dropout(p=0.0, inplace=False)
1154
- )
1155
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1156
- )
1157
- (7): BlockWithMasking(
1158
- (attn): MultiheadAttention(
1159
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1160
- )
1161
- (drop_path): Identity()
1162
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1163
- (mlp): Mlp(
1164
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1165
- (act): GELU(approximate='none')
1166
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1167
- (drop): Dropout(p=0.0, inplace=False)
1168
- )
1169
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1170
- )
1171
- (8): BlockWithMasking(
1172
- (attn): MultiheadAttention(
1173
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1174
- )
1175
- (drop_path): Identity()
1176
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1177
- (mlp): Mlp(
1178
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1179
- (act): GELU(approximate='none')
1180
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1181
- (drop): Dropout(p=0.0, inplace=False)
1182
- )
1183
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1184
- )
1185
- (9): BlockWithMasking(
1186
- (attn): MultiheadAttention(
1187
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1188
- )
1189
- (drop_path): Identity()
1190
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1191
- (mlp): Mlp(
1192
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1193
- (act): GELU(approximate='none')
1194
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1195
- (drop): Dropout(p=0.0, inplace=False)
1196
- )
1197
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1198
- )
1199
- (10): BlockWithMasking(
1200
- (attn): MultiheadAttention(
1201
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1202
- )
1203
- (drop_path): Identity()
1204
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1205
- (mlp): Mlp(
1206
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1207
- (act): GELU(approximate='none')
1208
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1209
- (drop): Dropout(p=0.0, inplace=False)
1210
- )
1211
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1212
- )
1213
- (11): BlockWithMasking(
1214
- (attn): MultiheadAttention(
1215
- (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
1216
- )
1217
- (drop_path): Identity()
1218
- (norm_1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1219
- (mlp): Mlp(
1220
- (fc1): Linear(in_features=384, out_features=1536, bias=True)
1221
- (act): GELU(approximate='none')
1222
- (fc2): Linear(in_features=1536, out_features=384, bias=True)
1223
- (drop): Dropout(p=0.0, inplace=False)
1224
- )
1225
- (norm_2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1226
- )
1227
- )
1228
- (post_transformer_layer): EinOpsRearrange()
1229
- )
1230
- (thermal): SimpleTransformer(
1231
- (pre_transformer_layer): Sequential(
1232
- (0): Identity()
1233
- (1): EinOpsRearrange()
1234
- )
1235
- (blocks): Sequential(
1236
- (0): BlockWithMasking(
1237
- (attn): MultiheadAttention(
1238
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1239
- )
1240
- (drop_path): Identity()
1241
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1242
- (mlp): Mlp(
1243
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1244
- (act): GELU(approximate='none')
1245
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1246
- (drop): Dropout(p=0.0, inplace=False)
1247
- )
1248
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1249
- )
1250
- (1): BlockWithMasking(
1251
- (attn): MultiheadAttention(
1252
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1253
- )
1254
- (drop_path): Identity()
1255
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1256
- (mlp): Mlp(
1257
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1258
- (act): GELU(approximate='none')
1259
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1260
- (drop): Dropout(p=0.0, inplace=False)
1261
- )
1262
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1263
- )
1264
- (2): BlockWithMasking(
1265
- (attn): MultiheadAttention(
1266
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1267
- )
1268
- (drop_path): Identity()
1269
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1270
- (mlp): Mlp(
1271
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1272
- (act): GELU(approximate='none')
1273
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1274
- (drop): Dropout(p=0.0, inplace=False)
1275
- )
1276
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1277
- )
1278
- (3): BlockWithMasking(
1279
- (attn): MultiheadAttention(
1280
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1281
- )
1282
- (drop_path): Identity()
1283
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1284
- (mlp): Mlp(
1285
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1286
- (act): GELU(approximate='none')
1287
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1288
- (drop): Dropout(p=0.0, inplace=False)
1289
- )
1290
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1291
- )
1292
- (4): BlockWithMasking(
1293
- (attn): MultiheadAttention(
1294
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1295
- )
1296
- (drop_path): Identity()
1297
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1298
- (mlp): Mlp(
1299
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1300
- (act): GELU(approximate='none')
1301
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1302
- (drop): Dropout(p=0.0, inplace=False)
1303
- )
1304
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1305
- )
1306
- (5): BlockWithMasking(
1307
- (attn): MultiheadAttention(
1308
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1309
- )
1310
- (drop_path): Identity()
1311
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1312
- (mlp): Mlp(
1313
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1314
- (act): GELU(approximate='none')
1315
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1316
- (drop): Dropout(p=0.0, inplace=False)
1317
- )
1318
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1319
- )
1320
- (6): BlockWithMasking(
1321
- (attn): MultiheadAttention(
1322
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1323
- )
1324
- (drop_path): Identity()
1325
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1326
- (mlp): Mlp(
1327
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1328
- (act): GELU(approximate='none')
1329
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1330
- (drop): Dropout(p=0.0, inplace=False)
1331
- )
1332
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1333
- )
1334
- (7): BlockWithMasking(
1335
- (attn): MultiheadAttention(
1336
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1337
- )
1338
- (drop_path): Identity()
1339
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1340
- (mlp): Mlp(
1341
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1342
- (act): GELU(approximate='none')
1343
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1344
- (drop): Dropout(p=0.0, inplace=False)
1345
- )
1346
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1347
- )
1348
- (8): BlockWithMasking(
1349
- (attn): MultiheadAttention(
1350
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1351
- )
1352
- (drop_path): Identity()
1353
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1354
- (mlp): Mlp(
1355
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1356
- (act): GELU(approximate='none')
1357
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1358
- (drop): Dropout(p=0.0, inplace=False)
1359
- )
1360
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1361
- )
1362
- (9): BlockWithMasking(
1363
- (attn): MultiheadAttention(
1364
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1365
- )
1366
- (drop_path): Identity()
1367
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1368
- (mlp): Mlp(
1369
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1370
- (act): GELU(approximate='none')
1371
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1372
- (drop): Dropout(p=0.0, inplace=False)
1373
- )
1374
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1375
- )
1376
- (10): BlockWithMasking(
1377
- (attn): MultiheadAttention(
1378
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1379
- )
1380
- (drop_path): Identity()
1381
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1382
- (mlp): Mlp(
1383
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1384
- (act): GELU(approximate='none')
1385
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1386
- (drop): Dropout(p=0.0, inplace=False)
1387
- )
1388
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1389
- )
1390
- (11): BlockWithMasking(
1391
- (attn): MultiheadAttention(
1392
- (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
1393
- )
1394
- (drop_path): Identity()
1395
- (norm_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1396
- (mlp): Mlp(
1397
- (fc1): Linear(in_features=768, out_features=3072, bias=True)
1398
- (act): GELU(approximate='none')
1399
- (fc2): Linear(in_features=3072, out_features=768, bias=True)
1400
- (drop): Dropout(p=0.0, inplace=False)
1401
- )
1402
- (norm_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1403
- )
1404
- )
1405
- (post_transformer_layer): EinOpsRearrange()
1406
- )
1407
- (imu): SimpleTransformer(
1408
- (pre_transformer_layer): Sequential(
1409
- (0): Identity()
1410
- (1): EinOpsRearrange()
1411
- )
1412
- (blocks): Sequential(
1413
- (0): BlockWithMasking(
1414
- (attn): MultiheadAttention(
1415
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1416
- )
1417
- (drop_path): Identity()
1418
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1419
- (mlp): Mlp(
1420
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1421
- (act): GELU(approximate='none')
1422
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1423
- (drop): Dropout(p=0.0, inplace=False)
1424
- )
1425
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1426
- )
1427
- (1): BlockWithMasking(
1428
- (attn): MultiheadAttention(
1429
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1430
- )
1431
- (drop_path): DropPath(drop_prob=0.140)
1432
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1433
- (mlp): Mlp(
1434
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1435
- (act): GELU(approximate='none')
1436
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1437
- (drop): Dropout(p=0.0, inplace=False)
1438
- )
1439
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1440
- )
1441
- (2): BlockWithMasking(
1442
- (attn): MultiheadAttention(
1443
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1444
- )
1445
- (drop_path): DropPath(drop_prob=0.280)
1446
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1447
- (mlp): Mlp(
1448
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1449
- (act): GELU(approximate='none')
1450
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1451
- (drop): Dropout(p=0.0, inplace=False)
1452
- )
1453
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1454
- )
1455
- (3): BlockWithMasking(
1456
- (attn): MultiheadAttention(
1457
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1458
- )
1459
- (drop_path): DropPath(drop_prob=0.420)
1460
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1461
- (mlp): Mlp(
1462
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1463
- (act): GELU(approximate='none')
1464
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1465
- (drop): Dropout(p=0.0, inplace=False)
1466
- )
1467
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1468
- )
1469
- (4): BlockWithMasking(
1470
- (attn): MultiheadAttention(
1471
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1472
- )
1473
- (drop_path): DropPath(drop_prob=0.560)
1474
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1475
- (mlp): Mlp(
1476
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1477
- (act): GELU(approximate='none')
1478
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1479
- (drop): Dropout(p=0.0, inplace=False)
1480
- )
1481
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1482
- )
1483
- (5): BlockWithMasking(
1484
- (attn): MultiheadAttention(
1485
- (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
1486
- )
1487
- (drop_path): DropPath(drop_prob=0.700)
1488
- (norm_1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1489
- (mlp): Mlp(
1490
- (fc1): Linear(in_features=512, out_features=2048, bias=True)
1491
- (act): GELU(approximate='none')
1492
- (fc2): Linear(in_features=2048, out_features=512, bias=True)
1493
- (drop): Dropout(p=0.0, inplace=False)
1494
- )
1495
- (norm_2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1496
- )
1497
- )
1498
- (post_transformer_layer): EinOpsRearrange()
1499
- )
1500
- )
1501
- (modality_heads): ModuleDict(
1502
- (vision): Sequential(
1503
- (0): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
1504
- (1): SelectElement()
1505
- (2): Linear(in_features=1280, out_features=1024, bias=False)
1506
- )
1507
- (text): SelectEOSAndProject(
1508
- (proj): Sequential(
1509
- (0): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
1510
- (1): Linear(in_features=1024, out_features=1024, bias=False)
1511
- )
1512
- )
1513
- (audio): Sequential(
1514
- (0): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1515
- (1): SelectElement()
1516
- (2): Linear(in_features=768, out_features=1024, bias=False)
1517
- )
1518
- (depth): Sequential(
1519
- (0): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
1520
- (1): SelectElement()
1521
- (2): Linear(in_features=384, out_features=1024, bias=False)
1522
- )
1523
- (thermal): Sequential(
1524
- (0): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
1525
- (1): SelectElement()
1526
- (2): Linear(in_features=768, out_features=1024, bias=False)
1527
- )
1528
- (imu): Sequential(
1529
- (0): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
1530
- (1): SelectElement()
1531
- (2): Dropout(p=0.5, inplace=False)
1532
- (3): Linear(in_features=512, out_features=1024, bias=False)
1533
- )
1534
- )
1535
- (modality_postprocessors): ModuleDict(
1536
- (vision): Normalize()
1537
- (text): Sequential(
1538
- (0): Normalize()
1539
- (1): LearnableLogitScaling(logit_scale_init=14.285714285714285,learnable=True, max_logit_scale=100)
1540
- )
1541
- (audio): Sequential(
1542
- (0): Normalize()
1543
- (1): LearnableLogitScaling(logit_scale_init=20.0,learnable=False, max_logit_scale=100)
1544
- )
1545
- (depth): Sequential(
1546
- (0): Normalize()
1547
- (1): LearnableLogitScaling(logit_scale_init=5.0,learnable=False, max_logit_scale=100)
1548
- )
1549
- (thermal): Sequential(
1550
- (0): Normalize()
1551
- (1): LearnableLogitScaling(logit_scale_init=10.0,learnable=False, max_logit_scale=100)
1552
- )
1553
- (imu): Sequential(
1554
- (0): Normalize()
1555
- (1): LearnableLogitScaling(logit_scale_init=5.0,learnable=False, max_logit_scale=100)
1556
- )
1557
- )
1558
- )
1559
- (reasoner): Qwen2ForCausalLM(
1560
- (model): Qwen2Model(
1561
- (embed_tokens): Embedding(151936, 896)
1562
- (layers): ModuleList(
1563
- (0): Qwen2DecoderLayer(
1564
- (self_attn): Qwen2Attention(
1565
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1566
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1567
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1568
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1569
- (rotary_emb): Qwen2RotaryEmbedding()
1570
- )
1571
- (mlp): Qwen2MLP(
1572
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1573
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1574
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1575
- (act_fn): SiLU()
1576
- )
1577
- (input_layernorm): Qwen2RMSNorm()
1578
- (post_attention_layernorm): Qwen2RMSNorm()
1579
- )
1580
- (1): Qwen2DecoderLayer(
1581
- (self_attn): Qwen2Attention(
1582
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1583
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1584
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1585
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1586
- (rotary_emb): Qwen2RotaryEmbedding()
1587
- )
1588
- (mlp): Qwen2MLP(
1589
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1590
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1591
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1592
- (act_fn): SiLU()
1593
- )
1594
- (input_layernorm): Qwen2RMSNorm()
1595
- (post_attention_layernorm): Qwen2RMSNorm()
1596
- )
1597
- (2): Qwen2DecoderLayer(
1598
- (self_attn): Qwen2Attention(
1599
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1600
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1601
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1602
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1603
- (rotary_emb): Qwen2RotaryEmbedding()
1604
- )
1605
- (mlp): Qwen2MLP(
1606
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1607
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1608
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1609
- (act_fn): SiLU()
1610
- )
1611
- (input_layernorm): Qwen2RMSNorm()
1612
- (post_attention_layernorm): Qwen2RMSNorm()
1613
- )
1614
- (3): Qwen2DecoderLayer(
1615
- (self_attn): Qwen2Attention(
1616
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1617
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1618
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1619
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1620
- (rotary_emb): Qwen2RotaryEmbedding()
1621
- )
1622
- (mlp): Qwen2MLP(
1623
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1624
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1625
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1626
- (act_fn): SiLU()
1627
- )
1628
- (input_layernorm): Qwen2RMSNorm()
1629
- (post_attention_layernorm): Qwen2RMSNorm()
1630
- )
1631
- (4): Qwen2DecoderLayer(
1632
- (self_attn): Qwen2Attention(
1633
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1634
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1635
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1636
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1637
- (rotary_emb): Qwen2RotaryEmbedding()
1638
- )
1639
- (mlp): Qwen2MLP(
1640
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1641
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1642
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1643
- (act_fn): SiLU()
1644
- )
1645
- (input_layernorm): Qwen2RMSNorm()
1646
- (post_attention_layernorm): Qwen2RMSNorm()
1647
- )
1648
- (5): Qwen2DecoderLayer(
1649
- (self_attn): Qwen2Attention(
1650
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1651
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1652
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1653
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1654
- (rotary_emb): Qwen2RotaryEmbedding()
1655
- )
1656
- (mlp): Qwen2MLP(
1657
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1658
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1659
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1660
- (act_fn): SiLU()
1661
- )
1662
- (input_layernorm): Qwen2RMSNorm()
1663
- (post_attention_layernorm): Qwen2RMSNorm()
1664
- )
1665
- (6): Qwen2DecoderLayer(
1666
- (self_attn): Qwen2Attention(
1667
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1668
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1669
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1670
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1671
- (rotary_emb): Qwen2RotaryEmbedding()
1672
- )
1673
- (mlp): Qwen2MLP(
1674
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1675
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1676
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1677
- (act_fn): SiLU()
1678
- )
1679
- (input_layernorm): Qwen2RMSNorm()
1680
- (post_attention_layernorm): Qwen2RMSNorm()
1681
- )
1682
- (7): Qwen2DecoderLayer(
1683
- (self_attn): Qwen2Attention(
1684
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1685
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1686
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1687
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1688
- (rotary_emb): Qwen2RotaryEmbedding()
1689
- )
1690
- (mlp): Qwen2MLP(
1691
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1692
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1693
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1694
- (act_fn): SiLU()
1695
- )
1696
- (input_layernorm): Qwen2RMSNorm()
1697
- (post_attention_layernorm): Qwen2RMSNorm()
1698
- )
1699
- (8): Qwen2DecoderLayer(
1700
- (self_attn): Qwen2Attention(
1701
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1702
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1703
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1704
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1705
- (rotary_emb): Qwen2RotaryEmbedding()
1706
- )
1707
- (mlp): Qwen2MLP(
1708
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1709
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1710
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1711
- (act_fn): SiLU()
1712
- )
1713
- (input_layernorm): Qwen2RMSNorm()
1714
- (post_attention_layernorm): Qwen2RMSNorm()
1715
- )
1716
- (9): Qwen2DecoderLayer(
1717
- (self_attn): Qwen2Attention(
1718
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1719
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1720
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1721
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1722
- (rotary_emb): Qwen2RotaryEmbedding()
1723
- )
1724
- (mlp): Qwen2MLP(
1725
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1726
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1727
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1728
- (act_fn): SiLU()
1729
- )
1730
- (input_layernorm): Qwen2RMSNorm()
1731
- (post_attention_layernorm): Qwen2RMSNorm()
1732
- )
1733
- (10): Qwen2DecoderLayer(
1734
- (self_attn): Qwen2Attention(
1735
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1736
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1737
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1738
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1739
- (rotary_emb): Qwen2RotaryEmbedding()
1740
- )
1741
- (mlp): Qwen2MLP(
1742
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1743
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1744
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1745
- (act_fn): SiLU()
1746
- )
1747
- (input_layernorm): Qwen2RMSNorm()
1748
- (post_attention_layernorm): Qwen2RMSNorm()
1749
- )
1750
- (11): Qwen2DecoderLayer(
1751
- (self_attn): Qwen2Attention(
1752
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1753
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1754
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1755
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1756
- (rotary_emb): Qwen2RotaryEmbedding()
1757
- )
1758
- (mlp): Qwen2MLP(
1759
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1760
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1761
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1762
- (act_fn): SiLU()
1763
- )
1764
- (input_layernorm): Qwen2RMSNorm()
1765
- (post_attention_layernorm): Qwen2RMSNorm()
1766
- )
1767
- (12): Qwen2DecoderLayer(
1768
- (self_attn): Qwen2Attention(
1769
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1770
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1771
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1772
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1773
- (rotary_emb): Qwen2RotaryEmbedding()
1774
- )
1775
- (mlp): Qwen2MLP(
1776
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1777
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1778
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1779
- (act_fn): SiLU()
1780
- )
1781
- (input_layernorm): Qwen2RMSNorm()
1782
- (post_attention_layernorm): Qwen2RMSNorm()
1783
- )
1784
- (13): Qwen2DecoderLayer(
1785
- (self_attn): Qwen2Attention(
1786
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1787
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1788
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1789
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1790
- (rotary_emb): Qwen2RotaryEmbedding()
1791
- )
1792
- (mlp): Qwen2MLP(
1793
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1794
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1795
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1796
- (act_fn): SiLU()
1797
- )
1798
- (input_layernorm): Qwen2RMSNorm()
1799
- (post_attention_layernorm): Qwen2RMSNorm()
1800
- )
1801
- (14): Qwen2DecoderLayer(
1802
- (self_attn): Qwen2Attention(
1803
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1804
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1805
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1806
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1807
- (rotary_emb): Qwen2RotaryEmbedding()
1808
- )
1809
- (mlp): Qwen2MLP(
1810
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1811
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1812
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1813
- (act_fn): SiLU()
1814
- )
1815
- (input_layernorm): Qwen2RMSNorm()
1816
- (post_attention_layernorm): Qwen2RMSNorm()
1817
- )
1818
- (15): Qwen2DecoderLayer(
1819
- (self_attn): Qwen2Attention(
1820
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1821
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1822
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1823
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1824
- (rotary_emb): Qwen2RotaryEmbedding()
1825
- )
1826
- (mlp): Qwen2MLP(
1827
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1828
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1829
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1830
- (act_fn): SiLU()
1831
- )
1832
- (input_layernorm): Qwen2RMSNorm()
1833
- (post_attention_layernorm): Qwen2RMSNorm()
1834
- )
1835
- (16): Qwen2DecoderLayer(
1836
- (self_attn): Qwen2Attention(
1837
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1838
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1839
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1840
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1841
- (rotary_emb): Qwen2RotaryEmbedding()
1842
- )
1843
- (mlp): Qwen2MLP(
1844
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1845
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1846
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1847
- (act_fn): SiLU()
1848
- )
1849
- (input_layernorm): Qwen2RMSNorm()
1850
- (post_attention_layernorm): Qwen2RMSNorm()
1851
- )
1852
- (17): Qwen2DecoderLayer(
1853
- (self_attn): Qwen2Attention(
1854
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1855
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1856
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1857
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1858
- (rotary_emb): Qwen2RotaryEmbedding()
1859
- )
1860
- (mlp): Qwen2MLP(
1861
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1862
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1863
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1864
- (act_fn): SiLU()
1865
- )
1866
- (input_layernorm): Qwen2RMSNorm()
1867
- (post_attention_layernorm): Qwen2RMSNorm()
1868
- )
1869
- (18): Qwen2DecoderLayer(
1870
- (self_attn): Qwen2Attention(
1871
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1872
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1873
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1874
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1875
- (rotary_emb): Qwen2RotaryEmbedding()
1876
- )
1877
- (mlp): Qwen2MLP(
1878
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1879
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1880
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1881
- (act_fn): SiLU()
1882
- )
1883
- (input_layernorm): Qwen2RMSNorm()
1884
- (post_attention_layernorm): Qwen2RMSNorm()
1885
- )
1886
- (19): Qwen2DecoderLayer(
1887
- (self_attn): Qwen2Attention(
1888
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1889
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1890
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1891
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1892
- (rotary_emb): Qwen2RotaryEmbedding()
1893
- )
1894
- (mlp): Qwen2MLP(
1895
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1896
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1897
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1898
- (act_fn): SiLU()
1899
- )
1900
- (input_layernorm): Qwen2RMSNorm()
1901
- (post_attention_layernorm): Qwen2RMSNorm()
1902
- )
1903
- (20): Qwen2DecoderLayer(
1904
- (self_attn): Qwen2Attention(
1905
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1906
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1907
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1908
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1909
- (rotary_emb): Qwen2RotaryEmbedding()
1910
- )
1911
- (mlp): Qwen2MLP(
1912
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1913
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1914
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1915
- (act_fn): SiLU()
1916
- )
1917
- (input_layernorm): Qwen2RMSNorm()
1918
- (post_attention_layernorm): Qwen2RMSNorm()
1919
- )
1920
- (21): Qwen2DecoderLayer(
1921
- (self_attn): Qwen2Attention(
1922
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1923
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1924
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1925
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1926
- (rotary_emb): Qwen2RotaryEmbedding()
1927
- )
1928
- (mlp): Qwen2MLP(
1929
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1930
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1931
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1932
- (act_fn): SiLU()
1933
- )
1934
- (input_layernorm): Qwen2RMSNorm()
1935
- (post_attention_layernorm): Qwen2RMSNorm()
1936
- )
1937
- (22): Qwen2DecoderLayer(
1938
- (self_attn): Qwen2Attention(
1939
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1940
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1941
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1942
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1943
- (rotary_emb): Qwen2RotaryEmbedding()
1944
- )
1945
- (mlp): Qwen2MLP(
1946
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1947
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1948
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1949
- (act_fn): SiLU()
1950
- )
1951
- (input_layernorm): Qwen2RMSNorm()
1952
- (post_attention_layernorm): Qwen2RMSNorm()
1953
- )
1954
- (23): Qwen2DecoderLayer(
1955
- (self_attn): Qwen2Attention(
1956
- (q_proj): Linear(in_features=896, out_features=896, bias=True)
1957
- (k_proj): Linear(in_features=896, out_features=128, bias=True)
1958
- (v_proj): Linear(in_features=896, out_features=128, bias=True)
1959
- (o_proj): Linear(in_features=896, out_features=896, bias=False)
1960
- (rotary_emb): Qwen2RotaryEmbedding()
1961
- )
1962
- (mlp): Qwen2MLP(
1963
- (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
1964
- (up_proj): Linear(in_features=896, out_features=4864, bias=False)
1965
- (down_proj): Linear(in_features=4864, out_features=896, bias=False)
1966
- (act_fn): SiLU()
1967
- )
1968
- (input_layernorm): Qwen2RMSNorm()
1969
- (post_attention_layernorm): Qwen2RMSNorm()
1970
- )
1971
- )
1972
- (norm): Qwen2RMSNorm()
1973
- )
1974
- (lm_head): Linear(in_features=896, out_features=151936, bias=False)
1975
- )
1976
- (input_projetor): Linear(in_features=1024, out_features=896, bias=True)
1977
- )