File size: 1,825 Bytes
b20af9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
model:
  clip: ViT16
  vision_backbone: null
  audio_backbone: BEATs
  audio_proj: FGA512

pretrain:
  vision_backbone: null
  audio_backbone: ./pretrain/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt
  audio_proj: null

fga_conf:
  FGA:
    input_size: 768
    output_size: 768

  FGA512:
    input_size: 768
    output_size: 512

clip_conf:
  RN50:
    name: RN50
    vision:
      image_resolution: 224
      vision_layers: [3, 4, 6, 3]
      vision_width: 64
      heads: 8
      vision_patch_size: null
    text:
      transformer_layers: 12
      transformer_width: 512
      transformer_heads: 8
      vocab_size: 49408
      context_length: 77
    embedding_dim: 1024

  ViT16:
    name: ViT-B/16
    vision:
      image_resolution: 224
      vision_layers: 12
      vision_width: 768
      heads: 12
      vision_patch_size: 16
    text:
      transformer_layers: 12
      transformer_width: 512
      transformer_heads: 8
      vocab_size: 49408
      context_length: 77
    embedding_dim: 512

  ViT14:
    name: ViT-L/14
    vision:
      image_resolution: 224
      vision_layers: 24
      vision_width: 1024
      heads: 16
      vision_patch_size: 14
    text:
      transformer_layers: 12
      transformer_width: 768
      transformer_heads: 12
      vocab_size: 49408
      context_length: 77
    embedding_dim: 768

vision_backbone_conf:
  maskclip_plus_rn50_512:
    name: maskclip_plus_rn50_512
    image_resolution: 512
    vision_layers: [ 3, 4, 6, 3 ]
    vision_width: 2048
    aspp:
      dilations: [ 6, 12, 18, 24 ]
      in_channels: 2048
      channels: 512

  maskclip_plus_rn101_512:
    name: maskclip_plus_rn101_512
    image_resolution: 512
    vision_layers: [ 3, 4, 23, 3 ]
    vision_width: 2048
    aspp:
      dilations: [ 6, 12, 18, 24 ]
      in_channels: 2048
      channels: 1024