Spaces:
Sleeping
Sleeping
Add configs
Browse files- prismer/configs/caption.yaml +48 -0
- prismer/configs/classification.yaml +21 -0
- prismer/configs/experts.yaml +2 -0
- prismer/configs/pretrain.yaml +21 -0
- prismer/configs/prismer.json +74 -0
- prismer/configs/roberta.json +42 -0
- prismer/configs/vqa.yaml +18 -0
prismer/configs/caption.yaml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
coco:
|
2 |
+
dataset: 'coco'
|
3 |
+
data_path: '/workspace_dataset/dataset_vqa'
|
4 |
+
label_path: '/workspace_dataset/dataset_experts'
|
5 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
6 |
+
image_resolution: 480
|
7 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
8 |
+
freeze: 'freeze_vision'
|
9 |
+
|
10 |
+
batch_size_train: 4 # for 8 * 8 nodes [effective batch-size: 256]
|
11 |
+
batch_size_test: 8
|
12 |
+
init_lr: 5e-5
|
13 |
+
weight_decay: 0.05
|
14 |
+
min_lr: 0
|
15 |
+
max_epoch: 3
|
16 |
+
|
17 |
+
prefix: 'A picture of' # use prefix for fine-tuning or no pre-fix '' for zero-shot experiments
|
18 |
+
|
19 |
+
nocaps:
|
20 |
+
dataset: 'nocaps'
|
21 |
+
data_path: '/workspace_dataset/dataset_vqa'
|
22 |
+
label_path: '/workspace_dataset/dataset_experts'
|
23 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
24 |
+
|
25 |
+
image_resolution: 480
|
26 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
27 |
+
freeze: 'freeze_vision'
|
28 |
+
|
29 |
+
batch_size_train: 4 # for 8 * 8 nodes [effective batch-size: 256]
|
30 |
+
batch_size_test: 8
|
31 |
+
init_lr: 5e-5
|
32 |
+
weight_decay: 0.05
|
33 |
+
min_lr: 0
|
34 |
+
max_epoch: 3
|
35 |
+
|
36 |
+
prefix: 'A picture of' # use prefix for fine-tuning or no pre-fix '' for zero-shot experiments
|
37 |
+
|
38 |
+
demo:
|
39 |
+
dataset: 'demo'
|
40 |
+
data_path: 'helpers'
|
41 |
+
label_path: 'helpers/labels'
|
42 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
43 |
+
|
44 |
+
image_resolution: 480
|
45 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
46 |
+
freeze: 'freeze_vision'
|
47 |
+
|
48 |
+
prefix: 'A picture of'
|
prismer/configs/classification.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_path: '/workspace_dataset/dataset_zero'
|
2 |
+
label_path: '/workspace_dataset/dataset_experts'
|
3 |
+
|
4 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
5 |
+
|
6 |
+
freeze: 'freeze_vision'
|
7 |
+
dataset: 'imagenet'
|
8 |
+
shots: 1
|
9 |
+
|
10 |
+
image_resolution: 384
|
11 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
12 |
+
|
13 |
+
batch_size_train: 2 # for 4 * 8 nodes [effective batch-size: 64]
|
14 |
+
batch_size_test: 8
|
15 |
+
init_lr: 5e-5
|
16 |
+
weight_decay: 0.05
|
17 |
+
min_lr: 0
|
18 |
+
max_epoch: 20
|
19 |
+
|
20 |
+
k_test: 32
|
21 |
+
prefix: 'A photo of a'
|
prismer/configs/experts.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data_path: 'helpers'
|
2 |
+
save_path: 'helpers/labels'
|
prismer/configs/pretrain.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets: ['cc12m', 'cc3m_sgu', 'coco', 'vg']
|
2 |
+
|
3 |
+
cc12m_data_path: '/workspace_dataset/cc12m'
|
4 |
+
cc3m_data_path: '/home/datasets/cc3m'
|
5 |
+
coco_data_path: '/workspace_dataset/dataset_vqa'
|
6 |
+
vg_data_path: '/home/datasets/vqa'
|
7 |
+
label_path: '/workspace_dataset/dataset_experts'
|
8 |
+
|
9 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
10 |
+
|
11 |
+
image_resolution: 224
|
12 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
13 |
+
freeze: 'freeze_lang_vision'
|
14 |
+
batch_size_train: 32 # for 4 * 8 nodes [effective batch-size: 1024]
|
15 |
+
|
16 |
+
max_epoch: 20
|
17 |
+
weight_decay: 0.05
|
18 |
+
init_lr: 3e-4 # 1e-4 for prismer_large
|
19 |
+
min_lr: 1e-6
|
20 |
+
warmup_lr: 1e-6
|
21 |
+
warmup_steps: 2000
|
prismer/configs/prismer.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"prismer_base": {
|
3 |
+
"roberta_model": {
|
4 |
+
"attention_probs_dropout_prob": 0.1,
|
5 |
+
"bos_token_id": 0,
|
6 |
+
"eos_token_id": 2,
|
7 |
+
"hidden_act": "gelu",
|
8 |
+
"hidden_dropout_prob": 0.1,
|
9 |
+
"hidden_size": 768,
|
10 |
+
"vision_hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 3072,
|
13 |
+
"layer_norm_eps": 1e-05,
|
14 |
+
"max_position_embeddings": 514,
|
15 |
+
"model_name": "roberta-base",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 1,
|
19 |
+
"type_vocab_size": 1,
|
20 |
+
"vocab_size": 50265,
|
21 |
+
"num_decoder_layers": 4,
|
22 |
+
"is_decoder": true
|
23 |
+
},
|
24 |
+
"vit_model": "ViT-B/16"
|
25 |
+
},
|
26 |
+
"prismer_large": {
|
27 |
+
"roberta_model": {
|
28 |
+
"attention_probs_dropout_prob": 0.1,
|
29 |
+
"bos_token_id": 0,
|
30 |
+
"eos_token_id": 2,
|
31 |
+
"hidden_act": "gelu",
|
32 |
+
"hidden_dropout_prob": 0.1,
|
33 |
+
"hidden_size": 1024,
|
34 |
+
"vision_hidden_size": 1024,
|
35 |
+
"initializer_range": 0.02,
|
36 |
+
"intermediate_size": 4096,
|
37 |
+
"layer_norm_eps": 1e-05,
|
38 |
+
"max_position_embeddings": 514,
|
39 |
+
"model_name": "roberta-large",
|
40 |
+
"num_attention_heads": 16,
|
41 |
+
"num_hidden_layers": 24,
|
42 |
+
"pad_token_id": 1,
|
43 |
+
"type_vocab_size": 1,
|
44 |
+
"vocab_size": 50265,
|
45 |
+
"num_decoder_layers": 4,
|
46 |
+
"is_decoder": true
|
47 |
+
},
|
48 |
+
"vit_model": "ViT-L/14@336px"
|
49 |
+
},
|
50 |
+
"prismer_huge": {
|
51 |
+
"roberta_model": {
|
52 |
+
"attention_probs_dropout_prob": 0.1,
|
53 |
+
"bos_token_id": 0,
|
54 |
+
"eos_token_id": 2,
|
55 |
+
"hidden_act": "gelu",
|
56 |
+
"hidden_dropout_prob": 0.1,
|
57 |
+
"hidden_size": 1024,
|
58 |
+
"vision_hidden_size": 1280,
|
59 |
+
"initializer_range": 0.02,
|
60 |
+
"intermediate_size": 4096,
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"max_position_embeddings": 514,
|
63 |
+
"model_name": "roberta-large",
|
64 |
+
"num_attention_heads": 16,
|
65 |
+
"num_hidden_layers": 24,
|
66 |
+
"pad_token_id": 1,
|
67 |
+
"type_vocab_size": 1,
|
68 |
+
"vocab_size": 50265,
|
69 |
+
"num_decoder_layers": 4,
|
70 |
+
"is_decoder": true
|
71 |
+
},
|
72 |
+
"vit_model": "ViT-H/14"
|
73 |
+
}
|
74 |
+
}
|
prismer/configs/roberta.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"roberta-base": {
|
3 |
+
"attention_probs_dropout_prob": 0.1,
|
4 |
+
"bos_token_id": 0,
|
5 |
+
"eos_token_id": 2,
|
6 |
+
"hidden_act": "gelu",
|
7 |
+
"hidden_dropout_prob": 0.1,
|
8 |
+
"hidden_size": 768,
|
9 |
+
"initializer_range": 0.02,
|
10 |
+
"intermediate_size": 3072,
|
11 |
+
"layer_norm_eps": 1e-05,
|
12 |
+
"max_position_embeddings": 514,
|
13 |
+
"model_type": "roberta",
|
14 |
+
"num_attention_heads": 12,
|
15 |
+
"num_hidden_layers": 12,
|
16 |
+
"pad_token_id": 1,
|
17 |
+
"type_vocab_size": 1,
|
18 |
+
"vocab_size": 50265,
|
19 |
+
"num_decoder_layers": 4,
|
20 |
+
"is_decoder": true
|
21 |
+
},
|
22 |
+
"roberta-large": {
|
23 |
+
"attention_probs_dropout_prob": 0.1,
|
24 |
+
"bos_token_id": 0,
|
25 |
+
"eos_token_id": 2,
|
26 |
+
"hidden_act": "gelu",
|
27 |
+
"hidden_dropout_prob": 0.1,
|
28 |
+
"hidden_size": 1024,
|
29 |
+
"initializer_range": 0.02,
|
30 |
+
"intermediate_size": 4096,
|
31 |
+
"layer_norm_eps": 1e-05,
|
32 |
+
"max_position_embeddings": 514,
|
33 |
+
"model_type": "roberta",
|
34 |
+
"num_attention_heads": 16,
|
35 |
+
"num_hidden_layers": 24,
|
36 |
+
"pad_token_id": 1,
|
37 |
+
"type_vocab_size": 1,
|
38 |
+
"vocab_size": 50265,
|
39 |
+
"num_decoder_layers": 4,
|
40 |
+
"is_decoder": true
|
41 |
+
}
|
42 |
+
}
|
prismer/configs/vqa.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets: ['vqav2', 'vg']
|
2 |
+
data_path: '/workspace_dataset/dataset_vqa'
|
3 |
+
label_path: '/workspace_dataset/dataset_experts'
|
4 |
+
experts: ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'] # 'none' for PrismerZ
|
5 |
+
|
6 |
+
image_resolution: 480
|
7 |
+
prismer_model: 'prismer_base' # 'prismer-large' for Prismer(Z)-Large
|
8 |
+
freeze: 'freeze_vision'
|
9 |
+
|
10 |
+
batch_size_train: 8 # for 8 * 8 nodes [effective batch-size: 512]
|
11 |
+
batch_size_test: 32
|
12 |
+
init_lr: 5e-5
|
13 |
+
weight_decay: 0.05
|
14 |
+
min_lr: 0
|
15 |
+
max_epoch: 10
|
16 |
+
|
17 |
+
k_test: 16
|
18 |
+
inference: 'rank'
|