name: COVER | |
num_epochs: 0 | |
l_num_epochs: 10 | |
warmup_epochs: 2.5 | |
ema: true | |
save_model: true | |
batch_size: 8 | |
num_workers: 6 | |
split_seed: 42 | |
wandb: | |
project_name: COVER | |
data: | |
val-livevqc: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.598 | |
phase: test | |
anno_file: ./examplar_data_labels/LIVE_VQC/labels.txt | |
data_prefix: ./datasets/LIVE_VQC/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
val-kv1k: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.540 | |
phase: test | |
anno_file: ./examplar_data_labels/KoNViD/labels.txt | |
data_prefix: ./datasets/KoNViD/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
val-ltest: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.603 | |
phase: test | |
anno_file: ./examplar_data_labels/LSVQ/labels_test.txt | |
data_prefix: ./datasets/LSVQ/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
val-l1080p: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.620 | |
phase: test | |
anno_file: ./examplar_data_labels/LSVQ/labels_1080p.txt | |
data_prefix: ./datasets/LSVQ/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
val-cvd2014: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.576 | |
phase: test | |
anno_file: ./examplar_data_labels/CVD2014/labels.txt | |
data_prefix: ./datasets/CVD2014/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
val-ytugc: | |
type: ViewDecompositionDataset | |
args: | |
weight: 0.443 | |
phase: test | |
anno_file: ./examplar_data_labels/YouTubeUGC/labels.txt | |
data_prefix: ./dataset/YouTubeUGC/ # revert before submit | |
sample_types: | |
semantic: | |
size_h: 512 | |
size_w: 512 | |
clip_len: 1 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
technical: | |
fragments_h: 7 | |
fragments_w: 7 | |
fsize_h: 32 | |
fsize_w: 32 | |
aligned: 2 | |
clip_len: 2 | |
t_frag: 1 | |
frame_interval: 2 | |
num_clips: 1 | |
aesthetic: | |
size_h: 224 | |
size_w: 224 | |
clip_len: 2 | |
frame_interval: 2 | |
t_frag: 1 | |
num_clips: 1 | |
model: | |
type: COVER | |
args: | |
backbone: | |
technical: | |
type: swin_tiny_grpb | |
checkpoint: true | |
pretrained: | |
aesthetic: | |
type: conv_tiny | |
semantic: | |
type: clip_iqa+ | |
backbone_preserve_keys: technical,aesthetic,semantic | |
divide_head: true | |
vqa_head: | |
in_channels: 768 | |
hidden_channels: 64 | |
optimizer: | |
lr: !!float 1e-3 | |
backbone_lr_mult: !!float 1e-1 | |
wd: 0.05 | |
test_load_path: ./COVER.pth # revert before submit |