virtex-redcaps / virtex /configs /_base_bicaptioning_R_50_L1_H1024.yaml
zamborg's picture
added datasets and virtex
a5f8a35
raw
history blame
1.37 kB
# -----------------------------------------------------------------------------
# Base config: VirTex pretraining for our "base" bicaptioning model:
# ResNet-50 + (L = 1, H = 1024) transformer trained for 500K iterations.
# -----------------------------------------------------------------------------
RANDOM_SEED: 0
AMP: true
CUDNN_BENCHMARK: true
CUDNN_DETERMINISTIC: false
DATA:
ROOT: "datasets/coco"
TOKENIZER_MODEL: "datasets/vocab/coco_10k.model"
VOCAB_SIZE: 10000
UNK_INDEX: 0
SOS_INDEX: 1
EOS_INDEX: 2
MASK_INDEX: 3
IMAGE_CROP_SIZE: 224
MAX_CAPTION_LENGTH: 30
IMAGE_TRANSFORM_TRAIN:
- "random_resized_crop"
- "horizontal_flip"
- "color_jitter"
- "normalize"
IMAGE_TRANSFORM_VAL:
- "smallest_resize"
- "center_crop"
- "normalize"
USE_PERCENTAGE: 100.0
USE_SINGLE_CAPTION: false
MODEL:
NAME: "virtex"
VISUAL:
NAME: "torchvision::resnet50"
PRETRAINED: false
FROZEN: false
TEXTUAL:
NAME: "transdec_postnorm::L1_H1024_A16_F4096"
DROPOUT: 0.1
OPTIM:
OPTIMIZER_NAME: "sgd"
SGD_MOMENTUM: 0.9
WEIGHT_DECAY: 0.0001
LOOKAHEAD:
USE: true
ALPHA: 0.5
STEPS: 5
BATCH_SIZE: 256
CNN_LR: 0.2
LR: 0.001
NUM_ITERATIONS: 500000
WARMUP_STEPS: 10000
LR_DECAY_NAME: "cosine"
NO_DECAY: ".*textual.(embedding|transformer).*(norm.*|bias)"
CLIP_GRAD_NORM: 10.0