|
MODEL: |
|
META_ARCHITECTURE: "GLEE" |
|
MASK_ON: True |
|
BACKBONE: |
|
NAME: "D2SwinTransformer" |
|
SWIN: |
|
EMBED_DIM: 192 |
|
DEPTHS: [2, 2, 18, 2] |
|
NUM_HEADS: [6, 12, 24, 48] |
|
WINDOW_SIZE: 12 |
|
APE: False |
|
DROP_PATH_RATE: 0.3 |
|
PATCH_NORM: True |
|
PRETRAIN_IMG_SIZE: 384 |
|
PIXEL_MEAN: [123.675, 116.280, 103.530] |
|
PIXEL_STD: [58.395, 57.120, 57.375] |
|
RESNETS: |
|
DEPTH: 50 |
|
STEM_TYPE: "basic" |
|
STEM_OUT_CHANNELS: 64 |
|
STRIDE_IN_1X1: False |
|
OUT_FEATURES: ["res2", "res3", "res4", "res5"] |
|
|
|
RES5_MULTI_GRID: [1, 1, 1] |
|
SEM_SEG_HEAD: |
|
NAME: "MaskDINOHead" |
|
IGNORE_VALUE: 255 |
|
NUM_CLASSES: 80 |
|
LOSS_WEIGHT: 1.0 |
|
CONVS_DIM: 256 |
|
MASK_DIM: 256 |
|
NORM: "GN" |
|
|
|
PIXEL_DECODER_NAME: "MaskDINOEncoder" |
|
DIM_FEEDFORWARD: 2048 |
|
NUM_FEATURE_LEVELS: 3 |
|
TOTAL_NUM_FEATURE_LEVELS: 4 |
|
IN_FEATURES: ["res2", "res3", "res4", "res5"] |
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] |
|
COMMON_STRIDE: 4 |
|
TRANSFORMER_ENC_LAYERS: 6 |
|
FEATURE_ORDER: "low2high" |
|
MaskDINO: |
|
TRANSFORMER_DECODER_NAME: "MaskDINODecoder" |
|
DEEP_SUPERVISION: True |
|
NO_OBJECT_WEIGHT: 0.1 |
|
CLASS_WEIGHT: 4.0 |
|
MASK_WEIGHT: 5.0 |
|
DICE_WEIGHT: 5.0 |
|
BOX_WEIGHT: 5.0 |
|
GIOU_WEIGHT: 2.0 |
|
HIDDEN_DIM: 256 |
|
NUM_OBJECT_QUERIES: 300 |
|
NHEADS: 8 |
|
DROPOUT: 0.0 |
|
DIM_FEEDFORWARD: 2048 |
|
ENC_LAYERS: 0 |
|
PRE_NORM: False |
|
ENFORCE_INPUT_PROJ: False |
|
SIZE_DIVISIBILITY: 32 |
|
DEC_LAYERS: 9 |
|
TRAIN_NUM_POINTS: 12544 |
|
OVERSAMPLE_RATIO: 3.0 |
|
IMPORTANCE_SAMPLE_RATIO: 0.75 |
|
INITIAL_PRED: True |
|
TWO_STAGE: True |
|
DN: "standard" |
|
DN_NUM: 100 |
|
INITIALIZE_BOX_TYPE: "no" |
|
TEST: |
|
SEMANTIC_ON: False |
|
INSTANCE_ON: True |
|
PANOPTIC_ON: False |
|
OVERLAP_THRESHOLD: 0.8 |
|
OBJECT_MASK_THRESHOLD: 0.25 |
|
TEXT: |
|
ARCH: clip_teacher |
|
LANGUAGE_BACKBONE: |
|
LANG_DIM: 512 |
|
|