Spaces:
Runtime error
Runtime error
Ahsen Khaliq
commited on
Commit
•
16aee22
1
Parent(s):
92b1fb6
add files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- cog.yaml +28 -0
- configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml +61 -0
- configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
- configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml +18 -0
- configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml +61 -0
- configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
- configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml +18 -0
- configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml +61 -0
- configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
- configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
- configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml +37 -0
- configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml +37 -0
- configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml +37 -0
- configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml +15 -0
- configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml +15 -0
- configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml +61 -0
- configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
- configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
- configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
- configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
- configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
- configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
- configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml +61 -0
- configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
- configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
- configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
- configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
- configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
- configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
- configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml +61 -0
- configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
- configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
- configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
- configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
- configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
- configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
- configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml +47 -0
- configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml +11 -0
- configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml +44 -0
- configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml +16 -0
- configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml +16 -0
- configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +21 -0
- configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml +15 -0
- configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml +15 -0
- configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml +47 -0
- configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml +11 -0
- configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml +45 -0
- configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml +16 -0
- configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml +16 -0
- configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +21 -0
cog.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build:
|
2 |
+
gpu: true
|
3 |
+
cuda: "10.1"
|
4 |
+
python_version: "3.8"
|
5 |
+
system_packages:
|
6 |
+
- "libgl1-mesa-glx"
|
7 |
+
- "libglib2.0-0"
|
8 |
+
python_packages:
|
9 |
+
- "ipython==7.30.1"
|
10 |
+
- "numpy==1.21.4"
|
11 |
+
- "torch==1.8.1"
|
12 |
+
- "torchvision==0.9.1"
|
13 |
+
- "opencv-python==4.5.5.62"
|
14 |
+
- "Shapely==1.8.0"
|
15 |
+
- "h5py==3.6.0"
|
16 |
+
- "scipy==1.7.3"
|
17 |
+
- "submitit==1.4.1"
|
18 |
+
- "scikit-image==0.19.1"
|
19 |
+
- "Cython==0.29.27"
|
20 |
+
- "timm==0.4.12"
|
21 |
+
run:
|
22 |
+
- pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
|
23 |
+
- pip install git+https://github.com/cocodataset/panopticapi.git
|
24 |
+
- pip install git+https://github.com/mcordts/cityscapesScripts.git
|
25 |
+
- git clone https://github.com/facebookresearch/Mask2Former
|
26 |
+
- TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
|
27 |
+
|
28 |
+
predict: "predict.py:Predictor"
|
configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
# NORM: "SyncBN"
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("ade20k_instance_train",)
|
18 |
+
TEST: ("ade20k_instance_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 160000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 640
|
40 |
+
MAX_SIZE_TRAIN: 2560
|
41 |
+
MAX_SIZE_TEST: 2560
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (640, 640)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_instance"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [320, 480, 640, 800, 960, 1120]
|
56 |
+
MAX_SIZE: 4480
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-ADE20K-InstanceSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 100
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: True
|
41 |
+
INSTANCE_ON: True
|
42 |
+
PANOPTIC_ON: True
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
# NORM: "SyncBN"
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("ade20k_panoptic_train",)
|
18 |
+
TEST: ("ade20k_panoptic_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 160000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 640
|
40 |
+
MAX_SIZE_TRAIN: 2560
|
41 |
+
MAX_SIZE_TEST: 2560
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (640, 640)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_panoptic"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [320, 480, 640, 800, 960, 1120]
|
56 |
+
MAX_SIZE: 4480
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-ADE20K-PanopticSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 150
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: True
|
41 |
+
INSTANCE_ON: True
|
42 |
+
PANOPTIC_ON: True
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
# NORM: "SyncBN"
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("ade20k_sem_seg_train",)
|
18 |
+
TEST: ("ade20k_sem_seg_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 160000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 512
|
40 |
+
MAX_SIZE_TRAIN: 2048
|
41 |
+
MAX_SIZE_TEST: 2048
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (512, 512)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: 512 # used in dataset mapper
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_semantic"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [256, 384, 512, 640, 768, 896]
|
56 |
+
MAX_SIZE: 3584
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-ADE20K-SemanticSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 150
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: True
|
41 |
+
INSTANCE_ON: False
|
42 |
+
PANOPTIC_ON: False
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
INPUT:
|
18 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
|
19 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
20 |
+
MIN_SIZE_TEST: 640
|
21 |
+
MAX_SIZE_TRAIN: 2560
|
22 |
+
MAX_SIZE_TEST: 2560
|
23 |
+
CROP:
|
24 |
+
ENABLED: True
|
25 |
+
TYPE: "absolute"
|
26 |
+
SIZE: (640, 640)
|
27 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
28 |
+
COLOR_AUG_SSD: True
|
29 |
+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
30 |
+
FORMAT: "RGB"
|
31 |
+
TEST:
|
32 |
+
EVAL_PERIOD: 5000
|
33 |
+
AUG:
|
34 |
+
ENABLED: False
|
35 |
+
MIN_SIZES: [320, 480, 640, 800, 960, 1120]
|
36 |
+
MAX_SIZE: 4480
|
37 |
+
FLIP: True
|
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
INPUT:
|
18 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
|
19 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
20 |
+
MIN_SIZE_TEST: 640
|
21 |
+
MAX_SIZE_TRAIN: 2560
|
22 |
+
MAX_SIZE_TEST: 2560
|
23 |
+
CROP:
|
24 |
+
ENABLED: True
|
25 |
+
TYPE: "absolute"
|
26 |
+
SIZE: (640, 640)
|
27 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
28 |
+
COLOR_AUG_SSD: True
|
29 |
+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
30 |
+
FORMAT: "RGB"
|
31 |
+
TEST:
|
32 |
+
EVAL_PERIOD: 5000
|
33 |
+
AUG:
|
34 |
+
ENABLED: False
|
35 |
+
MIN_SIZES: [320, 480, 640, 800, 960, 1120]
|
36 |
+
MAX_SIZE: 4480
|
37 |
+
FLIP: True
|
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
INPUT:
|
18 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
|
19 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
20 |
+
MIN_SIZE_TEST: 640
|
21 |
+
MAX_SIZE_TRAIN: 2560
|
22 |
+
MAX_SIZE_TEST: 2560
|
23 |
+
CROP:
|
24 |
+
ENABLED: True
|
25 |
+
TYPE: "absolute"
|
26 |
+
SIZE: (640, 640)
|
27 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
28 |
+
COLOR_AUG_SSD: True
|
29 |
+
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
30 |
+
FORMAT: "RGB"
|
31 |
+
TEST:
|
32 |
+
EVAL_PERIOD: 5000
|
33 |
+
AUG:
|
34 |
+
ENABLED: False
|
35 |
+
MIN_SIZES: [320, 480, 640, 800, 960, 1120]
|
36 |
+
MAX_SIZE: 4480
|
37 |
+
FLIP: True
|
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_small_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_160k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 6, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
NORM: "SyncBN" # use syncbn for cityscapes dataset
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("cityscapes_fine_instance_seg_train",)
|
18 |
+
TEST: ("cityscapes_fine_instance_seg_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 90000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 1024
|
40 |
+
MAX_SIZE_TRAIN: 4096
|
41 |
+
MAX_SIZE_TEST: 2048
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (512, 1024)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: -1
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_instance"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
|
56 |
+
MAX_SIZE: 4096
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-Cityscapes-InstanceSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 8
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: False
|
41 |
+
INSTANCE_ON: True
|
42 |
+
PANOPTIC_ON: False
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_small_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 6, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
NORM: "SyncBN" # use syncbn for cityscapes dataset
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("cityscapes_fine_panoptic_train",)
|
18 |
+
TEST: ("cityscapes_fine_panoptic_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 90000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 1024
|
40 |
+
MAX_SIZE_TRAIN: 4096
|
41 |
+
MAX_SIZE_TEST: 2048
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (512, 1024)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: -1
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_panoptic"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
|
56 |
+
MAX_SIZE: 4096
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-Cityscapes-PanopticSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 19
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: True
|
41 |
+
INSTANCE_ON: True
|
42 |
+
PANOPTIC_ON: True
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_small_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 6, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
NORM: "SyncBN" # use syncbn for cityscapes dataset
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("cityscapes_fine_sem_seg_train",)
|
18 |
+
TEST: ("cityscapes_fine_sem_seg_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
MAX_ITER: 90000
|
23 |
+
WARMUP_FACTOR: 1.0
|
24 |
+
WARMUP_ITERS: 0
|
25 |
+
WEIGHT_DECAY: 0.05
|
26 |
+
OPTIMIZER: "ADAMW"
|
27 |
+
LR_SCHEDULER_NAME: "WarmupPolyLR"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
|
38 |
+
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
39 |
+
MIN_SIZE_TEST: 1024
|
40 |
+
MAX_SIZE_TRAIN: 4096
|
41 |
+
MAX_SIZE_TEST: 2048
|
42 |
+
CROP:
|
43 |
+
ENABLED: True
|
44 |
+
TYPE: "absolute"
|
45 |
+
SIZE: (512, 1024)
|
46 |
+
SINGLE_CATEGORY_MAX_AREA: 1.0
|
47 |
+
COLOR_AUG_SSD: True
|
48 |
+
SIZE_DIVISIBILITY: -1
|
49 |
+
FORMAT: "RGB"
|
50 |
+
DATASET_MAPPER_NAME: "mask_former_semantic"
|
51 |
+
TEST:
|
52 |
+
EVAL_PERIOD: 5000
|
53 |
+
AUG:
|
54 |
+
ENABLED: False
|
55 |
+
MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
|
56 |
+
MAX_SIZE: 4096
|
57 |
+
FLIP: True
|
58 |
+
DATALOADER:
|
59 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
60 |
+
NUM_WORKERS: 4
|
61 |
+
VERSION: 2
|
configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-Cityscapes-SemanticSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 19
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: True
|
41 |
+
INSTANCE_ON: False
|
42 |
+
PANOPTIC_ON: False
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 100
|
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_small_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_90k.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 6, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
# NORM: "SyncBN"
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("coco_2017_train",)
|
18 |
+
TEST: ("coco_2017_val",)
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
STEPS: (327778, 355092)
|
23 |
+
MAX_ITER: 368750
|
24 |
+
WARMUP_FACTOR: 1.0
|
25 |
+
WARMUP_ITERS: 10
|
26 |
+
WEIGHT_DECAY: 0.05
|
27 |
+
OPTIMIZER: "ADAMW"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
IMAGE_SIZE: 1024
|
38 |
+
MIN_SCALE: 0.1
|
39 |
+
MAX_SCALE: 2.0
|
40 |
+
FORMAT: "RGB"
|
41 |
+
DATASET_MAPPER_NAME: "coco_instance_lsj"
|
42 |
+
TEST:
|
43 |
+
EVAL_PERIOD: 5000
|
44 |
+
DATALOADER:
|
45 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
46 |
+
NUM_WORKERS: 4
|
47 |
+
VERSION: 2
|
configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
# NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-COCO-InstanceSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IGNORE_VALUE: 255
|
7 |
+
NUM_CLASSES: 80
|
8 |
+
LOSS_WEIGHT: 1.0
|
9 |
+
CONVS_DIM: 256
|
10 |
+
MASK_DIM: 256
|
11 |
+
NORM: "GN"
|
12 |
+
# pixel decoder
|
13 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
14 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
15 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
16 |
+
COMMON_STRIDE: 4
|
17 |
+
TRANSFORMER_ENC_LAYERS: 6
|
18 |
+
MASK_FORMER:
|
19 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
20 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
21 |
+
DEEP_SUPERVISION: True
|
22 |
+
NO_OBJECT_WEIGHT: 0.1
|
23 |
+
CLASS_WEIGHT: 2.0
|
24 |
+
MASK_WEIGHT: 5.0
|
25 |
+
DICE_WEIGHT: 5.0
|
26 |
+
HIDDEN_DIM: 256
|
27 |
+
NUM_OBJECT_QUERIES: 100
|
28 |
+
NHEADS: 8
|
29 |
+
DROPOUT: 0.0
|
30 |
+
DIM_FEEDFORWARD: 2048
|
31 |
+
ENC_LAYERS: 0
|
32 |
+
PRE_NORM: False
|
33 |
+
ENFORCE_INPUT_PROJ: False
|
34 |
+
SIZE_DIVISIBILITY: 32
|
35 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
36 |
+
TRAIN_NUM_POINTS: 12544
|
37 |
+
OVERSAMPLE_RATIO: 3.0
|
38 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
39 |
+
TEST:
|
40 |
+
SEMANTIC_ON: False
|
41 |
+
INSTANCE_ON: True
|
42 |
+
PANOPTIC_ON: False
|
43 |
+
OVERLAP_THRESHOLD: 0.8
|
44 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
19 |
+
SOLVER:
|
20 |
+
STEPS: (655556, 710184)
|
21 |
+
MAX_ITER: 737500
|
configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_small_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 96
|
7 |
+
DEPTHS: [2, 2, 6, 2]
|
8 |
+
NUM_HEADS: [3, 6, 12, 24]
|
9 |
+
WINDOW_SIZE: 7
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
|
14 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
15 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
BACKBONE:
|
3 |
+
FREEZE_AT: 0
|
4 |
+
NAME: "build_resnet_backbone"
|
5 |
+
WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
|
6 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
7 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
8 |
+
RESNETS:
|
9 |
+
DEPTH: 50
|
10 |
+
STEM_TYPE: "basic" # not used
|
11 |
+
STEM_OUT_CHANNELS: 64
|
12 |
+
STRIDE_IN_1X1: False
|
13 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
14 |
+
# NORM: "SyncBN"
|
15 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
16 |
+
DATASETS:
|
17 |
+
TRAIN: ("coco_2017_train_panoptic",)
|
18 |
+
TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well
|
19 |
+
SOLVER:
|
20 |
+
IMS_PER_BATCH: 16
|
21 |
+
BASE_LR: 0.0001
|
22 |
+
STEPS: (327778, 355092)
|
23 |
+
MAX_ITER: 368750
|
24 |
+
WARMUP_FACTOR: 1.0
|
25 |
+
WARMUP_ITERS: 10
|
26 |
+
WEIGHT_DECAY: 0.05
|
27 |
+
OPTIMIZER: "ADAMW"
|
28 |
+
BACKBONE_MULTIPLIER: 0.1
|
29 |
+
CLIP_GRADIENTS:
|
30 |
+
ENABLED: True
|
31 |
+
CLIP_TYPE: "full_model"
|
32 |
+
CLIP_VALUE: 0.01
|
33 |
+
NORM_TYPE: 2.0
|
34 |
+
AMP:
|
35 |
+
ENABLED: True
|
36 |
+
INPUT:
|
37 |
+
IMAGE_SIZE: 1024
|
38 |
+
MIN_SCALE: 0.1
|
39 |
+
MAX_SCALE: 2.0
|
40 |
+
FORMAT: "RGB"
|
41 |
+
DATASET_MAPPER_NAME: "coco_panoptic_lsj"
|
42 |
+
TEST:
|
43 |
+
EVAL_PERIOD: 5000
|
44 |
+
DATALOADER:
|
45 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
46 |
+
NUM_WORKERS: 4
|
47 |
+
VERSION: 2
|
configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
WEIGHTS: "R-101.pkl"
|
4 |
+
RESNETS:
|
5 |
+
DEPTH: 101
|
6 |
+
STEM_TYPE: "basic" # not used
|
7 |
+
STEM_OUT_CHANNELS: 64
|
8 |
+
STRIDE_IN_1X1: False
|
9 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
10 |
+
# NORM: "SyncBN"
|
11 |
+
RES5_MULTI_GRID: [1, 1, 1] # not used
|
configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: Base-COCO-PanopticSegmentation.yaml
|
2 |
+
MODEL:
|
3 |
+
META_ARCHITECTURE: "MaskFormer"
|
4 |
+
SEM_SEG_HEAD:
|
5 |
+
NAME: "MaskFormerHead"
|
6 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
7 |
+
IGNORE_VALUE: 255
|
8 |
+
NUM_CLASSES: 133
|
9 |
+
LOSS_WEIGHT: 1.0
|
10 |
+
CONVS_DIM: 256
|
11 |
+
MASK_DIM: 256
|
12 |
+
NORM: "GN"
|
13 |
+
# pixel decoder
|
14 |
+
PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
|
15 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
16 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
17 |
+
COMMON_STRIDE: 4
|
18 |
+
TRANSFORMER_ENC_LAYERS: 6
|
19 |
+
MASK_FORMER:
|
20 |
+
TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
|
21 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
22 |
+
DEEP_SUPERVISION: True
|
23 |
+
NO_OBJECT_WEIGHT: 0.1
|
24 |
+
CLASS_WEIGHT: 2.0
|
25 |
+
MASK_WEIGHT: 5.0
|
26 |
+
DICE_WEIGHT: 5.0
|
27 |
+
HIDDEN_DIM: 256
|
28 |
+
NUM_OBJECT_QUERIES: 100
|
29 |
+
NHEADS: 8
|
30 |
+
DROPOUT: 0.0
|
31 |
+
DIM_FEEDFORWARD: 2048
|
32 |
+
ENC_LAYERS: 0
|
33 |
+
PRE_NORM: False
|
34 |
+
ENFORCE_INPUT_PROJ: False
|
35 |
+
SIZE_DIVISIBILITY: 32
|
36 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
37 |
+
TRAIN_NUM_POINTS: 12544
|
38 |
+
OVERSAMPLE_RATIO: 3.0
|
39 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
40 |
+
TEST:
|
41 |
+
SEMANTIC_ON: True
|
42 |
+
INSTANCE_ON: True
|
43 |
+
PANOPTIC_ON: True
|
44 |
+
OVERLAP_THRESHOLD: 0.8
|
45 |
+
OBJECT_MASK_THRESHOLD: 0.8
|
configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 128
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [4, 8, 16, 32]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: ../maskformer2_R50_bs16_50ep.yaml
|
2 |
+
MODEL:
|
3 |
+
BACKBONE:
|
4 |
+
NAME: "D2SwinTransformer"
|
5 |
+
SWIN:
|
6 |
+
EMBED_DIM: 192
|
7 |
+
DEPTHS: [2, 2, 18, 2]
|
8 |
+
NUM_HEADS: [6, 12, 24, 48]
|
9 |
+
WINDOW_SIZE: 12
|
10 |
+
APE: False
|
11 |
+
DROP_PATH_RATE: 0.3
|
12 |
+
PATCH_NORM: True
|
13 |
+
PRETRAIN_IMG_SIZE: 384
|
14 |
+
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
|
15 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
16 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
17 |
+
MASK_FORMER:
|
18 |
+
NUM_OBJECT_QUERIES: 200
|
19 |
+
SOLVER:
|
20 |
+
STEPS: (655556, 710184)
|
21 |
+
MAX_ITER: 737500
|