Spaces:
Runtime error
Runtime error
File size: 14,349 Bytes
51f6859 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 |
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn.functional as F
from mmcv.runner import force_fp32
from ..builder import HEADS
from ..losses import smooth_l1_loss
from .ascend_anchor_head import AscendAnchorHead
from .ssd_head import SSDHead
@HEADS.register_module()
class AscendSSDHead(SSDHead, AscendAnchorHead):
"""Ascend SSD head used in https://arxiv.org/abs/1512.02325.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
stacked_convs (int): Number of conv layers in cls and reg tower.
Default: 0.
feat_channels (int): Number of hidden channels when stacked_convs
> 0. Default: 256.
use_depthwise (bool): Whether to use DepthwiseSeparableConv.
Default: False.
conv_cfg (dict): Dictionary to construct and config conv layer.
Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: None.
act_cfg (dict): Dictionary to construct and config activation layer.
Default: None.
anchor_generator (dict): Config dict for anchor generator
bbox_coder (dict): Config of bounding box coder.
reg_decoded_bbox (bool): If true, the regression loss would be
applied directly on decoded bounding boxes, converting both
the predicted boxes and regression targets to absolute
coordinates format. Default False. It should be `True` when
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
init_cfg (dict or list[dict], optional): Initialization config dict.
""" # noqa: W605
def __init__(self,
num_classes=80,
in_channels=(512, 1024, 512, 256, 256, 256),
stacked_convs=0,
feat_channels=256,
use_depthwise=False,
conv_cfg=None,
norm_cfg=None,
act_cfg=None,
anchor_generator=dict(
type='SSDAnchorGenerator',
scale_major=False,
input_size=300,
strides=[8, 16, 32, 64, 100, 300],
ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
basesize_ratio_range=(0.1, 0.9)),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
clip_border=True,
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
),
reg_decoded_bbox=False,
train_cfg=None,
test_cfg=None,
init_cfg=dict(
type='Xavier',
layer='Conv2d',
distribution='uniform',
bias=0)):
super(AscendSSDHead, self).__init__(
num_classes=num_classes,
in_channels=in_channels,
stacked_convs=stacked_convs,
feat_channels=feat_channels,
use_depthwise=use_depthwise,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
anchor_generator=anchor_generator,
bbox_coder=bbox_coder,
reg_decoded_bbox=reg_decoded_bbox,
train_cfg=train_cfg,
test_cfg=test_cfg,
init_cfg=init_cfg)
assert self.reg_decoded_bbox is False, \
'reg_decoded_bbox only support False now.'
def get_static_anchors(self, featmap_sizes, img_metas, device='cuda'):
"""Get static anchors according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
img_metas (list[dict]): Image meta info.
device (torch.device | str): Device for returned tensors
Returns:
tuple:
anchor_list (list[Tensor]): Anchors of each image.
valid_flag_list (list[Tensor]): Valid flags of each image.
"""
if not hasattr(self, 'static_anchors') or \
not hasattr(self, 'static_valid_flags'):
static_anchors, static_valid_flags = self.get_anchors(
featmap_sizes, img_metas, device)
self.static_anchors = static_anchors
self.static_valid_flags = static_valid_flags
return self.static_anchors, self.static_valid_flags
def get_targets(self,
anchor_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
label_channels=1,
unmap_outputs=True,
return_sampling_results=False,
return_level=True):
"""Compute regression and classification targets for anchors in
multiple images.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_anchors, 4).
valid_flag_list (list[list[Tensor]]): Multi level valid flags of
each image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_anchors, )
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
ignored.
gt_labels_list (list[Tensor]): Ground truth labels of each box.
label_channels (int): Channel of label.
unmap_outputs (bool): Whether to map outputs back to the original
set of anchors.
return_sampling_results (bool): Whether to return the result of
sample.
return_level (bool): Whether to map outputs back to the levels
of feature map sizes.
Returns:
tuple: Usually returns a tuple containing learning targets.
- labels_list (list[Tensor]): Labels of each level.
- label_weights_list (list[Tensor]): Label weights of each
level.
- bbox_targets_list (list[Tensor]): BBox targets of each level.
- bbox_weights_list (list[Tensor]): BBox weights of each level.
- num_total_pos (int): Number of positive samples in all
images.
- num_total_neg (int): Number of negative samples in all
images.
additional_returns: This function enables user-defined returns from
`self._get_targets_single`. These returns are currently refined
to properties at each feature map (i.e. having HxW dimension).
The results will be concatenated after the end
"""
return AscendAnchorHead.get_targets(
self,
anchor_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list,
gt_labels_list,
label_channels,
unmap_outputs,
return_sampling_results,
return_level,
)
def batch_loss(self, batch_cls_score, batch_bbox_pred, batch_anchor,
batch_labels, batch_label_weights, batch_bbox_targets,
batch_bbox_weights, batch_pos_mask, batch_neg_mask,
num_total_samples):
"""Compute loss of all images.
Args:
batch_cls_score (Tensor): Box scores for all image
Has shape (num_imgs, num_total_anchors, num_classes).
batch_bbox_pred (Tensor): Box energies / deltas for all image
level with shape (num_imgs, num_total_anchors, 4).
batch_anchor (Tensor): Box reference for all image with shape
(num_imgs, num_total_anchors, 4).
batch_labels (Tensor): Labels of all anchors with shape
(num_imgs, num_total_anchors,).
batch_label_weights (Tensor): Label weights of all anchor with
shape (num_imgs, num_total_anchors,)
batch_bbox_targets (Tensor): BBox regression targets of all anchor
weight shape (num_imgs, num_total_anchors, 4).
batch_bbox_weights (Tensor): BBox regression loss weights of
all anchor with shape (num_imgs, num_total_anchors, 4).
batch_pos_mask (Tensor): Positive samples mask in all images.
batch_neg_mask (Tensor): negative samples mask in all images.
num_total_samples (int): If sampling, num total samples equal to
the number of total anchors; Otherwise, it is the number of
positive anchors.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
num_images, num_anchors, _ = batch_anchor.size()
batch_loss_cls_all = F.cross_entropy(
batch_cls_score.view((-1, self.cls_out_channels)),
batch_labels.view(-1),
reduction='none').view(
batch_label_weights.size()) * batch_label_weights
# # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
batch_num_pos_samples = torch.sum(batch_pos_mask, dim=1)
batch_num_neg_samples = \
self.train_cfg.neg_pos_ratio * batch_num_pos_samples
batch_num_neg_samples_max = torch.sum(batch_neg_mask, dim=1)
batch_num_neg_samples = torch.min(batch_num_neg_samples,
batch_num_neg_samples_max)
batch_topk_loss_cls_neg, _ = torch.topk(
batch_loss_cls_all * batch_neg_mask, k=num_anchors, dim=1)
batch_loss_cls_pos = torch.sum(
batch_loss_cls_all * batch_pos_mask, dim=1)
anchor_index = torch.arange(
end=num_anchors, dtype=torch.float,
device=batch_anchor.device).view((1, -1))
topk_loss_neg_mask = (anchor_index < batch_num_neg_samples.view(
-1, 1)).float()
batch_loss_cls_neg = torch.sum(
batch_topk_loss_cls_neg * topk_loss_neg_mask, dim=1)
loss_cls = \
(batch_loss_cls_pos + batch_loss_cls_neg) / num_total_samples
if self.reg_decoded_bbox:
# TODO: support self.reg_decoded_bbox is True
raise RuntimeError
loss_bbox_all = smooth_l1_loss(
batch_bbox_pred,
batch_bbox_targets,
batch_bbox_weights,
reduction='none',
beta=self.train_cfg.smoothl1_beta,
avg_factor=num_total_samples)
eps = torch.finfo(torch.float32).eps
sum_dim = (i for i in range(1, len(loss_bbox_all.size())))
loss_bbox = loss_bbox_all.sum(tuple(sum_dim)) / (
num_total_samples + eps)
return loss_cls[None], loss_bbox
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.prior_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=1,
unmap_outputs=True,
return_level=False)
if cls_reg_targets is None:
return None
(batch_labels, batch_label_weights, batch_bbox_targets,
batch_bbox_weights, batch_pos_mask, batch_neg_mask, sampling_result,
num_total_pos, num_total_neg, batch_anchors) = cls_reg_targets
num_imgs = len(img_metas)
batch_cls_score = torch.cat([
s.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels)
for s in cls_scores
], 1)
batch_bbox_pred = torch.cat([
b.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for b in bbox_preds
], -2)
batch_losses_cls, batch_losses_bbox = self.batch_loss(
batch_cls_score, batch_bbox_pred, batch_anchors, batch_labels,
batch_label_weights, batch_bbox_targets, batch_bbox_weights,
batch_pos_mask, batch_neg_mask, num_total_pos)
losses_cls = [
batch_losses_cls[:, index_imgs] for index_imgs in range(num_imgs)
]
losses_bbox = [losses_bbox for losses_bbox in batch_losses_bbox]
return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
|