|
@@ -27,6 +27,8 @@ from .roi_extractor import (RoIAlign, FPNRoIAlign)
|
|
|
from .bbox_head import (BBoxHead, TwoFCHead)
|
|
from .bbox_head import (BBoxHead, TwoFCHead)
|
|
|
from .mask_head import MaskHead
|
|
from .mask_head import MaskHead
|
|
|
from ..resnet import ResNetC5
|
|
from ..resnet import ResNetC5
|
|
|
|
|
+from .loss.diou_loss import DiouLoss
|
|
|
|
|
+from .ops import BBoxAssigner, LibraBBoxAssigner
|
|
|
|
|
|
|
|
__all__ = ['MaskRCNN']
|
|
__all__ = ['MaskRCNN']
|
|
|
|
|
|
|
@@ -46,11 +48,11 @@ class MaskRCNN(object):
|
|
|
def __init__(
|
|
def __init__(
|
|
|
self,
|
|
self,
|
|
|
backbone,
|
|
backbone,
|
|
|
- num_classes=81,
|
|
|
|
|
|
|
+ input_channel=3,
|
|
|
mode='train',
|
|
mode='train',
|
|
|
|
|
+ num_classes=81,
|
|
|
with_fpn=False,
|
|
with_fpn=False,
|
|
|
fpn=None,
|
|
fpn=None,
|
|
|
- num_chan=256,
|
|
|
|
|
min_level=2,
|
|
min_level=2,
|
|
|
max_level=6,
|
|
max_level=6,
|
|
|
spatial_scale=[1. / 32., 1. / 16., 1. / 8., 1. / 4.],
|
|
spatial_scale=[1. / 32., 1. / 16., 1. / 8., 1. / 4.],
|
|
@@ -69,6 +71,9 @@ class MaskRCNN(object):
|
|
|
test_pre_nms_top_n=6000,
|
|
test_pre_nms_top_n=6000,
|
|
|
test_post_nms_top_n=1000,
|
|
test_post_nms_top_n=1000,
|
|
|
test_nms_thresh=0.7,
|
|
test_nms_thresh=0.7,
|
|
|
|
|
+ rpn_cls_loss='SigmoidCrossEntropy',
|
|
|
|
|
+ rpn_focal_loss_alpha=0.25,
|
|
|
|
|
+ rpn_focal_loss_gamma=2,
|
|
|
#roi_extractor
|
|
#roi_extractor
|
|
|
roi_extractor=None,
|
|
roi_extractor=None,
|
|
|
#bbox_head
|
|
#bbox_head
|
|
@@ -76,6 +81,10 @@ class MaskRCNN(object):
|
|
|
keep_top_k=100,
|
|
keep_top_k=100,
|
|
|
nms_threshold=0.5,
|
|
nms_threshold=0.5,
|
|
|
score_threshold=0.05,
|
|
score_threshold=0.05,
|
|
|
|
|
+ rcnn_nms='MultiClassNMS',
|
|
|
|
|
+ softnms_sigma=0.5,
|
|
|
|
|
+ post_threshold=.05,
|
|
|
|
|
+
|
|
|
#MaskHead
|
|
#MaskHead
|
|
|
mask_head=None,
|
|
mask_head=None,
|
|
|
num_convs=0,
|
|
num_convs=0,
|
|
@@ -88,7 +97,12 @@ class MaskRCNN(object):
|
|
|
bg_thresh_lo=0.,
|
|
bg_thresh_lo=0.,
|
|
|
bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
|
|
bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
|
|
|
fixed_input_shape=None,
|
|
fixed_input_shape=None,
|
|
|
- input_channel=3):
|
|
|
|
|
|
|
+ rcnn_bbox_loss='SmoothL1Loss',
|
|
|
|
|
+ diouloss_weight=10.0,
|
|
|
|
|
+ diouloss_is_cls_agnostic=False,
|
|
|
|
|
+ diouloss_use_complete_iou_loss=True,
|
|
|
|
|
+ bbox_assigner='BBoxAssigner',
|
|
|
|
|
+ fpn_num_channels=256):
|
|
|
super(MaskRCNN, self).__init__()
|
|
super(MaskRCNN, self).__init__()
|
|
|
self.backbone = backbone
|
|
self.backbone = backbone
|
|
|
self.mode = mode
|
|
self.mode = mode
|
|
@@ -98,11 +112,13 @@ class MaskRCNN(object):
|
|
|
fpn.min_level = 2
|
|
fpn.min_level = 2
|
|
|
fpn.max_level = 6
|
|
fpn.max_level = 6
|
|
|
else:
|
|
else:
|
|
|
- fpn = FPN(num_chan=num_chan,
|
|
|
|
|
|
|
+ fpn = FPN(num_chan=fpn_num_channels,
|
|
|
min_level=min_level,
|
|
min_level=min_level,
|
|
|
max_level=max_level,
|
|
max_level=max_level,
|
|
|
spatial_scale=spatial_scale)
|
|
spatial_scale=spatial_scale)
|
|
|
self.fpn = fpn
|
|
self.fpn = fpn
|
|
|
|
|
+ if self.fpn is not None:
|
|
|
|
|
+ self.fpn.num_chan = fpn_num_channels
|
|
|
self.num_classes = num_classes
|
|
self.num_classes = num_classes
|
|
|
if rpn_head is None:
|
|
if rpn_head is None:
|
|
|
if self.fpn is None:
|
|
if self.fpn is None:
|
|
@@ -118,7 +134,10 @@ class MaskRCNN(object):
|
|
|
train_nms_thresh=train_nms_thresh,
|
|
train_nms_thresh=train_nms_thresh,
|
|
|
test_pre_nms_top_n=test_pre_nms_top_n,
|
|
test_pre_nms_top_n=test_pre_nms_top_n,
|
|
|
test_post_nms_top_n=test_post_nms_top_n,
|
|
test_post_nms_top_n=test_post_nms_top_n,
|
|
|
- test_nms_thresh=test_nms_thresh)
|
|
|
|
|
|
|
+ test_nms_thresh=test_nms_thresh,
|
|
|
|
|
+ rpn_cls_loss=rpn_cls_loss,
|
|
|
|
|
+ rpn_focal_loss_alpha=rpn_focal_loss_alpha,
|
|
|
|
|
+ rpn_focal_loss_gamma=rpn_focal_loss_gamma)
|
|
|
else:
|
|
else:
|
|
|
rpn_head = FPNRPNHead(
|
|
rpn_head = FPNRPNHead(
|
|
|
anchor_start_size=anchor_sizes[0],
|
|
anchor_start_size=anchor_sizes[0],
|
|
@@ -135,7 +154,10 @@ class MaskRCNN(object):
|
|
|
train_nms_thresh=train_nms_thresh,
|
|
train_nms_thresh=train_nms_thresh,
|
|
|
test_pre_nms_top_n=test_pre_nms_top_n,
|
|
test_pre_nms_top_n=test_pre_nms_top_n,
|
|
|
test_post_nms_top_n=test_post_nms_top_n,
|
|
test_post_nms_top_n=test_post_nms_top_n,
|
|
|
- test_nms_thresh=test_nms_thresh)
|
|
|
|
|
|
|
+ test_nms_thresh=test_nms_thresh,
|
|
|
|
|
+ rpn_cls_loss=rpn_cls_loss,
|
|
|
|
|
+ rpn_focal_loss_alpha=rpn_focal_loss_alpha,
|
|
|
|
|
+ rpn_focal_loss_gamma=rpn_focal_loss_gamma)
|
|
|
self.rpn_head = rpn_head
|
|
self.rpn_head = rpn_head
|
|
|
if roi_extractor is None:
|
|
if roi_extractor is None:
|
|
|
if self.fpn is None:
|
|
if self.fpn is None:
|
|
@@ -150,7 +172,8 @@ class MaskRCNN(object):
|
|
|
head = ResNetC5(
|
|
head = ResNetC5(
|
|
|
layers=self.backbone.layers,
|
|
layers=self.backbone.layers,
|
|
|
norm_type=self.backbone.norm_type,
|
|
norm_type=self.backbone.norm_type,
|
|
|
- freeze_norm=self.backbone.freeze_norm)
|
|
|
|
|
|
|
+ freeze_norm=self.backbone.freeze_norm,
|
|
|
|
|
+ variant=self.backbone.variant)
|
|
|
else:
|
|
else:
|
|
|
head = TwoFCHead()
|
|
head = TwoFCHead()
|
|
|
bbox_head = BBoxHead(
|
|
bbox_head = BBoxHead(
|
|
@@ -158,7 +181,15 @@ class MaskRCNN(object):
|
|
|
keep_top_k=keep_top_k,
|
|
keep_top_k=keep_top_k,
|
|
|
nms_threshold=nms_threshold,
|
|
nms_threshold=nms_threshold,
|
|
|
score_threshold=score_threshold,
|
|
score_threshold=score_threshold,
|
|
|
- num_classes=num_classes)
|
|
|
|
|
|
|
+ rcnn_nms=rcnn_nms,
|
|
|
|
|
+ softnms_sigma=softnms_sigma,
|
|
|
|
|
+ post_threshold=post_threshold,
|
|
|
|
|
+ num_classes=num_classes,
|
|
|
|
|
+ rcnn_bbox_loss=rcnn_bbox_loss,
|
|
|
|
|
+ diouloss_weight=diouloss_weight,
|
|
|
|
|
+ diouloss_is_cls_agnostic=diouloss_is_cls_agnostic,
|
|
|
|
|
+ diouloss_use_complete_iou_loss=diouloss_use_complete_iou_loss)
|
|
|
|
|
+
|
|
|
self.bbox_head = bbox_head
|
|
self.bbox_head = bbox_head
|
|
|
if mask_head is None:
|
|
if mask_head is None:
|
|
|
mask_head = MaskHead(
|
|
mask_head = MaskHead(
|
|
@@ -174,14 +205,40 @@ class MaskRCNN(object):
|
|
|
self.bbox_reg_weights = bbox_reg_weights
|
|
self.bbox_reg_weights = bbox_reg_weights
|
|
|
self.rpn_only = rpn_only
|
|
self.rpn_only = rpn_only
|
|
|
self.fixed_input_shape = fixed_input_shape
|
|
self.fixed_input_shape = fixed_input_shape
|
|
|
|
|
+ if bbox_assigner == 'BBoxAssigner':
|
|
|
|
|
+ self.bbox_assigner = BBoxAssigner(
|
|
|
|
|
+ batch_size_per_im=batch_size_per_im,
|
|
|
|
|
+ fg_fraction=fg_fraction,
|
|
|
|
|
+ fg_thresh=fg_thresh,
|
|
|
|
|
+ bg_thresh_hi=bg_thresh_hi,
|
|
|
|
|
+ bg_thresh_lo=bg_thresh_lo,
|
|
|
|
|
+ bbox_reg_weights=bbox_reg_weights,
|
|
|
|
|
+ num_classes=num_classes,
|
|
|
|
|
+ shuffle_before_sample=self.rpn_head.use_random)
|
|
|
|
|
+ elif bbox_assigner == 'LibraBBoxAssigner':
|
|
|
|
|
+ self.bbox_assigner = LibraBBoxAssigner(
|
|
|
|
|
+ batch_size_per_im=batch_size_per_im,
|
|
|
|
|
+ fg_fraction=fg_fraction,
|
|
|
|
|
+ fg_thresh=fg_thresh,
|
|
|
|
|
+ bg_thresh_hi=bg_thresh_hi,
|
|
|
|
|
+ bg_thresh_lo=bg_thresh_lo,
|
|
|
|
|
+ bbox_reg_weights=bbox_reg_weights,
|
|
|
|
|
+ num_classes=num_classes,
|
|
|
|
|
+ shuffle_before_sample=self.rpn_head.use_random)
|
|
|
self.input_channel = input_channel
|
|
self.input_channel = input_channel
|
|
|
|
|
|
|
|
def build_net(self, inputs):
|
|
def build_net(self, inputs):
|
|
|
im = inputs['image']
|
|
im = inputs['image']
|
|
|
im_info = inputs['im_info']
|
|
im_info = inputs['im_info']
|
|
|
|
|
+ if self.mode == 'train':
|
|
|
|
|
+ gt_bbox = inputs['gt_box']
|
|
|
|
|
+ is_crowd = inputs['is_crowd']
|
|
|
|
|
+ else:
|
|
|
|
|
+ im_shape = inputs['im_shape']
|
|
|
|
|
|
|
|
# backbone
|
|
# backbone
|
|
|
body_feats = self.backbone(im)
|
|
body_feats = self.backbone(im)
|
|
|
|
|
+ body_feat_names = list(body_feats.keys())
|
|
|
|
|
|
|
|
# FPN
|
|
# FPN
|
|
|
spatial_scale = None
|
|
spatial_scale = None
|
|
@@ -192,28 +249,19 @@ class MaskRCNN(object):
|
|
|
rois = self.rpn_head.get_proposals(body_feats, im_info, mode=self.mode)
|
|
rois = self.rpn_head.get_proposals(body_feats, im_info, mode=self.mode)
|
|
|
|
|
|
|
|
if self.mode == 'train':
|
|
if self.mode == 'train':
|
|
|
- rpn_loss = self.rpn_head.get_loss(im_info, inputs['gt_box'],
|
|
|
|
|
- inputs['is_crowd'])
|
|
|
|
|
- outputs = fluid.layers.generate_proposal_labels(
|
|
|
|
|
|
|
+ rpn_loss = self.rpn_head.get_loss(im_info, gt_bbox, is_crowd)
|
|
|
|
|
+ outputs = self.bbox_assigner(
|
|
|
rpn_rois=rois,
|
|
rpn_rois=rois,
|
|
|
gt_classes=inputs['gt_label'],
|
|
gt_classes=inputs['gt_label'],
|
|
|
is_crowd=inputs['is_crowd'],
|
|
is_crowd=inputs['is_crowd'],
|
|
|
gt_boxes=inputs['gt_box'],
|
|
gt_boxes=inputs['gt_box'],
|
|
|
- im_info=inputs['im_info'],
|
|
|
|
|
- batch_size_per_im=self.batch_size_per_im,
|
|
|
|
|
- fg_fraction=self.fg_fraction,
|
|
|
|
|
- fg_thresh=self.fg_thresh,
|
|
|
|
|
- bg_thresh_hi=self.bg_thresh_hi,
|
|
|
|
|
- bg_thresh_lo=self.bg_thresh_lo,
|
|
|
|
|
- bbox_reg_weights=self.bbox_reg_weights,
|
|
|
|
|
- class_nums=self.num_classes,
|
|
|
|
|
- use_random=self.rpn_head.use_random)
|
|
|
|
|
|
|
+ im_info=inputs['im_info'])
|
|
|
|
|
|
|
|
rois = outputs[0]
|
|
rois = outputs[0]
|
|
|
labels_int32 = outputs[1]
|
|
labels_int32 = outputs[1]
|
|
|
|
|
|
|
|
if self.fpn is None:
|
|
if self.fpn is None:
|
|
|
- last_feat = body_feats[list(body_feats.keys())[-1]]
|
|
|
|
|
|
|
+ last_feat = body_feats[body_feat_names[-1]]
|
|
|
roi_feat = self.roi_extractor(last_feat, rois)
|
|
roi_feat = self.roi_extractor(last_feat, rois)
|
|
|
else:
|
|
else:
|
|
|
roi_feat = self.roi_extractor(body_feats, rois, spatial_scale)
|
|
roi_feat = self.roi_extractor(body_feats, rois, spatial_scale)
|
|
@@ -254,8 +302,7 @@ class MaskRCNN(object):
|
|
|
return {'proposal': rois}
|
|
return {'proposal': rois}
|
|
|
mask_name = 'mask_pred'
|
|
mask_name = 'mask_pred'
|
|
|
mask_pred, bbox_pred = self._eval(body_feats, mask_name, rois,
|
|
mask_pred, bbox_pred = self._eval(body_feats, mask_name, rois,
|
|
|
- im_info, inputs['im_shape'],
|
|
|
|
|
- spatial_scale)
|
|
|
|
|
|
|
+ im_info, im_shape, spatial_scale)
|
|
|
return OrderedDict(zip(['bbox', 'mask'], [bbox_pred, mask_pred]))
|
|
return OrderedDict(zip(['bbox', 'mask'], [bbox_pred, mask_pred]))
|
|
|
|
|
|
|
|
def _eval(self,
|
|
def _eval(self,
|