FlyingQianMM 5 tahun lalu
induk
melakukan
9b8fbb652a

+ 1 - 1
paddlex/cv/models/base.py

@@ -578,7 +578,7 @@ class BaseAPI:
                                     continue
                             log_writer.add_scalar(
                                 "Metrics/Eval(Epoch): {}".format(k), v, i + 1)
-                self.save_model(save_dir=current_save_dir)
+                #self.save_model(save_dir=current_save_dir)
                 if getattr(self, 'use_ema', False):
                     self.exe.run(self.ema.restore_program)
                 time_eval_one_epoch = time.time() - eval_epoch_start_time

+ 33 - 5
paddlex/cv/models/faster_rcnn.py

@@ -46,7 +46,11 @@ class FasterRCNN(BaseAPI):
                  backbone='ResNet50',
                  with_fpn=True,
                  aspect_ratios=[0.5, 1.0, 2.0],
-                 anchor_sizes=[32, 64, 128, 256, 512]):
+                 anchor_sizes=[32, 64, 128, 256, 512],
+                 with_dcn=False,
+                 rpn_cls_loss='SigmoidCrossEntropy',
+                 rpn_focal_loss_alpha=0.25,
+                 rpn_focal_loss_gamma=2):
         self.init_params = locals()
         super(FasterRCNN, self).__init__('detector')
         backbones = [
@@ -62,9 +66,17 @@ class FasterRCNN(BaseAPI):
         self.anchor_sizes = anchor_sizes
         self.labels = None
         self.fixed_input_shape = None
+        self.with_dcn = with_dcn
+        rpn_cls_losses = ['SigmoidFocalLoss', 'SigmoidCrossEntropy']
+        assert rpn_cls_loss in rpn_cls_losses, "rpn_cls_loss should be one of {}".format(
+            rpn_cls_losses)
+        self.rpn_cls_loss = rpn_cls_loss
+        self.rpn_focal_loss_alpha = rpn_focal_loss_alpha
+        self.rpn_focal_loss_gamma = rpn_focal_loss_gamma
 
     def _get_backbone(self, backbone_name):
         norm_type = None
+        lr_mult_list = [1.0, 1.0, 1.0, 1.0, 1.0]
         if backbone_name == 'ResNet18':
             layers = 18
             variant = 'b'
@@ -89,6 +101,11 @@ class FasterRCNN(BaseAPI):
             if self.with_fpn is False:
                 self.with_fpn = True
             return backbone
+        elif backbone_name == 'ResNet50_vd_ssld':
+            layers = 50
+            variant = 'd'
+            norm_type = 'bn'
+            lr_mult_list = [1.0, 0.05, 0.05, 0.1, 0.15]
         if self.with_fpn:
             backbone = paddlex.cv.nets.resnet.ResNet(
                 norm_type='bn' if norm_type is None else norm_type,
@@ -97,7 +114,9 @@ class FasterRCNN(BaseAPI):
                 freeze_norm=True,
                 norm_decay=0.,
                 feature_maps=[2, 3, 4, 5],
-                freeze_at=2)
+                freeze_at=2,
+                lr_mult_list=lr_mult_list,
+                dcn_v2_stages=[3, 4, 5] if self.with_dcn else [])
         else:
             backbone = paddlex.cv.nets.resnet.ResNet(
                 norm_type='affine_channel' if norm_type is None else norm_type,
@@ -106,7 +125,9 @@ class FasterRCNN(BaseAPI):
                 freeze_norm=True,
                 norm_decay=0.,
                 feature_maps=4,
-                freeze_at=2)
+                freeze_at=2,
+                lr_mult_list=lr_mult_list,
+                dcn_v2_stages=[3, 4, 5] if self.with_dcn else [])
         return backbone
 
     def build_net(self, mode='train'):
@@ -121,7 +142,10 @@ class FasterRCNN(BaseAPI):
             anchor_sizes=self.anchor_sizes,
             train_pre_nms_top_n=train_pre_nms_top_n,
             test_pre_nms_top_n=test_pre_nms_top_n,
-            fixed_input_shape=self.fixed_input_shape)
+            fixed_input_shape=self.fixed_input_shape,
+            rpn_cls_loss=self.rpn_cls_loss,
+            rpn_focal_loss_alpha=self.rpn_focal_loss_alpha,
+            rpn_focal_loss_gamma=self.rpn_focal_loss_gamma)
         inputs = model.generate_inputs()
         if mode == 'train':
             model_out = model.build_net(inputs)
@@ -376,7 +400,11 @@ class FasterRCNN(BaseAPI):
         return metrics
 
     @staticmethod
-    def _preprocess(images, transforms, model_type, class_name, thread_pool=None):
+    def _preprocess(images,
+                    transforms,
+                    model_type,
+                    class_name,
+                    thread_pool=None):
         arrange_transforms(
             model_type=model_type,
             class_name=class_name,

+ 11 - 2
paddlex/cv/nets/detection/faster_rcnn.py

@@ -63,6 +63,9 @@ class FasterRCNN(object):
             test_pre_nms_top_n=6000,
             test_post_nms_top_n=1000,
             test_nms_thresh=0.7,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_alpha=0.25,
+            rpn_focal_loss_gamma=2,
             #roi_extractor
             roi_extractor=None,
             #bbox_head
@@ -104,7 +107,10 @@ class FasterRCNN(object):
                     train_nms_thresh=train_nms_thresh,
                     test_pre_nms_top_n=test_pre_nms_top_n,
                     test_post_nms_top_n=test_post_nms_top_n,
-                    test_nms_thresh=test_nms_thresh)
+                    test_nms_thresh=test_nms_thresh,
+                    rpn_cls_loss=rpn_cls_loss,
+                    rpn_focal_loss_alpha=rpn_focal_loss_alpha,
+                    rpn_focal_loss_gamma=rpn_focal_loss_gamma)
             else:
                 rpn_head = FPNRPNHead(
                     anchor_start_size=anchor_sizes[0],
@@ -121,7 +127,10 @@ class FasterRCNN(object):
                     train_nms_thresh=train_nms_thresh,
                     test_pre_nms_top_n=test_pre_nms_top_n,
                     test_post_nms_top_n=test_post_nms_top_n,
-                    test_nms_thresh=test_nms_thresh)
+                    test_nms_thresh=test_nms_thresh,
+                    rpn_cls_loss=rpn_cls_loss,
+                    rpn_focal_loss_alpha=rpn_focal_loss_alpha,
+                    rpn_focal_loss_gamma=rpn_focal_loss_gamma)
         self.rpn_head = rpn_head
         if roi_extractor is None:
             if self.fpn is None:

+ 110 - 39
paddlex/cv/nets/detection/rpn_head.py

@@ -16,10 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from paddle import fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Normal
 from paddle.fluid.regularizer import L2Decay
+from paddle.fluid.initializer import Constant
 
 __all__ = ['RPNHead', 'FPNRPNHead']
 
@@ -39,6 +41,9 @@ class RPNHead(object):
             rpn_positive_overlap=0.7,
             rpn_negative_overlap=0.3,
             use_random=True,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_gamma=2,
+            rpn_focal_loss_alpha=0.25,
             #train_proposal
             train_pre_nms_top_n=12000,
             train_post_nms_top_n=2000,
@@ -75,6 +80,9 @@ class RPNHead(object):
         self.test_min_size = test_min_size
         self.test_eta = test_eta
         self.num_classes = num_classes
+        self.rpn_cls_loss = rpn_cls_loss
+        self.rpn_focal_loss_gamma = rpn_focal_loss_gamma
+        self.rpn_focal_loss_alpha = rpn_focal_loss_alpha
 
     def _get_output(self, input):
         """
@@ -99,7 +107,8 @@ class RPNHead(object):
             act='relu',
             name='conv_rpn',
             param_attr=ParamAttr(
-                name="conv_rpn_w", initializer=Normal(loc=0., scale=0.01)),
+                name="conv_rpn_w", initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="conv_rpn_b", learning_rate=2., regularizer=L2Decay(0.)))
         # Generate anchors
@@ -111,6 +120,11 @@ class RPNHead(object):
             variance=self.variance)
         num_anchor = self.anchor.shape[2]
         # Proposal classification scores
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            bias_init = None
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            value = float(-np.log((1 - 0.01) / 0.01))
+            bias_init = Constant(value=value)
         self.rpn_cls_score = fluid.layers.conv2d(
             rpn_conv,
             num_filters=num_anchor * self.num_classes,
@@ -121,9 +135,11 @@ class RPNHead(object):
             name='rpn_cls_score',
             param_attr=ParamAttr(
                 name="rpn_cls_logits_w",
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="rpn_cls_logits_b",
+                initializer=bias_init,
                 learning_rate=2.,
                 regularizer=L2Decay(0.)))
         # Proposal bbox regression deltas
@@ -136,8 +152,8 @@ class RPNHead(object):
             act=None,
             name='rpn_bbox_pred',
             param_attr=ParamAttr(
-                name="rpn_bbox_pred_w", initializer=Normal(loc=0.,
-                                                           scale=0.01)),
+                name="rpn_bbox_pred_w", initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="rpn_bbox_pred_b",
                 learning_rate=2.,
@@ -251,25 +267,58 @@ class RPNHead(object):
         """
         rpn_cls, rpn_bbox, anchor, anchor_var = self._get_loss_input()
         if self.num_classes == 1:
-            score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \
-                fluid.layers.rpn_target_assign(
-                    bbox_pred=rpn_bbox,
-                    cls_logits=rpn_cls,
-                    anchor_box=anchor,
-                    anchor_var=anchor_var,
-                    gt_boxes=gt_box,
-                    is_crowd=is_crowd,
-                    im_info=im_info,
-                    rpn_batch_size_per_im=self.rpn_batch_size_per_im,
-                    rpn_straddle_thresh=self.rpn_straddle_thresh,
-                    rpn_fg_fraction=self.rpn_fg_fraction,
-                    rpn_positive_overlap=self.rpn_positive_overlap,
-                    rpn_negative_overlap=self.rpn_negative_overlap,
-                    use_random=self.use_random)
-            score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32')
-            score_tgt.stop_gradient = True
-            rpn_cls_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=score_pred, label=score_tgt)
+            if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+                score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \
+                    fluid.layers.rpn_target_assign(
+                        bbox_pred=rpn_bbox,
+                        cls_logits=rpn_cls,
+                        anchor_box=anchor,
+                        anchor_var=anchor_var,
+                        gt_boxes=gt_box,
+                        is_crowd=is_crowd,
+                        im_info=im_info,
+                        rpn_batch_size_per_im=self.rpn_batch_size_per_im,
+                        rpn_straddle_thresh=self.rpn_straddle_thresh,
+                        rpn_fg_fraction=self.rpn_fg_fraction,
+                        rpn_positive_overlap=self.rpn_positive_overlap,
+                        rpn_negative_overlap=self.rpn_negative_overlap,
+                        use_random=self.use_random)
+                score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32')
+                score_tgt.stop_gradient = True
+                rpn_cls_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=score_pred, label=score_tgt)
+            elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+                binary_gt_label = fluid.layers.full_like(
+                    gt_box, fill_value=1, dtype='int32')
+                binary_gt_label = fluid.layers.reduce_sum(
+                    binary_gt_label, dim=1, keep_dim=True)
+                data = fluid.layers.fill_constant(
+                    shape=[1], value=4, dtype='int32')
+                binary_gt_label = fluid.layers.greater_equal(binary_gt_label,
+                                                             data)
+                binary_gt_label = fluid.layers.cast(
+                    binary_gt_label, dtype='int32')
+                score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight, fg_num = \
+                    fluid.layers.retinanet_target_assign(
+                        bbox_pred=rpn_bbox,
+                        cls_logits=rpn_cls,
+                        anchor_box=anchor,
+                        anchor_var=anchor_var,
+                        gt_boxes=gt_box,
+                        gt_labels=binary_gt_label,
+                        is_crowd=is_crowd,
+                        im_info=im_info,
+                        positive_overlap=self.rpn_positive_overlap,
+                        negative_overlap=self.rpn_negative_overlap,
+                        num_classes=1)
+                fg_num = fluid.layers.reduce_sum(fg_num, name='fg_num')
+                score_tgt = fluid.layers.cast(score_tgt, 'int32')
+                rpn_cls_loss = fluid.layers.sigmoid_focal_loss(
+                    x=score_pred,
+                    label=score_tgt,
+                    fg_num=fg_num,
+                    gamma=self.rpn_focal_loss_gamma,
+                    alpha=self.rpn_focal_loss_alpha)
         else:
             score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \
                 fluid.layers.rpn_target_assign(
@@ -295,8 +344,12 @@ class RPNHead(object):
                 label=labels_int64,
                 numeric_stable_mode=True)
 
-        rpn_cls_loss = fluid.layers.reduce_mean(
-            rpn_cls_loss, name='loss_rpn_cls')
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            rpn_cls_loss = fluid.layers.reduce_mean(
+                rpn_cls_loss, name='loss_rpn_cls')
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            rpn_cls_loss = fluid.layers.reduce_sum(
+                rpn_cls_loss, name='loss_rpn_cls')
 
         loc_tgt = fluid.layers.cast(x=loc_tgt, dtype='float32')
         loc_tgt.stop_gradient = True
@@ -308,12 +361,15 @@ class RPNHead(object):
             outside_weight=bbox_weight)
         rpn_reg_loss = fluid.layers.reduce_sum(
             rpn_reg_loss, name='loss_rpn_bbox')
-        score_shape = fluid.layers.shape(score_tgt)
-        score_shape = fluid.layers.cast(x=score_shape, dtype='float32')
-        norm = fluid.layers.reduce_prod(score_shape)
-        norm.stop_gradient = True
-        rpn_reg_loss = rpn_reg_loss / norm
-
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            score_shape = fluid.layers.shape(score_tgt)
+            score_shape = fluid.layers.cast(x=score_shape, dtype='float32')
+            norm = fluid.layers.reduce_prod(score_shape)
+            norm.stop_gradient = True
+            rpn_reg_loss = rpn_reg_loss / norm
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            rpn_reg_loss = rpn_reg_loss / fluid.layers.cast(fg_num,
+                                                            rpn_reg_loss.dtype)
         return {'loss_rpn_cls': rpn_cls_loss, 'loss_rpn_bbox': rpn_reg_loss}
 
 
@@ -333,6 +389,9 @@ class FPNRPNHead(RPNHead):
             rpn_positive_overlap=0.7,
             rpn_negative_overlap=0.3,
             use_random=True,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_gamma=2,
+            rpn_focal_loss_alpha=0.25,
             #train_proposal
             train_pre_nms_top_n=2000,
             train_post_nms_top_n=2000,
@@ -366,7 +425,10 @@ class FPNRPNHead(RPNHead):
             test_nms_thresh=test_nms_thresh,
             test_min_size=test_min_size,
             test_eta=test_eta,
-            num_classes=num_classes)
+            num_classes=num_classes,
+            rpn_cls_loss=rpn_cls_loss,
+            rpn_focal_loss_gamma=rpn_focal_loss_gamma,
+            rpn_focal_loss_alpha=rpn_focal_loss_alpha)
         self.anchor_start_size = anchor_start_size
         self.num_chan = num_chan
         self.min_level = min_level
@@ -410,7 +472,8 @@ class FPNRPNHead(RPNHead):
             name=conv_name,
             param_attr=ParamAttr(
                 name=conv_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=conv_share_name + '_b',
                 learning_rate=2.,
@@ -418,13 +481,18 @@ class FPNRPNHead(RPNHead):
 
         self.anchors, self.anchor_var = fluid.layers.anchor_generator(
             input=conv_rpn_fpn,
-            anchor_sizes=(self.anchor_start_size * 2.**
-                          (feat_lvl - self.min_level), ),
+            anchor_sizes=(self.anchor_start_size * 2.
+                          **(feat_lvl - self.min_level), ),
             stride=(2.**feat_lvl, 2.**feat_lvl),
             aspect_ratios=self.aspect_ratios,
             variance=self.variance)
 
         cls_num_filters = num_anchors * self.num_classes
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            bias_init = None
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            value = float(-np.log((1 - 0.01) / 0.01))
+            bias_init = Constant(value=value)
         self.rpn_cls_score = fluid.layers.conv2d(
             input=conv_rpn_fpn,
             num_filters=cls_num_filters,
@@ -433,9 +501,11 @@ class FPNRPNHead(RPNHead):
             name=cls_name,
             param_attr=ParamAttr(
                 name=cls_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=cls_share_name + '_b',
+                initializer=bias_init,
                 learning_rate=2.,
                 regularizer=L2Decay(0.)))
         self.rpn_bbox_pred = fluid.layers.conv2d(
@@ -446,7 +516,8 @@ class FPNRPNHead(RPNHead):
             name=bbox_name,
             param_attr=ParamAttr(
                 name=bbox_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=bbox_share_name + '_b',
                 learning_rate=2.,
@@ -471,8 +542,8 @@ class FPNRPNHead(RPNHead):
                 shape of (rois_num, 1).
         """
 
-        rpn_cls_score_fpn, rpn_bbox_pred_fpn = self._get_output(
-            body_feat, feat_lvl)
+        rpn_cls_score_fpn, rpn_bbox_pred_fpn = self._get_output(body_feat,
+                                                                feat_lvl)
 
         if self.num_classes == 1:
             rpn_cls_prob_fpn = fluid.layers.sigmoid(

+ 3 - 6
paddlex/cv/nets/resnet.py

@@ -151,10 +151,8 @@ class ResNet(object):
                    groups=1,
                    act=None,
                    name=None,
-                   dcn_v2=False,
-                   use_lr_mult_list=False):
-        lr_mult = self.lr_mult_list[
-            self.curr_stage] if use_lr_mult_list else 1.0
+                   dcn_v2=False):
+        lr_mult = self.lr_mult_list[self.curr_stage]
         _name = self.prefix_name + name if self.prefix_name != '' else name
         if not dcn_v2:
             conv = fluid.layers.conv2d(
@@ -269,8 +267,7 @@ class ResNet(object):
                     pool_padding=0,
                     ceil_mode=True,
                     pool_type='avg')
-                return self._conv_norm(
-                    input, ch_out, 1, 1, name=name, use_lr_mult_list=True)
+                return self._conv_norm(input, ch_out, 1, 1, name=name)
             return self._conv_norm(input, ch_out, 1, stride, name=name)
         else:
             return input

+ 5 - 2
paddlex/tools/base.py

@@ -18,6 +18,7 @@ import json
 import chardet
 import numpy as np
 
+
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.integer):
@@ -28,7 +29,8 @@ class MyEncoder(json.JSONEncoder):
             return obj.tolist()
         else:
             return super(MyEncoder, self).default(obj)
-        
+
+
 def is_pic(img_name):
     valid_suffix = ["JPEG", "jpeg", "JPG", "jpg", "BMP", "bmp", "PNG", "png"]
     suffix = img_name.split(".")[-1]
@@ -36,9 +38,10 @@ def is_pic(img_name):
         return False
     return True
 
+
 def get_encoding(path):
     f = open(path, 'rb')
     data = f.read()
     file_encoding = chardet.detect(data).get('encoding')
     f.close()
-    return file_encoding
+    return file_encoding

+ 6 - 1
paddlex/tools/convert.py

@@ -19,6 +19,7 @@ from .x2imagenet import JingLing2ImageNet
 from .x2coco import LabelMe2COCO
 from .x2coco import EasyData2COCO
 from .x2coco import JingLing2COCO
+from .x2coco import VOC2COCO
 from .x2voc import LabelMe2VOC
 from .x2voc import EasyData2VOC
 from .x2seg import JingLing2Seg
@@ -30,12 +31,14 @@ jingling2imagenet = JingLing2ImageNet().convert
 labelme2coco = LabelMe2COCO().convert
 easydata2coco = EasyData2COCO().convert
 jingling2coco = JingLing2COCO().convert
+voc2coco = VOC2COCO().convert
 labelme2voc = LabelMe2VOC().convert
 easydata2voc = EasyData2VOC().convert
 jingling2seg = JingLing2Seg().convert
 labelme2seg = LabelMe2Seg().convert
 easydata2seg = EasyData2Seg().convert
 
+
 def dataset_conversion(source, to, pics, anns, save_dir):
     if source == 'labelme' and to == 'PascalVOC':
         labelme2voc(pics, anns, save_dir)
@@ -47,6 +50,8 @@ def dataset_conversion(source, to, pics, anns, save_dir):
         jingling2imagenet(pics, anns, save_dir)
     elif source == 'jingling' and to == 'MSCOCO':
         jingling2coco(pics, anns, save_dir)
+    elif source == 'PascalVOC' and to == 'MSCOCO':
+        voc2coco(pics, anns, save_dir)
     elif source == 'jingling' and to == 'SEG':
         jingling2seg(pics, anns, save_dir)
     elif source == 'easydata' and to == 'ImageNet':
@@ -56,4 +61,4 @@ def dataset_conversion(source, to, pics, anns, save_dir):
     elif source == 'easydata' and to == 'MSCOCO':
         easydata2coco(pics, anns, save_dir)
     elif source == 'easydata' and to == 'SEG':
-        easydata2seg(pics, anns, save_dir)
+        easydata2seg(pics, anns, save_dir)

+ 295 - 76
paddlex/tools/x2coco.py

@@ -19,26 +19,30 @@ import json
 import os
 import os.path as osp
 import shutil
+import re
 import numpy as np
 import PIL.ImageDraw
+import xml.etree.ElementTree as ET
 from .base import MyEncoder, is_pic, get_encoding
 from paddlex.utils import path_normalization
-        
-        
+import paddlex.utils.logging as logging
+
+
 class X2COCO(object):
     def __init__(self):
         self.images_list = []
         self.categories_list = []
         self.annotations_list = []
-    
+
     def generate_categories_field(self, label, labels_list):
         category = {}
         category["supercategory"] = "component"
         category["id"] = len(labels_list) + 1
         category["name"] = label
         return category
-    
-    def generate_rectangle_anns_field(self, points, label, image_id, object_id, label_to_num):
+
+    def generate_rectangle_anns_field(self, points, label, image_id, object_id,
+                                      label_to_num):
         annotation = {}
         seg_points = np.asarray(points).copy()
         seg_points[1, :] = np.asarray(points)[2, :]
@@ -48,14 +52,14 @@ class X2COCO(object):
         annotation["image_id"] = image_id + 1
         annotation["bbox"] = list(
             map(float, [
-                points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
-                    1] - points[0][1]
+                points[0][0], points[0][1], points[1][0] - points[0][0],
+                points[1][1] - points[0][1]
             ]))
         annotation["area"] = annotation["bbox"][2] * annotation["bbox"][3]
         annotation["category_id"] = label_to_num[label]
         annotation["id"] = object_id + 1
         return annotation
-    
+
     def convert(self, image_dir, json_dir, dataset_save_dir):
         """转换。
         Args:
@@ -74,8 +78,8 @@ class X2COCO(object):
         for img_name in os.listdir(image_dir):
             if is_pic(img_name):
                 shutil.copyfile(
-                            osp.join(image_dir, img_name),
-                            osp.join(new_image_dir, img_name))
+                    osp.join(image_dir, img_name),
+                    osp.join(new_image_dir, img_name))
         # Convert the json files.
         self.parse_json(new_image_dir, json_dir)
         coco_data = {}
@@ -83,42 +87,40 @@ class X2COCO(object):
         coco_data["categories"] = self.categories_list
         coco_data["annotations"] = self.annotations_list
         json_path = osp.join(dataset_save_dir, "annotations.json")
-        json.dump(
-            coco_data,
-            open(json_path, "w"),
-            indent=4,
-            cls=MyEncoder)
-    
-    
+        json.dump(coco_data, open(json_path, "w"), indent=4, cls=MyEncoder)
+
+
 class LabelMe2COCO(X2COCO):
     """将使用LabelMe标注的数据集转换为COCO数据集。
     """
+
     def __init__(self):
         super(LabelMe2COCO, self).__init__()
-        
+
     def generate_images_field(self, json_info, image_file, image_id):
         image = {}
         image["height"] = json_info["imageHeight"]
         image["width"] = json_info["imageWidth"]
         image["id"] = image_id + 1
         json_img_path = path_normalization(json_info["imagePath"])
-        json_info["imagePath"] = osp.join(osp.split(json_img_path)[0], image_file)
+        json_info["imagePath"] = osp.join(
+            osp.split(json_img_path)[0], image_file)
         image["file_name"] = osp.split(json_info["imagePath"])[-1]
         return image
-    
-    def generate_polygon_anns_field(self, height, width, 
-                                    points, label, image_id, 
-                                    object_id, label_to_num):
+
+    def generate_polygon_anns_field(self, height, width, points, label,
+                                    image_id, object_id, label_to_num):
         annotation = {}
         annotation["segmentation"] = [list(np.asarray(points).flatten())]
         annotation["iscrowd"] = 0
         annotation["image_id"] = image_id + 1
-        annotation["bbox"] = list(map(float, self.get_bbox(height, width, points)))
+        annotation["bbox"] = list(
+            map(float, self.get_bbox(height, width, points)))
         annotation["area"] = annotation["bbox"][2] * annotation["bbox"][3]
         annotation["category_id"] = label_to_num[label]
         annotation["id"] = object_id + 1
         return annotation
-    
+
     def get_bbox(self, height, width, points):
         polygons = points
         mask = np.zeros([height, width], dtype=np.uint8)
@@ -137,7 +139,7 @@ class LabelMe2COCO(X2COCO):
             left_top_c, left_top_r, right_bottom_c - left_top_c,
             right_bottom_r - left_top_r
         ]
-    
+
     def parse_json(self, img_dir, json_dir):
         image_id = -1
         object_id = -1
@@ -153,7 +155,8 @@ class LabelMe2COCO(X2COCO):
             with open(json_file, mode='r', \
                               encoding=get_encoding(json_file)) as j:
                 json_info = json.load(j)
-                img_info = self.generate_images_field(json_info, img_file, image_id)
+                img_info = self.generate_images_field(json_info, img_file,
+                                                      image_id)
                 self.images_list.append(img_info)
                 for shapes in json_info["shapes"]:
                     object_id = object_id + 1
@@ -167,23 +170,26 @@ class LabelMe2COCO(X2COCO):
                     p_type = shapes["shape_type"]
                     if p_type == "polygon":
                         self.annotations_list.append(
-                            self.generate_polygon_anns_field(json_info["imageHeight"], json_info[
-                                "imageWidth"], points, label, image_id,
-                                                object_id, label_to_num))
+                            self.generate_polygon_anns_field(
+                                json_info["imageHeight"], json_info[
+                                    "imageWidth"], points, label, image_id,
+                                object_id, label_to_num))
                     if p_type == "rectangle":
                         points.append([points[0][0], points[1][1]])
                         points.append([points[1][0], points[0][1]])
                         self.annotations_list.append(
-                            self.generate_rectangle_anns_field(points, label, image_id,
-                                                  object_id, label_to_num))
-                        
-    
+                            self.generate_rectangle_anns_field(
+                                points, label, image_id, object_id,
+                                label_to_num))
+
+
 class EasyData2COCO(X2COCO):
     """将使用EasyData标注的检测或分割数据集转换为COCO数据集。
     """
+
     def __init__(self):
-        super(EasyData2COCO, self).__init__()        
-    
+        super(EasyData2COCO, self).__init__()
+
     def generate_images_field(self, img_path, image_id):
         image = {}
         img = cv2.imread(img_path)
@@ -193,23 +199,23 @@ class EasyData2COCO(X2COCO):
         img_path = path_normalization(img_path)
         image["file_name"] = osp.split(img_path)[-1]
         return image
-    
-    def generate_polygon_anns_field(self, points, segmentation, 
-                                    label, image_id, object_id,
-                                    label_to_num):
+
+    def generate_polygon_anns_field(self, points, segmentation, label,
+                                    image_id, object_id, label_to_num):
         annotation = {}
         annotation["segmentation"] = segmentation
         annotation["iscrowd"] = 1 if len(segmentation) > 1 else 0
         annotation["image_id"] = image_id + 1
-        annotation["bbox"] = list(map(float, [
-                points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
-                    1] - points[0][1]
+        annotation["bbox"] = list(
+            map(float, [
+                points[0][0], points[0][1], points[1][0] - points[0][0],
+                points[1][1] - points[0][1]
             ]))
         annotation["area"] = annotation["bbox"][2] * annotation["bbox"][3]
         annotation["category_id"] = label_to_num[label]
         annotation["id"] = object_id + 1
         return annotation
-        
+
     def parse_json(self, img_dir, json_dir):
         from pycocotools.mask import decode
         image_id = -1
@@ -226,7 +232,8 @@ class EasyData2COCO(X2COCO):
             with open(json_file, mode='r', \
                               encoding=get_encoding(json_file)) as j:
                 json_info = json.load(j)
-                img_info = self.generate_images_field(osp.join(img_dir, img_file), image_id)
+                img_info = self.generate_images_field(
+                    osp.join(img_dir, img_file), image_id)
                 self.images_list.append(img_info)
                 for shapes in json_info["labels"]:
                     object_id = object_id + 1
@@ -242,31 +249,36 @@ class EasyData2COCO(X2COCO):
                         points.append([points[0][0], points[1][1]])
                         points.append([points[1][0], points[0][1]])
                         self.annotations_list.append(
-                            self.generate_rectangle_anns_field(points, label, image_id,
-                                                  object_id, label_to_num))
+                            self.generate_rectangle_anns_field(
+                                points, label, image_id, object_id,
+                                label_to_num))
                     else:
                         mask_dict = {}
-                        mask_dict['size'] = [img_info["height"], img_info["width"]]
+                        mask_dict[
+                            'size'] = [img_info["height"], img_info["width"]]
                         mask_dict['counts'] = shapes['mask'].encode()
                         mask = decode(mask_dict)
                         contours, hierarchy = cv2.findContours(
-                                (mask).astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+                            (mask).astype(np.uint8), cv2.RETR_TREE,
+                            cv2.CHAIN_APPROX_SIMPLE)
                         segmentation = []
                         for contour in contours:
                             contour_list = contour.flatten().tolist()
                             if len(contour_list) > 4:
                                 segmentation.append(contour_list)
                         self.annotations_list.append(
-                            self.generate_polygon_anns_field(points, segmentation, label, image_id, object_id,
-                                                label_to_num))
-                        
+                            self.generate_polygon_anns_field(
+                                points, segmentation, label, image_id,
+                                object_id, label_to_num))
+
 
 class JingLing2COCO(X2COCO):
     """将使用EasyData标注的检测或分割数据集转换为COCO数据集。
     """
+
     def __init__(self):
         super(JingLing2COCO, self).__init__()
-        
+
     def generate_images_field(self, json_info, image_id):
         image = {}
         image["height"] = json_info["size"]["height"]
@@ -275,20 +287,20 @@ class JingLing2COCO(X2COCO):
         json_info["path"] = path_normalization(json_info["path"])
         image["file_name"] = osp.split(json_info["path"])[-1]
         return image
-    
-    def generate_polygon_anns_field(self, height, width, 
-                                    points, label, image_id, 
-                                    object_id, label_to_num):
+
+    def generate_polygon_anns_field(self, height, width, points, label,
+                                    image_id, object_id, label_to_num):
         annotation = {}
         annotation["segmentation"] = [list(np.asarray(points).flatten())]
         annotation["iscrowd"] = 0
         annotation["image_id"] = image_id + 1
-        annotation["bbox"] = list(map(float, self.get_bbox(height, width, points)))
+        annotation["bbox"] = list(
+            map(float, self.get_bbox(height, width, points)))
         annotation["area"] = annotation["bbox"][2] * annotation["bbox"][3]
         annotation["category_id"] = label_to_num[label]
         annotation["id"] = object_id + 1
         return annotation
-    
+
     def get_bbox(self, height, width, points):
         polygons = points
         mask = np.zeros([height, width], dtype=np.uint8)
@@ -307,7 +319,7 @@ class JingLing2COCO(X2COCO):
             left_top_c, left_top_r, right_bottom_c - left_top_c,
             right_bottom_r - left_top_r
         ]
-        
+
     def parse_json(self, img_dir, json_dir):
         image_id = -1
         object_id = -1
@@ -329,7 +341,7 @@ class JingLing2COCO(X2COCO):
                 for i, obj in enumerate(json_info["outputs"]["object"]):
                     if i == 0:
                         if "polygon" in obj:
-                            anns_type = "polygon" 
+                            anns_type = "polygon"
                     else:
                         if anns_type not in obj:
                             continue
@@ -343,22 +355,229 @@ class JingLing2COCO(X2COCO):
                     if anns_type == "polygon":
                         points = []
                         for j in range(int(len(obj["polygon"]) / 2.0)):
-                            points.append([obj["polygon"]["x" + str(j + 1)], 
-                                           obj["polygon"]["y" + str(j + 1)]])
+                            points.append([
+                                obj["polygon"]["x" + str(j + 1)],
+                                obj["polygon"]["y" + str(j + 1)]
+                            ])
                         self.annotations_list.append(
-                            self.generate_polygon_anns_field(json_info["size"]["height"], 
-                                                             json_info["size"]["width"], 
-                                                             points, 
-                                                             label, 
-                                                             image_id,
-                                                             object_id, 
-                                                             label_to_num))
+                            self.generate_polygon_anns_field(
+                                json_info["size"]["height"], json_info["size"][
+                                    "width"], points, label, image_id,
+                                object_id, label_to_num))
                     if anns_type == "bndbox":
                         points = []
-                        points.append([obj["bndbox"]["xmin"], obj["bndbox"]["ymin"]])
-                        points.append([obj["bndbox"]["xmax"], obj["bndbox"]["ymax"]])
-                        points.append([obj["bndbox"]["xmin"], obj["bndbox"]["ymax"]])
-                        points.append([obj["bndbox"]["xmax"], obj["bndbox"]["ymin"]])
+                        points.append(
+                            [obj["bndbox"]["xmin"], obj["bndbox"]["ymin"]])
+                        points.append(
+                            [obj["bndbox"]["xmax"], obj["bndbox"]["ymax"]])
+                        points.append(
+                            [obj["bndbox"]["xmin"], obj["bndbox"]["ymax"]])
+                        points.append(
+                            [obj["bndbox"]["xmax"], obj["bndbox"]["ymin"]])
                         self.annotations_list.append(
-                            self.generate_rectangle_anns_field(points, label, image_id,
-                                                  object_id, label_to_num))
+                            self.generate_rectangle_anns_field(
+                                points, label, image_id, object_id,
+                                label_to_num))
+
+
+class VOC2COCO(X2COCO):
+    """将使用VOC标注的数据集转换为COCO数据集。
+    """
+
+    def __init__(self):
+        super(VOC2COCO, self).__init__()
+
+    def generate_categories_field(self, label, labels_list):
+        category = {}
+        category["supercategory"] = "component"
+        category["id"] = len(labels_list) + 1
+        category["name"] = label
+        return category
+
+    def generate_images_field(self, xml_info, image_file, image_id):
+        image = {}
+        image["height"] = xml_info["imageHeight"]
+        image["width"] = xml_info["imageWidth"]
+        image["id"] = image_id + 1
+        image["imagePath"] = image_file
+        image["file_name"] = osp.split(image_file)[-1]
+        return image
+
+    def generate_label_list(self, xml_dir):
+        xml_dir_dir = os.path.abspath(
+            os.path.join(os.path.dirname(xml_dir), os.path.pardir))
+        self.labels_list = []
+        self.label_to_num = {}
+        if osp.exists(osp.join(xml_dir_dir, 'labels.txt')):
+            with open(osp.join(xml_dir_dir, 'labels.txt'), 'r') as fr:
+                while True:
+                    label = fr.readline().strip()
+                    if not label:
+                        break
+                    if label not in self.labels_list:
+                        self.categories_list.append(\
+                            self.generate_categories_field(label, self.labels_list))
+                        self.labels_list.append(label)
+                        self.label_to_num[label] = len(self.labels_list)
+            return
+        logging.info(
+            'labels.txt is not in the folder {}, so categories are ordered randomly in annotation.json.'.
+            format(xml_dir_dir))
+        return
+
+    def parse_xml(self, xml_file):
+        xml_info = {'im_info': {}, 'annotations': []}
+        tree = ET.parse(xml_file)
+        pattern = re.compile('<object>', re.IGNORECASE)
+        obj_match = pattern.findall(str(ET.tostringlist(tree.getroot())))
+        obj_tag = obj_match[0][1:-1]
+        objs = tree.findall(obj_tag)
+        pattern = re.compile('<size>', re.IGNORECASE)
+        size_tag = pattern.findall(str(ET.tostringlist(tree.getroot())))[0][1:
+                                                                            -1]
+        size_element = tree.find(size_tag)
+        pattern = re.compile('<width>', re.IGNORECASE)
+        width_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                           -1]
+        im_w = float(size_element.find(width_tag).text)
+        pattern = re.compile('<height>', re.IGNORECASE)
+        height_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                            -1]
+        im_h = float(size_element.find(height_tag).text)
+        xml_info['im_info']['imageWidth'] = im_w
+        xml_info['im_info']['imageHeight'] = im_h
+        for i, obj in enumerate(objs):
+            pattern = re.compile('<name>', re.IGNORECASE)
+            name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            cname = obj.find(name_tag).text.strip()
+            pattern = re.compile('<bndbox>', re.IGNORECASE)
+            box_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            box_element = obj.find(box_tag)
+            pattern = re.compile('<xmin>', re.IGNORECASE)
+            xmin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x1 = float(box_element.find(xmin_tag).text)
+            pattern = re.compile('<ymin>', re.IGNORECASE)
+            ymin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y1 = float(box_element.find(ymin_tag).text)
+            pattern = re.compile('<xmax>', re.IGNORECASE)
+            xmax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x2 = float(box_element.find(xmax_tag).text)
+            pattern = re.compile('<ymax>', re.IGNORECASE)
+            ymax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y2 = float(box_element.find(ymax_tag).text)
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            if im_w > 0.5 and im_h > 0.5:
+                x2 = min(im_w - 1, x2)
+                y2 = min(im_h - 1, y2)
+            xml_info['annotations'].append({
+                'bbox': [[x1, y1], [x2, y2], [x1, y2], [x2, y1]],
+                'category': cname,
+            })
+        return xml_info
+
+    def parse_json(self, img_dir, xml_dir, file_list=None):
+        image_id = -1
+        object_id = -1
+        self.generate_label_list(xml_dir)
+        for img_file in os.listdir(img_dir):
+            if file_list is not None and img_file not in file_list:
+                continue
+            img_name_part = osp.splitext(img_file)[0]
+            xml_file = osp.join(xml_dir, img_name_part + ".xml")
+            if not osp.exists(xml_file):
+                os.remove(osp.join(img_dir, img_file))
+                continue
+            image_id = image_id + 1
+            xml_info = self.parse_xml(xml_file)
+            img_info = self.generate_images_field(xml_info['im_info'],
+                                                  osp.join(img_dir, img_file),
+                                                  image_id)
+            self.images_list.append(img_info)
+            annos = xml_info['annotations']
+            for anno in annos:
+                object_id = object_id + 1
+                label = anno["category"]
+                if label not in self.labels_list:
+                    self.categories_list.append(\
+                        self.generate_categories_field(label, self.labels_list))
+                    self.labels_list.append(label)
+                    self.label_to_num[label] = len(self.labels_list)
+                self.annotations_list.append(
+                    self.generate_rectangle_anns_field(anno[
+                        'bbox'], label, image_id, object_id,
+                                                       self.label_to_num))
+
+    def convert(self, image_dir, json_dir, dataset_save_dir):
+        """转换。
+        Args:
+            image_dir (str): 图像文件存放的路径。
+            json_dir (str): 与每张图像对应的json文件的存放路径。
+            dataset_save_dir (str): 转换后数据集存放路径。
+        """
+        assert osp.exists(image_dir), "he image folder does not exist!"
+        assert osp.exists(json_dir), "The json folder does not exist!"
+        assert osp.exists(dataset_save_dir), "The save folder does not exist!"
+        # Convert the image files.
+        new_image_dir = osp.join(dataset_save_dir, "JPEGImages")
+        if osp.exists(new_image_dir):
+            shutil.rmtree(new_image_dir)
+        os.makedirs(new_image_dir)
+        for img_name in os.listdir(image_dir):
+            if is_pic(img_name):
+                shutil.copyfile(
+                    osp.join(image_dir, img_name),
+                    osp.join(new_image_dir, img_name))
+        # Convert the json files.
+        xml_dir_dir = os.path.abspath(
+            os.path.join(os.path.dirname(json_dir), os.path.pardir))
+        for part in ['train', 'val', 'test']:
+            part_list_file = osp.join(xml_dir_dir, '{}_list.txt'.format(part))
+            if osp.exists(part_list_file):
+                file_list = list()
+                with open(part_list_file, 'r') as f:
+                    while True:
+                        line = f.readline()
+                        if not line:
+                            break
+                        if len(line.strip().split()) > 2:
+                            raise Exception(
+                                "A space is defined as the separator, but it exists in image or label name {}."
+                                .format(line))
+                        img_file = osp.join(
+                            image_dir, osp.split(line.strip().split()[0])[-1])
+                        xml_file = osp.join(
+                            json_dir, osp.split(line.strip().split()[1])[-1])
+                        img_file = path_normalization(img_file)
+                        xml_file = path_normalization(xml_file)
+                        if not is_pic(img_file):
+                            continue
+                        if not osp.isfile(xml_file):
+                            continue
+                        if not osp.exists(img_file):
+                            raise IOError('The image file {} is not exist!'.
+                                          format(img_file))
+                        file_list.append(osp.split(img_file)[-1])
+                self.parse_json(new_image_dir, json_dir, file_list)
+                coco_data = {}
+                coco_data["images"] = self.images_list
+                coco_data["categories"] = self.categories_list
+                coco_data["annotations"] = self.annotations_list
+                json_path = osp.join(dataset_save_dir, "{}.json".format(part))
+                json.dump(
+                    coco_data, open(json_path, "w"), indent=4, cls=MyEncoder)
+                logging.info("xml files in {} are converted to the MSCOCO format stored in {}".format(\
+                    osp.join(xml_dir_dir, '{}_list.txt'.format(part)), osp.join(dataset_save_dir, "{}.json".format(part))))
+                self.images_list = []
+                self.annotations_list = []
+        self.parse_json(new_image_dir, json_dir)
+        coco_data = {}
+        coco_data["images"] = self.images_list
+        coco_data["categories"] = self.categories_list
+        coco_data["annotations"] = self.annotations_list
+        json_path = osp.join(dataset_save_dir, "annotations.json")
+        json.dump(coco_data, open(json_path, "w"), indent=4, cls=MyEncoder)

+ 87 - 0
tutorials/train/object_detection/guang_2.py

@@ -0,0 +1,87 @@
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
+import json
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html
+train_transforms = transforms.Compose([
+    transforms.RandomHorizontalFlip(), transforms.Normalize(),
+    transforms.ResizeByShort(
+        short_size=800, max_size=1333), transforms.Padding(coarsest_stride=32)
+])
+
+eval_transforms = transforms.Compose([
+    transforms.Normalize(),
+    transforms.ResizeByShort(
+        short_size=800, max_size=1333),
+    transforms.Padding(coarsest_stride=32),
+])
+
+# 定义训练和验证所用的数据集
+# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection
+#train_dataset = pdx.datasets.VOCDetection(
+#    data_dir='dataset',
+#    file_list='dataset/train_list.txt',
+#    label_list='dataset/labels.txt',
+#    transforms=train_transforms,
+#    num_workers=2,
+#    shuffle=True)
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='dataset',
+    file_list='dataset/val_list.txt',
+    label_list='dataset/labels.txt',
+    num_workers=2,
+    transforms=eval_transforms)
+
+# 初始化模型,并进行训练
+# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html
+# num_classes 需要设置为包含背景类的类别数,即: 目标类别数量 + 1
+#num_classes = len(train_dataset.labels) + 1
+#
+## API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn
+#model = pdx.det.FasterRCNN(num_classes=num_classes, backbone='ResNet50_vd')
+#
+## API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#id1
+## 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html
+#model.train(
+#    num_epochs=36,
+#    train_dataset=train_dataset,
+#    train_batch_size=8,
+#    eval_dataset=eval_dataset,
+#    learning_rate=0.01,
+#    lr_decay_epochs=[24, 33],
+#    warmup_steps=1000,
+#    pretrain_weights='ResNet50_vd_ssld_pretrained',
+#    save_dir='output/guan_2',
+#    use_vdl=False)
+
+
+#eval_dataset = pdx.datasets.CocoDetection(
+#    data_dir='dataset_coco/JPEGImages',
+#    ann_file='dataset_coco/val.json',
+#    num_workers=2,
+#    transforms=eval_transforms)
+#model = pdx.load_model('output/guan_4/best_model/')
+#eval_details = model.evaluate(eval_dataset, batch_size=8, return_details=True)
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(MyEncoder, self).default(obj)
+
+
+with open('output/guan_4/best_model/eval_details.json', 'r') as f:
+    eval_details = json.load(f)
+json_path = 'output/guan_4/best_model/gt.json'
+json.dump(eval_details['gt'], open(json_path, "w"), indent=4, cls=MyEncoder)
+json_path = 'output/guan_4/best_model/bbox.json'
+json.dump(eval_details['bbox'], open(json_path, "w"), indent=4, cls=MyEncoder)

+ 63 - 0
tutorials/train/object_detection/guang_6.py

@@ -0,0 +1,63 @@
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html
+train_transforms = transforms.Compose([
+    transforms.RandomHorizontalFlip(), transforms.Normalize(),
+    transforms.ResizeByShort(
+        short_size=800, max_size=1333), transforms.Padding(coarsest_stride=32)
+])
+
+eval_transforms = transforms.Compose([
+    transforms.Normalize(),
+    transforms.ResizeByShort(
+        short_size=800, max_size=1333),
+    transforms.Padding(coarsest_stride=32),
+])
+
+# 定义训练和验证所用的数据集
+# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection
+train_dataset = pdx.datasets.VOCDetection(
+    data_dir='dataset',
+    file_list='dataset/train_list.txt',
+    label_list='dataset/labels.txt',
+    transforms=train_transforms,
+    num_workers=8,
+    shuffle=True)
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='dataset',
+    file_list='dataset/val_list.txt',
+    label_list='dataset/labels.txt',
+    num_workers=8,
+    transforms=eval_transforms)
+
+# 初始化模型,并进行训练
+# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html
+# num_classes 需要设置为包含背景类的类别数,即: 目标类别数量 + 1
+num_classes = len(train_dataset.labels) + 1
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn
+model = pdx.det.FasterRCNN(
+    num_classes=num_classes,
+    backbone='ResNet50_vd',
+    with_dcn=True,
+    rpn_cls_loss='SigmoidFocalLoss')
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#id1
+# 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html
+model.train(
+    num_epochs=60,
+    train_dataset=train_dataset,
+    train_batch_size=8,
+    eval_dataset=eval_dataset,
+    learning_rate=0.01,
+    lr_decay_epochs=[48, 56],
+    warmup_steps=1000,
+    pretrain_weights='ResNet50_vd_ssld_pretrained',
+    save_dir='output/guan_6',
+    use_vdl=False)

+ 95 - 0
tutorials/train/object_detection/mv_train_img.py

@@ -0,0 +1,95 @@
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '4'
+import os.path as osp
+import re
+import cv2
+import xml.etree.ElementTree as ET
+import paddlex as pdx
+
+file_list = 'dataset/train_list.txt'
+save_dir = './visualize/train'
+data_dir = 'dataset/'
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+
+with open(file_list, 'r') as fr:
+    while True:
+        line = fr.readline()
+        if not line:
+            break
+        img_file, xml_file = [osp.join(data_dir, x) \
+                for x in line.strip().split()[:2]]
+        if 'budaodian' not in img_file and 'cahua' not in img_file and 'loudi' not in img_file and 'zangdian' not in img_file:
+            continue
+        tree = ET.parse(xml_file)
+        pattern = re.compile('<object>', re.IGNORECASE)
+        obj_match = pattern.findall(str(ET.tostringlist(tree.getroot())))
+        if len(obj_match) == 0:
+            continue
+        obj_tag = obj_match[0][1:-1]
+        objs = tree.findall(obj_tag)
+        pattern = re.compile('<size>', re.IGNORECASE)
+        size_tag = pattern.findall(str(ET.tostringlist(tree.getroot())))[0][1:
+                                                                            -1]
+        size_element = tree.find(size_tag)
+        pattern = re.compile('<width>', re.IGNORECASE)
+        width_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                           -1]
+        im_w = float(size_element.find(width_tag).text)
+        pattern = re.compile('<height>', re.IGNORECASE)
+        height_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                            -1]
+        im_h = float(size_element.find(height_tag).text)
+        gt_bbox = []
+        gt_class = []
+        for i, obj in enumerate(objs):
+            pattern = re.compile('<name>', re.IGNORECASE)
+            name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            cname = obj.find(name_tag).text.strip()
+            gt_class.append(cname)
+            pattern = re.compile('<difficult>', re.IGNORECASE)
+            diff_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            try:
+                _difficult = int(obj.find(diff_tag).text)
+            except Exception:
+                _difficult = 0
+            pattern = re.compile('<bndbox>', re.IGNORECASE)
+            box_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            box_element = obj.find(box_tag)
+            pattern = re.compile('<xmin>', re.IGNORECASE)
+            xmin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x1 = float(box_element.find(xmin_tag).text)
+            pattern = re.compile('<ymin>', re.IGNORECASE)
+            ymin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y1 = float(box_element.find(ymin_tag).text)
+            pattern = re.compile('<xmax>', re.IGNORECASE)
+            xmax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x2 = float(box_element.find(xmax_tag).text)
+            pattern = re.compile('<ymax>', re.IGNORECASE)
+            ymax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y2 = float(box_element.find(ymax_tag).text)
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            if im_w > 0.5 and im_h > 0.5:
+                x2 = min(im_w - 1, x2)
+                y2 = min(im_h - 1, y2)
+            gt_bbox.append([x1, y1, x2, y2])
+        gts = []
+        for bbox, name in zip(gt_bbox, gt_class):
+            x1, y1, x2, y2 = bbox
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            gt = {
+                'category_id': 0,
+                'category': name,
+                'bbox': [x1, y1, w, h],
+                'score': 1
+            }
+            gts.append(gt)
+        gt_vis = pdx.det.visualize(img_file, gts, threshold=0.1, save_dir=None)
+        cv2.imwrite(
+            os.path.join(save_dir, os.path.split(img_file)[-1]), gt_vis)

+ 99 - 0
tutorials/train/object_detection/predict.py

@@ -0,0 +1,99 @@
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '4'
+import os.path as osp
+import cv2
+import re
+import xml.etree.ElementTree as ET
+import paddlex as pdx
+
+model_dir = 'output/guan_2/best_model/'
+file_list = 'dataset/val_list.txt'
+data_dir = 'dataset/'
+save_dir = './visualize/guan_2'
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+
+model = pdx.load_model(model_dir)
+with open(file_list, 'r') as fr:
+    while True:
+        line = fr.readline()
+        if not line:
+            break
+        img_file, xml_file = [osp.join(data_dir, x) \
+                for x in line.strip().split()[:2]]
+        res = model.predict(img_file)
+        det_vis = pdx.det.visualize(
+            img_file, res, threshold=0.1, save_dir=None)
+
+        tree = ET.parse(xml_file)
+        pattern = re.compile('<object>', re.IGNORECASE)
+        obj_match = pattern.findall(str(ET.tostringlist(tree.getroot())))
+        if len(obj_match) == 0:
+            continue
+        obj_tag = obj_match[0][1:-1]
+        objs = tree.findall(obj_tag)
+        pattern = re.compile('<size>', re.IGNORECASE)
+        size_tag = pattern.findall(str(ET.tostringlist(tree.getroot())))[0][1:
+                                                                            -1]
+        size_element = tree.find(size_tag)
+        pattern = re.compile('<width>', re.IGNORECASE)
+        width_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                           -1]
+        im_w = float(size_element.find(width_tag).text)
+        pattern = re.compile('<height>', re.IGNORECASE)
+        height_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:
+                                                                            -1]
+        im_h = float(size_element.find(height_tag).text)
+        gt_bbox = []
+        gt_class = []
+        for i, obj in enumerate(objs):
+            pattern = re.compile('<name>', re.IGNORECASE)
+            name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            cname = obj.find(name_tag).text.strip()
+            gt_class.append(cname)
+            pattern = re.compile('<difficult>', re.IGNORECASE)
+            diff_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            try:
+                _difficult = int(obj.find(diff_tag).text)
+            except Exception:
+                _difficult = 0
+            pattern = re.compile('<bndbox>', re.IGNORECASE)
+            box_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+            box_element = obj.find(box_tag)
+            pattern = re.compile('<xmin>', re.IGNORECASE)
+            xmin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x1 = float(box_element.find(xmin_tag).text)
+            pattern = re.compile('<ymin>', re.IGNORECASE)
+            ymin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y1 = float(box_element.find(ymin_tag).text)
+            pattern = re.compile('<xmax>', re.IGNORECASE)
+            xmax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            x2 = float(box_element.find(xmax_tag).text)
+            pattern = re.compile('<ymax>', re.IGNORECASE)
+            ymax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][
+                1:-1]
+            y2 = float(box_element.find(ymax_tag).text)
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            if im_w > 0.5 and im_h > 0.5:
+                x2 = min(im_w - 1, x2)
+                y2 = min(im_h - 1, y2)
+            gt_bbox.append([x1, y1, x2, y2])
+        gts = []
+        for bbox, name in zip(gt_bbox, gt_class):
+            x1, y1, x2, y2 = bbox
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            gt = {
+                'category_id': 0,
+                'category': name,
+                'bbox': [x1, y1, w, h],
+                'score': 1
+            }
+            gts.append(gt)
+        gt_vis = pdx.det.visualize(img_file, gts, threshold=0.1, save_dir=None)
+        vis = cv2.hconcat([det_vis, gt_vis])
+        cv2.imwrite(os.path.join(save_dir, os.path.split(img_file)[-1]), vis)