Sfoglia il codice sorgente

Merge pull request #400 from FlyingQianMM/develop_new

support quality testing case
Jason 5 anni fa
parent
commit
c4655cc742
91 ha cambiato i file con 4100 aggiunte e 357 eliminazioni
  1. 6 2
      deploy/cpp/demo/classifier.cpp
  2. 6 2
      deploy/cpp/demo/detector.cpp
  3. 6 2
      deploy/cpp/demo/segmenter.cpp
  4. 4 2
      deploy/cpp/demo/video_classifier.cpp
  5. 4 2
      deploy/cpp/demo/video_detector.cpp
  6. 4 2
      deploy/cpp/demo/video_segmenter.cpp
  7. 6 0
      deploy/cpp/include/paddlex/transforms.h
  8. 4 1
      deploy/cpp/src/paddlex.cpp
  9. 51 13
      deploy/cpp/src/transforms.cpp
  10. 6 2
      deploy/openvino/demo/classifier.cpp
  11. 6 2
      deploy/openvino/demo/detector.cpp
  12. 6 2
      deploy/openvino/demo/segmenter.cpp
  13. 6 0
      deploy/openvino/include/paddlex/transforms.h
  14. 50 14
      deploy/openvino/src/transforms.cpp
  15. 0 48
      docs/apis/analysis.md
  16. 18 0
      docs/apis/datasets.md
  17. BIN
      docs/apis/images/detection_analysis.jpg
  18. BIN
      docs/apis/images/insect_bbox-allclass-allarea.png
  19. 1 1
      docs/apis/index.rst
  20. 27 9
      docs/apis/models/detection.md
  21. 3 2
      docs/apis/models/instance_segmentation.md
  22. 77 0
      docs/apis/tools.md
  23. 14 3
      docs/apis/transforms/det_transforms.md
  24. 67 0
      docs/apis/visualize.md
  25. 1 0
      docs/examples/index.rst
  26. 99 0
      docs/examples/industrial_quality_inspection/README.md
  27. 97 0
      docs/examples/industrial_quality_inspection/accuracy_improvement.md
  28. 14 0
      docs/examples/industrial_quality_inspection/dataset.md
  29. 116 0
      docs/examples/industrial_quality_inspection/gpu_solution.md
  30. 102 0
      docs/examples/industrial_quality_inspection/tp_fp_list.md
  31. 1 0
      docs/index.rst
  32. 99 0
      examples/industrial_quality_inspection/README.md
  33. 93 0
      examples/industrial_quality_inspection/accuracy_improvement.md
  34. 214 0
      examples/industrial_quality_inspection/cal_tp_fp.py
  35. 160 0
      examples/industrial_quality_inspection/compare.py
  36. 14 0
      examples/industrial_quality_inspection/dataset.md
  37. 26 0
      examples/industrial_quality_inspection/error_analysis.py
  38. 114 0
      examples/industrial_quality_inspection/gpu_solution.md
  39. BIN
      examples/industrial_quality_inspection/image/after_clahe.png
  40. BIN
      examples/industrial_quality_inspection/image/allclasses_analysis_example.png
  41. BIN
      examples/industrial_quality_inspection/image/before_clahe.png
  42. BIN
      examples/industrial_quality_inspection/image/budaodian_analysis_example.png
  43. BIN
      examples/industrial_quality_inspection/image/cahua_analysis_example.png
  44. BIN
      examples/industrial_quality_inspection/image/compare_budaodian-116.jpg
  45. BIN
      examples/industrial_quality_inspection/image/image-level_tp_fp.png
  46. BIN
      examples/industrial_quality_inspection/image/jiaoweiloudi_analysis_example.png
  47. BIN
      examples/industrial_quality_inspection/image/jupi_analysis_example.png
  48. BIN
      examples/industrial_quality_inspection/image/loudi_analysis_example.png
  49. BIN
      examples/industrial_quality_inspection/image/penliu_analysis_example.png
  50. BIN
      examples/industrial_quality_inspection/image/qikeng_analysis_example.png
  51. BIN
      examples/industrial_quality_inspection/image/qipao_analysis_example.png
  52. BIN
      examples/industrial_quality_inspection/image/visualize_budaodian-116.jpg
  53. BIN
      examples/industrial_quality_inspection/image/zangdian_analysis_example.png
  54. BIN
      examples/industrial_quality_inspection/image/zase_analysis_example.png
  55. 61 0
      examples/industrial_quality_inspection/params_analysis.py
  56. 35 0
      examples/industrial_quality_inspection/predict.py
  57. 102 0
      examples/industrial_quality_inspection/tp_fp_list.md
  58. 58 0
      examples/industrial_quality_inspection/train_pruned_yolov3.py
  59. 74 0
      examples/industrial_quality_inspection/train_rcnn.py
  60. 59 0
      examples/industrial_quality_inspection/train_yolov3.py
  61. 14 1
      paddlex/command.py
  62. 15 4
      paddlex/cv/datasets/voc.py
  63. 21 12
      paddlex/cv/models/base.py
  64. 17 5
      paddlex/cv/models/deeplabv3p.py
  65. 139 16
      paddlex/cv/models/faster_rcnn.py
  66. 28 9
      paddlex/cv/models/mask_rcnn.py
  67. 30 9
      paddlex/cv/models/ppyolo.py
  68. 9 0
      paddlex/cv/models/slim/prune_config.py
  69. 239 0
      paddlex/cv/models/utils/detection_eval.py
  70. 4 1
      paddlex/cv/models/yolo_v3.py
  71. 109 59
      paddlex/cv/nets/detection/bbox_head.py
  72. 63 16
      paddlex/cv/nets/detection/faster_rcnn.py
  73. 121 0
      paddlex/cv/nets/detection/loss/diou_loss.py
  74. 145 0
      paddlex/cv/nets/detection/loss/giou_loss.py
  75. 8 3
      paddlex/cv/nets/detection/mask_rcnn.py
  76. 696 0
      paddlex/cv/nets/detection/ops.py
  77. 100 38
      paddlex/cv/nets/detection/rpn_head.py
  78. 8 3
      paddlex/cv/nets/detection/yolo_v3.py
  79. 3 6
      paddlex/cv/nets/resnet.py
  80. 6 1
      paddlex/cv/transforms/__init__.py
  81. 76 9
      paddlex/cv/transforms/det_transforms.py
  82. 4 0
      paddlex/cv/transforms/ops.py
  83. 30 17
      paddlex/cv/transforms/seg_transforms.py
  84. 18 13
      paddlex/deploy.py
  85. 3 0
      paddlex/det.py
  86. 1 0
      paddlex/tools/__init__.py
  87. 5 2
      paddlex/tools/base.py
  88. 2 1
      paddlex/tools/convert.py
  89. 15 0
      paddlex/tools/dataset_generate/__init__.py
  90. 208 0
      paddlex/tools/dataset_generate/det.py
  91. 52 23
      paddlex/tools/x2voc.py

+ 6 - 2
deploy/cpp/demo/classifier.cpp

@@ -92,7 +92,9 @@ int main(int argc, char** argv) {
       for (int j = i; j < im_vec_size; ++j) {
         im_vec[j - i] = std::move(cv::imread(image_paths[j], 1));
       }
-      model.predict(im_vec, &results, thread_num);
+      if (!model.predict(im_vec, &results, thread_num)) {
+        return -1;
+      }
       for (int j = i; j < im_vec_size; ++j) {
         std::cout << "Path:" << image_paths[j]
                   << ", predict label: " << results[j - i].category
@@ -103,7 +105,9 @@ int main(int argc, char** argv) {
   } else {
     PaddleX::ClsResult result;
     cv::Mat im = cv::imread(FLAGS_image, 1);
-    model.predict(im, &result);
+    if (!model.predict(im, &result)) {
+      return -1;
+    }
     std::cout << "Predict label: " << result.category
               << ", label_id:" << result.category_id
               << ", score: " << result.score << std::endl;

+ 6 - 2
deploy/cpp/demo/detector.cpp

@@ -95,7 +95,9 @@ int main(int argc, char** argv) {
       for (int j = i; j < im_vec_size; ++j) {
         im_vec[j - i] = std::move(cv::imread(image_paths[j], 1));
       }
-      model.predict(im_vec, &results, thread_num);
+      if (!model.predict(im_vec, &results, thread_num)) {
+        return -1;
+      }
       // Output predicted bounding boxes
       for (int j = 0; j < im_vec_size - i; ++j) {
         for (int k = 0; k < results[j].boxes.size(); ++k) {
@@ -123,7 +125,9 @@ int main(int argc, char** argv) {
   } else {
     PaddleX::DetResult result;
     cv::Mat im = cv::imread(FLAGS_image, 1);
-    model.predict(im, &result);
+    if (!model.predict(im, &result)) {
+      return -1;
+    }
     // Output predicted bounding boxes
     for (int i = 0; i < result.boxes.size(); ++i) {
       std::cout << "image file: " << FLAGS_image << std::endl;

+ 6 - 2
deploy/cpp/demo/segmenter.cpp

@@ -91,7 +91,9 @@ int main(int argc, char** argv) {
       for (int j = i; j < im_vec_size; ++j) {
         im_vec[j - i] = std::move(cv::imread(image_paths[j], 1));
       }
-      model.predict(im_vec, &results, thread_num);
+      if (!model.predict(im_vec, &results, thread_num)) {
+        return -1;
+      }
       // Visualize results
       for (int j = 0; j < im_vec_size - i; ++j) {
         cv::Mat vis_img =
@@ -105,7 +107,9 @@ int main(int argc, char** argv) {
   } else {
     PaddleX::SegResult result;
     cv::Mat im = cv::imread(FLAGS_image, 1);
-    model.predict(im, &result);
+    if (!model.predict(im, &result)) {
+      return -1;
+    }
     // Visualize results
     cv::Mat vis_img = PaddleX::Visualize(im, result, model.labels);
     std::string save_path =

+ 4 - 2
deploy/cpp/demo/video_classifier.cpp

@@ -103,7 +103,7 @@ int main(int argc, char** argv) {
     if (FLAGS_use_camera) {
       video_fourcc = 828601953;
     } else {
-      video_fourcc = static_cast<int>(capture.get(CV_CAP_PROP_FOURCC));
+      video_fourcc = CV_FOURCC('M', 'J', 'P', 'G');
     }
 
     if (FLAGS_use_camera) {
@@ -140,7 +140,9 @@ int main(int argc, char** argv) {
       break;
     }
     // Begin to predict
-    model.predict(frame, &result);
+    if (!model.predict(frame, &result)) {
+      return -1;
+    }
     // Visualize results
     cv::Mat vis_img = frame.clone();
     auto colormap = PaddleX::GenerateColorMap(model.labels.size());

+ 4 - 2
deploy/cpp/demo/video_detector.cpp

@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
     if (FLAGS_use_camera) {
       video_fourcc = 828601953;
     } else {
-      video_fourcc = static_cast<int>(capture.get(CV_CAP_PROP_FOURCC));
+      video_fourcc = CV_FOURCC('M', 'J', 'P', 'G');
     }
 
     if (FLAGS_use_camera) {
@@ -141,7 +141,9 @@ int main(int argc, char** argv) {
       break;
     }
     // Begin to predict
-    model.predict(frame, &result);
+    if (!model.predict(frame, &result)) {
+      return -1;
+    }
     // Visualize results
     cv::Mat vis_img =
         PaddleX::Visualize(frame, result, model.labels, FLAGS_threshold);

+ 4 - 2
deploy/cpp/demo/video_segmenter.cpp

@@ -103,7 +103,7 @@ int main(int argc, char** argv) {
     if (FLAGS_use_camera) {
       video_fourcc = 828601953;
     } else {
-      video_fourcc = static_cast<int>(capture.get(CV_CAP_PROP_FOURCC));
+      video_fourcc = CV_FOURCC('M', 'J', 'P', 'G');
     }
 
     if (FLAGS_use_camera) {
@@ -140,7 +140,9 @@ int main(int argc, char** argv) {
       break;
     }
     // Begin to predict
-    model.predict(frame, &result);
+    if (!model.predict(frame, &result)) {
+      return -1;
+    }
     // Visualize results
     cv::Mat vis_img = PaddleX::Visualize(frame, result, model.labels);
     if (FLAGS_show_result || FLAGS_use_camera) {

+ 6 - 0
deploy/cpp/include/paddlex/transforms.h

@@ -234,6 +234,12 @@ class Padding : public Transform {
     }
   }
   virtual bool Run(cv::Mat* im, ImageBlob* data);
+  virtual void GeneralPadding(cv::Mat* im,
+                              const std::vector<float> &padding_val,
+                              int padding_w, int padding_h);
+  virtual void MultichannelPadding(cv::Mat* im,
+                                   const std::vector<float> &padding_val,
+                                   int padding_w, int padding_h);
 
  private:
   int coarsest_stride_ = -1;

+ 4 - 1
deploy/cpp/src/paddlex.cpp

@@ -171,7 +171,10 @@ bool Model::predict(const cv::Mat& im, ClsResult* result) {
   inputs_.clear();
   if (type == "detector") {
     std::cerr << "Loading model is a 'detector', DetResult should be passed to "
-                 "function predict()!"
+                 "function predict()!" << std::endl;
+    return false;
+  } else if (type == "segmenter") {
+    std::cerr << "Loading model is a 'segmenter', SegResult should be passed "
                  "to function predict()!" << std::endl;
     return false;
   }

+ 51 - 13
deploy/cpp/src/transforms.cpp

@@ -36,9 +36,11 @@ bool Normalize::Run(cv::Mat* im, ImageBlob* data) {
 
   std::vector<cv::Mat> split_im;
   cv::split(*im, split_im);
+  #pragma omp parallel for num_threads(im->channels())
   for (int c = 0; c < im->channels(); c++) {
+    float range_val = max_val_[c] - min_val_[c];
     cv::subtract(split_im[c], cv::Scalar(min_val_[c]), split_im[c]);
-    cv::divide(split_im[c], cv::Scalar(range_val[c]), split_im[c]);
+    cv::divide(split_im[c], cv::Scalar(range_val), split_im[c]);
     cv::subtract(split_im[c], cv::Scalar(mean_[c]), split_im[c]);
     cv::divide(split_im[c], cv::Scalar(std_[c]), split_im[c]);
   }
@@ -92,6 +94,50 @@ bool CenterCrop::Run(cv::Mat* im, ImageBlob* data) {
   return true;
 }
 
+void Padding::GeneralPadding(cv::Mat* im,
+                             const std::vector<float> &padding_val,
+                             int padding_w, int padding_h) {
+  cv::Scalar value;
+  if (im->channels() == 1) {
+    value = cv::Scalar(padding_val[0]);
+  } else if (im->channels() == 2) {
+    value = cv::Scalar(padding_val[0], padding_val[1]);
+  } else if (im->channels() == 3) {
+    value = cv::Scalar(padding_val[0], padding_val[1], padding_val[2]);
+  } else if (im->channels() == 4) {
+    value = cv::Scalar(padding_val[0], padding_val[1], padding_val[2],
+                                  padding_val[3]);
+  }
+  cv::copyMakeBorder(
+  *im,
+  *im,
+  0,
+  padding_h,
+  0,
+  padding_w,
+  cv::BORDER_CONSTANT,
+  value);
+}
+
+void Padding::MultichannelPadding(cv::Mat* im,
+                                  const std::vector<float> &padding_val,
+                                  int padding_w, int padding_h) {
+  std::vector<cv::Mat> padded_im_per_channel(im->channels());
+  #pragma omp parallel for num_threads(im->channels())
+  for (size_t i = 0; i < im->channels(); i++) {
+    const cv::Mat per_channel = cv::Mat(im->rows + padding_h,
+                                        im->cols + padding_w,
+                                        CV_32FC1,
+                                        cv::Scalar(padding_val[i]));
+    padded_im_per_channel[i] = per_channel;
+  }
+  cv::Mat padded_im;
+  cv::merge(padded_im_per_channel, padded_im);
+  cv::Rect im_roi = cv::Rect(0, 0, im->cols, im->rows);
+  im->copyTo(padded_im(im_roi));
+  *im = padded_im;
+}
+
 bool Padding::Run(cv::Mat* im, ImageBlob* data) {
   data->im_size_before_resize_.push_back({im->rows, im->cols});
   data->reshape_order_.push_back("padding");
@@ -116,19 +162,11 @@ bool Padding::Run(cv::Mat* im, ImageBlob* data) {
               << ", but they should be greater than 0." << std::endl;
     return false;
   }
-  std::vector<cv::Mat> padded_im_per_channel;
-  for (size_t i = 0; i < im->channels(); i++) {
-    const cv::Mat per_channel = cv::Mat(im->rows + padding_h,
-                                        im->cols + padding_w,
-                                        CV_32FC1,
-                                        cv::Scalar(im_value_[i]));
-    padded_im_per_channel.push_back(per_channel);
+  if (im->channels() < 5) {
+    Padding::GeneralPadding(im, im_value_, padding_w, padding_h);
+  } else {
+    Padding::MultichannelPadding(im, im_value_, padding_w, padding_h);
   }
-  cv::Mat padded_im;
-  cv::merge(padded_im_per_channel, padded_im);
-  cv::Rect im_roi = cv::Rect(0, 0, im->cols, im->rows);
-  im->copyTo(padded_im(im_roi));
-  *im = padded_im;
   data->new_im_size_[0] = im->rows;
   data->new_im_size_[1] = im->cols;
 

+ 6 - 2
deploy/openvino/demo/classifier.cpp

@@ -59,7 +59,9 @@ int main(int argc, char** argv) {
     while (getline(inf, image_path)) {
       PaddleX::ClsResult result;
       cv::Mat im = cv::imread(image_path, 1);
-      model.predict(im, &result);
+      if (!model.predict(im, &result)) {
+        return -1;
+      }
       std::cout << "Predict label: " << result.category
                 << ", label_id:" << result.category_id
                 << ", score: " << result.score << std::endl;
@@ -67,7 +69,9 @@ int main(int argc, char** argv) {
   } else {
     PaddleX::ClsResult result;
     cv::Mat im = cv::imread(FLAGS_image, 1);
-    model.predict(im, &result);
+    if (!model.predict(im, &result)) {
+      return -1;
+    }
     std::cout << "Predict label: " << result.category
               << ", label_id:" << result.category_id
               << ", score: " << result.score << std::endl;

+ 6 - 2
deploy/openvino/demo/detector.cpp

@@ -70,7 +70,9 @@ int main(int argc, char** argv) {
     while (getline(inf, image_path)) {
       PaddleX::DetResult result;
       cv::Mat im = cv::imread(image_path, 1);
-      model.predict(im, &result);
+      if (!model.predict(im, &result)) {
+        return -1;
+      }
       if (FLAGS_save_dir != "") {
         cv::Mat vis_img = PaddleX::Visualize(
           im, result, model.labels, colormap, FLAGS_threshold);
@@ -83,7 +85,9 @@ int main(int argc, char** argv) {
   } else {
   PaddleX::DetResult result;
   cv::Mat im = cv::imread(FLAGS_image, 1);
-  model.predict(im, &result);
+  if (!model.predict(im, &result)) {
+    return -1;
+  }
   for (int i = 0; i < result.boxes.size(); ++i) {
       std::cout << "image file: " << FLAGS_image << std::endl;
       std::cout << ", predict label: " << result.boxes[i].category

+ 6 - 2
deploy/openvino/demo/segmenter.cpp

@@ -64,7 +64,9 @@ int main(int argc, char** argv) {
     while (getline(inf, image_path)) {
       PaddleX::SegResult result;
       cv::Mat im = cv::imread(image_path, 1);
-      model.predict(im, &result);
+      if (!model.predict(im, &result)) {
+        return -1;
+      }
       if (FLAGS_save_dir != "") {
       cv::Mat vis_img = PaddleX::Visualize(im, result, model.labels, colormap);
         std::string save_path =
@@ -76,7 +78,9 @@ int main(int argc, char** argv) {
   } else {
     PaddleX::SegResult result;
     cv::Mat im = cv::imread(FLAGS_image, 1);
-    model.predict(im, &result);
+    if (!model.predict(im, &result)) {
+      return -1;
+    }
     if (FLAGS_save_dir != "") {
       cv::Mat vis_img = PaddleX::Visualize(im, result, model.labels, colormap);
       std::string save_path =

+ 6 - 0
deploy/openvino/include/paddlex/transforms.h

@@ -213,6 +213,12 @@ class Padding : public Transform {
   }
 
   virtual bool Run(cv::Mat* im, ImageBlob* data);
+  virtual void GeneralPadding(cv::Mat* im,
+                              const std::vector<float> &padding_val,
+                              int padding_w, int padding_h);
+  virtual void MultichannelPadding(cv::Mat* im,
+                                   const std::vector<float> &padding_val,
+                                   int padding_w, int padding_h);
 
  private:
   int coarsest_stride_ = -1;

+ 50 - 14
deploy/openvino/src/transforms.cpp

@@ -38,9 +38,11 @@ bool Normalize::Run(cv::Mat* im, ImageBlob* data) {
 
   std::vector<cv::Mat> split_im;
   cv::split(*im, split_im);
+  #pragma omp parallel for num_threads(im->channels())
   for (int c = 0; c < im->channels(); c++) {
+    float range_val = max_val_[c] - min_val_[c];
     cv::subtract(split_im[c], cv::Scalar(min_val_[c]), split_im[c]);
-    cv::divide(split_im[c], cv::Scalar(range_val[c]), split_im[c]);
+    cv::divide(split_im[c], cv::Scalar(range_val), split_im[c]);
     cv::subtract(split_im[c], cv::Scalar(mean_[c]), split_im[c]);
     cv::divide(split_im[c], cv::Scalar(std_[c]), split_im[c]);
   }
@@ -95,6 +97,49 @@ bool CenterCrop::Run(cv::Mat* im, ImageBlob* data) {
   return true;
 }
 
+void Padding::GeneralPadding(cv::Mat* im,
+                             const std::vector<float> &padding_val,
+                             int padding_w, int padding_h) {
+  cv::Scalar value;
+  if (im->channels() == 1) {
+    value = cv::Scalar(padding_val[0]);
+  } else if (im->channels() == 2) {
+    value = cv::Scalar(padding_val[0], padding_val[1]);
+  } else if (im->channels() == 3) {
+    value = cv::Scalar(padding_val[0], padding_val[1], padding_val[2]);
+  } else if (im->channels() == 4) {
+    value = cv::Scalar(padding_val[0], padding_val[1], padding_val[2],
+                                  padding_val[3]);
+  }
+  cv::copyMakeBorder(
+  *im,
+  *im,
+  0,
+  padding_h,
+  0,
+  padding_w,
+  cv::BORDER_CONSTANT,
+  value);
+}
+
+void Padding::MultichannelPadding(cv::Mat* im,
+                                  const std::vector<float> &padding_val,
+                                  int padding_w, int padding_h) {
+  std::vector<cv::Mat> padded_im_per_channel(im->channels());
+  #pragma omp parallel for num_threads(im->channels())
+  for (size_t i = 0; i < im->channels(); i++) {
+    const cv::Mat per_channel = cv::Mat(im->rows + padding_h,
+                                        im->cols + padding_w,
+                                        CV_32FC1,
+                                        cv::Scalar(padding_val[i]));
+    padded_im_per_channel[i] = per_channel;
+  }
+  cv::Mat padded_im;
+  cv::merge(padded_im_per_channel, padded_im);
+  cv::Rect im_roi = cv::Rect(0, 0, im->cols, im->rows);
+  im->copyTo(padded_im(im_roi));
+  *im = padded_im;
+}
 
 bool Padding::Run(cv::Mat* im, ImageBlob* data) {
   data->im_size_before_resize_.push_back({im->rows, im->cols});
@@ -120,19 +165,11 @@ bool Padding::Run(cv::Mat* im, ImageBlob* data) {
               << ", but they should be greater than 0." << std::endl;
     return false;
   }
-  std::vector<cv::Mat> padded_im_per_channel;
-  for (size_t i = 0; i < im->channels(); i++) {
-    const cv::Mat per_channel = cv::Mat(im->rows + padding_h,
-                                        im->cols + padding_w,
-                                        CV_32FC1,
-                                        cv::Scalar(im_value_[i]));
-    padded_im_per_channel.push_back(per_channel);
+  if (im->channels() < 5) {
+    Padding::GeneralPadding(im, im_value_, padding_w, padding_h);
+  } else {
+    Padding::MultichannelPadding(im, im_value_, padding_w, padding_h);
   }
-  cv::Mat padded_im;
-  cv::merge(padded_im_per_channel, padded_im);
-  cv::Rect im_roi = cv::Rect(0, 0, im->cols, im->rows);
-  im->copyTo(padded_im(im_roi));
-  *im = padded_im;
   data->new_im_size_[0] = im->rows;
   data->new_im_size_[1] = im->cols;
 
@@ -219,7 +256,6 @@ void Transforms::Init(
     if (name == "ArrangeYOLOv3") {
       continue;
     }
-    std::cout << "trans name: " << name << std::endl;
     std::shared_ptr<Transform> transform = CreateTransform(name);
     transform->Init(item.begin()->second);
     transforms_.push_back(transform);

+ 0 - 48
docs/apis/analysis.md

@@ -1,48 +0,0 @@
-# 数据集分析
-
-## paddlex.datasets.analysis.Seg
-```python
-paddlex.datasets.analysis.Seg(data_dir, file_list, label_list)
-```
-
-构建统计分析语义分类数据集的分析器。
-
-> **参数**
-> > * **data_dir** (str): 数据集所在的目录路径。  
-> > * **file_list** (str): 描述数据集图片文件和类别id的文件路径(文本内每行路径为相对`data_dir`的相对路径)。  
-> > * **label_list** (str): 描述数据集包含的类别信息文件路径。  
-
-### analysis
-```python
-analysis(self)
-```
-
-Seg分析器的分析接口,完成以下信息的分析统计:
-
-> * 图像数量
-> * 图像最大和最小的尺寸
-> * 图像通道数量
-> * 图像各通道的最小值和最大值
-> * 图像各通道的像素值分布
-> * 图像各通道归一化后的均值和方差
-> * 标注图中各类别的数量及比重
-
-[代码示例](https://github.com/PaddlePaddle/PaddleX/blob/develop/examples/multi-channel_remote_sensing/tools/analysis.py)
-
-[统计信息示例](../examples/multi-channel_remote_sensing/analysis.html#id2)
-
-### cal_clipped_mean_std
-```python
-cal_clipped_mean_std(self, clip_min_value, clip_max_value, data_info_file)
-```
-
-Seg分析器用于计算图像截断后的均值和方差的接口。
-
-> **参数**
-> > * **clip_min_value** (list):  截断的下限,小于min_val的数值均设为min_val。
-> > * **clip_max_value** (list): 截断的上限,大于max_val的数值均设为max_val。
-> > * **data_info_file** (str): 在analysis()接口中保存的分析结果文件(名为`train_information.pkl`)的路径。
-
-[代码示例](https://github.com/PaddlePaddle/PaddleX/blob/develop/examples/multi-channel_remote_sensing/tools/cal_clipped_mean_std.py)
-
-[计算结果示例](../../examples/multi-channel_remote_sensing/analysis.html#id4)

+ 18 - 0
docs/apis/datasets.md

@@ -41,6 +41,15 @@ paddlex.datasets.VOCDetection(data_dir, file_list, label_list, transforms=None,
 > > * **parallel_method** (str): 数据集中样本在预处理过程中并行处理的方式,支持'thread'线程和'process'进程两种方式。默认为'process'(Windows和Mac下会强制使用thread,该参数无效)。  
 > > * **shuffle** (bool): 是否需要对数据集中样本打乱顺序。默认为False。  
 
+### add_negative_samples(self, image_dir)
+
+> **将背景图片加入训练**
+
+> > * **image_dir** (str): 背景图片所在的文件夹目录。
+
+> 示例:[代码文件](https://github.com/PaddlePaddle/PaddleX/tree/develop/examples/industrial_quality_inspection/train_rcnn.py#L45)
+
+
 ## paddlex.datasets.CocoDetection
 > **用于实例分割/目标检测模型**  
 ```
@@ -61,6 +70,15 @@ paddlex.datasets.CocoDetection(data_dir, ann_file, transforms=None, num_workers=
 > > * **parallel_method** (str): 数据集中样本在预处理过程中并行处理的方式,支持'thread'线程和'process'进程两种方式。默认为'process'(Windows和Mac下会强制使用thread,该参数无效)。  
 > > * **shuffle** (bool): 是否需要对数据集中样本打乱顺序。默认为False。  
 
+### add_negative_samples(self, image_dir)
+
+> **将背景图片加入训练**
+
+> > * **image_dir** (str): 背景图片所在的文件夹目录。
+
+> 示例:[代码文件](https://github.com/PaddlePaddle/PaddleX/tree/develop/examples/industrial_quality_inspection/train_rcnn.py#L45)
+
+
 ## paddlex.datasets.SegDataset
 > **用于语义分割模型**  
 ```

BIN
docs/apis/images/detection_analysis.jpg


BIN
docs/apis/images/insect_bbox-allclass-allarea.png


+ 1 - 1
docs/apis/index.rst

@@ -6,7 +6,7 @@ API接口说明
 
    transforms/index.rst
    datasets.md
-   analysis.md
+   tools.md
    models/index.rst
    slim.md
    visualize.md

+ 27 - 9
docs/apis/models/detection.md

@@ -3,7 +3,7 @@
 ## paddlex.det.PPYOLO
 
 ```python
-paddlex.det.PPYOLO(num_classes=80, backbone='ResNet50_vd_ssld', with_dcn_v2=True, anchors=None, anchor_masks=None, use_coord_conv=True, use_iou_aware=True, use_spp=True, use_drop_block=True, scale_x_y=1.05, ignore_threshold=0.7, label_smooth=False, use_iou_loss=True, use_matrix_nms=True, nms_score_threshold=0.01, nms_topk=1000, nms_keep_topk=100, nms_iou_threshold=0.45, train_random_shapes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608])
+paddlex.det.PPYOLO(num_classes=80, backbone='ResNet50_vd_ssld', with_dcn_v2=True, anchors=None, anchor_masks=None, use_coord_conv=True, use_iou_aware=True, use_spp=True, use_drop_block=True, scale_x_y=1.05, ignore_threshold=0.7, label_smooth=False, use_iou_loss=True, use_matrix_nms=True, nms_score_threshold=0.01, nms_topk=1000, nms_keep_topk=100, nms_iou_threshold=0.45, train_random_shapes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], input_channel=3)
 ```
 
 > 构建PPYOLO检测器。**注意在PPYOLO,num_classes不需要包含背景类,如目标包括human、dog两种,则num_classes设为2即可,这里与FasterRCNN/MaskRCNN有差别**
@@ -32,6 +32,7 @@ paddlex.det.PPYOLO(num_classes=80, backbone='ResNet50_vd_ssld', with_dcn_v2=True
 > > - **nms_iou_threshold** (float): 进行NMS时,用于剔除检测框IOU的阈值。默认为0.45。
 > > - **label_smooth** (bool): 是否使用label smooth。默认值为False。
 > > - **train_random_shapes** (list|tuple): 训练时从列表中随机选择图像大小。默认值为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。
+> > - **input_channel** (int): 输入图像的通道数量。默认为3。
 
 ### train
 
@@ -85,7 +86,7 @@ evaluate(self, eval_dataset, batch_size=1, epoch_id=None, metric=None, return_de
 > >
 >  **返回值**
 >
-> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。
+> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含bbox和gt两个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。而关键字gt的键值是真实标注框的相关信息。
 
 ### predict
 
@@ -93,7 +94,7 @@ evaluate(self, eval_dataset, batch_size=1, epoch_id=None, metric=None, return_de
 predict(self, img_file, transforms=None)
 ```
 
-> PPYOLO模型预测接口。需要注意的是,只有在训练过程中定义了eval_dataset,模型在保存时才会将预测时的图像处理流程保存在`YOLOv3.test_transforms`和`YOLOv3.eval_transforms`中。如未在训练时定义eval_dataset,那在调用预测`predict`接口时,用户需要再重新定义`test_transforms`传入给`predict`接口
+> PPYOLO模型预测接口。需要注意的是,只有在训练过程中定义了eval_dataset,模型在保存时才会将预测时的图像处理流程保存在`PPYOLO.test_transforms`和`PPYOLO.eval_transforms`中。如未在训练时定义eval_dataset,那在调用预测`predict`接口时,用户需要再重新定义`test_transforms`传入给`predict`接口
 
 > **参数**
 >
@@ -111,7 +112,7 @@ predict(self, img_file, transforms=None)
 batch_predict(self, img_file_list, transforms=None)
 ```
 
-> PPYOLO模型批量预测接口。需要注意的是,只有在训练过程中定义了eval_dataset,模型在保存时才会将预测时的图像处理流程保存在`YOLOv3.test_transforms`和`YOLOv3.eval_transforms`中。如未在训练时定义eval_dataset,那在调用预测`batch_predict`接口时,用户需要再重新定义`test_transforms`传入给`batch_predict`接口
+> PPYOLO模型批量预测接口。需要注意的是,只有在训练过程中定义了eval_dataset,模型在保存时才会将预测时的图像处理流程保存在`PPYOLO.test_transforms`和`PPYOLO.eval_transforms`中。如未在训练时定义eval_dataset,那在调用预测`batch_predict`接口时,用户需要再重新定义`test_transforms`传入给`batch_predict`接口
 
 > **参数**
 >
@@ -126,7 +127,7 @@ batch_predict(self, img_file_list, transforms=None)
 ## paddlex.det.YOLOv3
 
 ```python
-paddlex.det.YOLOv3(num_classes=80, backbone='MobileNetV1', anchors=None, anchor_masks=None, ignore_threshold=0.7, nms_score_threshold=0.01, nms_topk=1000, nms_keep_topk=100, nms_iou_threshold=0.45, label_smooth=False, train_random_shapes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608])
+paddlex.det.YOLOv3(num_classes=80, backbone='MobileNetV1', anchors=None, anchor_masks=None, ignore_threshold=0.7, nms_score_threshold=0.01, nms_topk=1000, nms_keep_topk=100, nms_iou_threshold=0.45, label_smooth=False, train_random_shapes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], input_channel=3)
 ```
 
 > 构建YOLOv3检测器。**注意在YOLOv3,num_classes不需要包含背景类,如目标包括human、dog两种,则num_classes设为2即可,这里与FasterRCNN/MaskRCNN有差别**
@@ -147,6 +148,7 @@ paddlex.det.YOLOv3(num_classes=80, backbone='MobileNetV1', anchors=None, anchor_
 > > - **nms_iou_threshold** (float): 进行NMS时,用于剔除检测框IoU的阈值。默认为0.45。
 > > - **label_smooth** (bool): 是否使用label smooth。默认值为False。
 > > - **train_random_shapes** (list|tuple): 训练时从列表中随机选择图像大小。默认值为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。
+> > - **input_channel** (int): 输入图像的通道数量。默认为3。
 
 ### train
 
@@ -198,7 +200,7 @@ evaluate(self, eval_dataset, batch_size=1, epoch_id=None, metric=None, return_de
 > >
 >  **返回值**
 >
-> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。
+> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含bbox和gt两个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。而关键字gt的键值是真实标注框的相关信息。
 
 ### predict
 
@@ -240,8 +242,7 @@ batch_predict(self, img_file_list, transforms=None)
 ## paddlex.det.FasterRCNN
 
 ```python
-paddlex.det.FasterRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspect_ratios=[0.5, 1.0, 2.0], anchor_sizes=[32, 64, 128, 256, 512])
-
+paddlex.det.FasterRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspect_ratios=[0.5, 1.0, 2.0], anchor_sizes=[32, 64, 128, 256, 512], with_dcn=False, rpn_cls_loss='SigmoidCrossEntropy', rpn_focal_loss_alpha=0.25, rpn_focal_loss_gamma=2, rcnn_bbox_loss='SmoothL1Loss', rcnn_nms='MultiClassNMS', keep_top_k=100, nms_threshold=0.5, score_threshold=0.05, softnms_sigma=0.5, bbox_assigner='BBoxAssigner', fpn_num_channels=256, input_channel=3, rpn_batch_size_per_im=256, rpn_fg_fraction=0.5, test_pre_nms_top_n=None, test_post_nms_top_n=1000)
 ```
 
 > 构建FasterRCNN检测器。 **注意在FasterRCNN中,num_classes需要设置为类别数+背景类,如目标包括human、dog两种,则num_classes需设为3,多的一种为背景background类别**
@@ -253,6 +254,23 @@ paddlex.det.FasterRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspec
 > > - **with_fpn** (bool): 是否使用FPN结构。默认为True。
 > > - **aspect_ratios** (list): 生成anchor高宽比的可选值。默认为[0.5, 1.0, 2.0]。
 > > - **anchor_sizes** (list): 生成anchor大小的可选值。默认为[32, 64, 128, 256, 512]。
+> > - **with_dcn** (bool): backbone网络中是否使用deformable convolution network v2。默认为False。
+> > - **rpn_cls_loss** (str): RPN部分的分类损失函数,取值范围为['SigmoidCrossEntropy', 'SigmoidFocalLoss']。当遇到模型误检了很多背景区域时,可以考虑使用'SigmoidFocalLoss',并调整适合的`rpn_focal_loss_alpha`和`rpn_focal_loss_gamma`。默认为'SigmoidCrossEntropy'。
+> > - **rpn_focal_loss_alpha** (float):当RPN的分类损失函数设置为'SigmoidFocalLoss'时,用于调整正样本和负样本的比例因子,默认为0.25。当PN的分类损失函数设置为'SigmoidCrossEntropy'时,`rpn_focal_loss_alpha`的设置不生效。
+> > - **rpn_focal_loss_gamma** (float): 当RPN的分类损失函数设置为'SigmoidFocalLoss'时,用于调整易分样本和难分样本的比例因子,默认为2。当RPN的分类损失函数设置为'SigmoidCrossEntropy'时,`rpn_focal_loss_gamma`的设置不生效。
+> > - **rcnn_bbox_loss** (str): RCNN部分的位置回归损失函数,取值范围为['SmoothL1Loss', 'CIoULoss']。默认为'SmoothL1Loss'。
+> > - **rcnn_nms** (str): RCNN部分的非极大值抑制的计算方法,取值范围为['MultiClassNMS', 'MultiClassSoftNMS','MultiClassCiouNMS']。默认为'MultiClassNMS'。当选择'MultiClassNMS'时,可以将`keep_top_k`设置成100、`nms_threshold`设置成0.5、`score_threshold`设置成0.05。当选择'MultiClassSoftNMS'时,可以将`keep_top_k`设置为300、`score_threshold`设置为0.01、`softnms_sigma`设置为0.5。当选择'MultiClassCiouNMS'时,可以将`keep_top_k`设置为100、`score_threshold`设置成0.05、`nms_threshold`设置成0.5。
+> > - **keep_top_k** (int): RCNN部分在进行非极大值抑制计算后,每张图像保留最多保存`keep_top_k`个检测框。默认为100。
+> > - **nms_threshold** (float): RCNN部分在进行非极大值抑制时,用于剔除检测框所需的IoU阈值。当`rcnn_nms`设置为`MultiClassSoftNMS`时,`nms_threshold`的设置不生效。默认为0.5。
+> > - **score_threshold** (float): RCNN部分在进行非极大值抑制前,用于过滤掉低置信度边界框所需的置信度阈值。默认为0.05。
+> > - **softnms_sigma** (float): 当`rcnn_nms`设置为`MultiClassSoftNMS`时,用于调整被抑制的检测框的置信度,调整公式为`score = score * weights, weights = exp(-(iou * iou) / softnms_sigma)`。默认设为0.5。
+> > - **bbox_assigner** (str): 训练阶段,RCNN部分生成正负样本的采样方式。可选范围为['BBoxAssigner', 'LibraBBoxAssigner']。当目标物体的区域只占原始图像的一小部分时,可以考虑采用[LibraRCNN](https://arxiv.org/abs/1904.02701)中提出的IoU-balanced Sampling采样方式来获取更多的难分负样本,设置为'LibraBBoxAssigner'即可。默认为'BBoxAssigner'。
+> > - **fpn_num_channels** (int): FPN部分特征层的通道数量。默认为256。
+> > - **input_channel** (int): 输入图像的通道数量。默认为3。
+> > - **rpn_batch_size_per_im** (int): 训练阶段,RPN部分每张图片的正负样本的数量总和。默认为256。
+> > - **rpn_fg_fraction** (float): 训练阶段,RPN部分每张图片的正负样本数量总和中正样本的占比。默认为0.5。
+> > - **test_pre_nms_top_n** (int):预测阶段,RPN部分做非极大值抑制计算的候选框的数量。若设置为None, 有FPN结构的话,`test_pre_nms_top_n`会被设置成6000, 无FPN结构的话,`test_pre_nms_top_n`会被设置成1000。默认为None。
+> > - **test_post_nms_top_n** (int): 预测阶段,RPN部分做完非极大值抑制后保留的候选框的数量。默认为1000。
 
 ### train
 
@@ -302,7 +320,7 @@ evaluate(self, eval_dataset, batch_size=1, epoch_id=None, metric=None, return_de
 > >
 > **返回值**
 >
-> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。
+> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当`return_details`为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,分别表示平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含bbox和gt两个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。而关键字gt的键值是真实标注框的相关信息。
 
 ### predict
 

+ 3 - 2
docs/apis/models/instance_segmentation.md

@@ -3,7 +3,7 @@
 ## MaskRCNN
 
 ```python
-paddlex.det.MaskRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspect_ratios=[0.5, 1.0, 2.0], anchor_sizes=[32, 64, 128, 256, 512])
+paddlex.det.MaskRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspect_ratios=[0.5, 1.0, 2.0], anchor_sizes=[32, 64, 128, 256, 512], input_channel=3)
 
 ```
 
@@ -16,6 +16,7 @@ paddlex.det.MaskRCNN(num_classes=81, backbone='ResNet50', with_fpn=True, aspect_
 > > - **with_fpn** (bool): 是否使用FPN结构。默认为True。
 > > - **aspect_ratios** (list): 生成anchor高宽比的可选值。默认为[0.5, 1.0, 2.0]。
 > > - **anchor_sizes** (list): 生成anchor大小的可选值。默认为[32, 64, 128, 256, 512]。
+> > - **input_channel** (int): 输入图像的通道数量。默认为3。
 
 #### train
 
@@ -65,7 +66,7 @@ evaluate(self, eval_dataset, batch_size=1, epoch_id=None, metric=None, return_de
 > >
 > **返回值**
 >
-> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'和'segm_mmap'或者’bbox_map‘和'segm_map',分别表示预测框和分割区域平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含关键字:'bbox',对应元素预测框结果列表,每个预测结果由图像id、预测框类别id、预测框坐标、预测框得分;'mask',对应元素预测区域结果列表,每个预测结果由图像id、预测区域类别id、预测区域坐标、预测区域得分;’gt‘:真实标注框和标注区域相关信息。
+> > - **tuple** (metrics, eval_details) | **dict** (metrics): 当`return_details`为True时,返回(metrics, eval_details),当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'和'segm_mmap'或者’bbox_map‘和'segm_map',分别表示预测框和分割区域平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,包含`bbox`、`mask`和`gt`三个关键字。其中关键字`bbox`的键值是一个列表,列表中每个元素代表一个预测结果,一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。关键字`mask`的键值是一个列表,列表中每个元素代表各预测框内物体的分割结果,分割结果由图像id、预测框类别id、表示预测框内各像素点是否属于物体的二值图、预测框得分。而关键字gt的键值是真实标注框的相关信息。
 
 #### predict
 

+ 77 - 0
docs/apis/tools.md

@@ -0,0 +1,77 @@
+# 数据集工具
+
+## 数据集分析
+
+### paddlex.datasets.analysis.Seg
+```python
+paddlex.datasets.analysis.Seg(data_dir, file_list, label_list)
+```
+
+构建统计分析语义分类数据集的分析器。
+
+> **参数**
+> > * **data_dir** (str): 数据集所在的目录路径。  
+> > * **file_list** (str): 描述数据集图片文件和类别id的文件路径(文本内每行路径为相对`data_dir`的相对路径)。  
+> > * **label_list** (str): 描述数据集包含的类别信息文件路径。  
+
+#### analysis
+```python
+analysis(self)
+```
+
+Seg分析器的分析接口,完成以下信息的分析统计:
+
+> * 图像数量
+> * 图像最大和最小的尺寸
+> * 图像通道数量
+> * 图像各通道的最小值和最大值
+> * 图像各通道的像素值分布
+> * 图像各通道归一化后的均值和方差
+> * 标注图中各类别的数量及比重
+
+[代码示例](https://github.com/PaddlePaddle/PaddleX/blob/develop/examples/multi-channel_remote_sensing/tools/analysis.py)
+
+[统计信息示例](../../examples/multi-channel_remote_sensing/analysis.html#id2)
+
+#### cal_clipped_mean_std
+```python
+cal_clipped_mean_std(self, clip_min_value, clip_max_value, data_info_file)
+```
+
+Seg分析器用于计算图像截断后的均值和方差的接口。
+
+> **参数**
+> > * **clip_min_value** (list):  截断的下限,小于min_val的数值均设为min_val。
+> > * **clip_max_value** (list): 截断的上限,大于max_val的数值均设为max_val。
+> > * **data_info_file** (str): 在analysis()接口中保存的分析结果文件(名为`train_information.pkl`)的路径。
+
+[代码示例](https://github.com/PaddlePaddle/PaddleX/blob/develop/examples/multi-channel_remote_sensing/tools/cal_clipped_mean_std.py)
+
+[计算结果示例](../examples/multi-channel_remote_sensing/analysis.html#id4)
+
+## 数据集生成
+
+### paddlex.det.paste_objects
+```python
+paddlex.det.paste_objects(templates, background, save_dir='dataset_clone')
+```
+
+将目标物体粘贴在背景图片上生成新的图片和标注文件
+
+> **参数**
+> > * **templates** (list|tuple):可以将多张图像上的目标物体同时粘贴在同一个背景图片上,因此templates是一个列表,其中每个元素是一个dict,表示一张图片的目标物体。一张图片的目标物体有`image`和`annos`两个关键字,`image`的键值是图像的路径,或者是解码后的排列格式为(H, W, C)且类型为uint8且为BGR格式的数组。图像上可以有多个目标物体,因此`annos`的键值是一个列表,列表中每个元素是一个dict,表示一个目标物体的信息。该dict包含`polygon`和`category`两个关键字,其中`polygon`表示目标物体的边缘坐标,例如[[0, 0], [0, 1], [1, 1], [1, 0]],`category`表示目标物体的类别,例如'dog'。
+> > * **background** (dict): 背景图片可以有真值,因此background是一个dict,包含`image`和`annos`两个关键字,`image`的键值是背景图像的路径,或者是解码后的排列格式为(H, W, C)且类型为uint8且为BGR格式的数组。若背景图片上没有真值,则`annos`的键值是空列表[],若有,则`annos`的键值是由多个dict组成的列表,每个dict表示一个物体的信息,包含`bbox`和`category`两个关键字,`bbox`的键值是物体框左上角和右下角的坐标,即[x1, y1, x2, y2],`category`表示目标物体的类别,例如'dog'。
+> > * **save_dir** (str):新图片及其标注文件的存储目录。默认值为`dataset_clone`。
+
+> **代码示例**
+
+```python
+import paddlex as pdx
+templates = [{'image': 'dataset/JPEGImages/budaodian-10.jpg',
+              'annos': [{'polygon': [[146, 169], [909, 169], [909, 489], [146, 489]],
+                        'category': 'lou_di'},
+                        {'polygon': [[146, 169], [909, 169], [909, 489], [146, 489]],
+                        'category': 'lou_di'}]}]
+background = {'image': 'dataset/JPEGImages/budaodian-12.jpg', 'annos': []}
+pdx.det.paste_objects(templates, background, save_dir='dataset_clone')
+```

+ 14 - 3
docs/apis/transforms/det_transforms.md

@@ -32,13 +32,13 @@ paddlex.det.transforms.ResizeByShort(short_size=800, max_size=1333)
 
 根据图像的短边调整图像大小(resize)。  
 1. 获取图像的长边和短边长度。  
-2. 根据短边与short_size的比例,计算长边的目标长度,此时高、宽的resize比例为short_size/原图短边长度。  
+2. 根据短边与short_size的比例,计算长边的目标长度,此时高、宽的resize比例为short_size/原图短边长度。若short_size为数组,则随机从该数组中挑选一个数值作为short_size。
 3. 如果max_size>0,调整resize比例:
    如果长边的目标长度>max_size,则高、宽的resize比例为max_size/原图长边长度。
 4. 根据调整大小的比例对图像进行resize。
 
 ### 参数
-* **short_size** (int): 短边目标长度。默认为800。
+* **short_size** (int|list): 短边目标长度。默认为800。当需要做多尺度训练时,可以将`short_size`设置成数组,例如[500, 600, 700, 800]。
 * **max_size** (int): 长边目标长度的最大限制。默认为1333。
 
 ## Padding
@@ -122,7 +122,7 @@ paddlex.det.transforms.MixupImage(alpha=1.5, beta=1.5, mixup_epoch=-1)
 * **beta** (float): 随机beta分布的上限。默认为1.5。
 * **mixup_epoch** (int): 在前mixup_epoch轮使用mixup增强操作;当该参数为-1时,该策略不会生效。默认为-1。
 
-## RandomExpand
+## RandomExpand
 ```python
 paddlex.det.transforms.RandomExpand(ratio=4., prob=0.5, fill_value=[123.675, 116.28, 103.53])
 ```
@@ -168,6 +168,17 @@ paddlex.det.transforms.RandomCrop(aspect_ratio=[.5, 2.], thresholds=[.0, .1, .3,
 * **allow_no_crop** (bool): 是否允许未进行裁剪。默认值为True。
 * **cover_all_box** (bool): 是否要求所有的真实标注框都必须在裁剪区域内。默认值为False。
 
+## CLAHE
+```
+paddlex.det.transforms.CLAHE(clip_limit=2., tile_grid_size=(8, 8))
+```
+对图像进行对比度增强。
+
+### 参数
+
+* **clip_limit** (int|float): 颜色对比度的阈值,默认值为2.。
+* **tile_grid_size** (list|tuple): 进行像素均衡化的网格大小。默认值为(8, 8)。
+
 <!--
 ## ComposedRCNNTransforms
 ```python

+ 67 - 0
docs/apis/visualize.md

@@ -139,3 +139,70 @@ paddlex.transforms.visualize(dataset,
 >* **dataset** (paddlex.datasets): 数据集读取器。
 >* **img_count** (int): 需要进行数据预处理/增强的图像数目。默认为3。
 >* **save_dir** (str): 日志保存的路径。默认为'vdl_output'。
+
+## paddlex.det.coco_error_analysis
+> **分析模型预测错误的原因**
+
+```
+paddlex.det.coco_error_analysis(eval_details_file=None, gt=None, pred_bbox=None, pred_mask=None, save_dir='./output')
+```
+逐个分析模型预测错误的原因,并将分析结果以图表的形式展示。分析结果图表示例如下:
+
+![](images/detection_analysis.jpg)
+
+左图显示的是`person`类的分析结果,有图显示的是所有类别整体的分析结果。
+
+分析图表展示了7条Precision-Recall(PR)曲线,每一条曲线表示的Average Precision (AP)比它左边那条高,原因是逐步放宽了评估要求。以`person`类为例,各条PR曲线的评估要求解释如下:
+
+* C75: 在IoU设置为0.75时的PR曲线, AP为0.510。
+* C50: 在IoU设置为0.5时的PR曲线,AP为0.724。C50与C75之间的白色区域面积代表将IoU从0.75放宽至0.5带来的AP增益。
+* Loc: 在IoU设置为0.1时的PR曲线,AP为0.832。Loc与C50之间的蓝色区域面积代表将IoU从0.5放宽至0.1带来的AP增益。蓝色区域面积越大,表示越多的检测框位置不够精准。
+* Sim: 在Loc的基础上,如果检测框与真值框的类别不相同,但两者同属于一个亚类,则不认为该检测框是错误的,在这种评估要求下的PR曲线, AP为0.832。Sim与Loc之间的红色区域面积越大,表示子类间的混淆程度越高。
+* Oth: 在Sim的基础上,如果检测框与真值框的亚类不相同,则不认为该检测框是错误的,在这种评估要求下的PR曲线,AP为0.841。Oth与Sim之间的绿色区域面积越大,表示亚类间的混淆程度越高。
+* BG: 在Oth的基础上,背景区域上的检测框不认为是错误的,在这种评估要求下的PR曲线,AP为91.1。BG与Oth之间的紫色区域面积越大,表示背景区域被误检的数量越多。
+* FN: 在BG的基础上,漏检的真值框不认为是错误的,在这种评估要求下的PR曲线,AP为1.00。FN与BG之间的橙色区域面积越大,表示漏检的真值框数量越多。
+
+更为详细的说明参考[COCODataset官网给出分析工具说明](https://cocodataset.org/#detection-eval)
+
+### 参数
+> * **eval_details_file** (str): 模型评估结果的保存路径,包含真值信息和预测结果。默认值为None。
+> * **gt** (list): 数据集的真值信息。默认值为None。
+> * **pred_bbox** (list): 模型在数据集上的预测框。默认值为None。
+> * **pred_mask** (list): 模型在数据集上的预测mask。默认值为None。
+> * **save_dir** (str): 可视化结果保存路径。默认值为'./output'。
+
+**注意:**`eval_details_file`的优先级更高,只要`eval_details_file`不为None,就会从`eval_details_file`提取真值信息和预测结果做分析。当`eval_details_file`为None时,则用`gt`、`pred_mask`、`pred_mask`做分析。
+
+### 使用示例
+点击下载如下示例中的[模型](https://bj.bcebos.com/paddlex/models/insect_epoch_270.zip)和[数据集](https://bj.bcebos.com/paddlex/datasets/insect_det.tar.gz)
+
+> 方式一:分析训练过程中保存的模型文件夹中的评估结果文件`eval_details.json`,例如[模型](https://bj.bcebos.com/paddlex/models/insect_epoch_270.zip)中的`eval_details.json`。
+```
+import paddlex as pdx
+eval_details_file = 'insect_epoch_270/eval_details.json'
+pdx.det.coco_error_analysis(eval_details_file, save_dir='./insect')
+```
+> 方式二:分析模型评估函数返回的评估结果。
+
+```
+import os
+# 选择使用0号卡
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+model = pdx.load_model('insect_epoch_270')
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='insect_det',
+    file_list='insect_det/val_list.txt',
+    label_list='insect_det/labels.txt',
+    transforms=model.eval_transforms)
+metrics, evaluate_details = model.evaluate(eval_dataset, batch_size=8, return_details=True)
+gt = evaluate_details['gt']
+bbox = evaluate_details['bbox']
+pdx.det.coco_error_analysis(gt=gt, pred_bbox=bbox, save_dir='./insect')
+```
+所有类别整体的分析结果示例如下:
+
+![](./images/insect_bbox-allclass-allarea.png)

+ 1 - 0
docs/examples/index.rst

@@ -15,3 +15,4 @@ PaddleX精选飞桨视觉开发套件在产业实践中的成熟模型结构,
    multi-channel_remote_sensing/README.md
    remote_sensing.md
    change_detection.md
+   industrial_quality_inspection/README.md

+ 99 - 0
docs/examples/industrial_quality_inspection/README.md

@@ -0,0 +1,99 @@
+# 工业质检
+
+本案例面向工业质检场景里的铝材表面缺陷检测,提供了针对GPU端和CPU端两种部署场景下基于PaddleX的解决方案,希望通过梳理优化模型精度和性能的思路能帮助用户更高效地解决实际质检应用中的问题。
+
+## 1. GPU端解决方案
+
+### 1.1 数据集介绍
+
+本案例使用[天池铝材表面缺陷检测初赛](https://tianchi.aliyun.com/competition/entrance/231682/introduction)数据集,共有3005张图片,分别检测擦花、杂色、漏底、不导电、桔皮、喷流、漆泡、起坑、脏点和角位漏底10种缺陷,这10种缺陷的定义和示例可点击文档[天池铝材表面缺陷检测初赛数据集示例](./dataset.md)查看。
+
+将这3005张图片按9:1随机切分成2713张图片的训练集和292张图片的验证集。
+
+### 1.2 精度优化
+
+本小节侧重展示在模型迭代过程中优化精度的思路,在本案例中,有些优化策略获得了精度收益,而有些没有。在其他质检场景中,可根据实际情况尝试这些优化策略。点击文档[精度优化](./accuracy_improvement.md)查看。
+
+### 1.3 性能优化
+
+在完成模型精度优化之后,从以下两个方面对模型进行加速:
+
+#### (1) 减少FPN部分的通道数量
+
+将FPN部分的通道数量由原本的256减少至64,使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`fpn_num_channels`为64即可,需要重新对模型进行训练。
+
+#### (2) 减少测试阶段的候选框数量
+
+将测试阶段RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500,将RPN部分做完非极大值抑制后保留的候选框数量由原本的1000减少至300。使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`test_pre_nms_top_n`为500,`test_post_nms_top_n`为300。
+
+采用Fluid C++预测引擎在Tesla P40上测试模型的推理时间(输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间),输入大小设置为800x1333,加速前后推理时间如下表所示:
+
+| 模型 | 推理时间 (ms/image)| VOC mAP (%) |
+| -- | -- | -- |
+| baseline | 66.51 | 88.87 |
+| + fpn channel=64 + test proposal=pre/post topk 500/300 | 46.08 | 87.72 |
+
+### 1.4 最终方案
+
+本案例面向GPU端的最终方案是选择二阶段检测模型FasterRCNN,其骨干网络选择加入了可变形卷积(DCN)的ResNet50_vd,训练时使用SSLD蒸馏方案训练得到的ResNet50_vd预训练模型,FPN部分的通道数量设置为64。使用复核过的数据集,训练阶段数据增强策略采用RandomHorizontalFlip、RandomDistort、RandomCrop,并加入背景图片。测试阶段的RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500、做完非极大值抑制后保留的候选框数量由原本的1000减少至300。模型在验证集上的VOC mAP为87.72%。
+
+在Tesla P40的Linux系统下,对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image,模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。
+
+| 模型 | VOC mAP (%) | 推理时间 (ms/image)
+| -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld | 81.05 | 48.62 |
+| + dcn | 88.09 | 66.51 |
+| + RandomHorizontalFlip/RandomDistort/RandomCrop | 90.23| 66.51 |
+| + background images | 88.87 | 66.51 |
+| + fpn channel=64 | 87.79 | 48.65 |
+| + test proposal=pre/post topk 500/300 | 87.72 | 46.08 |
+
+具体的训练和部署流程点击文档[GPU端最终解决方案](./gpu_solution.md)进行查看。
+
+## 2. CPU端解决方案
+
+为了实现高效的模型推理,面向CPU端的模型选择精度和效率皆优的单阶段检测模型YOLOv3,骨干网络选择基于PaddleClas中SSLD蒸馏方案训练得到的MobileNetv3_large。训练完成后,对模型做剪裁操作,以提升模型的性能。模型在验证集上的VOC mAP为79.02%。
+
+部署阶段,借助OpenVINO预测引擎完成在Intel(R) Core(TM) i9-9820X CPU @ 3.30GHz Windows系统下高效推理。对于输入大小是608 x 608的模型,图像预处理时长为38.69 ms/image,模型的推理时间为34.50ms/image,
+
+| 模型 | VOC mAP (%) | Inference Speed (ms/image)
+| -- | -- | -- |
+| YOLOv3-MobileNetv3_ssld | 78.52 | 56.71 |
+| pruned YOLOv3-MobileNetv3_ssld | 79.02 | 34.50 |
+
+### 模型训练
+
+[环境前置依赖](./gpu_solution.md#%E5%89%8D%E7%BD%AE%E4%BE%9D%E8%B5%96)、[下载PaddleX源码](./gpu_solution.md#1-%E4%B8%8B%E8%BD%BDpaddlex%E6%BA%90%E7%A0%81)、[下载数据集](./gpu_solution.md#2-%E4%B8%8B%E8%BD%BD%E6%95%B0%E6%8D%AE%E9%9B%86)与GPU端是一样的,可点击文档[GPU端最终解决方案](./gpu_solution.md)查看,在此不做赘述。
+
+如果不想再次训练模型,可以直接下载已经训练好的模型完成后面的模型测试和部署推理:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/models/yolov3_mobilenetv3_large_pruned.tar.gz
+tar xvf yolov3_mobilenetv3_large_pruned.tar.gz
+```
+
+运行以下代码进行模型训练,代码会自动下载数据集,如若事先下载了数据集,需将下载和解压铝材缺陷检测数据集的相关行注释掉。代码中默认使用0,1,2,3,4,5,6,7号GPU训练,可根据实际情况设置卡号并调整`batch_size`和`learning_rate`。
+
+```
+python train_yolov3.py
+```
+
+### 模型剪裁
+
+运行以下代码,分析在不同的精度损失下模型各层的剪裁比例:
+
+```
+python params_analysis.py
+```
+
+设置可允许的精度损失为0.05,对模型进行剪裁,剪裁后需要重新训练模型:
+
+```
+python train_pruned_yolov3.py
+```
+
+[分析预测错误的原因](./gpu_solution.md#4-%E5%88%86%E6%9E%90%E9%A2%84%E6%B5%8B%E9%94%99%E8%AF%AF%E7%9A%84%E5%8E%9F%E5%9B%A0)、[统计图片级召回率和误检率](./gpu_solution.md#5-%E7%BB%9F%E8%AE%A1%E5%9B%BE%E7%89%87%E7%BA%A7%E5%8F%AC%E5%9B%9E%E7%8E%87%E5%92%8C%E8%AF%AF%E6%A3%80%E7%8E%87)、[模型测试](./gpu_solution.md#6-%E6%A8%A1%E5%9E%8B%E6%B5%8B%E8%AF%95)这些步骤与GPU端是一样的,可点击文档[GPU端最终解决方案](./gpu_solution.md)查看,在此不做赘述。
+
+### 推理部署
+
+本案例采用C++部署方式,通过OpenVINO将模型部署在Intel(R) Core(TM) i9-9820X CPU @ 3.30GHz的Windows系统下,具体的部署流程请参考文档[PaddleX模型多端安全部署/OpenVINO部署](https://paddlex.readthedocs.io/zh_CN/develop/deploy/openvino/index.html)。

+ 97 - 0
docs/examples/industrial_quality_inspection/accuracy_improvement.md

@@ -0,0 +1,97 @@
+# 精度优化
+
+本小节侧重展示在模型迭代过程中优化精度的思路,在本案例中,有些优化策略获得了精度收益,而有些没有。在其他质检场景中,可根据实际情况尝试这些优化策略。
+
+## (1) 基线模型选择
+
+相较于单阶段检测模型,二阶段检测模型的精度更高但是速度更慢。考虑到是部署到GPU端,本案例选择二阶段检测模型FasterRCNN作为基线模型,其骨干网络选择ResNet50_vd,并使用基于PaddleClas中SSLD蒸馏方案训练得到的ResNet50_vd预训练模型(ImageNet1k验证集上Top1 Acc为82.39%)。训练完成后,模型在验证集上的精度VOC mAP为73.36%。
+
+## (2) 模型效果分析
+
+使用PaddleX提供的[paddlex.det.coco_error_analysis](https://paddlex.readthedocs.io/zh_CN/develop/apis/visualize.html#paddlex-det-coco-error-analysis)接口对模型在验证集上预测错误的原因进行分析,分析结果以图表的形式展示如下:
+
+| all classes| 擦花 | 杂色 | 漏底 | 不导电 | 桔皮 | 喷流 | 漆泡 | 起坑 | 脏点 | 角位漏底 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/allclasses_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/cahua_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/zase_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/loudi_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/budaodian_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/jupi_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/penliu_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qipao_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qikeng_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/zangdian_analysis_example.png) | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/jiaoweiloudi_analysis_example.png) |
+
+分析图表展示了7条Precision-Recall(PR)曲线,每一条曲线表示的Average Precision (AP)比它左边那条高,原因是逐步放宽了评估要求。以擦花类为例,各条PR曲线的评估要求解释如下:
+
+* C75: 在IoU设置为0.75时的PR曲线, AP为0.001。
+* C50: 在IoU设置为0.5时的PR曲线,AP为0.622。C50与C75之间的白色区域面积代表将IoU从0.75放宽至0.5带来的AP增益。
+* Loc: 在IoU设置为0.1时的PR曲线,AP为0.740。Loc与C50之间的蓝色区域面积代表将IoU从0.5放宽至0.1带来的AP增益。蓝色区域面积越大,表示越多的检测框位置不够精准。
+* Sim: 在Loc的基础上,如果检测框与真值框的类别不相同,但两者同属于一个亚类,则不认为该检测框是错误的,在这种评估要求下的PR曲线, AP为0.742。Sim与Loc之间的红色区域面积越大,表示子类间的混淆程度越高。VOC格式的数据集所有的类别都属于同一个亚类。
+* Oth: 在Sim的基础上,如果检测框与真值框的亚类不相同,则不认为该检测框是错误的,在这种评估要求下的PR曲线,AP为0.742。Oth与Sim之间的绿色区域面积越大,表示亚类间的混淆程度越高。VOC格式的数据集中所有的类别都属于同一个亚类,故不存在亚类间的混淆。
+* BG: 在Oth的基础上,背景区域上的检测框不认为是错误的,在这种评估要求下的PR曲线,AP为92.1。BG与Oth之间的紫色区域面积越大,表示背景区域被误检的数量越多。
+* FN: 在BG的基础上,漏检的真值框不认为是错误的,在这种评估要求下的PR曲线,AP为1.00。FN与BG之间的橙色区域面积越大,表示漏检的真值框数量越多。
+
+从分析图表中可以看出,杂色、桔皮、起坑三类检测效果较好,角位漏底存在少许检测框没有达到IoU 0.5的情况,问题较多的是擦花、不导电、喷流、漆泡、脏点。擦花类最严重的问题是误检、位置不精准、漏检,不导电类最严重的问题是漏检、位置不精准,喷流类和漆泡类最严重的问题是位置不精准、误检,脏点类最严重的问题是误检、漏检。为进一步理解造成这些问题的原因,将验证集上的预测结果进行了可视化,然后发现数据集标注存在以下问题:
+
+* 轻微的缺陷不视为缺陷,但轻微的界定不明确,有些轻微的缺陷被标注了,造成误检较多
+* 不导电、漏底、角位漏底外观极其相似,肉眼难以区分,导致这三类极其容易混淆而使得评估时误检或漏检的产生
+* 有些轻微的擦花和脏点被标注了,有些明显的反而没有被标注,造成了这两类误检和漏检情况都较为严重
+* 喷流和漆泡多为连续性的缺陷,一处喷流,其水平线上还会有其他喷流,一个气泡,其水平线上还会有一排气泡。但有时候把这些连续性的缺陷标注成一个目标,有时候又单独地标注不同的部分。导致模型有时候检测单独部分,有时候又检测整体,造成这两类位置不精准、误检较多。
+
+## (3) 数据复核
+
+为了减少原始数据标注的诸多问题对模型优化的影响,需要对数据进行复核。复核准则示例如下:
+
+* 擦花:不明显的擦花不标注,面状擦花以同一个框表示,条状擦花一条擦花以一个框表示
+* 漏底、角位漏底、不导电:三者过于相似,归为一类
+* 桔皮:忽略不是大颗粒状的表面
+* 喷流:明显是一条喷流的就用一个框表示,不是的话用多个框表示
+* 漆泡:不要单独标一个点,一连串点标一个框
+* 脏点:忽略轻微脏点
+
+对数据集复核并重新标注后,将FasterRCNN-ResNet50_vd_ssld重新在训练集上进行训练,模型在验证集上的VOC mAP为81.05%。
+
+## (4) 可变形卷积加入
+
+由于喷流、漆泡的形态不规则,导致这两类的很多预测框位置不精准。为了解决该问题,选择在骨干网络ResNet50_vd中使用可变形卷积(DCN)。重新训练后,模型在验证集上的VOC mAP为88.09%,喷流的VOC AP由57.3%提升至78.7%,漆泡的VOC AP由74.7%提升至96.7%。
+
+## (5) 数据增强选择
+
+在(4)的基础上,选择加入一些数据增强策略来进一步提升模型的精度。本案例选择同时使用[RandomHorizontalFlip](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomhorizontalflip)、[RandomDistort](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomdistort)、[RandomCrop](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomcrop)这三种数据增强方法,重新训练后的模型在验证集上的VOC mAP为90.23%。
+
+除此之外,还可以尝试的数据增强方式有[MultiScaleTraining](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#resizebyshort)、[RandomExpand](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomexpand)。本案例使用的铝材表面缺陷检测数据集中,同一个类别的尺度变化不大,使用MultiScaleTraining或者RandomExpand反而使得原始数据分布发生改变。对此,本案例也进行了实验验证,使用RandomHorizontalFlip + RandomDistort + RandomCrop + MultiScaleTraining数据增强方式训练得到的模型在验证集上的VOC mAP为87.15%,使用RandomHorizontalFlip + RandomDistort + RandomCrop + RandomExpand数据增强方式训练得到的模型在验证集上的的VOC mAP为88.56%。
+
+## (6) 背景图片加入
+
+本案例将数据集中提供的背景图片按9:1切分成了1116张、135张两部分,并使用(5)中训练好的模型在135张背景图片上进行测试,发现图片级误检率高达21.5%。为了降低模型的误检率,使用[paddlex.datasets.VOCDetection.add_negative_samples](https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#add-negative-samples)接口将1116张背景图片加入到原本的训练集中,重新训练后图片级误检率降低至4%。为了不让训练被背景图片主导,本案例通过将`train_list.txt`中的文件路径多写了一遍,从而增加有目标图片的占比。
+
+| 模型 | VOC mAP (%) | 有缺陷图片级召回率 | 背景图片级误检率 |
+| -- | -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld + DCN + RandomHorizontalFlip + RandomDistort + RandomCrop | 90.23 | 95.5 | 21.5 |
+| FasterRCNN-ResNet50_vd_ssld + DCN + RandomHorizontalFlip + RandomDistort + RandomCrop + 背景图片 | 88.87 | 95.2 | 4 |
+
+【名词解释】
+
+* 图片级别的召回率:只要在有目标的图片上检测出目标(不论框的个数),该图片被认为召回。批量有目标图片中被召回图片所占的比例,即为图片级别的召回率。
+* 图片级别的误检率:只要在无目标的图片上检测出目标(不论框的个数),该图片被认为误检。批量无目标图片中被误检图片所占的比例,即为图片级别的误检率。
+
+## (7) 分类损失函数选择
+
+降低误检率的措施除了(6)中提到的将背景图片加入训练,还可以将RPN部分的分类损失函数选择为`SigmoidFocalLoss`,将更多的anchors加入训练,增加难分样本的在损失函数的比重进而降低误检率。在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时将参数`rpn_cls_loss`设置为'SigmoidFocalLoss',同时需要调整参数`rpn_focal_loss_alpha`、`rpn_focal_loss_gamma`、`rpn_batch_size_per_im`、`rpn_fg_fraction`的设置。
+
+## (8) 位置回归损失函数选择
+
+RCNN部分的位置回归损失函数除了'SmoothL1Loss'以外,还可以选择'CIoULoss',使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`rcnn_bbox_loss`即可。在本案例中,选择'CIoULoss'并没有带来精度收益,故还是选择'SmoothL1Loss'。其他质检场景下,也可尝试使用'CIoULoss'。
+
+## (9) 正负样本采样方式选择
+
+当目标物体的区域只占图像的一小部分时,可以考虑采用[LibraRCNN](https://arxiv.org/abs/1904.02701)中提出的IoU-balanced Sampling采样方式来获取更多的难分负样本。使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时将参数`bbox_assigner`设置为'LibraBBoxAssigner'即可。
+
+## (10) 预处理对比度增强
+
+工业界常用灰度相机采集图片,会存在目标与周围背景对比度不明显而无法被检测出的情况。在这种情况下,可以在定义预处理的时候使用[paddlex.det.transforms.CLAHE](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#clahe)对灰度图像的对比度进行增强。
+
+灰度图:
+
+![](../../../examples/industrial_quality_inspection/image/before_clahe.png)
+
+对比度增加后的灰度图:
+
+![](../../../examples/industrial_quality_inspection/image/after_clahe.png) |
+
+## (11) 样本生成
+
+对于数量较少的类别或者小目标,可以通过把这些目标物体粘贴在背景图片上来生成新的图片和标注文件,并把这些新的样本加入到训练中从而提升模型精度。目前PaddleX提供了实现该功能的接口,详细见[paddlex.det.paste_objects](https://paddlex.readthedocs.io/zh_CN/develop/apis/tools.html#paddlex-det-paste-objects),需要注意的是,前景目标颜色与背景颜色差异较大时生成的新图片才会比较逼真。

+ 14 - 0
docs/examples/industrial_quality_inspection/dataset.md

@@ -0,0 +1,14 @@
+# 天池铝材表面缺陷检测初赛数据集示例
+
+| 序号 | 瑕疵名称 | 瑕疵成因 | 瑕疵图片示例 | 图片数量 |
+| -- | -- | -- | -- | -- |
+| 1 | 擦花(擦伤)| 表面处理(喷涂)后有轻微擦到其它的东西,造成痕迹 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/ca_hua_example.png) | 128 |
+| 2 | 杂色 | 喷涂换颜料的时候,装颜料的容器未清洗干净,造成喷涂时有少量其它颜色掺入 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/za_se_example.png) |365 |
+| 3 | 漏底 | 喷粉效果不好,铝材大量底色露出 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/lou_di_example.png) | 538 |
+| 4 | 不导电 | 直接喷不到铝材表面上去 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/bu_dao_dian_example.png) | 390 |
+|5 | 桔皮 | 表面处理后涂层表面粗糙,大颗粒 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/ju_pi_example.png) | 173 |
+| 6 | 喷流| 喷涂时油漆稀从上流下来,有流动痕迹 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/pen_liu_example.png) | 86 |
+| 7 |漆泡 | 喷涂后表面起泡,小而多| ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qi_pao_example.png) | 82 |
+| 8 | 起坑 | 型材模具问题,做出来的型材一整条都有一条凹下去的部分 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qi_keng_example.png.png) | 407 |
+| 9 | 脏点 | 表面处理时,有灰尘或一些脏东西未能擦掉,导致涂层有颗粒比较突出 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/zang_dian_example.png) | 261 |
+| 10 | 角位漏底 | 在型材角落出现的露底 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/jiao_wei_lou_di_example.png) | 346 |

+ 116 - 0
docs/examples/industrial_quality_inspection/gpu_solution.md

@@ -0,0 +1,116 @@
+# GPU端最终解决方案
+
+本案例面向GPU端的最终方案是选择二阶段检测模型FasterRCNN,其骨干网络选择加入了可变形卷积(DCN)的ResNet50_vd,训练时使用SSLD蒸馏方案训练得到的ResNet50_vd预训练模型,FPN部分的通道数量设置为64,训练阶段数据增强策略采用RandomHorizontalFlip、RandomDistort、RandomCrop,并加入背景图片,测试阶段的RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500、做完非极大值抑制后保留的候选框数量由原本的1000减少至300。
+
+在Tesla P40的Linux系统下,对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image,模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。
+
+| 模型 | VOC mAP (%) | 推理时间 (ms/image)
+| -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld | 81.05 | 48.62 |
+| + dcn | 88.09 | 66.51 |
+| + RandomHorizontalFlip/RandomDistort/RandomCrop | 90.23| 66.51 |
+| + background images | 88.87 | 66.51 |
+| + fpn channel=64 | 87.79 | 48.65 |
+| + test proposal=pre/post topk 500/300 | 87.72 | 46.08 |
+
+## 前置依赖
+
+* Paddle paddle >= 1.8.0
+* Python >= 3.5
+* PaddleX >= 1.3.0
+
+## 模型训练
+
+### (1) 下载PaddleX源码
+
+```
+git clone https://github.com/PaddlePaddle/PaddleX
+
+cd PaddleX/examples/industrial_quality_inspection
+```
+
+### (2) 下载数据集
+
+因数据集较大,可运行以下代码提前将数据集下载并解压。训练代码中也会自动下载数据集,所以这一步不是必须的。
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection.tar.gz
+tar xvf aluminum_inspection.tar.gz
+```
+### (3) 下载预先训练好的模型
+
+如果不想再次训练模型,可以直接下载已经训练好的模型完成后面的模型测试和部署推理:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/models/faster_rcnn_r50_vd_dcn.tar.gz
+tar xvf faster_rcnn_r50_vd_dcn.tar.gz
+```
+### (4) 训练
+
+运行以下代码进行模型训练,代码会自动下载数据集,如若事先下载了数据集,需将下载和解压铝材缺陷检测数据集的相关行注释掉。代码中默认使用0,1,2,3,4号GPU训练,可根据实际情况设置卡号并调整`batch_size`和`learning_rate`。
+
+```
+python train_rcnn.py
+```
+
+### (5) 分析预测错误的原因
+
+在模型迭代过程中,运行以下代码可完成模型效果的分析并生成分析结果图表:
+
+```
+python error_analysis.py
+```
+
+可参考[性能优化部分的模型效果分析](./accuracy_improvement.md#2-%E6%A8%A1%E5%9E%8B%E6%95%88%E6%9E%9C%E5%88%86%E6%9E%90)来理解当前模型预测错误的原因。
+
+运行以下代码,生成可视化真值和预测结果的对比图以进一步理解模型效果,代码中的置信度阈值可根据实际情况进行调整。
+
+```
+python compare.py
+```
+
+![](../../../examples/industrial_quality_inspection/image/compare_budaodian-116.jpg)
+
+左边是可视化真值,右边是可视化预测结果。
+
+### (6) 统计图片级召回率和误检率
+
+模型迭代完成后,计算不同置信度阈值下[图片级召回率](./accuracy_improvement.md#6-%E8%83%8C%E6%99%AF%E5%9B%BE%E7%89%87%E5%8A%A0%E5%85%A5)和[图片级误检率](./accuracy_improvement.md#6-%E8%83%8C%E6%99%AF%E5%9B%BE%E7%89%87%E5%8A%A0%E5%85%A5),找到符合要求的召回率和误检率,对应的置信度阈值用于后续模型预测阶段。
+
+```
+python cal_tp_fp.py
+```
+
+执行后会生成图表`image-level_tp_fp.png`和文件`tp_fp_list.txt`,示意如下:
+
+图表`image-level_tp_fp.png`:
+
+![](../../../examples/industrial_quality_inspection/image/image-level_tp_fp.png)
+
+文件[tp_fp_list.txt](tp_fp_list.md)
+
+图表`image-level_tp_fp.png`中左边子图,横坐标表示不同置信度阈值下计算得到的图片级召回率,纵坐标表示各图片级召回率对应的图片级误检率。右边子图横坐标表示图片级召回率,纵坐标表示各图片级召回率对应的置信度阈值。从图表中可较为直观地看出当前模型的图片级召回率和误检率的量级,从文件`tp_fp_list.txt`可以查到具体数值,例如在图片级召回率/图片级误检率为[0.9589,0.0074]这一组符合要求,就将对应的置信度阈值0.90选取为后续预测推理的阈值。
+
+### (7) 模型测试
+
+测试集因没有标注文件,这里单独下载测试集图片:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection_test.tar.gz
+tar xvf aluminum_inspection_test.tar.gz
+```
+
+加载训练好的模型,使用(5)选取的置信度阈值0.90对验证集图片或者测试集图片进行预测:
+
+```
+python predict.py
+```
+可视化预测结果示例如下:
+
+![](../../../examples/industrial_quality_inspection/image/visualize_budaodian-116.jpg)
+
+## 推理部署
+
+本案例采用C++部署方式将模型部署在Tesla P40的Linux系统下,具体的C++部署流程请参考文档[PaddleX模型多端安全部署/C++部署](https://paddlex.readthedocs.io/zh_CN/develop/deploy/server/cpp/index.html)。
+
+对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image。值得一提的是预处理中的Normalize操作比较耗时,因此在设置预处理操作时,可以先进行Resize操作再做Normalize。模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。

+ 102 - 0
docs/examples/industrial_quality_inspection/tp_fp_list.md

@@ -0,0 +1,102 @@
+| score | recall rate | false-positive rate |
+| -- | -- | -- |
+| 0.000000 | 0.982877 | 0.022222 |
+| 0.010000 | 0.982877 | 0.022222 |
+| 0.020000 | 0.982877 | 0.022222 |
+| 0.030000 | 0.982877 | 0.022222 |
+| 0.040000 | 0.982877 | 0.022222 |
+| 0.050000 | 0.982877 | 0.022222 |
+| 0.060000 | 0.982877 | 0.022222 |
+| 0.070000 | 0.979452 | 0.022222 |
+| 0.080000 | 0.979452 | 0.022222 |
+| 0.090000 | 0.979452 | 0.022222 |
+| 0.100000 | 0.979452 | 0.022222 |
+| 0.110000 | 0.979452 | 0.022222 |
+| 0.120000 | 0.979452 | 0.022222 |
+| 0.130000 | 0.979452 | 0.022222 |
+| 0.140000 | 0.976027 | 0.022222 |
+| 0.150000 | 0.976027 | 0.022222 |
+| 0.160000 | 0.976027 | 0.022222 |
+| 0.170000 | 0.976027 | 0.022222 |
+| 0.180000 | 0.976027 | 0.022222 |
+| 0.190000 | 0.976027 | 0.022222 |
+| 0.200000 | 0.976027 | 0.022222 |
+| 0.210000 | 0.976027 | 0.022222 |
+| 0.220000 | 0.976027 | 0.022222 |
+| 0.230000 | 0.976027 | 0.022222 |
+| 0.240000 | 0.976027 | 0.022222 |
+| 0.250000 | 0.976027 | 0.022222 |
+| 0.260000 | 0.976027 | 0.014815 |
+| 0.270000 | 0.976027 | 0.014815 |
+| 0.280000 | 0.976027 | 0.014815 |
+| 0.290000 | 0.976027 | 0.014815 |
+| 0.300000 | 0.976027 | 0.014815 |
+| 0.310000 | 0.976027 | 0.014815 |
+| 0.320000 | 0.976027 | 0.014815 |
+| 0.330000 | 0.976027 | 0.014815 |
+| 0.340000 | 0.976027 | 0.014815 |
+| 0.350000 | 0.976027 | 0.014815 |
+| 0.360000 | 0.976027 | 0.014815 |
+| 0.370000 | 0.976027 | 0.014815 |
+| 0.380000 | 0.976027 | 0.014815 |
+| 0.390000 | 0.976027 | 0.014815 |
+| 0.400000 | 0.976027 | 0.014815 |
+| 0.410000 | 0.976027 | 0.014815 |
+| 0.420000 | 0.976027 | 0.014815 |
+| 0.430000 | 0.976027 | 0.014815 |
+| 0.440000 | 0.972603 | 0.014815 |
+| 0.450000 | 0.972603 | 0.014815 |
+| 0.460000 | 0.972603 | 0.014815 |
+| 0.470000 | 0.972603 | 0.014815 |
+| 0.480000 | 0.972603 | 0.014815 |
+| 0.490000 | 0.972603 | 0.014815 |
+| 0.500000 | 0.972603 | 0.014815 |
+| 0.510000 | 0.972603 | 0.014815 |
+| 0.520000 | 0.972603 | 0.014815 |
+| 0.530000 | 0.972603 | 0.014815 |
+| 0.540000 | 0.972603 | 0.014815 |
+| 0.550000 | 0.972603 | 0.014815 |
+| 0.560000 | 0.969178 | 0.014815 |
+| 0.570000 | 0.969178 | 0.014815 |
+| 0.580000 | 0.969178 | 0.014815 |
+| 0.590000 | 0.969178 | 0.014815 |
+| 0.600000 | 0.969178 | 0.014815 |
+| 0.610000 | 0.969178 | 0.014815 |
+| 0.620000 | 0.969178 | 0.014815 |
+| 0.630000 | 0.969178 | 0.014815 |
+| 0.640000 | 0.969178 | 0.014815 |
+| 0.650000 | 0.969178 | 0.014815 |
+| 0.660000 | 0.969178 | 0.014815 |
+| 0.670000 | 0.969178 | 0.014815 |
+| 0.680000 | 0.969178 | 0.014815 |
+| 0.690000 | 0.969178 | 0.014815 |
+| 0.700000 | 0.969178 | 0.014815 |
+| 0.710000 | 0.969178 | 0.014815 |
+| 0.720000 | 0.969178 | 0.014815 |
+| 0.730000 | 0.969178 | 0.014815 |
+| 0.740000 | 0.969178 | 0.014815 |
+| 0.750000 | 0.969178 | 0.014815 |
+| 0.760000 | 0.969178 | 0.014815 |
+| 0.770000 | 0.965753 | 0.014815 |
+| 0.780000 | 0.965753 | 0.014815 |
+| 0.790000 | 0.965753 | 0.014815 |
+| 0.800000 | 0.962329 | 0.014815 |
+| 0.810000 | 0.962329 | 0.014815 |
+| 0.820000 | 0.962329 | 0.014815 |
+| 0.830000 | 0.962329 | 0.014815 |
+| 0.840000 | 0.962329 | 0.014815 |
+| 0.850000 | 0.958904 | 0.014815 |
+| 0.860000 | 0.958904 | 0.014815 |
+| 0.870000 | 0.958904 | 0.014815 |
+| 0.880000 | 0.958904 | 0.014815 |
+| 0.890000 | 0.958904 | 0.014815 |
+| 0.900000 | 0.958904 | 0.007407 |
+| 0.910000 | 0.958904 | 0.007407 |
+| 0.920000 | 0.958904 | 0.007407 |
+| 0.930000 | 0.955479 | 0.007407 |
+| 0.940000 | 0.955479 | 0.007407 |
+| 0.950000 | 0.955479 | 0.007407 |
+| 0.960000 | 0.955479 | 0.007407 |
+| 0.970000 | 0.955479 | 0.007407 |
+| 0.980000 | 0.941781 | 0.000000 |
+| 0.990000 | 0.893836 | 0.000000 |

+ 1 - 0
docs/index.rst

@@ -64,6 +64,7 @@ PaddleX是基于飞桨核心框架、开发套件和工具组件的深度学习
    examples/remote_sensing.md
    examples/multi-channel_remote_sensing/README.md
    examples/change_detection.md
+   examples/industrial_quality_inspection/README.md
 
 .. toctree::
    :maxdepth: 1

+ 99 - 0
examples/industrial_quality_inspection/README.md

@@ -0,0 +1,99 @@
+# 工业质检
+
+本案例面向工业质检场景里的铝材表面缺陷检测,提供了针对GPU端和CPU端两种部署场景下基于PaddleX的解决方案,希望通过梳理优化模型精度和性能的思路能帮助用户更高效地解决实际质检应用中的问题。
+
+## 1. GPU端解决方案
+
+### 1.1 数据集介绍
+
+本案例使用[天池铝材表面缺陷检测初赛](https://tianchi.aliyun.com/competition/entrance/231682/introduction)数据集,共有3005张图片,分别检测擦花、杂色、漏底、不导电、桔皮、喷流、漆泡、起坑、脏点和角位漏底10种缺陷,这10种缺陷的定义和示例可点击文档[天池铝材表面缺陷检测初赛数据集示例](./dataset.md)查看。
+
+将这3005张图片按9:1随机切分成2713张图片的训练集和292张图片的验证集。
+
+### 1.2 精度优化
+
+本小节侧重展示在模型迭代过程中优化精度的思路,在本案例中,有些优化策略获得了精度收益,而有些没有。在其他质检场景中,可根据实际情况尝试这些优化策略。点击文档[精度优化](./accuracy_improvement.md)查看。
+
+### 1.3 性能优化
+
+在完成模型精度优化之后,从以下两个方面对模型进行加速:
+
+#### (1) 减少FPN部分的通道数量
+
+将FPN部分的通道数量由原本的256减少至64,使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`fpn_num_channels`为64即可,需要重新对模型进行训练。
+
+#### (2) 减少测试阶段的候选框数量
+
+将测试阶段RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500,将RPN部分做完非极大值抑制后保留的候选框数量由原本的1000减少至300。使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`test_pre_nms_top_n`为500,`test_post_nms_top_n`为300。
+
+采用Fluid C++预测引擎在Tesla P40上测试模型的推理时间(输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间),输入大小设置为800x1333,加速前后推理时间如下表所示:
+
+| 模型 | 推理时间 (ms/image)| VOC mAP (%) |
+| -- | -- | -- |
+| baseline | 66.51 | 88.87 |
+| + fpn channel=64 + test proposal=pre/post topk 500/300 | 46.08 | 87.72 |
+
+### 1.4 最终方案
+
+本案例面向GPU端的最终方案是选择二阶段检测模型FasterRCNN,其骨干网络选择加入了可变形卷积(DCN)的ResNet50_vd,训练时使用SSLD蒸馏方案训练得到的ResNet50_vd预训练模型,FPN部分的通道数量设置为64。使用复核过的数据集,训练阶段数据增强策略采用RandomHorizontalFlip、RandomDistort、RandomCrop,并加入背景图片。测试阶段的RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500、做完非极大值抑制后保留的候选框数量由原本的1000减少至300。模型在验证集上的VOC mAP为87.72%。
+
+在Tesla P40的Linux系统下,对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image,模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。
+
+| 模型 | VOC mAP (%) | 推理时间 (ms/image)
+| -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld | 81.05 | 48.62 |
+| + dcn | 88.09 | 66.51 |
+| + RandomHorizontalFlip/RandomDistort/RandomCrop | 90.23| 66.51 |
+| + background images | 88.87 | 66.51 |
+| + fpn channel=64 | 87.79 | 48.65 |
+| + test proposal=pre/post topk 500/300 | 87.72 | 46.08 |
+
+具体的训练和部署流程点击文档[GPU端最终解决方案](./gpu_solution.md)进行查看。
+
+## 2. CPU端解决方案
+
+为了实现高效的模型推理,面向CPU端的模型选择精度和效率皆优的单阶段检测模型YOLOv3,骨干网络选择基于PaddleClas中SSLD蒸馏方案训练得到的MobileNetv3_large。训练完成后,对模型做剪裁操作,以提升模型的性能。模型在验证集上的VOC mAP为79.02%。
+
+部署阶段,借助OpenVINO预测引擎完成在Intel(R) Core(TM) i9-9820X CPU @ 3.30GHz Windows系统下高效推理。对于输入大小是608 x 608的模型,图像预处理时长为38.69 ms/image,模型的推理时间为34.50ms/image。
+
+| 模型 | VOC mAP (%) | Inference Speed (ms/image)
+| -- | -- | -- |
+| YOLOv3-MobileNetv3_ssld | 78.52 | 56.71 |
+| pruned YOLOv3-MobileNetv3_ssld | 79.02 | 34.50 |
+
+### 模型训练
+
+[环境前置依赖](./gpu_solution.md#%E5%89%8D%E7%BD%AE%E4%BE%9D%E8%B5%96)、[下载PaddleX源码](./gpu_solution.md#1-%E4%B8%8B%E8%BD%BDpaddlex%E6%BA%90%E7%A0%81)、[下载数据集](./gpu_solution.md#2-%E4%B8%8B%E8%BD%BD%E6%95%B0%E6%8D%AE%E9%9B%86)与GPU端是一样的,可点击文档[GPU端最终解决方案](./gpu_solution.md)查看,在此不做赘述。
+
+如果不想再次训练模型,可以直接下载已经训练好的模型完成后面的模型测试和部署推理:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/models/yolov3_mobilenetv3_large_pruned.tar.gz
+tar xvf yolov3_mobilenetv3_large_pruned.tar.gz
+```
+
+运行以下代码进行模型训练,代码会自动下载数据集,如若事先下载了数据集,需将下载和解压铝材缺陷检测数据集的相关行注释掉。代码中默认使用0,1,2,3,4,5,6,7号GPU训练,可根据实际情况设置卡号并调整`batch_size`和`learning_rate`。
+
+```
+python train_yolov3.py
+```
+
+### 模型剪裁
+
+运行以下代码,分析在不同的精度损失下模型各层的剪裁比例:
+
+```
+python params_analysis.py
+```
+
+设置可允许的精度损失为0.05,对模型进行剪裁,剪裁后需要重新训练模型:
+
+```
+python train_pruned_yolov3.py
+```
+
+[分析预测错误的原因](./gpu_solution.md#4-%E5%88%86%E6%9E%90%E9%A2%84%E6%B5%8B%E9%94%99%E8%AF%AF%E7%9A%84%E5%8E%9F%E5%9B%A0)、[统计图片级召回率和误检率](./gpu_solution.md#5-%E7%BB%9F%E8%AE%A1%E5%9B%BE%E7%89%87%E7%BA%A7%E5%8F%AC%E5%9B%9E%E7%8E%87%E5%92%8C%E8%AF%AF%E6%A3%80%E7%8E%87)、[模型测试](./gpu_solution.md#6-%E6%A8%A1%E5%9E%8B%E6%B5%8B%E8%AF%95)这些步骤与GPU端是一样的,可点击文档[GPU端最终解决方案](./gpu_solution.md)查看,在此不做赘述。
+
+### 推理部署
+
+本案例采用C++部署方式,通过OpenVINO将模型部署在Intel(R) Core(TM) i9-9820X CPU @ 3.30GHz的Windows系统下,具体的部署流程请参考文档[PaddleX模型多端安全部署/OpenVINO部署](https://paddlex.readthedocs.io/zh_CN/develop/deploy/openvino/index.html)。

+ 93 - 0
examples/industrial_quality_inspection/accuracy_improvement.md

@@ -0,0 +1,93 @@
+# 精度优化
+
+本小节侧重展示在模型迭代过程中优化精度的思路,在本案例中,有些优化策略获得了精度收益,而有些没有。在其他质检场景中,可根据实际情况尝试这些优化策略。
+
+## (1) 基线模型选择
+
+相较于单阶段检测模型,二阶段检测模型的精度更高但是速度更慢。考虑到是部署到GPU端,本案例选择二阶段检测模型FasterRCNN作为基线模型,其骨干网络选择ResNet50_vd,并使用基于PaddleClas中SSLD蒸馏方案训练得到的ResNet50_vd预训练模型(ImageNet1k验证集上Top1 Acc为82.39%)。训练完成后,模型在验证集上的精度VOC mAP为73.36%。
+
+## (2) 模型效果分析
+
+使用PaddleX提供的[paddlex.det.coco_error_analysis](https://paddlex.readthedocs.io/zh_CN/develop/apis/visualize.html#paddlex-det-coco-error-analysis)接口对模型在验证集上预测错误的原因进行分析,分析结果以图表的形式展示如下:
+
+| all classes| 擦花 | 杂色 | 漏底 | 不导电 | 桔皮 | 喷流 | 漆泡 | 起坑 | 脏点 | 角位漏底 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| ![](image/allclasses_analysis_example.png) | ![](image/cahua_analysis_example.png) | ![](image/zase_analysis_example.png) | ![](image/loudi_analysis_example.png) | ![](image/budaodian_analysis_example.png) | ![](image/jupi_analysis_example.png) | ![](image/penliu_analysis_example.png) | ![](image/qipao_analysis_example.png) | ![](image/qikeng_analysis_example.png) | ![](image/zangdian_analysis_example.png) | ![](image/jiaoweiloudi_analysis_example.png) |
+
+分析图表展示了7条Precision-Recall(PR)曲线,每一条曲线表示的Average Precision (AP)比它左边那条高,原因是逐步放宽了评估要求。以擦花类为例,各条PR曲线的评估要求解释如下:
+
+* C75: 在IoU设置为0.75时的PR曲线, AP为0.001。
+* C50: 在IoU设置为0.5时的PR曲线,AP为0.622。C50与C75之间的白色区域面积代表将IoU从0.75放宽至0.5带来的AP增益。
+* Loc: 在IoU设置为0.1时的PR曲线,AP为0.740。Loc与C50之间的蓝色区域面积代表将IoU从0.5放宽至0.1带来的AP增益。蓝色区域面积越大,表示越多的检测框位置不够精准。
+* Sim: 在Loc的基础上,如果检测框与真值框的类别不相同,但两者同属于一个亚类,则不认为该检测框是错误的,在这种评估要求下的PR曲线, AP为0.742。Sim与Loc之间的红色区域面积越大,表示子类间的混淆程度越高。VOC格式的数据集所有的类别都属于同一个亚类。
+* Oth: 在Sim的基础上,如果检测框与真值框的亚类不相同,则不认为该检测框是错误的,在这种评估要求下的PR曲线,AP为0.742。Oth与Sim之间的绿色区域面积越大,表示亚类间的混淆程度越高。VOC格式的数据集中所有的类别都属于同一个亚类,故不存在亚类间的混淆。
+* BG: 在Oth的基础上,背景区域上的检测框不认为是错误的,在这种评估要求下的PR曲线,AP为92.1。BG与Oth之间的紫色区域面积越大,表示背景区域被误检的数量越多。
+* FN: 在BG的基础上,漏检的真值框不认为是错误的,在这种评估要求下的PR曲线,AP为1.00。FN与BG之间的橙色区域面积越大,表示漏检的真值框数量越多。
+
+从分析图表中可以看出,杂色、桔皮、起坑三类检测效果较好,角位漏底存在少许检测框没有达到IoU 0.5的情况,问题较多的是擦花、不导电、喷流、漆泡、脏点。擦花类最严重的问题是误检、位置不精准、漏检,不导电类最严重的问题是漏检、位置不精准,喷流类和漆泡类最严重的问题是位置不精准、误检,脏点类最严重的问题是误检、漏检。为进一步理解造成这些问题的原因,将验证集上的预测结果进行了可视化,然后发现数据集标注存在以下问题:
+
+* 轻微的缺陷不视为缺陷,但轻微的界定不明确,有些轻微的缺陷被标注了,造成误检较多
+* 不导电、漏底、角位漏底外观极其相似,肉眼难以区分,导致这三类极其容易混淆而使得评估时误检或漏检的产生
+* 有些轻微的擦花和脏点被标注了,有些明显的反而没有被标注,造成了这两类误检和漏检情况都较为严重
+* 喷流和漆泡多为连续性的缺陷,一处喷流,其水平线上还会有其他喷流,一个气泡,其水平线上还会有一排气泡。但有时候把这些连续性的缺陷标注成一个目标,有时候又单独地标注不同的部分。导致模型有时候检测单独部分,有时候又检测整体,造成这两类位置不精准、误检较多。
+
+## (3) 数据复核
+
+为了减少原始数据标注的诸多问题对模型优化的影响,需要对数据进行复核。复核准则示例如下:
+
+* 擦花:不明显的擦花不标注,面状擦花以同一个框表示,条状擦花一条擦花以一个框表示
+* 漏底、角位漏底、不导电:三者过于相似,归为一类
+* 桔皮:忽略不是大颗粒状的表面
+* 喷流:明显是一条喷流的就用一个框表示,不是的话用多个框表示
+* 漆泡:不要单独标一个点,一连串点标一个框
+* 脏点:忽略轻微脏点
+
+对数据集复核并重新标注后,将FasterRCNN-ResNet50_vd_ssld重新在训练集上进行训练,模型在验证集上的VOC mAP为81.05%。
+
+## (4) 可变形卷积加入
+
+由于喷流、漆泡的形态不规则,导致这两类的很多预测框位置不精准。为了解决该问题,选择在骨干网络ResNet50_vd中使用可变形卷积(DCN)。重新训练后,模型在验证集上的VOC mAP为88.09%,喷流的VOC AP由57.3%提升至78.7%,漆泡的VOC AP由74.7%提升至96.7%。
+
+## (5) 数据增强选择
+
+在(4)的基础上,选择加入一些数据增强策略来进一步提升模型的精度。本案例选择同时使用[RandomHorizontalFlip](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomhorizontalflip)、[RandomDistort](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomdistort)、[RandomCrop](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomcrop)这三种数据增强方法,重新训练后的模型在验证集上的VOC mAP为90.23%。
+
+除此之外,还可以尝试的数据增强方式有[MultiScaleTraining](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#resizebyshort)、[RandomExpand](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#randomexpand)。本案例使用的铝材表面缺陷检测数据集中,同一个类别的尺度变化不大,使用MultiScaleTraining或者RandomExpand反而使得原始数据分布发生改变。对此,本案例也进行了实验验证,使用RandomHorizontalFlip + RandomDistort + RandomCrop + MultiScaleTraining数据增强方式训练得到的模型在验证集上的VOC mAP为87.15%,使用RandomHorizontalFlip + RandomDistort + RandomCrop + RandomExpand数据增强方式训练得到的模型在验证集上的的VOC mAP为88.56%。
+
+## (6) 背景图片加入
+
+本案例将数据集中提供的背景图片按9:1切分成了1116张、135张两部分,并使用(5)中训练好的模型在135张背景图片上进行测试,发现图片级误检率高达21.5%。为了降低模型的误检率,使用[paddlex.datasets.VOCDetection.add_negative_samples](https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#add-negative-samples)接口将1116张背景图片加入到原本的训练集中,重新训练后图片级误检率降低至4%。为了不让训练被背景图片主导,本案例通过将`train_list.txt`中的文件路径多写了一遍,从而增加有目标图片的占比。
+
+| 模型 | VOC mAP (%) | 有缺陷图片级召回率 | 背景图片级误检率 |
+| -- | -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld + DCN + RandomHorizontalFlip + RandomDistort + RandomCrop | 90.23 | 95.5 | 21.5 |
+| FasterRCNN-ResNet50_vd_ssld + DCN + RandomHorizontalFlip + RandomDistort + RandomCrop + 背景图片 | 88.87 | 95.2 | 4 |
+
+【名词解释】
+
+* 图片级别的召回率:只要在有目标的图片上检测出目标(不论框的个数),该图片被认为召回。批量有目标图片中被召回图片所占的比例,即为图片级别的召回率。
+* 图片级别的误检率:只要在无目标的图片上检测出目标(不论框的个数),该图片被认为误检。批量无目标图片中被误检图片所占的比例,即为图片级别的误检率。
+
+## (7) 分类损失函数选择
+
+降低误检率的措施除了(6)中提到的将背景图片加入训练,还可以将RPN部分的分类损失函数选择为`SigmoidFocalLoss`,将更多的anchors加入训练,增加难分样本的在损失函数的比重进而降低误检率。在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时将参数`rpn_cls_loss`设置为'SigmoidFocalLoss',同时需要调整参数`rpn_focal_loss_alpha`、`rpn_focal_loss_gamma`、`rpn_batch_size_per_im`、`rpn_fg_fraction`的设置。
+
+## (8) 位置回归损失函数选择
+
+RCNN部分的位置回归损失函数除了'SmoothL1Loss'以外,还可以选择'CIoULoss',使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时设置参数`rcnn_bbox_loss`即可。在本案例中,选择'CIoULoss'并没有带来精度收益,故还是选择'SmoothL1Loss'。其他质检场景下,也可尝试使用'CIoULoss'。
+
+## (9) 正负样本采样方式选择
+
+当目标物体的区域只占图像的一小部分时,可以考虑采用[LibraRCNN](https://arxiv.org/abs/1904.02701)中提出的IoU-balanced Sampling采样方式来获取更多的难分负样本。使用方式在定义模型[FasterRCNN](https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn)类时将参数`bbox_assigner`设置为'LibraBBoxAssigner'即可。
+
+## (10) 预处理对比度增强
+
+工业界常用灰度相机采集图片,会存在目标与周围背景对比度不明显而无法被检测出的情况。在这种情况下,可以在定义预处理的时候使用[paddlex.det.transforms.CLAHE](https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html#clahe)对灰度图像的对比度进行增强。
+
+| 灰度图 | 对比度增加后的灰度图 |
+| -- | -- |
+| ![](./image/before_clahe.png) | ![](./image/after_clahe.png) |
+
+## (11) 样本生成
+
+对于数量较少的类别或者小目标,可以通过把这些目标物体粘贴在背景图片上来生成新的图片和标注文件,并把这些新的样本加入到训练中从而提升模型精度。目前PaddleX提供了实现该功能的接口,详细见[paddlex.det.paste_objects](https://paddlex.readthedocs.io/zh_CN/develop/apis/tools.html#paddlex-det-paste-objects),需要注意的是,前景目标颜色与背景颜色差异较大时生成的新图片才会比较逼真。

+ 214 - 0
examples/industrial_quality_inspection/cal_tp_fp.py

@@ -0,0 +1,214 @@
+# coding: utf8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import argparse
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import os.path as osp
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import paddlex as pdx
+
+
+def cal_image_level_recall_rate(model, dataset_dir):
+    """计算置信度(Score)在[0, 1]内以间隔0.01递增取值时,模型在有目标的图像上的图片级召回率。
+
+    图片级召回率:只要在有目标的图片上检测出目标(不论框的个数),该图片被认为召回,
+       批量有目标图片中被召回的图片所占的比例,即为图片级别的召回率。
+
+    Args:
+        model (PaddleX model object): 已加载的PaddleX模型。
+        dataset_dir (str):数据集路径。
+
+    Returns:
+        numpy.array: 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的召回率。
+    """
+
+    print(
+        "Begin to calculate image-level recall rate of positive images. Please wait for a moment..."
+    )
+    file_list = osp.join(dataset_dir, 'val_list.txt')
+    tp = np.zeros((101, 1))
+    positive_num = 0
+    with open(file_list, 'r') as fr:
+        while True:
+            line = fr.readline()
+            if not line:
+                break
+            img_file, xml_file = [osp.join(dataset_dir, x) \
+                    for x in line.strip().split()[:2]]
+            if not osp.exists(img_file):
+                continue
+            if not osp.exists(xml_file):
+                continue
+
+            positive_num += 1
+            results = model.predict(img_file)
+            scores = list()
+            for res in results:
+                scores.append(res['score'])
+            if len(scores) > 0:
+                tp[0:int(np.round(max(scores) / 0.01)), 0] += 1
+    tp = tp / positive_num
+    return tp
+
+
+def cal_image_level_false_positive_rate(model, negative_dir):
+    """计算置信度(Score)在[0, 1]内以间隔0.01递增取值时,模型在无目标的图像上的图片级误检率。
+
+    图片级误检率:只要在无目标的图片上检测出目标(不论框的个数),该图片被认为误检,
+       批量无目标图片中被误检的图片所占的比例,即为图片级别的误检率。
+
+    Args:
+        model (PaddleX model object): 已加载的PaddleX模型。
+        negative_dir (str):无目标图片的文件夹路径。
+
+    Returns:
+        numpy.array: 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的误检率。
+    """
+
+    print(
+        "Begin to calculate image-level false positive rate of negative(background) images. Please wait for a moment..."
+    )
+    fp = np.zeros((101, 1))
+    negative_num = 0
+    for file in os.listdir(negative_dir):
+        file = osp.join(negative_dir, file)
+        results = model.predict(file)
+        negative_num += 1
+        scores = list()
+        for res in results:
+            scores.append(res['score'])
+        if len(scores) > 0:
+            fp[0:int(np.round(max(scores) / 0.01)), 0] += 1
+    fp = fp / negative_num
+    return fp
+
+
+def result2textfile(tp_list, fp_list, save_dir):
+    """将不同置信度阈值下的图片级召回率和图片级误检率保存为文本文件。
+
+    文本文件中内容按照| 置信度阈值 | 图片级召回率 | 图片级误检率 |的格式保存。
+
+    Args:
+        tp_list (numpy.array): 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的召回率。
+        fp_list (numpy.array): 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的误检率。
+        save_dir (str): 文本文件的保存路径。
+
+    """
+
+    tp_fp_list_file = osp.join(save_dir, 'tp_fp_list.txt')
+    with open(tp_fp_list_file, 'w') as f:
+        f.write("| score | recall rate | false-positive rate |\n")
+        f.write("| -- | -- | -- |\n")
+        for i in range(100):
+            f.write("| {:2f} | {:2f} | {:2f} |\n".format(0.01 * i, tp_list[
+                i, 0], fp_list[i, 0]))
+    print("The numerical score-recall_rate-false_positive_rate is saved as {}".
+          format(tp_fp_list_file))
+
+
+def result2imagefile(tp_list, fp_list, save_dir):
+    """将不同置信度阈值下的图片级召回率和图片级误检率保存为图片。
+
+    图片中左子图横坐标表示不同置信度阈值下计算得到的图片级召回率,纵坐标表示各图片级召回率对应的图片级误检率。
+        右边子图横坐标表示图片级召回率,纵坐标表示各图片级召回率对应的置信度阈值。
+
+    Args:
+        tp_list (numpy.array): 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的召回率。
+        fp_list (numpy.array): 形状为101x1的数组,对应置信度从0到1按0.01递增取值时,计算所得图片级别的误检率。
+        save_dir (str): 文本文件的保存路径。
+
+    """
+
+    plt.subplot(1, 2, 1)
+    plt.title("image-level false_positive-recall")
+    plt.xlabel("recall")
+    plt.ylabel("false_positive")
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.grid(linestyle='--', linewidth=1)
+    plt.plot([0, 1], [0, 1], 'r--', linewidth=1)
+    my_x_ticks = np.arange(0, 1, 0.1)
+    my_y_ticks = np.arange(0, 1, 0.1)
+    plt.xticks(my_x_ticks, fontsize=5)
+    plt.yticks(my_y_ticks, fontsize=5)
+    plt.plot(tp_list, fp_list, color='b', label="image level", linewidth=1)
+    plt.legend(loc="lower left", fontsize=5)
+
+    plt.subplot(1, 2, 2)
+    plt.title("score-recall")
+    plt.xlabel('recall')
+    plt.ylabel('score')
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.grid(linestyle='--', linewidth=1)
+    plt.xticks(my_x_ticks, fontsize=5)
+    plt.yticks(my_y_ticks, fontsize=5)
+    plt.plot(
+        tp_list,
+        np.arange(0, 1.01, 0.01),
+        color='b',
+        label="image level",
+        linewidth=1)
+    plt.legend(loc="lower left", fontsize=5)
+    tp_fp_chart_file = os.path.join(save_dir, "image-level_tp_fp.png")
+    plt.savefig(tp_fp_chart_file, dpi=800)
+    plt.close()
+    print(
+        "The diagrammatic score-recall_rate-false_positive_rate is saved as {}".
+        format(tp_fp_chart_file))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model_dir",
+        default="./output/faster_rcnn_r50_vd_dcn/best_model/",
+        type=str,
+        help="The model directory path.")
+    parser.add_argument(
+        "--dataset_dir",
+        default="./aluminum_inspection",
+        type=str,
+        help="The VOC-format dataset directory path.")
+    parser.add_argument(
+        "--background_image_dir",
+        default="./aluminum_inspection/val_wu_xia_ci",
+        type=str,
+        help="The directory path of background images.")
+    parser.add_argument(
+        "--save_dir",
+        default="./visualize/faster_rcnn_r50_vd_dcn",
+        type=str,
+        help="The directory path of result.")
+
+    args = parser.parse_args()
+
+    if not osp.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    model = pdx.load_model(args.model_dir)
+
+    tp_list = cal_image_level_recall_rate(model, args.dataset_dir)
+    fp_list = cal_image_level_false_positive_rate(model,
+                                                  args.background_image_dir)
+    result2textfile(tp_list, fp_list, args.save_dir)
+    result2imagefile(tp_list, fp_list, args.save_dir)

+ 160 - 0
examples/industrial_quality_inspection/compare.py

@@ -0,0 +1,160 @@
+# coding: utf8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import argparse
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import os.path as osp
+import cv2
+import re
+import xml.etree.ElementTree as ET
+import paddlex as pdx
+
+
+def parse_xml_file(xml_file):
+    tree = ET.parse(xml_file)
+    pattern = re.compile('<object>', re.IGNORECASE)
+    obj_match = pattern.findall(str(ET.tostringlist(tree.getroot())))
+    if len(obj_match) == 0:
+        return False
+    obj_tag = obj_match[0][1:-1]
+    objs = tree.findall(obj_tag)
+    pattern = re.compile('<size>', re.IGNORECASE)
+    size_tag = pattern.findall(str(ET.tostringlist(tree.getroot())))[0][1:-1]
+    size_element = tree.find(size_tag)
+    pattern = re.compile('<width>', re.IGNORECASE)
+    width_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:-1]
+    im_w = float(size_element.find(width_tag).text)
+    pattern = re.compile('<height>', re.IGNORECASE)
+    height_tag = pattern.findall(str(ET.tostringlist(size_element)))[0][1:-1]
+    im_h = float(size_element.find(height_tag).text)
+    gt_bbox = []
+    gt_class = []
+    for i, obj in enumerate(objs):
+        pattern = re.compile('<name>', re.IGNORECASE)
+        name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+        cname = obj.find(name_tag).text.strip()
+        gt_class.append(cname)
+        pattern = re.compile('<difficult>', re.IGNORECASE)
+        diff_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+        try:
+            _difficult = int(obj.find(diff_tag).text)
+        except Exception:
+            _difficult = 0
+        pattern = re.compile('<bndbox>', re.IGNORECASE)
+        box_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:-1]
+        box_element = obj.find(box_tag)
+        pattern = re.compile('<xmin>', re.IGNORECASE)
+        xmin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][1:-1]
+        x1 = float(box_element.find(xmin_tag).text)
+        pattern = re.compile('<ymin>', re.IGNORECASE)
+        ymin_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][1:-1]
+        y1 = float(box_element.find(ymin_tag).text)
+        pattern = re.compile('<xmax>', re.IGNORECASE)
+        xmax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][1:-1]
+        x2 = float(box_element.find(xmax_tag).text)
+        pattern = re.compile('<ymax>', re.IGNORECASE)
+        ymax_tag = pattern.findall(str(ET.tostringlist(box_element)))[0][1:-1]
+        y2 = float(box_element.find(ymax_tag).text)
+        x1 = max(0, x1)
+        y1 = max(0, y1)
+        if im_w > 0.5 and im_h > 0.5:
+            x2 = min(im_w - 1, x2)
+            y2 = min(im_h - 1, y2)
+        gt_bbox.append([x1, y1, x2, y2])
+    gts = []
+    for bbox, name in zip(gt_bbox, gt_class):
+        x1, y1, x2, y2 = bbox
+        w = x2 - x1 + 1
+        h = y2 - y1 + 1
+        gt = {
+            'category_id': 0,
+            'category': name,
+            'bbox': [x1, y1, w, h],
+            'score': 1
+        }
+        gts.append(gt)
+
+    return gts
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model_dir",
+        default="./output/faster_rcnn_r50_vd_dcn/best_model/",
+        type=str,
+        help="The model directory path.")
+    parser.add_argument(
+        "--dataset_dir",
+        default="./aluminum_inspection",
+        type=str,
+        help="The VOC-format dataset directory path.")
+    parser.add_argument(
+        "--save_dir",
+        default="./visualize/compare",
+        type=str,
+        help="The directory path of result.")
+    parser.add_argument(
+        "--score_threshold",
+        default=0.1,
+        type=float,
+        help="The predicted bbox whose score is lower than score_threshold is filtered."
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    file_list = osp.join(args.dataset_dir, 'val_list.txt')
+
+    model = pdx.load_model(args.model_dir)
+
+    with open(file_list, 'r') as fr:
+        while True:
+            line = fr.readline()
+            if not line:
+                break
+            img_file, xml_file = [osp.join(args.dataset_dir, x) \
+                    for x in line.strip().split()[:2]]
+            if not osp.exists(img_file):
+                continue
+            if not osp.exists(xml_file):
+                continue
+
+            res = model.predict(img_file)
+            gts = parse_xml_file(xml_file)
+
+            det_vis = pdx.det.visualize(
+                img_file, res, threshold=args.score_threshold, save_dir=None)
+            if gts == False:
+                gts = cv2.imread(img_file)
+            else:
+                gt_vis = pdx.det.visualize(
+                    img_file,
+                    gts,
+                    threshold=args.score_threshold,
+                    save_dir=None)
+            vis = cv2.hconcat([gt_vis, det_vis])
+            cv2.imwrite(
+                os.path.join(args.save_dir, os.path.split(img_file)[-1]), vis)
+            print('The comparison has been made for {}'.format(img_file))
+
+    print(
+        "The visualized ground-truths and predictions are saved in {}. Ground-truth is on the left, prediciton is on the right".
+        format(save_dir))

+ 14 - 0
examples/industrial_quality_inspection/dataset.md

@@ -0,0 +1,14 @@
+# 天池铝材表面缺陷检测初赛数据集示例
+
+| 序号 | 瑕疵名称 | 瑕疵成因 | 瑕疵图片示例 | 图片数量 |
+| -- | -- | -- | -- | -- |
+| 1 | 擦花(擦伤)| 表面处理(喷涂)后有轻微擦到其它的东西,造成痕迹 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/ca_hua_example.png) | 128 |
+| 2 | 杂色 | 喷涂换颜料的时候,装颜料的容器未清洗干净,造成喷涂时有少量其它颜色掺入 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/za_se_example.png) |365 |
+| 3 | 漏底 | 喷粉效果不好,铝材大量底色露出 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/lou_di_example.png) | 538 |
+| 4 | 不导电 | 直接喷不到铝材表面上去 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/bu_dao_dian_example.png) | 390 |
+|5 | 桔皮 | 表面处理后涂层表面粗糙,大颗粒 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/ju_pi_example.png) | 173 |
+| 6 | 喷流| 喷涂时油漆稀从上流下来,有流动痕迹 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/pen_liu_example.png) | 86 |
+| 7 |漆泡 | 喷涂后表面起泡,小而多| ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qi_pao_example.png) | 82 |
+| 8 | 起坑 | 型材模具问题,做出来的型材一整条都有一条凹下去的部分 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/qi_keng_example.png.png) | 407 |
+| 9 | 脏点 | 表面处理时,有灰尘或一些脏东西未能擦掉,导致涂层有颗粒比较突出 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/zang_dian_example.png) | 261 |
+| 10 | 角位漏底 | 在型材角落出现的露底 | ![](https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/jiao_wei_lou_di_example.png) | 346 |

+ 26 - 0
examples/industrial_quality_inspection/error_analysis.py

@@ -0,0 +1,26 @@
+# coding: utf8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import paddlex as pdx
+
+model_dir = 'output/faster_rcnn_r50_vd_dcn/best_model/'
+save_dir = 'visualize/faster_rcnn_r50_vd_dcn'
+if not osp.exists(save_dir):
+    os.makedirs(save_dir)
+
+eval_details_file = osp.join(model_dir, 'eval_details.json')
+pdx.det.coco_error_analysis(eval_details_file, save_dir=save_dir)

+ 114 - 0
examples/industrial_quality_inspection/gpu_solution.md

@@ -0,0 +1,114 @@
+# GPU端最终解决方案
+
+本案例面向GPU端的最终方案是选择二阶段检测模型FasterRCNN,其骨干网络选择加入了可变形卷积(DCN)的ResNet50_vd,训练时使用SSLD蒸馏方案训练得到的ResNet50_vd预训练模型,FPN部分的通道数量设置为64,训练阶段数据增强策略采用RandomHorizontalFlip、RandomDistort、RandomCrop,并加入背景图片,测试阶段的RPN部分做非极大值抑制计算的候选框数量由原本的6000减少至500、做完非极大值抑制后保留的候选框数量由原本的1000减少至300。
+
+在Tesla P40的Linux系统下,对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image,模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。
+
+| 模型 | VOC mAP (%) | 推理时间 (ms/image)
+| -- | -- | -- |
+| FasterRCNN-ResNet50_vd_ssld | 81.05 | 48.62 |
+| + dcn | 88.09 | 66.51 |
+| + RandomHorizontalFlip/RandomDistort/RandomCrop | 90.23| 66.51 |
+| + background images | 88.87 | 66.51 |
+| + fpn channel=64 | 87.79 | 48.65 |
+| + test proposal=pre/post topk 500/300 | 87.72 | 46.08 |
+
+## 前置依赖
+
+* Paddle paddle >= 1.8.0
+* Python >= 3.5
+* PaddleX >= 1.3.0
+
+## 模型训练
+
+### (1) 下载PaddleX源码
+
+```
+git clone https://github.com/PaddlePaddle/PaddleX
+
+cd PaddleX/examples/industrial_quality_inspection
+```
+
+### (2) 下载数据集
+
+因数据集较大,可运行以下代码提前将数据集下载并解压。训练代码中也会自动下载数据集,所以这一步不是必须的。
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection.tar.gz
+tar xvf aluminum_inspection.tar.gz
+```
+### (3) 下载预先训练好的模型
+
+如果不想再次训练模型,可以直接下载已经训练好的模型完成后面的模型测试和部署推理:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/models/faster_rcnn_r50_vd_dcn.tar.gz
+tar xvf faster_rcnn_r50_vd_dcn.tar.gz
+```
+### (4) 训练
+
+运行以下代码进行模型训练,代码会自动下载数据集,如若事先下载了数据集,需将下载和解压铝材缺陷检测数据集的相关行注释掉。代码中默认使用0,1,2,3,4号GPU训练,可根据实际情况设置卡号并调整`batch_size`和`learning_rate`。
+
+```
+python train_rcnn.py
+```
+
+### (5) 分析预测错误的原因
+
+在模型迭代过程中,运行以下代码可完成模型效果的分析并生成分析结果图表:
+
+```
+python error_analysis.py
+```
+
+可参考[性能优化部分的模型效果分析](./accuracy_improvement.md#2-%E6%A8%A1%E5%9E%8B%E6%95%88%E6%9E%9C%E5%88%86%E6%9E%90)来理解当前模型预测错误的原因。
+
+运行以下代码,生成可视化真值和预测结果的对比图以进一步理解模型效果,代码中的置信度阈值可根据实际情况进行调整。
+
+```
+python compare.py
+```
+
+![](image/compare_budaodian-116.jpg)
+
+左边是可视化真值,右边是可视化预测结果。
+
+### (6) 统计图片级召回率和误检率
+
+模型迭代完成后,计算不同置信度阈值下[图片级召回率](./accuracy_improvement.md#6-%E8%83%8C%E6%99%AF%E5%9B%BE%E7%89%87%E5%8A%A0%E5%85%A5)和[图片级误检率](./accuracy_improvement.md#6-%E8%83%8C%E6%99%AF%E5%9B%BE%E7%89%87%E5%8A%A0%E5%85%A5),找到符合要求的召回率和误检率,对应的置信度阈值用于后续模型预测阶段。
+
+```
+python cal_tp_fp.py
+```
+
+执行后会生成图表`image-level_tp_fp.png`和文件`tp_fp_list.txt`,示意如下:
+
+| 图表`image-level_tp_fp.png` | 文件`tp_fp_list.txt` |
+| -- | -- |
+| ![](./image/image-level_tp_fp.png) | [tp_fp_list.txt](tp_fp_list.md) |
+
+图表`image-level_tp_fp.png`中左边子图,横坐标表示不同置信度阈值下计算得到的图片级召回率,纵坐标表示各图片级召回率对应的图片级误检率。右边子图横坐标表示图片级召回率,纵坐标表示各图片级召回率对应的置信度阈值。从图表中可较为直观地看出当前模型的图片级召回率和误检率的量级,从文件`tp_fp_list.txt`可以查到具体数值,例如在图片级召回率/图片级误检率为[0.9589,0.0074]这一组符合要求,就将对应的置信度阈值0.90选取为后续预测推理的阈值。
+
+### (7) 模型测试
+
+测试集因没有标注文件,这里单独下载测试集图片:
+
+```
+wget https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection_test.tar.gz
+tar xvf aluminum_inspection_test.tar.gz
+```
+
+加载训练好的模型,使用(5)选取的置信度阈值0.90对验证集图片或者测试集图片进行预测:
+
+```
+python predict.py
+```
+可视化预测结果示例如下:
+
+![](image/visualize_budaodian-116.jpg)
+
+## 推理部署
+
+本案例采用C++部署方式将模型部署在Tesla P40的Linux系统下,具体的C++部署流程请参考文档[PaddleX模型多端安全部署/C++部署](https://paddlex.readthedocs.io/zh_CN/develop/deploy/server/cpp/index.html)。
+
+对于输入大小是800 x 1333的模型,图像预处理时长为30ms/image。值得一提的是预处理中的Normalize操作比较耗时,因此在设置预处理操作时,可以先进行Resize操作再做Normalize。模型的推理时间为46.08ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝至CPU的时间。

BIN
examples/industrial_quality_inspection/image/after_clahe.png


BIN
examples/industrial_quality_inspection/image/allclasses_analysis_example.png


BIN
examples/industrial_quality_inspection/image/before_clahe.png


BIN
examples/industrial_quality_inspection/image/budaodian_analysis_example.png


BIN
examples/industrial_quality_inspection/image/cahua_analysis_example.png


BIN
examples/industrial_quality_inspection/image/compare_budaodian-116.jpg


BIN
examples/industrial_quality_inspection/image/image-level_tp_fp.png


BIN
examples/industrial_quality_inspection/image/jiaoweiloudi_analysis_example.png


BIN
examples/industrial_quality_inspection/image/jupi_analysis_example.png


BIN
examples/industrial_quality_inspection/image/loudi_analysis_example.png


BIN
examples/industrial_quality_inspection/image/penliu_analysis_example.png


BIN
examples/industrial_quality_inspection/image/qikeng_analysis_example.png


BIN
examples/industrial_quality_inspection/image/qipao_analysis_example.png


BIN
examples/industrial_quality_inspection/image/visualize_budaodian-116.jpg


BIN
examples/industrial_quality_inspection/image/zangdian_analysis_example.png


BIN
examples/industrial_quality_inspection/image/zase_analysis_example.png


+ 61 - 0
examples/industrial_quality_inspection/params_analysis.py

@@ -0,0 +1,61 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import argparse
+import os
+# 选择使用0号卡
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+import paddlex as pdx
+
+
+def params_analysis(model_dir, dataset, batch_size, save_file):
+    # 加载模型
+    model = pdx.load_model(model_dir)
+
+    # 定义验证所用的数据集
+    eval_dataset = pdx.datasets.VOCDetection(
+        data_dir=dataset,
+        file_list=os.path.join(dataset, 'val_list.txt'),
+        label_list=os.path.join(dataset, 'labels.txt'),
+        transforms=model.eval_transforms)
+
+    pdx.slim.prune.analysis(
+        model,
+        dataset=eval_dataset,
+        batch_size=batch_size,
+        save_file=save_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model_dir",
+        default="./output/yolov3_mobilenetv3/best_model",
+        type=str,
+        help="The model path.")
+    parser.add_argument(
+        "--dataset",
+        default="./aluminum_inspection",
+        type=str,
+        help="The model path.")
+    parser.add_argument("--batch_size", default=8, type=int, help="Batch size")
+    parser.add_argument(
+        "--save_file",
+        default="./sensitivities.data",
+        type=str,
+        help="The sensitivities file path.")
+
+    args = parser.parse_args()
+    params_analysis(args.model_dir, args.dataset, args.batch_size,
+                    args.save_file)

+ 35 - 0
examples/industrial_quality_inspection/predict.py

@@ -0,0 +1,35 @@
+# coding: utf8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import os.path as osp
+import paddlex as pdx
+
+img_file = 'aluminum_inspection/JPEGImages/budaodian-116.jpg'
+model_dir = 'output/faster_rcnn_r50_vd_dcn/best_model/'
+save_dir = './visualize/predict'
+# 设置置信度阈值
+score_threshold = 0.9
+
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+
+model = pdx.load_model(model_dir)
+res = model.predict(img_file)
+pdx.det.visualize(img_file, res, threshold=score_threshold, save_dir=save_dir)

+ 102 - 0
examples/industrial_quality_inspection/tp_fp_list.md

@@ -0,0 +1,102 @@
+| score | recall rate | false-positive rate |
+| -- | -- | -- |
+| 0.000000 | 0.982877 | 0.022222 |
+| 0.010000 | 0.982877 | 0.022222 |
+| 0.020000 | 0.982877 | 0.022222 |
+| 0.030000 | 0.982877 | 0.022222 |
+| 0.040000 | 0.982877 | 0.022222 |
+| 0.050000 | 0.982877 | 0.022222 |
+| 0.060000 | 0.982877 | 0.022222 |
+| 0.070000 | 0.979452 | 0.022222 |
+| 0.080000 | 0.979452 | 0.022222 |
+| 0.090000 | 0.979452 | 0.022222 |
+| 0.100000 | 0.979452 | 0.022222 |
+| 0.110000 | 0.979452 | 0.022222 |
+| 0.120000 | 0.979452 | 0.022222 |
+| 0.130000 | 0.979452 | 0.022222 |
+| 0.140000 | 0.976027 | 0.022222 |
+| 0.150000 | 0.976027 | 0.022222 |
+| 0.160000 | 0.976027 | 0.022222 |
+| 0.170000 | 0.976027 | 0.022222 |
+| 0.180000 | 0.976027 | 0.022222 |
+| 0.190000 | 0.976027 | 0.022222 |
+| 0.200000 | 0.976027 | 0.022222 |
+| 0.210000 | 0.976027 | 0.022222 |
+| 0.220000 | 0.976027 | 0.022222 |
+| 0.230000 | 0.976027 | 0.022222 |
+| 0.240000 | 0.976027 | 0.022222 |
+| 0.250000 | 0.976027 | 0.022222 |
+| 0.260000 | 0.976027 | 0.014815 |
+| 0.270000 | 0.976027 | 0.014815 |
+| 0.280000 | 0.976027 | 0.014815 |
+| 0.290000 | 0.976027 | 0.014815 |
+| 0.300000 | 0.976027 | 0.014815 |
+| 0.310000 | 0.976027 | 0.014815 |
+| 0.320000 | 0.976027 | 0.014815 |
+| 0.330000 | 0.976027 | 0.014815 |
+| 0.340000 | 0.976027 | 0.014815 |
+| 0.350000 | 0.976027 | 0.014815 |
+| 0.360000 | 0.976027 | 0.014815 |
+| 0.370000 | 0.976027 | 0.014815 |
+| 0.380000 | 0.976027 | 0.014815 |
+| 0.390000 | 0.976027 | 0.014815 |
+| 0.400000 | 0.976027 | 0.014815 |
+| 0.410000 | 0.976027 | 0.014815 |
+| 0.420000 | 0.976027 | 0.014815 |
+| 0.430000 | 0.976027 | 0.014815 |
+| 0.440000 | 0.972603 | 0.014815 |
+| 0.450000 | 0.972603 | 0.014815 |
+| 0.460000 | 0.972603 | 0.014815 |
+| 0.470000 | 0.972603 | 0.014815 |
+| 0.480000 | 0.972603 | 0.014815 |
+| 0.490000 | 0.972603 | 0.014815 |
+| 0.500000 | 0.972603 | 0.014815 |
+| 0.510000 | 0.972603 | 0.014815 |
+| 0.520000 | 0.972603 | 0.014815 |
+| 0.530000 | 0.972603 | 0.014815 |
+| 0.540000 | 0.972603 | 0.014815 |
+| 0.550000 | 0.972603 | 0.014815 |
+| 0.560000 | 0.969178 | 0.014815 |
+| 0.570000 | 0.969178 | 0.014815 |
+| 0.580000 | 0.969178 | 0.014815 |
+| 0.590000 | 0.969178 | 0.014815 |
+| 0.600000 | 0.969178 | 0.014815 |
+| 0.610000 | 0.969178 | 0.014815 |
+| 0.620000 | 0.969178 | 0.014815 |
+| 0.630000 | 0.969178 | 0.014815 |
+| 0.640000 | 0.969178 | 0.014815 |
+| 0.650000 | 0.969178 | 0.014815 |
+| 0.660000 | 0.969178 | 0.014815 |
+| 0.670000 | 0.969178 | 0.014815 |
+| 0.680000 | 0.969178 | 0.014815 |
+| 0.690000 | 0.969178 | 0.014815 |
+| 0.700000 | 0.969178 | 0.014815 |
+| 0.710000 | 0.969178 | 0.014815 |
+| 0.720000 | 0.969178 | 0.014815 |
+| 0.730000 | 0.969178 | 0.014815 |
+| 0.740000 | 0.969178 | 0.014815 |
+| 0.750000 | 0.969178 | 0.014815 |
+| 0.760000 | 0.969178 | 0.014815 |
+| 0.770000 | 0.965753 | 0.014815 |
+| 0.780000 | 0.965753 | 0.014815 |
+| 0.790000 | 0.965753 | 0.014815 |
+| 0.800000 | 0.962329 | 0.014815 |
+| 0.810000 | 0.962329 | 0.014815 |
+| 0.820000 | 0.962329 | 0.014815 |
+| 0.830000 | 0.962329 | 0.014815 |
+| 0.840000 | 0.962329 | 0.014815 |
+| 0.850000 | 0.958904 | 0.014815 |
+| 0.860000 | 0.958904 | 0.014815 |
+| 0.870000 | 0.958904 | 0.014815 |
+| 0.880000 | 0.958904 | 0.014815 |
+| 0.890000 | 0.958904 | 0.014815 |
+| 0.900000 | 0.958904 | 0.007407 |
+| 0.910000 | 0.958904 | 0.007407 |
+| 0.920000 | 0.958904 | 0.007407 |
+| 0.930000 | 0.955479 | 0.007407 |
+| 0.940000 | 0.955479 | 0.007407 |
+| 0.950000 | 0.955479 | 0.007407 |
+| 0.960000 | 0.955479 | 0.007407 |
+| 0.970000 | 0.955479 | 0.007407 |
+| 0.980000 | 0.941781 | 0.000000 |
+| 0.990000 | 0.893836 | 0.000000 |

+ 58 - 0
examples/industrial_quality_inspection/train_pruned_yolov3.py

@@ -0,0 +1,58 @@
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+# 定义训练和验证时的transforms
+# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html
+train_transforms = transforms.Compose([
+    transforms.MixupImage(mixup_epoch=250), transforms.RandomDistort(),
+    transforms.RandomExpand(), transforms.RandomCrop(), transforms.Resize(
+        target_size=608, interp='RANDOM'), transforms.RandomHorizontalFlip(),
+    transforms.Normalize()
+])
+
+eval_transforms = transforms.Compose([
+    transforms.Resize(
+        target_size=608, interp='CUBIC'), transforms.Normalize()
+])
+
+# 定义训练和验证所用的数据集
+# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection
+train_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/train_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    transforms=train_transforms,
+    shuffle=True)
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/val_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    transforms=eval_transforms)
+
+# 初始化模型,并进行训练
+# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html
+num_classes = len(train_dataset.labels)
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-yolov3
+model = pdx.det.YOLOv3(num_classes=num_classes, backbone='MobileNetV3_large')
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#train
+# 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html
+model.train(
+    num_epochs=400,
+    train_dataset=train_dataset,
+    train_batch_size=8,
+    eval_dataset=eval_dataset,
+    warmup_steps=4000,
+    learning_rate=0.000125,
+    lr_decay_epochs=[240, 320],
+    pretrain_weights='output/yolov3_mobilenetv3/best_model',
+    save_dir='output/yolov3_mobilenetv3_prune',
+    use_vdl=True,
+    sensitivities_file='./sensitivities.data',
+    eval_metric_loss=0.05)

+ 74 - 0
examples/industrial_quality_inspection/train_rcnn.py

@@ -0,0 +1,74 @@
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+# 下载和解压铝材缺陷检测数据集
+aluminum_dataset = 'https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection.tar.gz'
+pdx.utils.download_and_decompress(aluminum_dataset, path='./')
+
+# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html
+train_transforms = transforms.Compose([
+    transforms.RandomDistort(), transforms.RandomCrop(),
+    transforms.RandomHorizontalFlip(), transforms.ResizeByShort(
+        short_size=[800], max_size=1333), transforms.Normalize(
+            mean=[0.5], std=[0.5]), transforms.Padding(coarsest_stride=32)
+])
+
+eval_transforms = transforms.Compose([
+    transforms.ResizeByShort(
+        short_size=800, max_size=1333),
+    transforms.Normalize(),
+    transforms.Padding(coarsest_stride=32),
+])
+
+# 定义训练和验证所用的数据集
+# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection
+train_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/train_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    transforms=train_transforms,
+    num_workers=8,
+    shuffle=True)
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/val_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    num_workers=8,
+    transforms=eval_transforms)
+
+# 把背景图片加入训练集中
+train_dataset.add_negative_samples(
+    image_dir='./aluminum_inspection/train_wu_xia_ci')
+
+# 初始化模型,并进行训练
+# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html
+# num_classes 需要设置为包含背景类的类别数,即: 目标类别数量 + 1
+num_classes = len(train_dataset.labels) + 1
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-fasterrcnn
+model = pdx.det.FasterRCNN(
+    num_classes=num_classes,
+    backbone='ResNet50_vd_ssld',
+    with_dcn=True,
+    fpn_num_channels=64,
+    with_fpn=True,
+    test_pre_nms_top_n=500,
+    test_post_nms_top_n=300)
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#id1
+# 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html
+model.train(
+    num_epochs=80,
+    train_dataset=train_dataset,
+    train_batch_size=2,
+    eval_dataset=eval_dataset,
+    learning_rate=0.0025,
+    lr_decay_epochs=[60, 70],
+    warmup_steps=5000,
+    save_dir='output/faster_rcnn_r50_vd_dcn',
+    use_vdl=True)

+ 59 - 0
examples/industrial_quality_inspection/train_yolov3.py

@@ -0,0 +1,59 @@
+# 环境变量配置,用于控制是否使用GPU
+# 说明文档:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html#gpu
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from paddlex.det import transforms
+import paddlex as pdx
+
+# 下载和解压铝材缺陷检测数据集
+aluminum_dataset = 'https://bj.bcebos.com/paddlex/examples/industrial_quality_inspection/datasets/aluminum_inspection.tar.gz'
+pdx.utils.download_and_decompress(aluminum_dataset, path='./')
+
+# 定义训练和验证时的transforms
+# API说明 https://paddlex.readthedocs.io/zh_CN/develop/apis/transforms/det_transforms.html
+train_transforms = transforms.Compose([
+    transforms.MixupImage(mixup_epoch=250), transforms.RandomDistort(),
+    transforms.RandomExpand(), transforms.RandomCrop(), transforms.Resize(
+        target_size=608, interp='RANDOM'), transforms.RandomHorizontalFlip(),
+    transforms.Normalize()
+])
+
+eval_transforms = transforms.Compose([
+    transforms.Resize(
+        target_size=608, interp='CUBIC'), transforms.Normalize()
+])
+
+# 定义训练和验证所用的数据集
+# API说明:https://paddlex.readthedocs.io/zh_CN/develop/apis/datasets.html#paddlex-datasets-vocdetection
+train_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/train_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    transforms=train_transforms,
+    shuffle=True)
+eval_dataset = pdx.datasets.VOCDetection(
+    data_dir='aluminum_inspection',
+    file_list='aluminum_inspection/val_list.txt',
+    label_list='aluminum_inspection/labels.txt',
+    transforms=eval_transforms)
+
+# 初始化模型,并进行训练
+# 可使用VisualDL查看训练指标,参考https://paddlex.readthedocs.io/zh_CN/develop/train/visualdl.html
+num_classes = len(train_dataset.labels)
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#paddlex-det-yolov3
+model = pdx.det.YOLOv3(num_classes=num_classes, backbone='MobileNetV3_large')
+
+# API说明: https://paddlex.readthedocs.io/zh_CN/develop/apis/models/detection.html#train
+# 各参数介绍与调整说明:https://paddlex.readthedocs.io/zh_CN/develop/appendix/parameters.html
+model.train(
+    num_epochs=400,
+    train_dataset=train_dataset,
+    train_batch_size=8,
+    eval_dataset=eval_dataset,
+    warmup_steps=4000,
+    learning_rate=0.000125,
+    lr_decay_epochs=[240, 320],
+    save_dir='output/yolov3_mobilenetv3',
+    use_vdl=True)

+ 14 - 1
paddlex/command.py

@@ -15,6 +15,7 @@
 from six import text_type as _text_type
 import argparse
 import sys
+import os
 import os.path as osp
 import paddlex.utils.logging as logging
 
@@ -169,7 +170,7 @@ def main():
             logging.error(
                 "paddlex --export_inference --model_dir model_path --save_dir infer_model"
             )
-        save_file = os.path.join(args.save_dir, 'paddle2onnx_model.onnx') 
+        save_file = os.path.join(args.save_dir, 'paddle2onnx_model.onnx')
         pdx.converter.export_onnx_model(model, save_file, args.onnx_opset)
 
     if args.data_conversion:
@@ -178,6 +179,16 @@ def main():
         assert args.pics is not None, "--pics should be defined to confirm the pictures path"
         assert args.annotations is not None, "--annotations should be defined to confirm the annotations path"
         assert args.save_dir is not None, "--save_dir should be defined to store taregt dataset"
+        if args.source not in ['labelme', 'jingling', 'easydata']:
+            logging.error(
+                "The source format {} is not one of labelme/jingling/easydata".
+                format(args.source),
+                exit=False)
+        if args.to not in ['PascalVOC', 'MSCOCO', 'SEG', 'ImageNet']:
+            logging.error(
+                "The to format {} is not one of PascalVOC/MSCOCO/SEG/ImageNet".
+                format(args.to),
+                exit=False)
         if args.source == 'labelme' and args.to == 'ImageNet':
             logging.error(
                 "The labelme dataset can not convert to the ImageNet dataset.",
@@ -186,6 +197,8 @@ def main():
             logging.error(
                 "The jingling dataset can not convert to the PascalVOC dataset.",
                 exit=False)
+        if not osp.exists(args.save_dir):
+            os.makedirs(args.save_dir)
         pdx.tools.convert.dataset_conversion(args.source, args.to, args.pics,
                                              args.annotations, args.save_dir)
 

+ 15 - 4
paddlex/cv/datasets/voc.py

@@ -104,8 +104,13 @@ class VOCDetection(Dataset):
                 if not osp.isfile(xml_file):
                     continue
                 if not osp.exists(img_file):
-                    raise IOError('The image file {} is not exist!'.format(
+                    logging.warning('The image file {} is not exist!'.format(
                         img_file))
+                    continue
+                if not osp.exists(xml_file):
+                    logging.warning('The annotation file {} is not exist!'.
+                                    format(xml_file))
+                    continue
                 tree = ET.parse(xml_file)
                 if tree.find('id') is None:
                     im_id = np.array([ct])
@@ -138,8 +143,8 @@ class VOCDetection(Dataset):
                 difficult = np.zeros((len(objs), 1), dtype=np.int32)
                 for i, obj in enumerate(objs):
                     pattern = re.compile('<name>', re.IGNORECASE)
-                    name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][1:
-                                                                             -1]
+                    name_tag = pattern.findall(str(ET.tostringlist(obj)))[0][
+                        1:-1]
                     cname = obj.find(name_tag).text.strip()
                     gt_class[i][0] = cname2cid[cname]
                     pattern = re.compile('<difficult>', re.IGNORECASE)
@@ -229,6 +234,12 @@ class VOCDetection(Dataset):
         self.coco_gt.createIndex()
 
     def add_negative_samples(self, image_dir):
+        """将背景图片加入训练
+
+        Args:
+            image_dir (str):背景图片所在的文件夹目录。
+
+        """
         import cv2
         if not osp.exists(image_dir):
             raise Exception("{} background images directory does not exist.".
@@ -248,7 +259,7 @@ class VOCDetection(Dataset):
 
             max_img_id += 1
             im_fname = osp.join(image_dir, image)
-            img_data = cv2.imread(im_fname)
+            img_data = cv2.imread(im_fname, cv2.IMREAD_UNCHANGED)
             im_h, im_w, im_c = img_data.shape
             im_info = {
                 'im_id': np.array([max_img_id]).astype('int32'),

+ 21 - 12
paddlex/cv/models/base.py

@@ -135,11 +135,13 @@ class BaseAPI:
                            batch_size=1,
                            batch_num=10,
                            cache_dir="./temp"):
+        input_channel = getattr(self, 'input_channel', 3)
         arrange_transforms(
             model_type=self.model_type,
             class_name=self.__class__.__name__,
             transforms=dataset.transforms,
-            mode='quant')
+            mode='quant',
+            input_channel=input_channel)
         dataset.num_samples = batch_size * batch_num
         try:
             from .slim.post_quantization import PaddleXPostTrainingQuantization
@@ -258,8 +260,8 @@ class BaseAPI:
             logging.info(
                 "Load pretrain weights from {}.".format(pretrain_weights),
                 use_color=True)
-            paddlex.utils.utils.load_pretrain_weights(self.exe, self.train_prog,
-                                                      pretrain_weights, fuse_bn)
+            paddlex.utils.utils.load_pretrain_weights(
+                self.exe, self.train_prog, pretrain_weights, fuse_bn)
         # 进行裁剪
         if sensitivities_file is not None:
             import paddleslim
@@ -365,7 +367,9 @@ class BaseAPI:
         logging.info("Model saved in {}.".format(save_dir))
 
     def export_inference_model(self, save_dir):
-        test_input_names = [var.name for var in list(self.test_inputs.values())]
+        test_input_names = [
+            var.name for var in list(self.test_inputs.values())
+        ]
         test_outputs = list(self.test_outputs.values())
         save_prog = self.test_prog.clone(for_test=True)
         with fluid.scope_guard(self.scope):
@@ -394,7 +398,8 @@ class BaseAPI:
 
         # 模型保存成功的标志
         open(osp.join(save_dir, '.success'), 'w').close()
-        logging.info("Model for inference deploy saved in {}.".format(save_dir))
+        logging.info("Model for inference deploy saved in {}.".format(
+            save_dir))
 
     def train_loop(self,
                    num_epochs,
@@ -418,11 +423,13 @@ class BaseAPI:
             from visualdl import LogWriter
             vdl_logdir = osp.join(save_dir, 'vdl_log')
         # 给transform添加arrange操作
+        input_channel = getattr(self, 'input_channel', 3)
         arrange_transforms(
             model_type=self.model_type,
             class_name=self.__class__.__name__,
             transforms=train_dataset.transforms,
-            mode='train')
+            mode='train',
+            input_channel=input_channel)
         # 构建train_data_loader
         self.build_train_data_loader(
             dataset=train_dataset, batch_size=train_batch_size)
@@ -524,11 +531,13 @@ class BaseAPI:
                         eta = ((num_epochs - i) * total_num_steps - step - 1
                                ) * avg_step_time
                     if time_eval_one_epoch is not None:
-                        eval_eta = (total_eval_times - i // save_interval_epochs
-                                    ) * time_eval_one_epoch
+                        eval_eta = (
+                            total_eval_times - i // save_interval_epochs
+                        ) * time_eval_one_epoch
                     else:
-                        eval_eta = (total_eval_times - i // save_interval_epochs
-                                    ) * total_num_steps_eval * avg_step_time
+                        eval_eta = (
+                            total_eval_times - i // save_interval_epochs
+                        ) * total_num_steps_eval * avg_step_time
                     eta_str = seconds_to_hms(eta + eval_eta)
 
                     logging.info(
@@ -577,8 +586,8 @@ class BaseAPI:
                                 if v.size > 1:
                                     continue
                             log_writer.add_scalar(
-                                "{}-Metrics/Eval(Epoch): {}".format(task_id, k),
-                                v, i + 1)
+                                "{}-Metrics/Eval(Epoch): {}".format(
+                                    task_id, k), v, i + 1)
                 self.save_model(save_dir=current_save_dir)
                 if getattr(self, 'use_ema', False):
                     self.exe.run(self.ema.restore_program)

+ 17 - 5
paddlex/cv/models/deeplabv3p.py

@@ -459,12 +459,14 @@ class DeepLabv3p(BaseAPI):
                     transforms,
                     model_type,
                     class_name,
-                    thread_pool=None):
+                    thread_pool=None,
+                    input_channel=3):
         arrange_transforms(
             model_type=model_type,
             class_name=class_name,
             transforms=transforms,
-            mode='test')
+            mode='test',
+            input_channel=input_channel)
         if thread_pool is not None:
             batch_data = thread_pool.map(transforms, images)
         else:
@@ -523,8 +525,13 @@ class DeepLabv3p(BaseAPI):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_info = DeepLabv3p._preprocess(
-            images, transforms, self.model_type, self.__class__.__name__)
+            images,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,
@@ -553,9 +560,14 @@ class DeepLabv3p(BaseAPI):
             raise Exception("im_file must be list/tuple")
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_info = DeepLabv3p._preprocess(
-            img_file_list, transforms, self.model_type,
-            self.__class__.__name__, self.thread_pool)
+            img_file_list,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            self.thread_pool,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,

+ 139 - 16
paddlex/cv/models/faster_rcnn.py

@@ -35,10 +35,43 @@ class FasterRCNN(BaseAPI):
     Args:
         num_classes (int): 包含了背景类的类别数。默认为81。
         backbone (str): FasterRCNN的backbone网络,取值范围为['ResNet18', 'ResNet50',
-            'ResNet50_vd', 'ResNet101', 'ResNet101_vd', 'HRNet_W18']。默认为'ResNet50'。
+            'ResNet50_vd', 'ResNet101', 'ResNet101_vd', 'HRNet_W18', 'ResNet50_vd_ssld']。默认为'ResNet50'。
         with_fpn (bool): 是否使用FPN结构。默认为True。
         aspect_ratios (list): 生成anchor高宽比的可选值。默认为[0.5, 1.0, 2.0]。
         anchor_sizes (list): 生成anchor大小的可选值。默认为[32, 64, 128, 256, 512]。
+        with_dcn (bool): backbone网络中是否使用deformable convolution network v2。默认为False。
+        rpn_cls_loss (str): RPN部分的分类损失函数,取值范围为['SigmoidCrossEntropy', 'SigmoidFocalLoss']。
+            当遇到模型误检了很多背景区域时,可以考虑使用'SigmoidFocalLoss',并调整适合的`rpn_focal_loss_alpha`
+            和`rpn_focal_loss_gamma`。默认为'SigmoidCrossEntropy'。
+        rpn_focal_loss_alpha (float):当RPN的分类损失函数设置为'SigmoidFocalLoss'时,用于调整
+            正样本和负样本的比例因子,默认为0.25。当PN的分类损失函数设置为'SigmoidCrossEntropy'时,
+            `rpn_focal_loss_alpha`的设置不生效。
+        rpn_focal_loss_gamma (float): 当RPN的分类损失函数设置为'SigmoidFocalLoss'时,用于调整
+            易分样本和难分样本的比例因子,默认为2。当RPN的分类损失函数设置为'SigmoidCrossEntropy'时,
+            `rpn_focal_loss_gamma`的设置不生效。
+        rcnn_bbox_loss (str): RCNN部分的位置回归损失函数,取值范围为['SmoothL1Loss', 'CIoULoss']。
+            默认为'SmoothL1Loss'。
+        rcnn_nms (str): RCNN部分的非极大值抑制的计算方法,取值范围为['MultiClassNMS', 'MultiClassSoftNMS',
+            'MultiClassCiouNMS']。默认为'MultiClassNMS'。当选择'MultiClassNMS'时,可以将`keep_top_k`设置成100、
+            `nms_threshold`设置成0.5、`score_threshold`设置成0.05。当选择'MultiClassSoftNMS'时,可以将`keep_top_k`设置为300、
+            `score_threshold`设置为0.01、`softnms_sigma`设置为0.5。当选择'MultiClassCiouNMS'时,可以将`keep_top_k`设置为100、
+            `score_threshold`设置成0.05、`nms_threshold`设置成0.5。
+        keep_top_k (int): RCNN部分在进行非极大值抑制计算后,每张图像保留最多保存`keep_top_k`个检测框。默认为100。
+        nms_threshold (float): RCNN部分在进行非极大值抑制时,用于剔除检测框所需的IoU阈值。
+            当`rcnn_nms`设置为`MultiClassSoftNMS`时,`nms_threshold`的设置不生效。默认为0.5。
+        score_threshold (float): RCNN部分在进行非极大值抑制前,用于过滤掉低置信度边界框所需的置信度阈值。默认为0.05。
+        softnms_sigma (float): 当`rcnn_nms`设置为`MultiClassSoftNMS`时,用于调整被抑制的检测框的置信度,
+            调整公式为`score = score * weights, weights = exp(-(iou * iou) / softnms_sigma)`。默认设为0.5。
+        bbox_assigner (str): 训练阶段,RCNN部分生成正负样本的采样方式。可选范围为['BBoxAssigner', 'LibraBBoxAssigner']。
+            当目标物体的区域只占原始图像的一小部分时,使用`LibraBBoxAssigner`采样方式模型效果更佳。默认为'BBoxAssigner'。
+        fpn_num_channels (int): FPN部分特征层的通道数量。默认为256。
+        input_channel (int): 输入图像的通道数量。默认为3。
+        rpn_batch_size_per_im (int): 训练阶段,RPN部分每张图片的正负样本的数量总和。默认为256。
+        rpn_fg_fraction (float): 训练阶段,RPN部分每张图片的正负样本数量总和中正样本的占比。默认为0.5。
+        test_pre_nms_top_n (int):预测阶段,RPN部分做非极大值抑制计算的候选框的数量。若设置为None,
+            有FPN结构的话,`test_pre_nms_top_n`会被设置成6000, 无FPN结构的话,`test_pre_nms_top_n`会被设置成
+            1000。默认为None。
+        test_post_nms_top_n (int): 预测阶段,RPN部分做完非极大值抑制后保留的候选框的数量。默认为1000。
     """
 
     def __init__(self,
@@ -46,12 +79,29 @@ class FasterRCNN(BaseAPI):
                  backbone='ResNet50',
                  with_fpn=True,
                  aspect_ratios=[0.5, 1.0, 2.0],
-                 anchor_sizes=[32, 64, 128, 256, 512]):
+                 anchor_sizes=[32, 64, 128, 256, 512],
+                 with_dcn=False,
+                 rpn_cls_loss='SigmoidCrossEntropy',
+                 rpn_focal_loss_alpha=0.25,
+                 rpn_focal_loss_gamma=2,
+                 rcnn_bbox_loss='SmoothL1Loss',
+                 rcnn_nms='MultiClassNMS',
+                 keep_top_k=100,
+                 nms_threshold=0.5,
+                 score_threshold=0.05,
+                 softnms_sigma=0.5,
+                 bbox_assigner='BBoxAssigner',
+                 fpn_num_channels=256,
+                 input_channel=3,
+                 rpn_batch_size_per_im=256,
+                 rpn_fg_fraction=0.5,
+                 test_pre_nms_top_n=None,
+                 test_post_nms_top_n=1000):
         self.init_params = locals()
         super(FasterRCNN, self).__init__('detector')
         backbones = [
             'ResNet18', 'ResNet50', 'ResNet50_vd', 'ResNet101', 'ResNet101_vd',
-            'HRNet_W18'
+            'HRNet_W18', 'ResNet50_vd_ssld'
         ]
         assert backbone in backbones, "backbone should be one of {}".format(
             backbones)
@@ -62,9 +112,30 @@ class FasterRCNN(BaseAPI):
         self.anchor_sizes = anchor_sizes
         self.labels = None
         self.fixed_input_shape = None
+        self.with_dcn = with_dcn
+        rpn_cls_losses = ['SigmoidFocalLoss', 'SigmoidCrossEntropy']
+        assert rpn_cls_loss in rpn_cls_losses, "rpn_cls_loss should be one of {}".format(
+            rpn_cls_losses)
+        self.rpn_cls_loss = rpn_cls_loss
+        self.rpn_focal_loss_alpha = rpn_focal_loss_alpha
+        self.rpn_focal_loss_gamma = rpn_focal_loss_gamma
+        self.rcnn_bbox_loss = rcnn_bbox_loss
+        self.rcnn_nms = rcnn_nms
+        self.keep_top_k = keep_top_k
+        self.nms_threshold = nms_threshold
+        self.score_threshold = score_threshold
+        self.softnms_sigma = softnms_sigma
+        self.bbox_assigner = bbox_assigner
+        self.fpn_num_channels = fpn_num_channels
+        self.input_channel = input_channel
+        self.rpn_batch_size_per_im = rpn_batch_size_per_im
+        self.rpn_fg_fraction = rpn_fg_fraction
+        self.test_pre_nms_top_n = test_pre_nms_top_n
+        self.test_post_nms_top_n = test_post_nms_top_n
 
     def _get_backbone(self, backbone_name):
         norm_type = None
+        lr_mult_list = [1.0, 1.0, 1.0, 1.0, 1.0]
         if backbone_name == 'ResNet18':
             layers = 18
             variant = 'b'
@@ -89,6 +160,11 @@ class FasterRCNN(BaseAPI):
             if self.with_fpn is False:
                 self.with_fpn = True
             return backbone
+        elif backbone_name == 'ResNet50_vd_ssld':
+            layers = 50
+            variant = 'd'
+            norm_type = 'bn'
+            lr_mult_list = [1.0, 0.05, 0.05, 0.1, 0.15]
         if self.with_fpn:
             backbone = paddlex.cv.nets.resnet.ResNet(
                 norm_type='bn' if norm_type is None else norm_type,
@@ -97,7 +173,9 @@ class FasterRCNN(BaseAPI):
                 freeze_norm=True,
                 norm_decay=0.,
                 feature_maps=[2, 3, 4, 5],
-                freeze_at=2)
+                freeze_at=2,
+                lr_mult_list=lr_mult_list,
+                dcn_v2_stages=[3, 4, 5] if self.with_dcn else [])
         else:
             backbone = paddlex.cv.nets.resnet.ResNet(
                 norm_type='affine_channel' if norm_type is None else norm_type,
@@ -106,12 +184,16 @@ class FasterRCNN(BaseAPI):
                 freeze_norm=True,
                 norm_decay=0.,
                 feature_maps=4,
-                freeze_at=2)
+                freeze_at=2,
+                lr_mult_list=lr_mult_list,
+                dcn_v2_stages=[3, 4, 5] if self.with_dcn else [])
         return backbone
 
     def build_net(self, mode='train'):
         train_pre_nms_top_n = 2000 if self.with_fpn else 12000
         test_pre_nms_top_n = 1000 if self.with_fpn else 6000
+        if self.test_pre_nms_top_n is not None:
+            test_pre_nms_top_n = self.test_pre_nms_top_n
         model = paddlex.cv.nets.detection.FasterRCNN(
             backbone=self._get_backbone(self.backbone),
             mode=mode,
@@ -121,7 +203,22 @@ class FasterRCNN(BaseAPI):
             anchor_sizes=self.anchor_sizes,
             train_pre_nms_top_n=train_pre_nms_top_n,
             test_pre_nms_top_n=test_pre_nms_top_n,
-            fixed_input_shape=self.fixed_input_shape)
+            fixed_input_shape=self.fixed_input_shape,
+            rpn_cls_loss=self.rpn_cls_loss,
+            rpn_focal_loss_alpha=self.rpn_focal_loss_alpha,
+            rpn_focal_loss_gamma=self.rpn_focal_loss_gamma,
+            rcnn_bbox_loss=self.rcnn_bbox_loss,
+            rcnn_nms=self.rcnn_nms,
+            keep_top_k=self.keep_top_k,
+            nms_threshold=self.nms_threshold,
+            score_threshold=self.score_threshold,
+            softnms_sigma=self.softnms_sigma,
+            bbox_assigner=self.bbox_assigner,
+            fpn_num_channels=self.fpn_num_channels,
+            input_channel=self.input_channel,
+            rpn_batch_size_per_im=self.rpn_batch_size_per_im,
+            rpn_fg_fraction=self.rpn_fg_fraction,
+            test_post_nms_top_n=self.test_post_nms_top_n)
         inputs = model.generate_inputs()
         if mode == 'train':
             model_out = model.build_net(inputs)
@@ -186,7 +283,9 @@ class FasterRCNN(BaseAPI):
               use_vdl=False,
               early_stop=False,
               early_stop_patience=5,
-              resume_checkpoint=None):
+              resume_checkpoint=None,
+              sensitivities_file=None,
+              eval_metric_loss=0.05):
         """训练。
 
         Args:
@@ -214,6 +313,9 @@ class FasterRCNN(BaseAPI):
             early_stop_patience (int): 当使用提前终止训练策略时,如果验证集精度在`early_stop_patience`个epoch内
                 连续下降或持平,则终止训练。默认值为5。
             resume_checkpoint (str): 恢复训练时指定上次训练保存的模型路径。若为None,则不会恢复训练。默认值为None。
+            sensitivities_file (str): 若指定为路径时,则加载路径下敏感度信息进行裁剪;若为字符串'DEFAULT',
+                则自动下载在ImageNet图片数据上获得的敏感度信息进行裁剪;若为None,则不进行裁剪。默认为None。
+            eval_metric_loss (float): 可容忍的精度损失。默认为0.05。
 
         Raises:
             ValueError: 评估类型不在指定列表中。
@@ -255,7 +357,9 @@ class FasterRCNN(BaseAPI):
             pretrain_weights=pretrain_weights,
             fuse_bn=fuse_bn,
             save_dir=save_dir,
-            resume_checkpoint=resume_checkpoint)
+            resume_checkpoint=resume_checkpoint,
+            sensitivities_file=sensitivities_file,
+            eval_metric_loss=eval_metric_loss)
 
         # 训练
         self.train_loop(
@@ -291,14 +395,17 @@ class FasterRCNN(BaseAPI):
             tuple (metrics, eval_details) /dict (metrics): 当return_details为True时,返回(metrics, eval_details),
                 当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,
                 分别表示平均准确率平均值在各个阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。
-                eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、
-                预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。
+                eval_details为dict,包含bbox和gt两个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,
+                一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。而关键字gt的键值是真实标注框的相关信息。
         """
+
+        input_channel = getattr(self, 'input_channel', 3)
         arrange_transforms(
             model_type=self.model_type,
             class_name=self.__class__.__name__,
             transforms=eval_dataset.transforms,
-            mode='eval')
+            mode='eval',
+            input_channel=input_channel)
         if metric is None:
             if hasattr(self, 'metric') and self.metric is not None:
                 metric = self.metric
@@ -376,12 +483,18 @@ class FasterRCNN(BaseAPI):
         return metrics
 
     @staticmethod
-    def _preprocess(images, transforms, model_type, class_name, thread_pool=None):
+    def _preprocess(images,
+                    transforms,
+                    model_type,
+                    class_name,
+                    thread_pool=None,
+                    input_channel=3):
         arrange_transforms(
             model_type=model_type,
             class_name=class_name,
             transforms=transforms,
-            mode='test')
+            mode='test',
+            input_channel=input_channel)
         if thread_pool is not None:
             batch_data = thread_pool.map(transforms, images)
         else:
@@ -429,8 +542,13 @@ class FasterRCNN(BaseAPI):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_resize_info, im_shape = FasterRCNN._preprocess(
-            images, transforms, self.model_type, self.__class__.__name__)
+            images,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,
@@ -476,9 +594,14 @@ class FasterRCNN(BaseAPI):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_resize_info, im_shape = FasterRCNN._preprocess(
-            img_file_list, transforms, self.model_type,
-            self.__class__.__name__, self.thread_pool)
+            img_file_list,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            self.thread_pool,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,

+ 28 - 9
paddlex/cv/models/mask_rcnn.py

@@ -38,6 +38,7 @@ class MaskRCNN(FasterRCNN):
         with_fpn (bool): 是否使用FPN结构。默认为True。
         aspect_ratios (list): 生成anchor高宽比的可选值。默认为[0.5, 1.0, 2.0]。
         anchor_sizes (list): 生成anchor大小的可选值。默认为[32, 64, 128, 256, 512]。
+        input_channel (int): 输入图像的通道数量。默认为3。
     """
 
     def __init__(self,
@@ -45,7 +46,8 @@ class MaskRCNN(FasterRCNN):
                  backbone='ResNet50',
                  with_fpn=True,
                  aspect_ratios=[0.5, 1.0, 2.0],
-                 anchor_sizes=[32, 64, 128, 256, 512]):
+                 anchor_sizes=[32, 64, 128, 256, 512],
+                 input_channel=3):
         self.init_params = locals()
         backbones = [
             'ResNet18', 'ResNet50', 'ResNet50_vd', 'ResNet101', 'ResNet101_vd',
@@ -64,6 +66,8 @@ class MaskRCNN(FasterRCNN):
         else:
             self.mask_head_resolution = 14
         self.fixed_input_shape = None
+        self.input_channel = input_channel
+        self.with_dcn = False
 
     def build_net(self, mode='train'):
         train_pre_nms_top_n = 2000 if self.with_fpn else 12000
@@ -78,7 +82,8 @@ class MaskRCNN(FasterRCNN):
             test_pre_nms_top_n=test_pre_nms_top_n,
             num_convs=num_convs,
             mask_head_resolution=self.mask_head_resolution,
-            fixed_input_shape=self.fixed_input_shape)
+            fixed_input_shape=self.fixed_input_shape,
+            input_channel=self.input_channel)
         inputs = model.generate_inputs()
         if mode == 'train':
             model_out = model.build_net(inputs)
@@ -251,15 +256,19 @@ class MaskRCNN(FasterRCNN):
                 当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'和'segm_mmap'
                 或者’bbox_map‘和'segm_map',分别表示预测框和分割区域平均准确率平均值在
                 各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。eval_details为dict,
-                包含关键字:'bbox',对应元素预测框结果列表,每个预测结果由图像id、预测框类别id、
-                预测框坐标、预测框得分;'mask',对应元素预测区域结果列表,每个预测结果由图像id、
-                预测区域类别id、预测区域坐标、预测区域得分;’gt‘:真实标注框和标注区域相关信息。
+                包含bbox、mask和gt三个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,
+                一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。
+                关键字mask的键值是一个列表,列表中每个元素代表各预测框内物体的分割结果,分割结果由图像id、
+                预测框类别id、表示预测框内各像素点是否属于物体的二值图、预测框得分。
+                而关键字gt的键值是真实标注框的相关信息。
         """
+        input_channel = getattr(self, 'input_channel', 3)
         arrange_transforms(
             model_type=self.model_type,
             class_name=self.__class__.__name__,
             transforms=eval_dataset.transforms,
-            mode='eval')
+            mode='eval',
+            input_channel=input_channel)
         if metric is None:
             if hasattr(self, 'metric') and self.metric is not None:
                 metric = self.metric
@@ -381,8 +390,13 @@ class MaskRCNN(FasterRCNN):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_resize_info, im_shape = FasterRCNN._preprocess(
-            images, transforms, self.model_type, self.__class__.__name__)
+            images,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,
@@ -429,9 +443,14 @@ class MaskRCNN(FasterRCNN):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
         im, im_resize_info, im_shape = FasterRCNN._preprocess(
-            img_file_list, transforms, self.model_type,
-            self.__class__.__name__, self.thread_pool)
+            img_file_list,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            self.thread_pool,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,

+ 30 - 9
paddlex/cv/models/ppyolo.py

@@ -58,6 +58,7 @@ class PPYOLO(BaseAPI):
         nms_iou_threshold (float): 进行NMS时,用于剔除检测框IOU的阈值。默认为0.45。
         label_smooth (bool): 是否使用label smooth。默认值为False。
         train_random_shapes (list|tuple): 训练时从列表中随机选择图像大小。默认值为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。
+        input_channel (int): 输入图像的通道数量。默认为3。
     """
 
     def __init__(
@@ -85,7 +86,8 @@ class PPYOLO(BaseAPI):
             nms_iou_threshold=0.45,
             train_random_shapes=[
                 320, 352, 384, 416, 448, 480, 512, 544, 576, 608
-            ]):
+            ],
+            input_channel=3):
         self.init_params = locals()
         super(PPYOLO, self).__init__('detector')
         backbones = ['ResNet50_vd_ssld']
@@ -123,6 +125,7 @@ class PPYOLO(BaseAPI):
         self.use_matrix_nms = use_matrix_nms
         self.use_ema = False
         self.with_dcn_v2 = with_dcn_v2
+        self.input_channel = input_channel
 
         if paddle.__version__ < '1.8.4' and paddle.__version__ != '0.0.0':
             raise Exception(
@@ -163,7 +166,8 @@ class PPYOLO(BaseAPI):
             use_matrix_nms=self.use_matrix_nms,
             use_fine_grained_loss=self.use_fine_grained_loss,
             use_iou_loss=self.use_iou_loss,
-            batch_size=getattr(self, 'batch_size_per_gpu', 8))
+            batch_size=getattr(self, 'batch_size_per_gpu', None),
+            input_channel=self.input_channel)
         if mode == 'train' and self.use_iou_loss or self.use_iou_aware:
             model.max_height = self.max_height
             model.max_width = self.max_width
@@ -382,14 +386,16 @@ class PPYOLO(BaseAPI):
             tuple (metrics, eval_details) | dict (metrics): 当return_details为True时,返回(metrics, eval_details),
                 当return_details为False时,返回metrics。metrics为dict,包含关键字:'bbox_mmap'或者’bbox_map‘,
                 分别表示平均准确率平均值在各个IoU阈值下的结果取平均值的结果(mmAP)、平均准确率平均值(mAP)。
-                eval_details为dict,包含关键字:'bbox',对应元素预测结果列表,每个预测结果由图像id、
-                预测框类别id、预测框坐标、预测框得分;’gt‘:真实标注框相关信息。
+                eval_details为dict,包含bbox和gt两个关键字。其中关键字bbox的键值是一个列表,列表中每个元素代表一个预测结果,
+                一个预测结果是一个由图像id,预测框类别id, 预测框坐标,预测框得分组成的列表。而关键字gt的键值是真实标注框的相关信息。
         """
+        input_channel = getattr(self, 'input_channel', 3)
         arrange_transforms(
             model_type=self.model_type,
             class_name=self.__class__.__name__,
             transforms=eval_dataset.transforms,
-            mode='eval')
+            mode='eval',
+            input_channel=input_channel)
         if metric is None:
             if hasattr(self, 'metric') and self.metric is not None:
                 metric = self.metric
@@ -454,12 +460,14 @@ class PPYOLO(BaseAPI):
                     transforms,
                     model_type,
                     class_name,
-                    thread_pool=None):
+                    thread_pool=None,
+                    input_channel=3):
         arrange_transforms(
             model_type=model_type,
             class_name=class_name,
             transforms=transforms,
-            mode='test')
+            mode='test',
+            input_channel=input_channel)
         if thread_pool is not None:
             batch_data = thread_pool.map(transforms, images)
         else:
@@ -508,8 +516,13 @@ class PPYOLO(BaseAPI):
 
         if transforms is None:
             transforms = self.test_transforms
-        im, im_size = PPYOLO._preprocess(images, transforms, self.model_type,
-                                         self.__class__.__name__)
+        input_channel = getattr(self, 'input_channel', 3)
+        im, im_size = PPYOLO._preprocess(
+            images,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            input_channel=input_channel)
 
         with fluid.scope_guard(self.scope):
             result = self.exe.run(self.test_prog,
@@ -549,6 +562,14 @@ class PPYOLO(BaseAPI):
 
         if transforms is None:
             transforms = self.test_transforms
+        input_channel = getattr(self, 'input_channel', 3)
+        im, im_size = PPYOLO._preprocess(
+            img_file_list,
+            transforms,
+            self.model_type,
+            self.__class__.__name__,
+            self.thread_pool,
+            input_channel=input_channel)
         im, im_size = PPYOLO._preprocess(
             img_file_list, transforms, self.model_type,
             self.__class__.__name__, self.thread_pool)

+ 9 - 0
paddlex/cv/models/slim/prune_config.py

@@ -334,6 +334,15 @@ def get_prune_params(model):
         for i in params_not_prune:
             if i in prune_names:
                 prune_names.remove(i)
+    elif 'RCNN' in model_type:
+        for block in program.blocks:
+            for param in block.all_parameters():
+                pd_var = model.scope.find_var(param.name)
+                pd_param = pd_var.get_tensor()
+                if len(np.array(pd_param).shape) == 4:
+                    if 'fpn' in param.name or 'rpn' in param.name or 'fc' in param.name or 'cls' in param.name or 'bbox' in param.name:
+                        continue
+                    prune_names.append(param.name)
     else:
         raise Exception('The {} is not implement yet!'.format(model_type))
     return prune_names

+ 239 - 0
paddlex/cv/models/utils/detection_eval.py

@@ -1,3 +1,4 @@
+# coding: utf8
 # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -768,3 +769,241 @@ class DetectionMAP(object):
             accum_fp += 1 - int(pos)
             accum_fp_list.append(accum_fp)
         return accum_tp_list, accum_fp_list
+
+
+def makeplot(rs, ps, outDir, class_name, iou_type):
+    """针对某个特定类别,绘制不同评估要求下的准确率和召回率。
+       绘制结果说明参考COCODataset官网给出分析工具说明https://cocodataset.org/#detection-eval。
+
+       Refer to https://github.com/open-mmlab/mmdetection/blob/master/tools/coco_error_analysis.py
+
+       Args:
+           rs (np.array): 在不同置信度阈值下计算得到的召回率。
+           ps (np.array): 在不同置信度阈值下计算得到的准确率。ps与rs相同位置下的数值为同一个置信度阈值
+               计算得到的准确率与召回率。
+           outDir (str): 图表保存的路径。
+           class_name (str): 类别名。
+           iou_type (str): iou计算方式,若为检测框,则设置为'bbox',若为像素级分割结果,则设置为'segm'。
+
+    """
+
+    import matplotlib.pyplot as plt
+
+    cs = np.vstack([
+        np.ones((2, 3)), np.array([.31, .51, .74]), np.array([.75, .31, .30]),
+        np.array([.36, .90, .38]), np.array([.50, .39, .64]),
+        np.array([1, .6, 0])
+    ])
+    areaNames = ['allarea', 'small', 'medium', 'large']
+    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
+    for i in range(len(areaNames)):
+        area_ps = ps[..., i, 0]
+        figure_tile = iou_type + '-' + class_name + '-' + areaNames[i]
+        aps = [ps_.mean() for ps_ in area_ps]
+        ps_curve = [
+            ps_.mean(axis=1) if ps_.ndim > 1 else ps_ for ps_ in area_ps
+        ]
+        ps_curve.insert(0, np.zeros(ps_curve[0].shape))
+        fig = plt.figure()
+        ax = plt.subplot(111)
+        for k in range(len(types)):
+            ax.plot(rs, ps_curve[k + 1], color=[0, 0, 0], linewidth=0.5)
+            ax.fill_between(
+                rs,
+                ps_curve[k],
+                ps_curve[k + 1],
+                color=cs[k],
+                label=str('[{:.3f}'.format(aps[k]) + ']' + types[k]))
+        plt.xlabel('recall')
+        plt.ylabel('precision')
+        plt.xlim(0, 1.)
+        plt.ylim(0, 1.)
+        plt.title(figure_tile)
+        plt.legend()
+        fig.savefig(outDir + '/{}.png'.format(figure_tile))
+        plt.close(fig)
+
+
+def analyze_individual_category(k, cocoDt, cocoGt, catId, iou_type):
+    """针对某个特定类别,分析忽略亚类混淆和类别混淆时的准确率。
+
+       Refer to https://github.com/open-mmlab/mmdetection/blob/master/tools/coco_error_analysis.py
+
+       Args:
+           k (int): 待分析类别的序号。
+           cocoDt (pycocotols.coco.COCO): 按COCO类存放的预测结果。
+           cocoGt (pycocotols.coco.COCO): 按COCO类存放的真值。
+           catId (int): 待分析类别在数据集中的类别id。
+           iou_type (str): iou计算方式,若为检测框,则设置为'bbox',若为像素级分割结果,则设置为'segm'。
+
+       Returns:
+           int:
+           dict: 有关键字'ps_supercategory'和'ps_allcategory'。关键字'ps_supercategory'的键值是忽略亚类间
+               混淆时的准确率,关键字'ps_allcategory'的键值是忽略类别间混淆时的准确率。
+
+    """
+
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+
+    nm = cocoGt.loadCats(catId)[0]
+    logging.info('--------------analyzing {}-{}---------------'.format(
+        k + 1, nm['name']))
+    ps_ = {}
+    dt = copy.deepcopy(cocoDt)
+    nm = cocoGt.loadCats(catId)[0]
+    imgIds = cocoGt.getImgIds()
+    dt_anns = dt.dataset['annotations']
+    select_dt_anns = []
+    for ann in dt_anns:
+        if ann['category_id'] == catId:
+            select_dt_anns.append(ann)
+    dt.dataset['annotations'] = select_dt_anns
+    dt.createIndex()
+    # compute precision but ignore superclass confusion
+    gt = copy.deepcopy(cocoGt)
+    child_catIds = gt.getCatIds(supNms=[nm['supercategory']])
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if (ann['category_id'] in child_catIds and
+                ann['category_id'] != catId):
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [.1]
+    cocoEval.params.useCats = 1
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_supercategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_supercategory'] = ps_supercategory
+    # compute precision but ignore any class confusion
+    gt = copy.deepcopy(cocoGt)
+    for idx, ann in enumerate(gt.dataset['annotations']):
+        if ann['category_id'] != catId:
+            gt.dataset['annotations'][idx]['ignore'] = 1
+            gt.dataset['annotations'][idx]['iscrowd'] = 1
+            gt.dataset['annotations'][idx]['category_id'] = catId
+    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
+    cocoEval.params.imgIds = imgIds
+    cocoEval.params.maxDets = [100]
+    cocoEval.params.iouThrs = [.1]
+    cocoEval.params.useCats = 1
+    cocoEval.evaluate()
+    cocoEval.accumulate()
+    ps_allcategory = cocoEval.eval['precision'][0, :, k, :, :]
+    ps_['ps_allcategory'] = ps_allcategory
+    return k, ps_
+
+
+def coco_error_analysis(eval_details_file=None,
+                        gt=None,
+                        pred_bbox=None,
+                        pred_mask=None,
+                        save_dir='./output'):
+    """逐个分析模型预测错误的原因,并将分析结果以图表的形式展示。
+       分析结果说明参考COCODataset官网给出分析工具说明https://cocodataset.org/#detection-eval。
+
+       Refer to https://github.com/open-mmlab/mmdetection/blob/master/tools/coco_error_analysis.py
+
+       Args:
+           eval_details_file (str):  模型评估结果的保存路径,包含真值信息和预测结果。
+           gt (list): 数据集的真值信息。默认值为None。
+           pred_bbox (list): 模型在数据集上的预测框。默认值为None。
+           pred_mask (list): 模型在数据集上的预测mask。默认值为None。
+           save_dir (str): 可视化结果保存路径。默认值为'./output'。
+
+        Note:
+           eval_details_file的优先级更高,只要eval_details_file不为None,
+           就会从eval_details_file提取真值信息和预测结果做分析。
+           当eval_details_file为None时,则用gt、pred_mask、pred_mask做分析。
+
+    """
+
+    from multiprocessing import Pool
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+
+    if eval_details_file is not None:
+        import json
+        with open(eval_details_file, 'r') as f:
+            eval_details = json.load(f)
+            pred_bbox = eval_details['bbox']
+            if 'mask' in eval_details:
+                pred_mask = eval_details['mask']
+            gt = eval_details['gt']
+    if gt is None or pred_bbox is None:
+        raise Exception(
+            "gt/pred_bbox/pred_mask is None now, please set right eval_details_file or gt/pred_bbox/pred_mask."
+        )
+    if pred_bbox is not None and len(pred_bbox) == 0:
+        raise Exception("There is no predicted bbox.")
+    if pred_mask is not None and len(pred_mask) == 0:
+        raise Exception("There is no predicted mask.")
+
+    def _analyze_results(cocoGt, cocoDt, res_type, out_dir):
+        directory = os.path.dirname(out_dir + '/')
+        if not os.path.exists(directory):
+            logging.info('-------------create {}-----------------'.format(
+                out_dir))
+            os.makedirs(directory)
+
+        imgIds = cocoGt.getImgIds()
+        res_out_dir = out_dir + '/' + res_type + '/'
+        res_directory = os.path.dirname(res_out_dir)
+        if not os.path.exists(res_directory):
+            logging.info('-------------create {}-----------------'.format(
+                res_out_dir))
+            os.makedirs(res_directory)
+        iou_type = res_type
+        cocoEval = COCOeval(
+            copy.deepcopy(cocoGt), copy.deepcopy(cocoDt), iou_type)
+        cocoEval.params.imgIds = imgIds
+        cocoEval.params.iouThrs = [.75, .5, .1]
+        cocoEval.params.maxDets = [100]
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        ps = cocoEval.eval['precision']
+        ps = np.vstack([ps, np.zeros((4, *ps.shape[1:]))])
+        catIds = cocoGt.getCatIds()
+        recThrs = cocoEval.params.recThrs
+        with Pool(processes=48) as pool:
+            args = [(k, cocoDt, cocoGt, catId, iou_type)
+                    for k, catId in enumerate(catIds)]
+            analyze_results = pool.starmap(analyze_individual_category, args)
+        for k, catId in enumerate(catIds):
+            nm = cocoGt.loadCats(catId)[0]
+            logging.info('--------------saving {}-{}---------------'.format(
+                k + 1, nm['name']))
+            analyze_result = analyze_results[k]
+            assert k == analyze_result[0], ""
+            ps_supercategory = analyze_result[1]['ps_supercategory']
+            ps_allcategory = analyze_result[1]['ps_allcategory']
+            # compute precision but ignore superclass confusion
+            ps[3, :, k, :, :] = ps_supercategory
+            # compute precision but ignore any class confusion
+            ps[4, :, k, :, :] = ps_allcategory
+            # fill in background and false negative errors and plot
+            T, _, _, A, _ = ps.shape
+            for t in range(T):
+                for a in range(A):
+                    if np.sum(ps[t, :, k, a, :] ==
+                              -1) != len(ps[t, :, k, :, :]):
+                        ps[t, :, k, a, :][ps[t, :, k, a, :] == -1] = 0
+            ps[5, :, k, :, :] = (ps[4, :, k, :, :] > 0)
+            ps[6, :, k, :, :] = 1.0
+            makeplot(recThrs, ps[:, :, k], res_out_dir, nm['name'], iou_type)
+        makeplot(recThrs, ps, res_out_dir, 'allclass', iou_type)
+
+    coco_gt = COCO()
+    coco_gt.dataset = gt
+    coco_gt.createIndex()
+    from pycocotools.cocoeval import COCOeval
+    if pred_bbox is not None:
+        coco_dt = loadRes(coco_gt, pred_bbox)
+        _analyze_results(coco_gt, coco_dt, res_type='bbox', out_dir=save_dir)
+    if pred_mask is not None:
+        coco_dt = loadRes(coco_gt, pred_mask)
+        _analyze_results(coco_gt, coco_dt, res_type='segm', out_dir=save_dir)
+    logging.info("The analysis figures are saved in {}".format(save_dir))

+ 4 - 1
paddlex/cv/models/yolo_v3.py

@@ -38,6 +38,7 @@ class YOLOv3(PPYOLO):
         nms_iou_threshold (float): 进行NMS时,用于剔除检测框IoU的阈值。默认为0.45。
         label_smooth (bool): 是否使用label smooth。默认值为False。
         train_random_shapes (list|tuple): 训练时从列表中随机选择图像大小。默认值为[320, 352, 384, 416, 448, 480, 512, 544, 576, 608]。
+        input_channel (int): 输入图像的通道数量。默认为3。
     """
 
     def __init__(self,
@@ -53,7 +54,8 @@ class YOLOv3(PPYOLO):
                  label_smooth=False,
                  train_random_shapes=[
                      320, 352, 384, 416, 448, 480, 512, 544, 576, 608
-                 ]):
+                 ],
+                 input_channel=3):
         self.init_params = locals()
         backbones = [
             'DarkNet53', 'ResNet34', 'MobileNetV1', 'MobileNetV3_large'
@@ -84,6 +86,7 @@ class YOLOv3(PPYOLO):
         self.use_matrix_nms = False
         self.use_ema = False
         self.with_dcn_v2 = False
+        self.input_channel = input_channel
 
     def _get_backbone(self, backbone_name):
         if backbone_name == 'DarkNet53':

+ 109 - 59
paddlex/cv/nets/detection/bbox_head.py

@@ -24,6 +24,9 @@ from paddle.fluid.initializer import Normal, Xavier
 from paddle.fluid.regularizer import L2Decay
 from paddle.fluid.initializer import MSRA
 
+from .loss.diou_loss import DiouLoss
+from .ops import MultiClassNMS, MatrixNMS, MultiClassSoftNMS, MultiClassDiouNMS
+
 __all__ = ['BBoxHead', 'TwoFCHead']
 
 
@@ -42,23 +45,27 @@ class TwoFCHead(object):
     def __call__(self, roi_feat):
         fan = roi_feat.shape[1] * roi_feat.shape[2] * roi_feat.shape[3]
 
-        fc6 = fluid.layers.fc(
-            input=roi_feat,
-            size=self.mlp_dim,
-            act='relu',
-            name='fc6',
-            param_attr=ParamAttr(
-                name='fc6_w', initializer=Xavier(fan_out=fan)),
-            bias_attr=ParamAttr(
-                name='fc6_b', learning_rate=2., regularizer=L2Decay(0.)))
-        head_feat = fluid.layers.fc(
-            input=fc6,
-            size=self.mlp_dim,
-            act='relu',
-            name='fc7',
-            param_attr=ParamAttr(name='fc7_w', initializer=Xavier()),
-            bias_attr=ParamAttr(
-                name='fc7_b', learning_rate=2., regularizer=L2Decay(0.)))
+        fc6 = fluid.layers.fc(input=roi_feat,
+                              size=self.mlp_dim,
+                              act='relu',
+                              name='fc6',
+                              param_attr=ParamAttr(
+                                  name='fc6_w',
+                                  initializer=Xavier(fan_out=fan)),
+                              bias_attr=ParamAttr(
+                                  name='fc6_b',
+                                  learning_rate=2.,
+                                  regularizer=L2Decay(0.)))
+        head_feat = fluid.layers.fc(input=fc6,
+                                    size=self.mlp_dim,
+                                    act='relu',
+                                    name='fc7',
+                                    param_attr=ParamAttr(
+                                        name='fc7_w', initializer=Xavier()),
+                                    bias_attr=ParamAttr(
+                                        name='fc7_b',
+                                        learning_rate=2.,
+                                        regularizer=L2Decay(0.)))
 
         return head_feat
 
@@ -73,6 +80,7 @@ class BBoxHead(object):
             box_normalized=False,
             axis=1,
             #MultiClassNMS
+            rcnn_nms='MultiClassNMS',
             score_threshold=.05,
             nms_top_k=-1,
             keep_top_k=100,
@@ -80,25 +88,63 @@ class BBoxHead(object):
             normalized=False,
             nms_eta=1.0,
             background_label=0,
+            post_threshold=.05,
+            softnms_sigma=0.5,
             #bbox_loss
             sigma=1.0,
-            num_classes=81):
+            num_classes=81,
+            rcnn_bbox_loss='SmoothL1Loss',
+            diouloss_weight=10.0,
+            diouloss_is_cls_agnostic=False,
+            diouloss_use_complete_iou_loss=True):
         super(BBoxHead, self).__init__()
         self.head = head
         self.prior_box_var = prior_box_var
         self.code_type = code_type
         self.box_normalized = box_normalized
         self.axis = axis
-        self.score_threshold = score_threshold
-        self.nms_top_k = nms_top_k
-        self.keep_top_k = keep_top_k
-        self.nms_threshold = nms_threshold
-        self.normalized = normalized
-        self.nms_eta = nms_eta
-        self.background_label = background_label
         self.sigma = sigma
         self.num_classes = num_classes
         self.head_feat = None
+        self.rcnn_bbox_loss = rcnn_bbox_loss
+        self.diouloss_weight = diouloss_weight
+        self.diouloss_is_cls_agnostic = diouloss_is_cls_agnostic
+        self.diouloss_use_complete_iou_loss = diouloss_use_complete_iou_loss
+        if self.rcnn_bbox_loss == 'CIoULoss':
+            self.diou_loss = DiouLoss(
+                loss_weight=self.diouloss_weight,
+                is_cls_agnostic=self.diouloss_is_cls_agnostic,
+                num_classes=num_classes,
+                use_complete_iou_loss=self.diouloss_use_complete_iou_loss)
+        if rcnn_nms == 'MultiClassNMS':
+            self.nms = MultiClassNMS(
+                score_threshold=score_threshold,
+                keep_top_k=keep_top_k,
+                nms_threshold=nms_threshold,
+                normalized=normalized,
+                nms_eta=nms_eta,
+                background_label=background_label)
+        elif rcnn_nms == 'MultiClassSoftNMS':
+            self.nms = MultiClassSoftNMS(
+                score_threshold=score_threshold,
+                keep_top_k=keep_top_k,
+                softnms_sigma=softnms_sigma,
+                normalized=normalized,
+                background_label=background_label)
+        elif rcnn_nms == 'MatrixNMS':
+            self.nms = MatrixNMS(
+                score_threshold=score_threshold,
+                post_threshold=post_threshold,
+                keep_top_k=keep_top_k,
+                normalized=normalized,
+                background_label=background_label)
+        elif rcnn_nms == 'MultiClassCiouNMS':
+            self.nms = MultiClassDiouNMS(
+                score_threshold=score_threshold,
+                keep_top_k=keep_top_k,
+                nms_threshold=nms_threshold,
+                normalized=normalized,
+                background_label=background_label)
 
     def get_head_feat(self, input=None):
         """
@@ -130,24 +176,30 @@ class BBoxHead(object):
         if not isinstance(self.head, TwoFCHead):
             head_feat = fluid.layers.pool2d(
                 head_feat, pool_type='avg', global_pooling=True)
-        cls_score = fluid.layers.fc(
-            input=head_feat,
-            size=self.num_classes,
-            act=None,
-            name='cls_score',
-            param_attr=ParamAttr(
-                name='cls_score_w', initializer=Normal(loc=0.0, scale=0.01)),
-            bias_attr=ParamAttr(
-                name='cls_score_b', learning_rate=2., regularizer=L2Decay(0.)))
-        bbox_pred = fluid.layers.fc(
-            input=head_feat,
-            size=4 * self.num_classes,
-            act=None,
-            name='bbox_pred',
-            param_attr=ParamAttr(
-                name='bbox_pred_w', initializer=Normal(loc=0.0, scale=0.001)),
-            bias_attr=ParamAttr(
-                name='bbox_pred_b', learning_rate=2., regularizer=L2Decay(0.)))
+        cls_score = fluid.layers.fc(input=head_feat,
+                                    size=self.num_classes,
+                                    act=None,
+                                    name='cls_score',
+                                    param_attr=ParamAttr(
+                                        name='cls_score_w',
+                                        initializer=Normal(
+                                            loc=0.0, scale=0.01)),
+                                    bias_attr=ParamAttr(
+                                        name='cls_score_b',
+                                        learning_rate=2.,
+                                        regularizer=L2Decay(0.)))
+        bbox_pred = fluid.layers.fc(input=head_feat,
+                                    size=4 * self.num_classes,
+                                    act=None,
+                                    name='bbox_pred',
+                                    param_attr=ParamAttr(
+                                        name='bbox_pred_w',
+                                        initializer=Normal(
+                                            loc=0.0, scale=0.001)),
+                                    bias_attr=ParamAttr(
+                                        name='bbox_pred_b',
+                                        learning_rate=2.,
+                                        regularizer=L2Decay(0.)))
         return cls_score, bbox_pred
 
     def get_loss(self, roi_feat, labels_int32, bbox_targets,
@@ -179,12 +231,19 @@ class BBoxHead(object):
         loss_cls = fluid.layers.softmax_with_cross_entropy(
             logits=cls_score, label=labels_int64, numeric_stable_mode=True)
         loss_cls = fluid.layers.reduce_mean(loss_cls)
-        loss_bbox = fluid.layers.smooth_l1(
-            x=bbox_pred,
-            y=bbox_targets,
-            inside_weight=bbox_inside_weights,
-            outside_weight=bbox_outside_weights,
-            sigma=self.sigma)
+        if self.rcnn_bbox_loss == 'SmoothL1Loss':
+            loss_bbox = fluid.layers.smooth_l1(
+                x=bbox_pred,
+                y=bbox_targets,
+                inside_weight=bbox_inside_weights,
+                outside_weight=bbox_outside_weights,
+                sigma=self.sigma)
+        elif self.rcnn_bbox_loss == 'CIoULoss':
+            loss_bbox = self.diou_loss(
+                x=bbox_pred,
+                y=bbox_targets,
+                inside_weight=bbox_inside_weights,
+                outside_weight=bbox_outside_weights)
         loss_bbox = fluid.layers.reduce_mean(loss_bbox)
         return {'loss_cls': loss_cls, 'loss_bbox': loss_bbox}
 
@@ -229,14 +288,5 @@ class BBoxHead(object):
         cliped_box = fluid.layers.box_clip(input=decoded_box, im_info=im_shape)
         if return_box_score:
             return {'bbox': cliped_box, 'score': cls_prob}
-        pred_result = fluid.layers.multiclass_nms(
-            bboxes=cliped_box,
-            scores=cls_prob,
-            score_threshold=self.score_threshold,
-            nms_top_k=self.nms_top_k,
-            keep_top_k=self.keep_top_k,
-            nms_threshold=self.nms_threshold,
-            normalized=self.normalized,
-            nms_eta=self.nms_eta,
-            background_label=self.background_label)
+        pred_result = self.nms(bboxes=cliped_box, scores=cls_prob)
         return {'bbox': pred_result}

+ 63 - 16
paddlex/cv/nets/detection/faster_rcnn.py

@@ -26,6 +26,8 @@ from .rpn_head import (RPNHead, FPNRPNHead)
 from .roi_extractor import (RoIAlign, FPNRoIAlign)
 from .bbox_head import (BBoxHead, TwoFCHead)
 from ..resnet import ResNetC5
+from .loss.diou_loss import DiouLoss
+from .ops import BBoxAssigner, LibraBBoxAssigner
 
 __all__ = ['FasterRCNN']
 
@@ -44,6 +46,7 @@ class FasterRCNN(object):
     def __init__(
             self,
             backbone,
+            input_channel=3,
             mode='train',
             num_classes=81,
             with_fpn=False,
@@ -63,6 +66,9 @@ class FasterRCNN(object):
             test_pre_nms_top_n=6000,
             test_post_nms_top_n=1000,
             test_nms_thresh=0.7,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_alpha=0.25,
+            rpn_focal_loss_gamma=2,
             #roi_extractor
             roi_extractor=None,
             #bbox_head
@@ -70,6 +76,9 @@ class FasterRCNN(object):
             keep_top_k=100,
             nms_threshold=0.5,
             score_threshold=0.05,
+            rcnn_nms='MultiClassNMS',
+            softnms_sigma=0.5,
+            post_threshold=.05,
             #bbox_assigner
             batch_size_per_im=512,
             fg_fraction=.25,
@@ -77,7 +86,13 @@ class FasterRCNN(object):
             bg_thresh_hi=.5,
             bg_thresh_lo=0.,
             bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-            fixed_input_shape=None):
+            fixed_input_shape=None,
+            rcnn_bbox_loss='SmoothL1Loss',
+            diouloss_weight=10.0,
+            diouloss_is_cls_agnostic=False,
+            diouloss_use_complete_iou_loss=True,
+            bbox_assigner='BBoxAssigner',
+            fpn_num_channels=256):
         super(FasterRCNN, self).__init__()
         self.backbone = backbone
         self.mode = mode
@@ -89,6 +104,8 @@ class FasterRCNN(object):
             else:
                 fpn = FPN()
         self.fpn = fpn
+        if self.fpn is not None:
+            self.fpn.num_chan = fpn_num_channels
         self.num_classes = num_classes
         if rpn_head is None:
             if self.fpn is None:
@@ -104,7 +121,10 @@ class FasterRCNN(object):
                     train_nms_thresh=train_nms_thresh,
                     test_pre_nms_top_n=test_pre_nms_top_n,
                     test_post_nms_top_n=test_post_nms_top_n,
-                    test_nms_thresh=test_nms_thresh)
+                    test_nms_thresh=test_nms_thresh,
+                    rpn_cls_loss=rpn_cls_loss,
+                    rpn_focal_loss_alpha=rpn_focal_loss_alpha,
+                    rpn_focal_loss_gamma=rpn_focal_loss_gamma)
             else:
                 rpn_head = FPNRPNHead(
                     anchor_start_size=anchor_sizes[0],
@@ -121,7 +141,10 @@ class FasterRCNN(object):
                     train_nms_thresh=train_nms_thresh,
                     test_pre_nms_top_n=test_pre_nms_top_n,
                     test_post_nms_top_n=test_post_nms_top_n,
-                    test_nms_thresh=test_nms_thresh)
+                    test_nms_thresh=test_nms_thresh,
+                    rpn_cls_loss=rpn_cls_loss,
+                    rpn_focal_loss_alpha=rpn_focal_loss_alpha,
+                    rpn_focal_loss_gamma=rpn_focal_loss_gamma)
         self.rpn_head = rpn_head
         if roi_extractor is None:
             if self.fpn is None:
@@ -145,7 +168,15 @@ class FasterRCNN(object):
                 keep_top_k=keep_top_k,
                 nms_threshold=nms_threshold,
                 score_threshold=score_threshold,
-                num_classes=num_classes)
+                rcnn_nms=rcnn_nms,
+                softnms_sigma=softnms_sigma,
+                post_threshold=post_threshold,
+                num_classes=num_classes,
+                rcnn_bbox_loss=rcnn_bbox_loss,
+                diouloss_weight=diouloss_weight,
+                diouloss_is_cls_agnostic=diouloss_is_cls_agnostic,
+                diouloss_use_complete_iou_loss=diouloss_use_complete_iou_loss)
+
         self.bbox_head = bbox_head
         self.batch_size_per_im = batch_size_per_im
         self.fg_fraction = fg_fraction
@@ -155,6 +186,27 @@ class FasterRCNN(object):
         self.bbox_reg_weights = bbox_reg_weights
         self.rpn_only = rpn_only
         self.fixed_input_shape = fixed_input_shape
+        if bbox_assigner == 'BBoxAssigner':
+            self.bbox_assigner = BBoxAssigner(
+                batch_size_per_im=batch_size_per_im,
+                fg_fraction=fg_fraction,
+                fg_thresh=fg_thresh,
+                bg_thresh_hi=bg_thresh_hi,
+                bg_thresh_lo=bg_thresh_lo,
+                bbox_reg_weights=bbox_reg_weights,
+                num_classes=num_classes,
+                shuffle_before_sample=self.rpn_head.use_random)
+        elif bbox_assigner == 'LibraBBoxAssigner':
+            self.bbox_assigner = LibraBBoxAssigner(
+                batch_size_per_im=batch_size_per_im,
+                fg_fraction=fg_fraction,
+                fg_thresh=fg_thresh,
+                bg_thresh_hi=bg_thresh_hi,
+                bg_thresh_lo=bg_thresh_lo,
+                bbox_reg_weights=bbox_reg_weights,
+                num_classes=num_classes,
+                shuffle_before_sample=self.rpn_head.use_random)
+        self.input_channel = input_channel
 
     def build_net(self, inputs):
         im = inputs['image']
@@ -175,20 +227,12 @@ class FasterRCNN(object):
 
         if self.mode == 'train':
             rpn_loss = self.rpn_head.get_loss(im_info, gt_bbox, is_crowd)
-            outputs = fluid.layers.generate_proposal_labels(
+            outputs = self.bbox_assigner(
                 rpn_rois=rois,
                 gt_classes=inputs['gt_label'],
                 is_crowd=inputs['is_crowd'],
                 gt_boxes=inputs['gt_box'],
-                im_info=inputs['im_info'],
-                batch_size_per_im=self.batch_size_per_im,
-                fg_fraction=self.fg_fraction,
-                fg_thresh=self.fg_thresh,
-                bg_thresh_hi=self.bg_thresh_hi,
-                bg_thresh_lo=self.bg_thresh_lo,
-                bbox_reg_weights=self.bbox_reg_weights,
-                class_nums=self.num_classes,
-                use_random=self.rpn_head.use_random)
+                im_info=inputs['im_info'])
 
             rois = outputs[0]
             labels_int32 = outputs[1]
@@ -229,13 +273,16 @@ class FasterRCNN(object):
 
         if self.fixed_input_shape is not None:
             input_shape = [
-                None, 3, self.fixed_input_shape[1], self.fixed_input_shape[0]
+                None, self.input_channel, self.fixed_input_shape[1],
+                self.fixed_input_shape[0]
             ]
             inputs['image'] = fluid.data(
                 dtype='float32', shape=input_shape, name='image')
         else:
             inputs['image'] = fluid.data(
-                dtype='float32', shape=[None, 3, None, None], name='image')
+                dtype='float32',
+                shape=[None, self.input_channel, None, None],
+                name='image')
         if self.mode == 'train':
             inputs['im_info'] = fluid.data(
                 dtype='float32', shape=[None, 3], name='im_info')

+ 121 - 0
paddlex/cv/nets/detection/loss/diou_loss.py

@@ -0,0 +1,121 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from paddle import fluid
+from .giou_loss import GiouLoss
+
+__all__ = ['DiouLoss']
+
+
+class DiouLoss(GiouLoss):
+    """
+    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
+    Args:
+        loss_weight (float): diou loss weight, default as 10 in faster-rcnn
+        is_cls_agnostic (bool): flag of class-agnostic
+        num_classes (int): class num
+        use_complete_iou_loss (bool): whether to use complete iou loss
+    """
+
+    def __init__(self,
+                 loss_weight=10.,
+                 is_cls_agnostic=False,
+                 num_classes=81,
+                 use_complete_iou_loss=True):
+        super(DiouLoss, self).__init__(
+            loss_weight=loss_weight,
+            is_cls_agnostic=is_cls_agnostic,
+            num_classes=num_classes)
+        self.use_complete_iou_loss = use_complete_iou_loss
+
+    def __call__(self,
+                 x,
+                 y,
+                 inside_weight=None,
+                 outside_weight=None,
+                 bbox_reg_weight=[0.1, 0.1, 0.2, 0.2]):
+        eps = 1.e-10
+        x1, y1, x2, y2 = self.bbox_transform(x, bbox_reg_weight)
+        x1g, y1g, x2g, y2g = self.bbox_transform(y, bbox_reg_weight)
+
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1
+        h = y2 - y1
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g
+        hg = y2g - y1g
+
+        x2 = fluid.layers.elementwise_max(x1, x2)
+        y2 = fluid.layers.elementwise_max(y1, y2)
+
+        # A and B
+        xkis1 = fluid.layers.elementwise_max(x1, x1g)
+        ykis1 = fluid.layers.elementwise_max(y1, y1g)
+        xkis2 = fluid.layers.elementwise_min(x2, x2g)
+        ykis2 = fluid.layers.elementwise_min(y2, y2g)
+
+        # A or B
+        xc1 = fluid.layers.elementwise_min(x1, x1g)
+        yc1 = fluid.layers.elementwise_min(y1, y1g)
+        xc2 = fluid.layers.elementwise_max(x2, x2g)
+        yc2 = fluid.layers.elementwise_max(y2, y2g)
+
+        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
+        intsctk = intsctk * fluid.layers.greater_than(
+            xkis2, xkis1) * fluid.layers.greater_than(ykis2, ykis1)
+        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
+                                                        ) - intsctk + eps
+        iouk = intsctk / unionk
+
+        # DIOU term
+        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
+        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
+        diou_term = (dist_intersection + eps) / (dist_union + eps)
+
+        # CIOU term
+        ciou_term = 0
+        if self.use_complete_iou_loss:
+            ar_gt = wg / hg
+            ar_pred = w / h
+            arctan = fluid.layers.atan(ar_gt) - fluid.layers.atan(ar_pred)
+            ar_loss = 4. / np.pi / np.pi * arctan * arctan
+            alpha = ar_loss / (1 - iouk + ar_loss + eps)
+            alpha.stop_gradient = True
+            ciou_term = alpha * ar_loss
+
+        iou_weights = 1
+        if inside_weight is not None and outside_weight is not None:
+            inside_weight = fluid.layers.reshape(inside_weight, shape=(-1, 4))
+            outside_weight = fluid.layers.reshape(
+                outside_weight, shape=(-1, 4))
+
+            inside_weight = fluid.layers.reduce_mean(inside_weight, dim=1)
+            outside_weight = fluid.layers.reduce_mean(outside_weight, dim=1)
+
+            iou_weights = inside_weight * outside_weight
+
+        class_weight = 2 if self.is_cls_agnostic else self.num_classes
+        diou = fluid.layers.reduce_mean(
+            (1 - iouk + ciou_term + diou_term) * iou_weights) * class_weight
+
+        return diou * self.loss_weight

+ 145 - 0
paddlex/cv/nets/detection/loss/giou_loss.py

@@ -0,0 +1,145 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from paddle import fluid
+
+__all__ = ['GiouLoss']
+
+
+class GiouLoss(object):
+    '''
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): diou loss weight, default as 10 in faster-rcnn
+        is_cls_agnostic (bool): flag of class-agnostic
+        num_classes (int): class num
+        do_average (bool): whether to average the loss
+        use_class_weight(bool): whether to use class weight
+    '''
+
+    def __init__(self,
+                 loss_weight=10.,
+                 is_cls_agnostic=False,
+                 num_classes=81,
+                 do_average=True,
+                 use_class_weight=True):
+        super(GiouLoss, self).__init__()
+        self.loss_weight = loss_weight
+        self.is_cls_agnostic = is_cls_agnostic
+        self.num_classes = num_classes
+        self.do_average = do_average
+        self.class_weight = 2 if is_cls_agnostic else num_classes
+        self.use_class_weight = use_class_weight
+
+    # deltas: NxMx4
+    def bbox_transform(self, deltas, weights):
+        wx, wy, ww, wh = weights
+
+        deltas = fluid.layers.reshape(deltas, shape=(0, -1, 4))
+
+        dx = fluid.layers.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx
+        dy = fluid.layers.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy
+        dw = fluid.layers.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww
+        dh = fluid.layers.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh
+
+        dw = fluid.layers.clip(dw, -1.e10, np.log(1000. / 16))
+        dh = fluid.layers.clip(dh, -1.e10, np.log(1000. / 16))
+
+        pred_ctr_x = dx
+        pred_ctr_y = dy
+        pred_w = fluid.layers.exp(dw)
+        pred_h = fluid.layers.exp(dh)
+
+        x1 = pred_ctr_x - 0.5 * pred_w
+        y1 = pred_ctr_y - 0.5 * pred_h
+        x2 = pred_ctr_x + 0.5 * pred_w
+        y2 = pred_ctr_y + 0.5 * pred_h
+
+        x1 = fluid.layers.reshape(x1, shape=(-1, ))
+        y1 = fluid.layers.reshape(y1, shape=(-1, ))
+        x2 = fluid.layers.reshape(x2, shape=(-1, ))
+        y2 = fluid.layers.reshape(y2, shape=(-1, ))
+
+        return x1, y1, x2, y2
+
+    def __call__(self,
+                 x,
+                 y,
+                 inside_weight=None,
+                 outside_weight=None,
+                 bbox_reg_weight=[0.1, 0.1, 0.2, 0.2],
+                 use_transform=True):
+        eps = 1.e-10
+        if use_transform:
+            x1, y1, x2, y2 = self.bbox_transform(x, bbox_reg_weight)
+            x1g, y1g, x2g, y2g = self.bbox_transform(y, bbox_reg_weight)
+        else:
+            x1, y1, x2, y2 = fluid.layers.split(x, num_or_sections=4, dim=1)
+            x1g, y1g, x2g, y2g = fluid.layers.split(
+                y, num_or_sections=4, dim=1)
+
+        x2 = fluid.layers.elementwise_max(x1, x2)
+        y2 = fluid.layers.elementwise_max(y1, y2)
+
+        xkis1 = fluid.layers.elementwise_max(x1, x1g)
+        ykis1 = fluid.layers.elementwise_max(y1, y1g)
+        xkis2 = fluid.layers.elementwise_min(x2, x2g)
+        ykis2 = fluid.layers.elementwise_min(y2, y2g)
+
+        xc1 = fluid.layers.elementwise_min(x1, x1g)
+        yc1 = fluid.layers.elementwise_min(y1, y1g)
+        xc2 = fluid.layers.elementwise_max(x2, x2g)
+        yc2 = fluid.layers.elementwise_max(y2, y2g)
+
+        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
+        intsctk = intsctk * fluid.layers.greater_than(
+            xkis2, xkis1) * fluid.layers.greater_than(ykis2, ykis1)
+        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
+                                                        ) - intsctk + eps
+
+        iouk = intsctk / unionk
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + eps
+        miouk = iouk - ((area_c - unionk) / area_c)
+
+        iou_weights = 1
+        if inside_weight is not None and outside_weight is not None:
+            inside_weight = fluid.layers.reshape(inside_weight, shape=(-1, 4))
+            outside_weight = fluid.layers.reshape(
+                outside_weight, shape=(-1, 4))
+
+            inside_weight = fluid.layers.reduce_mean(inside_weight, dim=1)
+            outside_weight = fluid.layers.reduce_mean(outside_weight, dim=1)
+
+            iou_weights = inside_weight * outside_weight
+        elif outside_weight is not None:
+            iou_weights = outside_weight
+
+        if self.do_average:
+            miouk = fluid.layers.reduce_mean((1 - miouk) * iou_weights)
+        else:
+            iou_distance = fluid.layers.elementwise_mul(
+                1 - miouk, iou_weights, axis=0)
+            miouk = fluid.layers.reduce_sum(iou_distance)
+
+        if self.use_class_weight:
+            miouk = miouk * self.class_weight
+
+        return miouk * self.loss_weight

+ 8 - 3
paddlex/cv/nets/detection/mask_rcnn.py

@@ -87,7 +87,8 @@ class MaskRCNN(object):
             bg_thresh_hi=.5,
             bg_thresh_lo=0.,
             bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-            fixed_input_shape=None):
+            fixed_input_shape=None,
+            input_channel=3):
         super(MaskRCNN, self).__init__()
         self.backbone = backbone
         self.mode = mode
@@ -173,6 +174,7 @@ class MaskRCNN(object):
         self.bbox_reg_weights = bbox_reg_weights
         self.rpn_only = rpn_only
         self.fixed_input_shape = fixed_input_shape
+        self.input_channel = input_channel
 
     def build_net(self, inputs):
         im = inputs['image']
@@ -315,13 +317,16 @@ class MaskRCNN(object):
 
         if self.fixed_input_shape is not None:
             input_shape = [
-                None, 3, self.fixed_input_shape[1], self.fixed_input_shape[0]
+                None, self.input_channel, self.fixed_input_shape[1],
+                self.fixed_input_shape[0]
             ]
             inputs['image'] = fluid.data(
                 dtype='float32', shape=input_shape, name='image')
         else:
             inputs['image'] = fluid.data(
-                dtype='float32', shape=[None, 3, None, None], name='image')
+                dtype='float32',
+                shape=[None, self.input_channel, None, None],
+                name='image')
         if self.mode == 'train':
             inputs['im_info'] = fluid.data(
                 dtype='float32', shape=[None, 3], name='im_info')

+ 696 - 0
paddlex/cv/nets/detection/ops.py

@@ -21,6 +21,63 @@ import paddle
 from paddle import fluid
 
 
+def bbox_overlaps(boxes_1, boxes_2):
+    '''
+    bbox_overlaps
+        boxes_1: x1, y, x2, y2
+        boxes_2: x1, y, x2, y2
+    '''
+    assert boxes_1.shape[1] == 4 and boxes_2.shape[1] == 4
+
+    num_1 = boxes_1.shape[0]
+    num_2 = boxes_2.shape[0]
+
+    x1_1 = boxes_1[:, 0:1]
+    y1_1 = boxes_1[:, 1:2]
+    x2_1 = boxes_1[:, 2:3]
+    y2_1 = boxes_1[:, 3:4]
+    area_1 = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
+
+    x1_2 = boxes_2[:, 0].transpose()
+    y1_2 = boxes_2[:, 1].transpose()
+    x2_2 = boxes_2[:, 2].transpose()
+    y2_2 = boxes_2[:, 3].transpose()
+    area_2 = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
+
+    xx1 = np.maximum(x1_1, x1_2)
+    yy1 = np.maximum(y1_1, y1_2)
+    xx2 = np.minimum(x2_1, x2_2)
+    yy2 = np.minimum(y2_1, y2_2)
+
+    w = np.maximum(0.0, xx2 - xx1 + 1)
+    h = np.maximum(0.0, yy2 - yy1 + 1)
+    inter = w * h
+
+    ovr = inter / (area_1 + area_2 - inter)
+    return ovr
+
+
+def box_to_delta(ex_boxes, gt_boxes, weights):
+    """ box_to_delta """
+    ex_w = ex_boxes[:, 2] - ex_boxes[:, 0] + 1
+    ex_h = ex_boxes[:, 3] - ex_boxes[:, 1] + 1
+    ex_ctr_x = ex_boxes[:, 0] + 0.5 * ex_w
+    ex_ctr_y = ex_boxes[:, 1] + 0.5 * ex_h
+
+    gt_w = gt_boxes[:, 2] - gt_boxes[:, 0] + 1
+    gt_h = gt_boxes[:, 3] - gt_boxes[:, 1] + 1
+    gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_w
+    gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_h
+
+    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
+    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]
+
+    targets = np.vstack([dx, dy, dw, dh]).transpose()
+    return targets
+
+
 def DropBlock(input, block_size, keep_prob, is_test):
     if is_test:
         return input
@@ -268,3 +325,642 @@ class MultiClassSoftNMS(object):
         fluid.layers.py_func(
             func=_batch_softnms, x=[bboxes, scores], out=pred_result)
         return pred_result
+
+
+class MultiClassDiouNMS(object):
+    def __init__(
+            self,
+            score_threshold=0.05,
+            keep_top_k=100,
+            nms_threshold=0.5,
+            normalized=False,
+            background_label=0, ):
+        super(MultiClassDiouNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.keep_top_k = keep_top_k
+        self.normalized = normalized
+        self.background_label = background_label
+
+    def __call__(self, bboxes, scores):
+        def create_tmp_var(program, name, dtype, shape, lod_level):
+            return program.current_block().create_var(
+                name=name, dtype=dtype, shape=shape, lod_level=lod_level)
+
+        def _calc_diou_term(dets1, dets2):
+            eps = 1.e-10
+            eta = 0 if self.normalized else 1
+
+            x1, y1, x2, y2 = dets1[0], dets1[1], dets1[2], dets1[3]
+            x1g, y1g, x2g, y2g = dets2[0], dets2[1], dets2[2], dets2[3]
+
+            cx = (x1 + x2) / 2
+            cy = (y1 + y2) / 2
+            w = x2 - x1 + eta
+            h = y2 - y1 + eta
+
+            cxg = (x1g + x2g) / 2
+            cyg = (y1g + y2g) / 2
+            wg = x2g - x1g + eta
+            hg = y2g - y1g + eta
+
+            x2 = np.maximum(x1, x2)
+            y2 = np.maximum(y1, y2)
+
+            # A or B
+            xc1 = np.minimum(x1, x1g)
+            yc1 = np.minimum(y1, y1g)
+            xc2 = np.maximum(x2, x2g)
+            yc2 = np.maximum(y2, y2g)
+
+            # DIOU term
+            dist_intersection = (cx - cxg)**2 + (cy - cyg)**2
+            dist_union = (xc2 - xc1)**2 + (yc2 - yc1)**2
+            diou_term = (dist_intersection + eps) / (dist_union + eps)
+            return diou_term
+
+        def _diou_nms_for_cls(dets, thres):
+            """_diou_nms_for_cls"""
+            scores = dets[:, 0]
+            x1 = dets[:, 1]
+            y1 = dets[:, 2]
+            x2 = dets[:, 3]
+            y2 = dets[:, 4]
+            eta = 0 if self.normalized else 1
+            areas = (x2 - x1 + eta) * (y2 - y1 + eta)
+            dt_num = dets.shape[0]
+            order = np.array(range(dt_num))
+
+            keep = []
+            while order.size > 0:
+                i = order[0]
+                keep.append(i)
+                xx1 = np.maximum(x1[i], x1[order[1:]])
+                yy1 = np.maximum(y1[i], y1[order[1:]])
+                xx2 = np.minimum(x2[i], x2[order[1:]])
+                yy2 = np.minimum(y2[i], y2[order[1:]])
+
+                w = np.maximum(0.0, xx2 - xx1 + eta)
+                h = np.maximum(0.0, yy2 - yy1 + eta)
+                inter = w * h
+                ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+                diou_term = _calc_diou_term([x1[i], y1[i], x2[i], y2[i]], [
+                    x1[order[1:]], y1[order[1:]], x2[order[1:]], y2[order[1:]]
+                ])
+
+                inds = np.where(ovr - diou_term <= thres)[0]
+
+                order = order[inds + 1]
+
+            dets_final = dets[keep]
+            return dets_final
+
+        def _diou_nms(bboxes, scores):
+            bboxes = np.array(bboxes)
+            scores = np.array(scores)
+            class_nums = scores.shape[-1]
+
+            score_threshold = self.score_threshold
+            nms_threshold = self.nms_threshold
+            keep_top_k = self.keep_top_k
+
+            cls_boxes = [[] for _ in range(class_nums)]
+            cls_ids = [[] for _ in range(class_nums)]
+
+            start_idx = 1 if self.background_label == 0 else 0
+            for j in range(start_idx, class_nums):
+                inds = np.where(scores[:, j] >= score_threshold)[0]
+                scores_j = scores[inds, j]
+                rois_j = bboxes[inds, j, :]
+                dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
+                    np.float32, copy=False)
+                cls_rank = np.argsort(-dets_j[:, 0])
+                dets_j = dets_j[cls_rank]
+
+                cls_boxes[j] = _diou_nms_for_cls(dets_j, thres=nms_threshold)
+                cls_ids[j] = np.array([j] * cls_boxes[j].shape[0]).reshape(-1,
+                                                                           1)
+
+            cls_boxes = np.vstack(cls_boxes[start_idx:])
+            cls_ids = np.vstack(cls_ids[start_idx:])
+            pred_result = np.hstack([cls_ids, cls_boxes]).astype(np.float32)
+
+            # Limit to max_per_image detections **over all classes**
+            image_scores = cls_boxes[:, 0]
+            if len(image_scores) > keep_top_k:
+                image_thresh = np.sort(image_scores)[-keep_top_k]
+                keep = np.where(cls_boxes[:, 0] >= image_thresh)[0]
+                pred_result = pred_result[keep, :]
+
+            res = fluid.LoDTensor()
+            res.set_lod([[0, pred_result.shape[0]]])
+            if pred_result.shape[0] == 0:
+                pred_result = np.array([[1]], dtype=np.float32)
+            res.set(pred_result, fluid.CPUPlace())
+
+            return res
+
+        pred_result = create_tmp_var(
+            fluid.default_main_program(),
+            name='diou_nms_pred_result',
+            dtype='float32',
+            shape=[-1, 6],
+            lod_level=0)
+        fluid.layers.py_func(
+            func=_diou_nms, x=[bboxes, scores], out=pred_result)
+        return pred_result
+
+
+class LibraBBoxAssigner(object):
+    def __init__(self,
+                 batch_size_per_im=512,
+                 fg_fraction=.25,
+                 fg_thresh=.5,
+                 bg_thresh_hi=.5,
+                 bg_thresh_lo=0.,
+                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                 num_classes=81,
+                 shuffle_before_sample=True,
+                 is_cls_agnostic=False,
+                 num_bins=3):
+        super(LibraBBoxAssigner, self).__init__()
+        self.batch_size_per_im = batch_size_per_im
+        self.fg_fraction = fg_fraction
+        self.fg_thresh = fg_thresh
+        self.bg_thresh_hi = bg_thresh_hi
+        self.bg_thresh_lo = bg_thresh_lo
+        self.bbox_reg_weights = bbox_reg_weights
+        self.class_nums = num_classes
+        self.use_random = shuffle_before_sample
+        self.is_cls_agnostic = is_cls_agnostic
+        self.num_bins = num_bins
+
+    def __call__(
+            self,
+            rpn_rois,
+            gt_classes,
+            is_crowd,
+            gt_boxes,
+            im_info, ):
+        return self.generate_proposal_label_libra(
+            rpn_rois=rpn_rois,
+            gt_classes=gt_classes,
+            is_crowd=is_crowd,
+            gt_boxes=gt_boxes,
+            im_info=im_info,
+            batch_size_per_im=self.batch_size_per_im,
+            fg_fraction=self.fg_fraction,
+            fg_thresh=self.fg_thresh,
+            bg_thresh_hi=self.bg_thresh_hi,
+            bg_thresh_lo=self.bg_thresh_lo,
+            bbox_reg_weights=self.bbox_reg_weights,
+            class_nums=self.class_nums,
+            use_random=self.use_random,
+            is_cls_agnostic=self.is_cls_agnostic,
+            is_cascade_rcnn=False)
+
+    def generate_proposal_label_libra(
+            self, rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
+            batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
+            bg_thresh_lo, bbox_reg_weights, class_nums, use_random,
+            is_cls_agnostic, is_cascade_rcnn):
+        num_bins = self.num_bins
+
+        def create_tmp_var(program, name, dtype, shape, lod_level=None):
+            return program.current_block().create_var(
+                name=name, dtype=dtype, shape=shape, lod_level=lod_level)
+
+        def _sample_pos(max_overlaps, max_classes, pos_inds, num_expected):
+            if len(pos_inds) <= num_expected:
+                return pos_inds
+            else:
+                unique_gt_inds = np.unique(max_classes[pos_inds])
+                num_gts = len(unique_gt_inds)
+                num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+
+                sampled_inds = []
+                for i in unique_gt_inds:
+                    inds = np.nonzero(max_classes == i)[0]
+                    before_len = len(inds)
+                    inds = list(set(inds) & set(pos_inds))
+                    after_len = len(inds)
+                    if len(inds) > num_per_gt:
+                        inds = np.random.choice(
+                            inds, size=num_per_gt, replace=False)
+                    sampled_inds.extend(list(inds))  # combine as a new sampler
+                if len(sampled_inds) < num_expected:
+                    num_extra = num_expected - len(sampled_inds)
+                    extra_inds = np.array(
+                        list(set(pos_inds) - set(sampled_inds)))
+                    assert len(sampled_inds)+len(extra_inds) == len(pos_inds), \
+                        "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format(
+                            len(sampled_inds), len(extra_inds), len(pos_inds))
+                    if len(extra_inds) > num_extra:
+                        extra_inds = np.random.choice(
+                            extra_inds, size=num_extra, replace=False)
+                    sampled_inds.extend(extra_inds.tolist())
+                elif len(sampled_inds) > num_expected:
+                    sampled_inds = np.random.choice(
+                        sampled_inds, size=num_expected, replace=False)
+                return sampled_inds
+
+        def sample_via_interval(max_overlaps, full_set, num_expected,
+                                floor_thr, num_bins, bg_thresh_hi):
+            max_iou = max_overlaps.max()
+            iou_interval = (max_iou - floor_thr) / num_bins
+            per_num_expected = int(num_expected / num_bins)
+
+            sampled_inds = []
+            for i in range(num_bins):
+                start_iou = floor_thr + i * iou_interval
+                end_iou = floor_thr + (i + 1) * iou_interval
+
+                tmp_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= start_iou, max_overlaps
+                                       < end_iou))[0])
+                tmp_inds = list(tmp_set & full_set)
+
+                if len(tmp_inds) > per_num_expected:
+                    tmp_sampled_set = np.random.choice(
+                        tmp_inds, size=per_num_expected, replace=False)
+                else:
+                    tmp_sampled_set = np.array(tmp_inds, dtype=np.int)
+                sampled_inds.append(tmp_sampled_set)
+
+            sampled_inds = np.concatenate(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(full_set - set(sampled_inds)))
+                assert len(sampled_inds)+len(extra_inds) == len(full_set), \
+                    "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format(
+                            len(sampled_inds), len(extra_inds), len(full_set))
+
+                if len(extra_inds) > num_extra:
+                    extra_inds = np.random.choice(
+                        extra_inds, num_extra, replace=False)
+                sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+            return sampled_inds
+
+        def _sample_neg(max_overlaps,
+                        max_classes,
+                        neg_inds,
+                        num_expected,
+                        floor_thr=-1,
+                        floor_fraction=0,
+                        num_bins=3,
+                        bg_thresh_hi=0.5):
+            if len(neg_inds) <= num_expected:
+                return neg_inds
+            else:
+                # balance sampling for negative samples
+                neg_set = set(neg_inds)
+                if floor_thr > 0:
+                    floor_set = set(
+                        np.where(
+                            np.logical_and(max_overlaps >= 0, max_overlaps <
+                                           floor_thr))[0])
+                    iou_sampling_set = set(
+                        np.where(max_overlaps >= floor_thr)[0])
+                elif floor_thr == 0:
+                    floor_set = set(np.where(max_overlaps == 0)[0])
+                    iou_sampling_set = set(
+                        np.where(max_overlaps > floor_thr)[0])
+                else:
+                    floor_set = set()
+                    iou_sampling_set = set(
+                        np.where(max_overlaps > floor_thr)[0])
+                    floor_thr = 0
+
+                floor_neg_inds = list(floor_set & neg_set)
+                iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+
+                num_expected_iou_sampling = int(num_expected *
+                                                (1 - floor_fraction))
+                if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                    if num_bins >= 2:
+                        iou_sampled_inds = sample_via_interval(
+                            max_overlaps,
+                            set(iou_sampling_neg_inds),
+                            num_expected_iou_sampling, floor_thr, num_bins,
+                            bg_thresh_hi)
+                    else:
+                        iou_sampled_inds = np.random.choice(
+                            iou_sampling_neg_inds,
+                            size=num_expected_iou_sampling,
+                            replace=False)
+                else:
+                    iou_sampled_inds = np.array(
+                        iou_sampling_neg_inds, dtype=np.int)
+                num_expected_floor = num_expected - len(iou_sampled_inds)
+                if len(floor_neg_inds) > num_expected_floor:
+                    sampled_floor_inds = np.random.choice(
+                        floor_neg_inds, size=num_expected_floor, replace=False)
+                else:
+                    sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int)
+                sampled_inds = np.concatenate(
+                    (sampled_floor_inds, iou_sampled_inds))
+                if len(sampled_inds) < num_expected:
+                    num_extra = num_expected - len(sampled_inds)
+                    extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                    if len(extra_inds) > num_extra:
+                        extra_inds = np.random.choice(
+                            extra_inds, size=num_extra, replace=False)
+                    sampled_inds = np.concatenate((sampled_inds, extra_inds))
+                return sampled_inds
+
+        def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
+                         batch_size_per_im, fg_fraction, fg_thresh,
+                         bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
+                         class_nums, use_random, is_cls_agnostic,
+                         is_cascade_rcnn):
+            rois_per_image = int(batch_size_per_im)
+            fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
+
+            # Roidb
+            im_scale = im_info[2]
+            inv_im_scale = 1. / im_scale
+            rpn_rois = rpn_rois * inv_im_scale
+            if is_cascade_rcnn:
+                rpn_rois = rpn_rois[gt_boxes.shape[0]:, :]
+            boxes = np.vstack([gt_boxes, rpn_rois])
+            gt_overlaps = np.zeros((boxes.shape[0], class_nums))
+            box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
+            if len(gt_boxes) > 0:
+                proposal_to_gt_overlaps = bbox_overlaps(boxes, gt_boxes)
+
+                overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
+                overlaps_max = proposal_to_gt_overlaps.max(axis=1)
+                # Boxes which with non-zero overlap with gt boxes
+                overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
+
+                overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
+                    overlapped_boxes_ind]]
+
+                for idx in range(len(overlapped_boxes_ind)):
+                    gt_overlaps[overlapped_boxes_ind[
+                        idx], overlapped_boxes_gt_classes[idx]] = overlaps_max[
+                            overlapped_boxes_ind[idx]]
+                    box_to_gt_ind_map[overlapped_boxes_ind[
+                        idx]] = overlaps_argmax[overlapped_boxes_ind[idx]]
+
+            crowd_ind = np.where(is_crowd)[0]
+            gt_overlaps[crowd_ind] = -1
+
+            max_overlaps = gt_overlaps.max(axis=1)
+            max_classes = gt_overlaps.argmax(axis=1)
+
+            # Cascade RCNN Decode Filter
+            if is_cascade_rcnn:
+                ws = boxes[:, 2] - boxes[:, 0] + 1
+                hs = boxes[:, 3] - boxes[:, 1] + 1
+                keep = np.where((ws > 0) & (hs > 0))[0]
+                boxes = boxes[keep]
+                max_overlaps = max_overlaps[keep]
+                fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+                bg_inds = np.where((max_overlaps < bg_thresh_hi) & (
+                    max_overlaps >= bg_thresh_lo))[0]
+                fg_rois_per_this_image = fg_inds.shape[0]
+                bg_rois_per_this_image = bg_inds.shape[0]
+            else:
+                # Foreground
+                fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+                fg_rois_per_this_image = np.minimum(fg_rois_per_im,
+                                                    fg_inds.shape[0])
+                # Sample foreground if there are too many
+                if fg_inds.shape[0] > fg_rois_per_this_image:
+                    if use_random:
+                        fg_inds = _sample_pos(max_overlaps, max_classes,
+                                              fg_inds, fg_rois_per_this_image)
+                fg_inds = fg_inds[:fg_rois_per_this_image]
+
+                # Background
+                bg_inds = np.where((max_overlaps < bg_thresh_hi) & (
+                    max_overlaps >= bg_thresh_lo))[0]
+                bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+                bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                                    bg_inds.shape[0])
+                assert bg_rois_per_this_image >= 0, "bg_rois_per_this_image must be >= 0 but got {}".format(
+                    bg_rois_per_this_image)
+
+                # Sample background if there are too many
+                if bg_inds.shape[0] > bg_rois_per_this_image:
+                    if use_random:
+                        # libra neg sample
+                        bg_inds = _sample_neg(
+                            max_overlaps,
+                            max_classes,
+                            bg_inds,
+                            bg_rois_per_this_image,
+                            num_bins=num_bins,
+                            bg_thresh_hi=bg_thresh_hi)
+                bg_inds = bg_inds[:bg_rois_per_this_image]
+
+            keep_inds = np.append(fg_inds, bg_inds)
+            sampled_labels = max_classes[keep_inds]  # N x 1
+            sampled_labels[fg_rois_per_this_image:] = 0
+            sampled_boxes = boxes[keep_inds]  # N x 324
+            sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
+            sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
+            bbox_label_targets = _compute_targets(
+                sampled_boxes, sampled_gts, sampled_labels, bbox_reg_weights)
+            bbox_targets, bbox_inside_weights = _expand_bbox_targets(
+                bbox_label_targets, class_nums, is_cls_agnostic)
+            bbox_outside_weights = np.array(
+                bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
+            # Scale rois
+            sampled_rois = sampled_boxes * im_scale
+
+            # Faster RCNN blobs
+            frcn_blobs = dict(
+                rois=sampled_rois,
+                labels_int32=sampled_labels,
+                bbox_targets=bbox_targets,
+                bbox_inside_weights=bbox_inside_weights,
+                bbox_outside_weights=bbox_outside_weights)
+            return frcn_blobs
+
+        def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights):
+            assert roi_boxes.shape[0] == gt_boxes.shape[0]
+            assert roi_boxes.shape[1] == 4
+            assert gt_boxes.shape[1] == 4
+
+            targets = np.zeros(roi_boxes.shape)
+            bbox_reg_weights = np.asarray(bbox_reg_weights)
+            targets = box_to_delta(
+                ex_boxes=roi_boxes,
+                gt_boxes=gt_boxes,
+                weights=bbox_reg_weights)
+
+            return np.hstack([labels[:, np.newaxis], targets]).astype(
+                np.float32, copy=False)
+
+        def _expand_bbox_targets(bbox_targets_input, class_nums,
+                                 is_cls_agnostic):
+            class_labels = bbox_targets_input[:, 0]
+            fg_inds = np.where(class_labels > 0)[0]
+            bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums
+                                     if not is_cls_agnostic else 4 * 2))
+            bbox_inside_weights = np.zeros(bbox_targets.shape)
+            for ind in fg_inds:
+                class_label = int(class_labels[
+                    ind]) if not is_cls_agnostic else 1
+                start_ind = class_label * 4
+                end_ind = class_label * 4 + 4
+                bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind,
+                                                                          1:]
+                bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0,
+                                                               1.0)
+            return bbox_targets, bbox_inside_weights
+
+        def generate_func(
+                rpn_rois,
+                gt_classes,
+                is_crowd,
+                gt_boxes,
+                im_info, ):
+            rpn_rois_lod = rpn_rois.lod()[0]
+            gt_classes_lod = gt_classes.lod()[0]
+
+            # convert
+            rpn_rois = np.array(rpn_rois)
+            gt_classes = np.array(gt_classes)
+            is_crowd = np.array(is_crowd)
+            gt_boxes = np.array(gt_boxes)
+            im_info = np.array(im_info)
+
+            rois = []
+            labels_int32 = []
+            bbox_targets = []
+            bbox_inside_weights = []
+            bbox_outside_weights = []
+            lod = [0]
+
+            for idx in range(len(rpn_rois_lod) - 1):
+                rois_si = rpn_rois_lod[idx]
+                rois_ei = rpn_rois_lod[idx + 1]
+
+                gt_si = gt_classes_lod[idx]
+                gt_ei = gt_classes_lod[idx + 1]
+                frcn_blobs = _sample_rois(
+                    rpn_rois[rois_si:rois_ei], gt_classes[gt_si:gt_ei],
+                    is_crowd[gt_si:gt_ei], gt_boxes[gt_si:gt_ei], im_info[idx],
+                    batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
+                    bg_thresh_lo, bbox_reg_weights, class_nums, use_random,
+                    is_cls_agnostic, is_cascade_rcnn)
+                lod.append(frcn_blobs['rois'].shape[0] + lod[-1])
+                rois.append(frcn_blobs['rois'])
+                labels_int32.append(frcn_blobs['labels_int32'].reshape(-1, 1))
+                bbox_targets.append(frcn_blobs['bbox_targets'])
+                bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
+                bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
+
+            rois = np.vstack(rois)
+            labels_int32 = np.vstack(labels_int32)
+            bbox_targets = np.vstack(bbox_targets)
+            bbox_inside_weights = np.vstack(bbox_inside_weights)
+            bbox_outside_weights = np.vstack(bbox_outside_weights)
+
+            # create lod-tensor for return
+            # notice that the func create_lod_tensor does not work well here
+            ret_rois = fluid.LoDTensor()
+            ret_rois.set_lod([lod])
+            ret_rois.set(rois.astype("float32"), fluid.CPUPlace())
+
+            ret_labels_int32 = fluid.LoDTensor()
+            ret_labels_int32.set_lod([lod])
+            ret_labels_int32.set(
+                labels_int32.astype("int32"), fluid.CPUPlace())
+
+            ret_bbox_targets = fluid.LoDTensor()
+            ret_bbox_targets.set_lod([lod])
+            ret_bbox_targets.set(
+                bbox_targets.astype("float32"), fluid.CPUPlace())
+
+            ret_bbox_inside_weights = fluid.LoDTensor()
+            ret_bbox_inside_weights.set_lod([lod])
+            ret_bbox_inside_weights.set(
+                bbox_inside_weights.astype("float32"), fluid.CPUPlace())
+
+            ret_bbox_outside_weights = fluid.LoDTensor()
+            ret_bbox_outside_weights.set_lod([lod])
+            ret_bbox_outside_weights.set(
+                bbox_outside_weights.astype("float32"), fluid.CPUPlace())
+
+            return ret_rois, ret_labels_int32, ret_bbox_targets, ret_bbox_inside_weights, ret_bbox_outside_weights
+
+        rois = create_tmp_var(
+            fluid.default_main_program(),
+            name=None,  #'rois',
+            dtype='float32',
+            shape=[-1, 4], )
+        bbox_inside_weights = create_tmp_var(
+            fluid.default_main_program(),
+            name=None,  #'bbox_inside_weights',
+            dtype='float32',
+            shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], )
+        bbox_outside_weights = create_tmp_var(
+            fluid.default_main_program(),
+            name=None,  #'bbox_outside_weights',
+            dtype='float32',
+            shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], )
+        bbox_targets = create_tmp_var(
+            fluid.default_main_program(),
+            name=None,  #'bbox_targets',
+            dtype='float32',
+            shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], )
+        labels_int32 = create_tmp_var(
+            fluid.default_main_program(),
+            name=None,  #'labels_int32',
+            dtype='int32',
+            shape=[-1, 1], )
+
+        outs = [
+            rois, labels_int32, bbox_targets, bbox_inside_weights,
+            bbox_outside_weights
+        ]
+
+        fluid.layers.py_func(
+            func=generate_func,
+            x=[rpn_rois, gt_classes, is_crowd, gt_boxes, im_info],
+            out=outs)
+        return outs
+
+
+class BBoxAssigner(object):
+    def __init__(self,
+                 batch_size_per_im=512,
+                 fg_fraction=.25,
+                 fg_thresh=.5,
+                 bg_thresh_hi=.5,
+                 bg_thresh_lo=0.,
+                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                 num_classes=81,
+                 shuffle_before_sample=True):
+        super(BBoxAssigner, self).__init__()
+        self.batch_size_per_im = batch_size_per_im
+        self.fg_fraction = fg_fraction
+        self.fg_thresh = fg_thresh
+        self.bg_thresh_hi = bg_thresh_hi
+        self.bg_thresh_lo = bg_thresh_lo
+        self.bbox_reg_weights = bbox_reg_weights
+        self.class_nums = num_classes
+        self.use_random = shuffle_before_sample
+
+    def __call__(self, rpn_rois, gt_classes, is_crowd, gt_boxes, im_info):
+        return fluid.layers.generate_proposal_labels(
+            rpn_rois=rpn_rois,
+            gt_classes=gt_classes,
+            is_crowd=is_crowd,
+            gt_boxes=gt_boxes,
+            im_info=im_info,
+            batch_size_per_im=self.batch_size_per_im,
+            fg_fraction=self.fg_fraction,
+            fg_thresh=self.fg_thresh,
+            bg_thresh_hi=self.bg_thresh_hi,
+            bg_thresh_lo=self.bg_thresh_lo,
+            bbox_reg_weights=self.bbox_reg_weights,
+            class_nums=self.class_nums,
+            use_random=self.use_random)

+ 100 - 38
paddlex/cv/nets/detection/rpn_head.py

@@ -16,10 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from paddle import fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Normal
 from paddle.fluid.regularizer import L2Decay
+from paddle.fluid.initializer import Constant
 
 __all__ = ['RPNHead', 'FPNRPNHead']
 
@@ -39,6 +41,9 @@ class RPNHead(object):
             rpn_positive_overlap=0.7,
             rpn_negative_overlap=0.3,
             use_random=True,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_gamma=2,
+            rpn_focal_loss_alpha=0.25,
             #train_proposal
             train_pre_nms_top_n=12000,
             train_post_nms_top_n=2000,
@@ -75,6 +80,9 @@ class RPNHead(object):
         self.test_min_size = test_min_size
         self.test_eta = test_eta
         self.num_classes = num_classes
+        self.rpn_cls_loss = rpn_cls_loss
+        self.rpn_focal_loss_gamma = rpn_focal_loss_gamma
+        self.rpn_focal_loss_alpha = rpn_focal_loss_alpha
 
     def _get_output(self, input):
         """
@@ -99,7 +107,8 @@ class RPNHead(object):
             act='relu',
             name='conv_rpn',
             param_attr=ParamAttr(
-                name="conv_rpn_w", initializer=Normal(loc=0., scale=0.01)),
+                name="conv_rpn_w", initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="conv_rpn_b", learning_rate=2., regularizer=L2Decay(0.)))
         # Generate anchors
@@ -111,6 +120,11 @@ class RPNHead(object):
             variance=self.variance)
         num_anchor = self.anchor.shape[2]
         # Proposal classification scores
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            bias_init = None
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            value = float(-np.log((1 - 0.01) / 0.01))
+            bias_init = Constant(value=value)
         self.rpn_cls_score = fluid.layers.conv2d(
             rpn_conv,
             num_filters=num_anchor * self.num_classes,
@@ -121,9 +135,11 @@ class RPNHead(object):
             name='rpn_cls_score',
             param_attr=ParamAttr(
                 name="rpn_cls_logits_w",
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="rpn_cls_logits_b",
+                initializer=bias_init,
                 learning_rate=2.,
                 regularizer=L2Decay(0.)))
         # Proposal bbox regression deltas
@@ -136,8 +152,8 @@ class RPNHead(object):
             act=None,
             name='rpn_bbox_pred',
             param_attr=ParamAttr(
-                name="rpn_bbox_pred_w", initializer=Normal(loc=0.,
-                                                           scale=0.01)),
+                name="rpn_bbox_pred_w", initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name="rpn_bbox_pred_b",
                 learning_rate=2.,
@@ -252,24 +268,48 @@ class RPNHead(object):
         rpn_cls, rpn_bbox, anchor, anchor_var = self._get_loss_input()
         if self.num_classes == 1:
             score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \
-                fluid.layers.rpn_target_assign(
-                    bbox_pred=rpn_bbox,
-                    cls_logits=rpn_cls,
-                    anchor_box=anchor,
-                    anchor_var=anchor_var,
-                    gt_boxes=gt_box,
-                    is_crowd=is_crowd,
-                    im_info=im_info,
-                    rpn_batch_size_per_im=self.rpn_batch_size_per_im,
-                    rpn_straddle_thresh=self.rpn_straddle_thresh,
-                    rpn_fg_fraction=self.rpn_fg_fraction,
-                    rpn_positive_overlap=self.rpn_positive_overlap,
-                    rpn_negative_overlap=self.rpn_negative_overlap,
-                    use_random=self.use_random)
-            score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32')
-            score_tgt.stop_gradient = True
-            rpn_cls_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-                x=score_pred, label=score_tgt)
+                    fluid.layers.rpn_target_assign(
+                        bbox_pred=rpn_bbox,
+                        cls_logits=rpn_cls,
+                        anchor_box=anchor,
+                        anchor_var=anchor_var,
+                        gt_boxes=gt_box,
+                        is_crowd=is_crowd,
+                        im_info=im_info,
+                        rpn_batch_size_per_im=self.rpn_batch_size_per_im,
+                        rpn_straddle_thresh=self.rpn_straddle_thresh,
+                        rpn_fg_fraction=self.rpn_fg_fraction,
+                        rpn_positive_overlap=self.rpn_positive_overlap,
+                        rpn_negative_overlap=self.rpn_negative_overlap,
+                        use_random=self.use_random)
+            if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+                score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32')
+                score_tgt.stop_gradient = True
+                rpn_cls_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=score_pred, label=score_tgt)
+            elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+                data = fluid.layers.fill_constant(
+                    shape=[1], value=1, dtype='int32')
+                fg_label = fluid.layers.greater_equal(score_tgt, data)
+                fg_label = fluid.layers.cast(fg_label, dtype='int32')
+                fg_num = fluid.layers.reduce_sum(fg_label)
+                fg_num.stop_gradient = True
+                score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32')
+                score_tgt.stop_gradient = True
+                loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=score_pred, label=score_tgt)
+
+                pred = fluid.layers.sigmoid(score_pred)
+                p_t = pred * score_tgt + (1 - pred) * (1 - score_tgt)
+
+                if self.rpn_focal_loss_alpha is not None:
+                    alpha_t = self.rpn_focal_loss_alpha * score_tgt + (
+                        1 - self.rpn_focal_loss_alpha) * (1 - score_tgt)
+                    loss = alpha_t * loss
+                gamma_t = fluid.layers.pow((1 - p_t),
+                                           self.rpn_focal_loss_gamma)
+                loss = gamma_t * loss
+                rpn_cls_loss = loss / fg_num
         else:
             score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \
                 fluid.layers.rpn_target_assign(
@@ -295,8 +335,12 @@ class RPNHead(object):
                 label=labels_int64,
                 numeric_stable_mode=True)
 
-        rpn_cls_loss = fluid.layers.reduce_mean(
-            rpn_cls_loss, name='loss_rpn_cls')
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            rpn_cls_loss = fluid.layers.reduce_mean(
+                rpn_cls_loss, name='loss_rpn_cls')
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            rpn_cls_loss = fluid.layers.reduce_sum(
+                rpn_cls_loss, name='loss_rpn_cls')
 
         loc_tgt = fluid.layers.cast(x=loc_tgt, dtype='float32')
         loc_tgt.stop_gradient = True
@@ -308,12 +352,15 @@ class RPNHead(object):
             outside_weight=bbox_weight)
         rpn_reg_loss = fluid.layers.reduce_sum(
             rpn_reg_loss, name='loss_rpn_bbox')
-        score_shape = fluid.layers.shape(score_tgt)
-        score_shape = fluid.layers.cast(x=score_shape, dtype='float32')
-        norm = fluid.layers.reduce_prod(score_shape)
-        norm.stop_gradient = True
-        rpn_reg_loss = rpn_reg_loss / norm
-
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            score_shape = fluid.layers.shape(score_tgt)
+            score_shape = fluid.layers.cast(x=score_shape, dtype='float32')
+            norm = fluid.layers.reduce_prod(score_shape)
+            norm.stop_gradient = True
+            rpn_reg_loss = rpn_reg_loss / norm
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            rpn_reg_loss = rpn_reg_loss / fluid.layers.cast(fg_num,
+                                                            rpn_reg_loss.dtype)
         return {'loss_rpn_cls': rpn_cls_loss, 'loss_rpn_bbox': rpn_reg_loss}
 
 
@@ -333,6 +380,9 @@ class FPNRPNHead(RPNHead):
             rpn_positive_overlap=0.7,
             rpn_negative_overlap=0.3,
             use_random=True,
+            rpn_cls_loss='SigmoidCrossEntropy',
+            rpn_focal_loss_gamma=2,
+            rpn_focal_loss_alpha=0.25,
             #train_proposal
             train_pre_nms_top_n=2000,
             train_post_nms_top_n=2000,
@@ -366,7 +416,10 @@ class FPNRPNHead(RPNHead):
             test_nms_thresh=test_nms_thresh,
             test_min_size=test_min_size,
             test_eta=test_eta,
-            num_classes=num_classes)
+            num_classes=num_classes,
+            rpn_cls_loss=rpn_cls_loss,
+            rpn_focal_loss_gamma=rpn_focal_loss_gamma,
+            rpn_focal_loss_alpha=rpn_focal_loss_alpha)
         self.anchor_start_size = anchor_start_size
         self.num_chan = num_chan
         self.min_level = min_level
@@ -410,7 +463,8 @@ class FPNRPNHead(RPNHead):
             name=conv_name,
             param_attr=ParamAttr(
                 name=conv_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=conv_share_name + '_b',
                 learning_rate=2.,
@@ -418,13 +472,18 @@ class FPNRPNHead(RPNHead):
 
         self.anchors, self.anchor_var = fluid.layers.anchor_generator(
             input=conv_rpn_fpn,
-            anchor_sizes=(self.anchor_start_size * 2.**
-                          (feat_lvl - self.min_level), ),
+            anchor_sizes=(self.anchor_start_size * 2.
+                          **(feat_lvl - self.min_level), ),
             stride=(2.**feat_lvl, 2.**feat_lvl),
             aspect_ratios=self.aspect_ratios,
             variance=self.variance)
 
         cls_num_filters = num_anchors * self.num_classes
+        if self.rpn_cls_loss == 'SigmoidCrossEntropy':
+            bias_init = None
+        elif self.rpn_cls_loss == 'SigmoidFocalLoss':
+            value = float(-np.log((1 - 0.01) / 0.01))
+            bias_init = Constant(value=value)
         self.rpn_cls_score = fluid.layers.conv2d(
             input=conv_rpn_fpn,
             num_filters=cls_num_filters,
@@ -433,9 +492,11 @@ class FPNRPNHead(RPNHead):
             name=cls_name,
             param_attr=ParamAttr(
                 name=cls_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=cls_share_name + '_b',
+                initializer=bias_init,
                 learning_rate=2.,
                 regularizer=L2Decay(0.)))
         self.rpn_bbox_pred = fluid.layers.conv2d(
@@ -446,7 +507,8 @@ class FPNRPNHead(RPNHead):
             name=bbox_name,
             param_attr=ParamAttr(
                 name=bbox_share_name + '_w',
-                initializer=Normal(loc=0., scale=0.01)),
+                initializer=Normal(
+                    loc=0., scale=0.01)),
             bias_attr=ParamAttr(
                 name=bbox_share_name + '_b',
                 learning_rate=2.,
@@ -471,8 +533,8 @@ class FPNRPNHead(RPNHead):
                 shape of (rois_num, 1).
         """
 
-        rpn_cls_score_fpn, rpn_bbox_pred_fpn = self._get_output(
-            body_feat, feat_lvl)
+        rpn_cls_score_fpn, rpn_bbox_pred_fpn = self._get_output(body_feat,
+                                                                feat_lvl)
 
         if self.num_classes == 1:
             rpn_cls_prob_fpn = fluid.layers.sigmoid(

+ 8 - 3
paddlex/cv/nets/detection/yolo_v3.py

@@ -62,7 +62,8 @@ class YOLOv3:
             nms_topk=1000,
             nms_keep_topk=100,
             nms_iou_threshold=0.45,
-            fixed_input_shape=None):
+            fixed_input_shape=None,
+            input_channel=3):
         if anchors is None:
             anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
                        [59, 119], [116, 90], [156, 198], [373, 326]]
@@ -125,6 +126,7 @@ class YOLOv3:
         self.keep_prob = 0.9
         self.downsample = [32, 16, 8]
         self.clip_bbox = True
+        self.input_channel = input_channel
 
     def _head(self, input, is_train=True):
         outputs = []
@@ -446,13 +448,16 @@ class YOLOv3:
         inputs = OrderedDict()
         if self.fixed_input_shape is not None:
             input_shape = [
-                None, 3, self.fixed_input_shape[1], self.fixed_input_shape[0]
+                None, self.input_channel, self.fixed_input_shape[1],
+                self.fixed_input_shape[0]
             ]
             inputs['image'] = fluid.data(
                 dtype='float32', shape=input_shape, name='image')
         else:
             inputs['image'] = fluid.data(
-                dtype='float32', shape=[None, 3, None, None], name='image')
+                dtype='float32',
+                shape=[None, self.input_channel, None, None],
+                name='image')
         if self.mode == 'train':
             inputs['gt_box'] = fluid.data(
                 dtype='float32', shape=[None, None, 4], name='gt_box')

+ 3 - 6
paddlex/cv/nets/resnet.py

@@ -151,10 +151,8 @@ class ResNet(object):
                    groups=1,
                    act=None,
                    name=None,
-                   dcn_v2=False,
-                   use_lr_mult_list=False):
-        lr_mult = self.lr_mult_list[
-            self.curr_stage] if use_lr_mult_list else 1.0
+                   dcn_v2=False):
+        lr_mult = self.lr_mult_list[self.curr_stage]
         _name = self.prefix_name + name if self.prefix_name != '' else name
         if not dcn_v2:
             conv = fluid.layers.conv2d(
@@ -269,8 +267,7 @@ class ResNet(object):
                     pool_padding=0,
                     ceil_mode=True,
                     pool_type='avg')
-                return self._conv_norm(
-                    input, ch_out, 1, 1, name=name, use_lr_mult_list=True)
+                return self._conv_norm(input, ch_out, 1, 1, name=name)
             return self._conv_norm(input, ch_out, 1, stride, name=name)
         else:
             return input

+ 6 - 1
paddlex/cv/transforms/__init__.py

@@ -84,7 +84,12 @@ def build_transforms_v1(model_type, transforms_info, batch_transforms_info):
     return eval_transforms
 
 
-def arrange_transforms(model_type, class_name, transforms, mode='train'):
+def arrange_transforms(model_type,
+                       class_name,
+                       transforms,
+                       mode='train',
+                       input_channel=3):
+    transforms.input_channel = input_channel
     # 给transforms添加arrange操作
     if model_type == 'classifier':
         arrange_transform = cls_transforms.ArrangeClassifier

+ 76 - 9
paddlex/cv/transforms/det_transforms.py

@@ -98,7 +98,7 @@ class Compose(DetTransform):
                 字段由transforms中的最后一个数据预处理操作决定。
         """
 
-        def decode_image(im_file, im_info, label_info):
+        def decode_image(im_file, im_info, label_info, input_channel=3):
             if im_info is None:
                 im_info = dict()
             if isinstance(im_file, np.ndarray):
@@ -109,12 +109,19 @@ class Compose(DetTransform):
                 im = im_file
             else:
                 try:
-                    im = cv2.imread(im_file).astype('float32')
+                    if input_channel == 3:
+                        im = cv2.imread(im_file).astype('float32')
+                    else:
+                        im = cv2.imread(im_file,
+                                        cv2.IMREAD_UNCHANGED).astype('float32')
+                        if im.ndim < 3:
+                            im = np.expand_dims(im, axis=-1)
                 except:
                     raise TypeError('Can\'t read The image file {}!'.format(
                         im_file))
             im = im.astype('float32')
-            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+            if input_channel == 3:
+                im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
             # make default im_info with [h, w, 1]
             im_info['im_resize_info'] = np.array(
                 [im.shape[0], im.shape[1], 1.], dtype=np.float32)
@@ -134,7 +141,8 @@ class Compose(DetTransform):
             else:
                 return (im, im_info, label_info)
 
-        outputs = decode_image(im, im_info, label_info)
+        input_channel = getattr(self, 'input_channel', 3)
+        outputs = decode_image(im, im_info, label_info, input_channel)
         im = outputs[0]
         im_info = outputs[1]
         if len(outputs) == 3:
@@ -146,6 +154,10 @@ class Compose(DetTransform):
                 outputs = op(im, im_info, label_info)
                 im = outputs[0]
             else:
+                if im.shape[-1] != 3:
+                    raise Exception(
+                        "Only the 3-channel RGB image is supported in the imgaug operator, but recieved image channel is {}".
+                        format(im.shape[-1]))
                 im = execute_imgaug(op, im)
                 if label_info is not None:
                     outputs = (im, im_info, label_info)
@@ -172,12 +184,14 @@ class ResizeByShort(DetTransform):
     1. 获取图像的长边和短边长度。
     2. 根据短边与short_size的比例,计算长边的目标长度,
        此时高、宽的resize比例为short_size/原图短边长度。
+       若short_size为数组,则随机从该数组中挑选一个数值
+       作为short_size。
     3. 如果max_size>0,调整resize比例:
        如果长边的目标长度>max_size,则高、宽的resize比例为max_size/原图长边长度。
     4. 根据调整大小的比例对图像进行resize。
 
     Args:
-        target_size (int): 短边目标长度。默认为800。
+        short_size (int|list): 短边目标长度。默认为800。
         max_size (int): 长边目标长度的最大限制。默认为1333。
 
      Raises:
@@ -186,9 +200,9 @@ class ResizeByShort(DetTransform):
 
     def __init__(self, short_size=800, max_size=1333):
         self.max_size = int(max_size)
-        if not isinstance(short_size, int):
+        if not (isinstance(short_size, int) or isinstance(short_size, list)):
             raise TypeError(
-                "Type of short_size is invalid. Must be Integer, now is {}".
+                "Type of short_size is invalid. Must be Integer or List, now is {}".
                 format(type(short_size)))
         self.short_size = short_size
         if not (isinstance(self.max_size, int)):
@@ -221,7 +235,12 @@ class ResizeByShort(DetTransform):
             raise ValueError('ResizeByShort: image is not 3-dimensional.')
         im_short_size = min(im.shape[0], im.shape[1])
         im_long_size = max(im.shape[0], im.shape[1])
-        scale = float(self.short_size) / im_short_size
+        if isinstance(self.short_size, list):
+            # Case for multi-scale training
+            selected_size = random.choice(self.short_size)
+        else:
+            selected_size = self.short_size
+        scale = float(selected_size) / im_short_size
         if self.max_size > 0 and np.round(scale *
                                           im_long_size) > self.max_size:
             scale = float(self.max_size) / float(im_long_size)
@@ -231,6 +250,8 @@ class ResizeByShort(DetTransform):
         im = cv2.resize(
             im, (resized_width, resized_height),
             interpolation=cv2.INTER_LINEAR)
+        if im.ndim < 3:
+            im = np.expand_dims(im, axis=-1)
         im_info['im_resize_info'] = np.array(im_resize_info).astype(np.float32)
         if label_info is None:
             return (im, im_info)
@@ -528,7 +549,9 @@ class Normalize(DetTransform):
         """
         mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
         std = np.array(self.std)[np.newaxis, np.newaxis, :]
-        im = normalize(im, mean, std)
+        min_val = [0] * im.shape[-1]
+        max_val = [255] * im.shape[-1]
+        im = normalize(im, mean, std, min_val, max_val)
         if label_info is None:
             return (im, im_info)
         else:
@@ -582,6 +605,11 @@ class RandomDistort(DetTransform):
                    当label_info不为空时,返回的tuple为(im, im_info, label_info),分别对应图像np.ndarray数据、
                    存储与标注框相关信息的字典。
         """
+        if im.shape[-1] != 3:
+            raise Exception(
+                "Only the 3-channel RGB image is supported in the RandomDistort operator, but recieved image channel is {}".
+                format(im.shape[-1]))
+
         brightness_lower = 1 - self.brightness_range
         brightness_upper = 1 + self.brightness_range
         contrast_lower = 1 - self.contrast_range
@@ -1015,6 +1043,45 @@ class RandomCrop(DetTransform):
         return (im, im_info, label_info)
 
 
+class CLAHE(DetTransform):
+    """对图像进行对比度增强。
+    Args:
+        clip_limit (int|float): 颜色对比度的阈值,默认值为2.。
+        tile_grid_size (list|tuple): 进行像素均衡化的网格大小。默认值为(8, 8)。
+    Raises:
+        TypeError: 形参数据类型不满足需求。
+    """
+
+    def __init__(self, clip_limit=2., tile_grid_size=(8, 8)):
+        self.clip_limit = clip_limit
+        self.tile_grid_size = tile_grid_size
+
+    def __call__(self, im, im_info=None, label_info=None):
+        """
+        Args:
+            im (numnp.ndarraypy): 图像np.ndarray数据。
+            im_info (dict, 可选): 存储与图像相关的信息。
+            label_info (dict, 可选): 存储与标注框相关的信息。
+
+        Returns:
+            tuple: 当label_info为空时,返回的tuple为(im, im_info),分别对应图像np.ndarray数据、存储与图像相关信息的字典;
+                   当label_info不为空时,返回的tuple为(im, im_info, label_info),分别对应图像np.ndarray数据、
+                   存储与标注框相关信息的字典。
+        """
+        if im.shape[-1] != 1:
+            raise Exception(
+                "Only the one-channel image is supported in the CLAHE operator, but recieved image channel is {}".
+                format(im.shape[-1]))
+        clahe = cv2.createCLAHE(
+            clipLimit=self.clip_limit, tileGridSize=self.tile_grid_size)
+        im = clahe.apply(im).astype(im.dtype)
+
+        if label_info is None:
+            return (im, im_info)
+        else:
+            return (im, im_info, label_info)
+
+
 class ArrangeFasterRCNN(DetTransform):
     """获取FasterRCNN模型训练/验证/预测所需信息。
 

+ 4 - 0
paddlex/cv/transforms/ops.py

@@ -45,6 +45,8 @@ def resize_long(im, long_size=224, interpolation=cv2.INTER_LINEAR):
 
     im = cv2.resize(
         im, (resized_width, resized_height), interpolation=interpolation)
+    if im.ndim < 3:
+        im = np.expand_dims(im, axis=-1)
     return im
 
 
@@ -56,6 +58,8 @@ def resize(im, target_size=608, interp=cv2.INTER_LINEAR):
         w = target_size
         h = target_size
     im = cv2.resize(im, (w, h), interpolation=interp)
+    if im.ndim < 3:
+        im = np.expand_dims(im, axis=-1)
     return im
 
 

+ 30 - 17
paddlex/cv/transforms/seg_transforms.py

@@ -65,7 +65,7 @@ class Compose(SegTransform):
                     )
 
     @staticmethod
-    def read_img(img_path):
+    def read_img(img_path, input_channel=3):
         img_format = imghdr.what(img_path)
         name, ext = osp.splitext(img_path)
         if img_format == 'tiff' or ext == '.img':
@@ -83,23 +83,26 @@ class Compose(SegTransform):
             im_data = dataset.ReadAsArray()
             return im_data.transpose((1, 2, 0))
         elif img_format in ['jpeg', 'bmp', 'png']:
-            return cv2.imread(img_path)
+            if input_channel == 3:
+                return cv2.imread(img_path)
+            else:
+                im = cv2.imread(im_file, cv2.IMREAD_UNCHANGED)
         elif ext == '.npy':
             return np.load(img_path)
         else:
             raise Exception('Image format {} is not supported!'.format(ext))
 
     @staticmethod
-    def decode_image(im, label):
-        if isinstance(im, np.ndarray):
-            if len(im.shape) != 3:
+    def decode_image(im_path, label, input_channel=3):
+        if isinstance(im_path, np.ndarray):
+            if len(im_path.shape) != 3:
                 raise Exception(
                     "im should be 3-dimensions, but now is {}-dimensions".
-                    format(len(im.shape)))
+                    format(len(im_path.shape)))
+            im = im_path
         else:
             try:
-                im_path = im
-                im = Compose.read_img(im).astype('float32')
+                im = Compose.read_img(im_path, input_channel).astype('float32')
             except:
                 raise ValueError('Can\'t read The image file {}!'.format(
                     im_path))
@@ -136,8 +139,9 @@ class Compose(SegTransform):
             tuple: 根据网络所需字段所组成的tuple;字段由transforms中的最后一个数据预处理操作决定。
         """
 
-        im, label = self.decode_image(im, label)
-        if self.to_rgb:
+        input_channel = getattr(self, 'input_channel', 3)
+        im, label = self.decode_image(im, label, input_channel)
+        if self.to_rgb and input_channel == 3:
             im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
         if im_info is None:
             im_info = [('origin_shape', im.shape[0:2])]
@@ -343,6 +347,8 @@ class Resize(SegTransform):
             fx=im_scale_x,
             fy=im_scale_y,
             interpolation=self.interp_dict[self.interp])
+        if im.ndim < 3:
+            im = np.expand_dims(im, axis=-1)
         if label is not None:
             label = cv2.resize(
                 label,
@@ -457,13 +463,16 @@ class ResizeByShort(SegTransform):
         im_short_size = min(im.shape[0], im.shape[1])
         im_long_size = max(im.shape[0], im.shape[1])
         scale = float(self.short_size) / im_short_size
-        if self.max_size > 0 and np.round(scale * im_long_size) > self.max_size:
+        if self.max_size > 0 and np.round(scale *
+                                          im_long_size) > self.max_size:
             scale = float(self.max_size) / float(im_long_size)
         resized_width = int(round(im.shape[1] * scale))
         resized_height = int(round(im.shape[0] * scale))
         im = cv2.resize(
             im, (resized_width, resized_height),
             interpolation=cv2.INTER_NEAREST)
+        if im.ndim < 3:
+            im = np.expand_dims(im, axis=-1)
         if label is not None:
             im = cv2.resize(
                 label, (resized_width, resized_height),
@@ -585,6 +594,8 @@ class ResizeStepScaling(SegTransform):
             fx=scale_factor,
             fy=scale_factor,
             interpolation=cv2.INTER_LINEAR)
+        if im.ndim < 3:
+            im = np.expand_dims(im, axis=-1)
         if label is not None:
             label = cv2.resize(
                 label, (0, 0),
@@ -733,12 +744,12 @@ class Padding(SegTransform):
             im = np.zeros((im_height + pad_height, im_width + pad_width,
                            im_channel)).astype(orig_im.dtype)
             for i in range(im_channel):
-                im[:, :, i] = np.pad(orig_im[:, :, i],
-                                     pad_width=((0, pad_height),
-                                                (0, pad_width)),
-                                     mode='constant',
-                                     constant_values=(self.im_padding_value[i],
-                                                      self.im_padding_value[i]))
+                im[:, :, i] = np.pad(
+                    orig_im[:, :, i],
+                    pad_width=((0, pad_height), (0, pad_width)),
+                    mode='constant',
+                    constant_values=(self.im_padding_value[i],
+                                     self.im_padding_value[i]))
 
             if label is not None:
                 label = np.pad(label,
@@ -1031,6 +1042,8 @@ class RandomScaleAspect(SegTransform):
                     im = cv2.resize(
                         im, (img_width, img_height),
                         interpolation=cv2.INTER_LINEAR)
+                    if im.ndim < 3:
+                        im = np.expand_dims(im, axis=-1)
                     label = cv2.resize(
                         label, (img_width, img_height),
                         interpolation=cv2.INTER_NEAREST)

+ 18 - 13
paddlex/deploy.py

@@ -80,14 +80,16 @@ class Predictor:
             to_rgb = False
         self.transforms = build_transforms(self.model_type,
                                            self.info['Transforms'], to_rgb)
-        self.predictor = self.create_predictor(use_gpu, gpu_id, use_mkl,
-                                               mkl_thread_num, use_trt,
-                                               use_glog, memory_optimize,
-                                               max_trt_batch_size)
+        self.predictor = self.create_predictor(
+            use_gpu, gpu_id, use_mkl, mkl_thread_num, use_trt, use_glog,
+            memory_optimize, max_trt_batch_size)
         # 线程池,在模型在预测时用于对输入数据以图片为单位进行并行处理
         # 主要用于batch_predict接口
         thread_num = mp.cpu_count() if mp.cpu_count() < 8 else 8
         self.thread_pool = mp.pool.ThreadPool(thread_num)
+        self.input_channel = 3
+        if 'input_channel' in self.info['_init_params']:
+            self.input_channel = self.info['_init_params']['input_channel']
 
     def reset_thread_pool(self, thread_num):
         self.thread_pool.close()
@@ -112,12 +114,12 @@ class Predictor:
             config.enable_use_gpu(100, gpu_id)
             if use_trt:
                 config.enable_tensorrt_engine(
-                            workspace_size=1<<10,
-                            max_batch_size=max_trt_batch_size,
-                            min_subgraph_size=3,
-                            precision_mode=fluid.core.AnalysisConfig.Precision.Float32,
-                            use_static=False,
-                            use_calib_mode=False)
+                    workspace_size=1 << 10,
+                    max_batch_size=max_trt_batch_size,
+                    min_subgraph_size=3,
+                    precision_mode=fluid.core.AnalysisConfig.Precision.Float32,
+                    use_static=False,
+                    use_calib_mode=False)
         else:
             config.disable_gpu()
         if use_mkl and not use_gpu:
@@ -165,7 +167,8 @@ class Predictor:
                     self.transforms,
                     self.model_type,
                     self.model_name,
-                    thread_pool=thread_pool)
+                    thread_pool=thread_pool,
+                    input_channel=self.input_channel)
                 res['image'] = im
                 res['im_size'] = im_size
             if self.model_name.count('RCNN') > 0:
@@ -174,7 +177,8 @@ class Predictor:
                     self.transforms,
                     self.model_type,
                     self.model_name,
-                    thread_pool=thread_pool)
+                    thread_pool=thread_pool,
+                    input_channel=self.input_channel)
                 res['image'] = im
                 res['im_info'] = im_resize_info
                 res['im_shape'] = im_shape
@@ -184,7 +188,8 @@ class Predictor:
                 self.transforms,
                 self.model_type,
                 self.model_name,
-                thread_pool=thread_pool)
+                thread_pool=thread_pool,
+                input_channel=self.input_channel)
             res['image'] = im
             res['im_info'] = im_info
         return res

+ 3 - 0
paddlex/det.py

@@ -14,6 +14,7 @@
 
 from __future__ import absolute_import
 from . import cv
+from . import tools
 
 FasterRCNN = cv.models.FasterRCNN
 YOLOv3 = cv.models.YOLOv3
@@ -22,3 +23,5 @@ MaskRCNN = cv.models.MaskRCNN
 transforms = cv.transforms.det_transforms
 visualize = cv.models.utils.visualize.visualize_detection
 draw_pr_curve = cv.models.utils.visualize.draw_pr_curve
+coco_error_analysis = cv.models.utils.detection_eval.coco_error_analysis
+paste_objects = tools.dataset_generate.det.paste_objects

+ 1 - 0
paddlex/tools/__init__.py

@@ -16,3 +16,4 @@
 
 from .convert import *
 from .split import *
+from .dataset_generate import *

+ 5 - 2
paddlex/tools/base.py

@@ -18,6 +18,7 @@ import json
 import chardet
 import numpy as np
 
+
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.integer):
@@ -28,7 +29,8 @@ class MyEncoder(json.JSONEncoder):
             return obj.tolist()
         else:
             return super(MyEncoder, self).default(obj)
-        
+
+
 def is_pic(img_name):
     valid_suffix = ["JPEG", "jpeg", "JPG", "jpg", "BMP", "bmp", "PNG", "png"]
     suffix = img_name.split(".")[-1]
@@ -36,9 +38,10 @@ def is_pic(img_name):
         return False
     return True
 
+
 def get_encoding(path):
     f = open(path, 'rb')
     data = f.read()
     file_encoding = chardet.detect(data).get('encoding')
     f.close()
-    return file_encoding
+    return file_encoding

+ 2 - 1
paddlex/tools/convert.py

@@ -36,6 +36,7 @@ jingling2seg = JingLing2Seg().convert
 labelme2seg = LabelMe2Seg().convert
 easydata2seg = EasyData2Seg().convert
 
+
 def dataset_conversion(source, to, pics, anns, save_dir):
     if source == 'labelme' and to == 'PascalVOC':
         labelme2voc(pics, anns, save_dir)
@@ -56,4 +57,4 @@ def dataset_conversion(source, to, pics, anns, save_dir):
     elif source == 'easydata' and to == 'MSCOCO':
         easydata2coco(pics, anns, save_dir)
     elif source == 'easydata' and to == 'SEG':
-        easydata2seg(pics, anns, save_dir)
+        easydata2seg(pics, anns, save_dir)

+ 15 - 0
paddlex/tools/dataset_generate/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import det

+ 208 - 0
paddlex/tools/dataset_generate/det.py

@@ -0,0 +1,208 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+import os
+import os.path as osp
+import random
+import cv2
+import time
+import numpy as np
+import xml.etree.ElementTree as ET
+import paddlex.utils.logging as logging
+
+
+def write_xml(im_info, label_info, anno_dir):
+    im_fname = im_info['file_name']
+    im_h, im_w, im_c = im_info['image_shape']
+    is_crowd = label_info['is_crowd']
+    gt_class = label_info['gt_class']
+    gt_bbox = label_info['gt_bbox']
+    gt_score = label_info['gt_score']
+    gt_poly = label_info['gt_poly']
+    difficult = label_info['difficult']
+    import xml.dom.minidom as minidom
+    xml_doc = minidom.Document()
+    root = xml_doc.createElement("annotation")
+    xml_doc.appendChild(root)
+    node_filename = xml_doc.createElement("filename")
+    node_filename.appendChild(xml_doc.createTextNode(im_fname))
+    root.appendChild(node_filename)
+    node_size = xml_doc.createElement("size")
+    node_width = xml_doc.createElement("width")
+    node_width.appendChild(xml_doc.createTextNode(str(im_w)))
+    node_size.appendChild(node_width)
+    node_height = xml_doc.createElement("height")
+    node_height.appendChild(xml_doc.createTextNode(str(im_h)))
+    node_size.appendChild(node_height)
+    node_depth = xml_doc.createElement("depth")
+    node_depth.appendChild(xml_doc.createTextNode(str(im_c)))
+    node_size.appendChild(node_depth)
+    root.appendChild(node_size)
+    for i in range(len(label_info['gt_class'])):
+        node_obj = xml_doc.createElement("object")
+        node_name = xml_doc.createElement("name")
+        label = gt_class[i]
+        node_name.appendChild(xml_doc.createTextNode(label))
+        node_obj.appendChild(node_name)
+        node_diff = xml_doc.createElement("difficult")
+        node_diff.appendChild(xml_doc.createTextNode(str(difficult[i][0])))
+        node_obj.appendChild(node_diff)
+        node_box = xml_doc.createElement("bndbox")
+        node_xmin = xml_doc.createElement("xmin")
+        node_xmin.appendChild(xml_doc.createTextNode(str(gt_bbox[i][0])))
+        node_box.appendChild(node_xmin)
+        node_ymin = xml_doc.createElement("ymin")
+        node_ymin.appendChild(xml_doc.createTextNode(str(gt_bbox[i][1])))
+        node_box.appendChild(node_ymin)
+        node_xmax = xml_doc.createElement("xmax")
+        node_xmax.appendChild(xml_doc.createTextNode(str(gt_bbox[i][2])))
+        node_box.appendChild(node_xmax)
+        node_ymax = xml_doc.createElement("ymax")
+        node_ymax.appendChild(xml_doc.createTextNode(str(gt_bbox[i][3])))
+        node_box.appendChild(node_ymax)
+        node_obj.appendChild(node_box)
+        root.appendChild(node_obj)
+    img_name_part = im_fname.split('.')[0]
+    with open(osp.join(anno_dir, img_name_part + ".xml"), 'w') as fxml:
+        xml_doc.writexml(
+            fxml, indent='\t', addindent='\t', newl='\n', encoding="utf-8")
+
+
+def paste_objects(templates, background, save_dir='dataset_clone'):
+    """将目标物体粘贴在背景图片上生成新的图片,并加入到数据集中
+
+    Args:
+        templates (list|tuple):可以将多张图像上的目标物体同时粘贴在同一个背景图片上,
+            因此templates是一个列表,其中每个元素是一个dict,表示一张图片的目标物体。
+            一张图片的目标物体有`image`和`annos`两个关键字,`image`的键值是图像的路径,
+            或者是解码后的排列格式为(H, W, C)且类型为uint8且为BGR格式的数组。
+            图像上可以有多个目标物体,因此`annos`的键值是一个列表,列表中每个元素是一个dict,
+            表示一个目标物体的信息。该dict包含`polygon`和`category`两个关键字,
+            其中`polygon`表示目标物体的边缘坐标,例如[[0, 0], [0, 1], [1, 1], [1, 0]],
+            `category`表示目标物体的类别,例如'dog'。
+        background (dict): 背景图片可以有真值,因此background是一个dict,包含`image`和`annos`
+            两个关键字,`image`的键值是背景图像的路径,或者是解码后的排列格式为(H, W, C)
+            且类型为uint8且为BGR格式的数组。若背景图片上没有真值,则`annos`的键值是空列表[],
+            若有,则`annos`的键值是由多个dict组成的列表,每个dict表示一个物体的信息,
+            包含`bbox`和`category`两个关键字,`bbox`的键值是物体框左上角和右下角的坐标,即
+            [x1, y1, x2, y2],`category`表示目标物体的类别,例如'dog'。
+        save_dir (str):新图片及其标注文件的存储目录。默认值为`dataset_clone`。
+
+    """
+    if not osp.exists(save_dir):
+        os.makedirs(save_dir)
+    image_dir = osp.join(save_dir, 'JPEGImages_clone')
+    anno_dir = osp.join(save_dir, 'Annotations_clone')
+    json_path = osp.join(save_dir, "annotations.json")
+    if not osp.exists(image_dir):
+        os.makedirs(image_dir)
+    if not osp.exists(anno_dir):
+        os.makedirs(anno_dir)
+
+    num_objs = len(background['annos'])
+    for temp in templates:
+        num_objs += len(temp['annos'])
+
+    gt_bbox = np.zeros((num_objs, 4), dtype=np.float32)
+    gt_class = list()
+    gt_score = np.ones((num_objs, 1), dtype=np.float32)
+    is_crowd = np.zeros((num_objs, 1), dtype=np.int32)
+    difficult = np.zeros((num_objs, 1), dtype=np.int32)
+    i = -1
+    for i, back_anno in enumerate(background['annos']):
+        gt_bbox[i] = back_anno['bbox']
+        gt_class.append(back_anno['category'])
+
+    back_im = background['image']
+    if isinstance(back_im, np.ndarray):
+        if len(back_im.shape) != 3:
+            raise Exception(
+                "background image should be 3-dimensions, but now is {}-dimensions".
+                format(len(back_im.shape)))
+    else:
+        try:
+            back_im = cv2.imread(back_im, cv2.IMREAD_UNCHANGED)
+        except:
+            raise TypeError('Can\'t read The image file {}!'.format(back_im))
+    back_annos = background['annos']
+    im_h, im_w, im_c = back_im.shape
+    for temp in templates:
+        temp_im = temp['image']
+        if isinstance(temp_im, np.ndarray):
+            if len(temp_im.shape) != 3:
+                raise Exception(
+                    "template image should be 3-dimensions, but now is {}-dimensions".
+                    format(len(temp_im.shape)))
+        else:
+            try:
+                temp_im = cv2.imread(temp_im, cv2.IMREAD_UNCHANGED)
+            except:
+                raise TypeError('Can\'t read The image file {}!'.format(
+                    temp_im))
+        temp_annos = temp['annos']
+        for temp_anno in temp_annos:
+            temp_mask = np.zeros(temp_im.shape, temp_im.dtype)
+            temp_poly = np.array(temp_anno['polygon'], np.int32)
+            temp_category = temp_anno['category']
+            cv2.fillPoly(temp_mask, [temp_poly], (255, 255, 255))
+            x_list = [temp_poly[i][0] for i in range(len(temp_poly))]
+            y_list = [temp_poly[i][1] for i in range(len(temp_poly))]
+            temp_poly_w = max(x_list) - min(x_list)
+            temp_poly_h = max(y_list) - min(y_list)
+            found = False
+            while not found:
+                center_x = random.randint(1, im_w - 1)
+                center_y = random.randint(1, im_h - 1)
+                if center_x < temp_poly_w / 2 or center_x > im_w - temp_poly_w / 2 - 1 or \
+                   center_y < temp_poly_h / 2 or center_y > im_h - temp_poly_h / 2 - 1:
+                    found = False
+                    continue
+                if len(back_annos) == 0:
+                    found = True
+                for back_anno in back_annos:
+                    x1, y1, x2, y2 = back_anno['bbox']
+                    if center_x > x1 and center_x < x2 and center_y > y1 and center_y < y2:
+                        found = False
+                        continue
+                    found = True
+            center = (center_x, center_y)
+            back_im = cv2.seamlessClone(temp_im, back_im, temp_mask, center,
+                                        cv2.MIXED_CLONE)
+            i += 1
+            x1 = center[0] - temp_poly_w / 2
+            x2 = center[0] + temp_poly_w / 2
+            y1 = center[1] - temp_poly_h / 2
+            y2 = center[1] + temp_poly_h / 2
+            gt_bbox[i] = [x1, y1, x2, y2]
+            gt_class.append(temp_category)
+
+    im_fname = str(int(time.time() * 1000)) + '.jpg'
+    im_info = {
+        'file_name': im_fname,
+        'image_shape': [im_h, im_w, im_c],
+    }
+    label_info = {
+        'is_crowd': is_crowd,
+        'gt_class': gt_class,
+        'gt_bbox': gt_bbox,
+        'gt_score': gt_score,
+        'difficult': difficult,
+        'gt_poly': [],
+    }
+    cv2.imwrite(osp.join(image_dir, im_fname), back_im.astype('uint8'))
+    write_xml(im_info, label_info, anno_dir)
+    logging.info("Gegerated image is saved in {}".format(image_dir))
+    logging.info("Generated Annotation is saved as xml files in {}".format(
+        anno_dir))

+ 52 - 23
paddlex/tools/x2voc.py

@@ -22,10 +22,11 @@ import shutil
 import numpy as np
 from .base import MyEncoder, is_pic, get_encoding
 
+
 class X2VOC(object):
     def __init__(self):
         pass
-    
+
     def convert(self, image_dir, json_dir, dataset_save_dir):
         """转换。
         Args:
@@ -45,32 +46,35 @@ class X2VOC(object):
         for img_name in os.listdir(image_dir):
             if is_pic(img_name):
                 shutil.copyfile(
-                            osp.join(image_dir, img_name),
-                            osp.join(new_image_dir, img_name))
+                    osp.join(image_dir, img_name),
+                    osp.join(new_image_dir, img_name))
         # Convert the json files.
         xml_dir = osp.join(dataset_save_dir, "Annotations")
         if osp.exists(xml_dir):
             shutil.rmtree(xml_dir)
         os.makedirs(xml_dir)
         self.json2xml(new_image_dir, json_dir, xml_dir)
-        
-        
+
+
 class LabelMe2VOC(X2VOC):
     """将使用LabelMe标注的数据集转换为VOC数据集。
     """
+
     def __init__(self):
         pass
-    
+
     def json2xml(self, image_dir, json_dir, xml_dir):
         import xml.dom.minidom as minidom
+        i = 0
         for img_name in os.listdir(image_dir):
             img_name_part = osp.splitext(img_name)[0]
             json_file = osp.join(json_dir, img_name_part + ".json")
+            i += 1
             if not osp.exists(json_file):
                 os.remove(osp.join(image_dir, img_name))
                 continue
-            xml_doc = minidom.Document() 
-            root = xml_doc.createElement("annotation") 
+            xml_doc = minidom.Document()
+            root = xml_doc.createElement("annotation")
             xml_doc.appendChild(root)
             node_folder = xml_doc.createElement("folder")
             node_folder.appendChild(xml_doc.createTextNode("JPEGImages"))
@@ -81,8 +85,13 @@ class LabelMe2VOC(X2VOC):
             with open(json_file, mode="r", \
                               encoding=get_encoding(json_file)) as j:
                 json_info = json.load(j)
-                h = json_info["imageHeight"]
-                w = json_info["imageWidth"]
+                if 'imageHeight' in json_info and 'imageWidth' in json_info:
+                    h = json_info["imageHeight"]
+                    w = json_info["imageWidth"]
+                else:
+                    img_file = osp.join(image_dir, img_name)
+                    im_data = cv2.imread(img_file)
+                    h, w, c = im_data.shape
                 node_size = xml_doc.createElement("size")
                 node_width = xml_doc.createElement("width")
                 node_width.appendChild(xml_doc.createTextNode(str(w)))
@@ -95,12 +104,22 @@ class LabelMe2VOC(X2VOC):
                 node_size.appendChild(node_depth)
                 root.appendChild(node_size)
                 for shape in json_info["shapes"]:
-                    if shape["shape_type"] != "rectangle":
-                        continue
+                    if 'shape_type' in shape:
+                        if shape["shape_type"] != "rectangle":
+                            continue
+                        (xmin, ymin), (xmax, ymax) = shape["points"]
+                        xmin, xmax = sorted([xmin, xmax])
+                        ymin, ymax = sorted([ymin, ymax])
+                    else:
+                        points = shape["points"]
+                        points_num = len(points)
+                        x = [points[i][0] for i in range(points_num)]
+                        y = [points[i][1] for i in range(points_num)]
+                        xmin = min(x)
+                        xmax = max(x)
+                        ymin = min(y)
+                        ymax = max(y)
                     label = shape["label"]
-                    (xmin, ymin), (xmax, ymax) = shape["points"]
-                    xmin, xmax = sorted([xmin, xmax])
-                    ymin, ymax = sorted([ymin, ymax])
                     node_obj = xml_doc.createElement("object")
                     node_name = xml_doc.createElement("name")
                     node_name.appendChild(xml_doc.createTextNode(label))
@@ -124,15 +143,21 @@ class LabelMe2VOC(X2VOC):
                     node_obj.appendChild(node_box)
                     root.appendChild(node_obj)
             with open(osp.join(xml_dir, img_name_part + ".xml"), 'w') as fxml:
-                xml_doc.writexml(fxml, indent='\t', addindent='\t', newl='\n', encoding="utf-8")
-                    
-                    
+                xml_doc.writexml(
+                    fxml,
+                    indent='\t',
+                    addindent='\t',
+                    newl='\n',
+                    encoding="utf-8")
+
+
 class EasyData2VOC(X2VOC):
     """将使用EasyData标注的分割数据集转换为VOC数据集。
     """
+
     def __init__(self):
         pass
-    
+
     def json2xml(self, image_dir, json_dir, xml_dir):
         import xml.dom.minidom as minidom
         for img_name in os.listdir(image_dir):
@@ -141,8 +166,8 @@ class EasyData2VOC(X2VOC):
             if not osp.exists(json_file):
                 os.remove(osp.join(image_dir, img_name))
                 continue
-            xml_doc = minidom.Document() 
-            root = xml_doc.createElement("annotation") 
+            xml_doc = minidom.Document()
+            root = xml_doc.createElement("annotation")
             xml_doc.appendChild(root)
             node_folder = xml_doc.createElement("folder")
             node_folder.appendChild(xml_doc.createTextNode("JPEGImages"))
@@ -196,5 +221,9 @@ class EasyData2VOC(X2VOC):
                     node_obj.appendChild(node_box)
                     root.appendChild(node_obj)
             with open(osp.join(xml_dir, img_name_part + ".xml"), 'w') as fxml:
-                xml_doc.writexml(fxml, indent='\t', addindent='\t', newl='\n', encoding="utf-8")                    
-                    
+                xml_doc.writexml(
+                    fxml,
+                    indent='\t',
+                    addindent='\t',
+                    newl='\n',
+                    encoding="utf-8")