detector.py 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. import collections
  16. import copy
  17. import os
  18. import os.path as osp
  19. from paddle.io import DistributedBatchSampler, DataLoader
  20. from paddle.static import InputSpec
  21. import paddlex
  22. import paddlex.utils.logging as logging
  23. from paddlex.cv.nets.ppdet.modeling.proposal_generator.target_layer import BBoxAssigner, MaskAssigner
  24. from paddlex.cv.nets.ppdet.modeling import *
  25. from paddlex.cv.nets.ppdet.modeling.post_process import *
  26. from paddlex.cv.nets.ppdet.modeling.layers import YOLOBox, MultiClassNMS, RCNNBox
  27. from paddlex.utils import get_single_card_bs, _get_shared_memory_size_in_M
  28. from paddlex.cv.transforms.operators import _NormalizeBox, _PadBox, _BboxXYXY2XYWH
  29. from paddlex.cv.transforms.batch_operators import BatchCompose, BatchRandomResize, BatchRandomResizeByShort, _BatchPadding, _Gt2YoloTarget, _Permute
  30. from paddlex.cv.transforms import arrange_transforms
  31. from .base import BaseModel
  32. from .utils.det_metrics import VOCMetric, COCOMetric
  33. from paddlex.utils.checkpoint import det_pretrain_weights_dict
  34. __all__ = [
  35. "YOLOv3", "FasterRCNN", "PPYOLO", "PPYOLOTiny", "PPYOLOv2", "MaskRCNN"
  36. ]
  37. class BaseDetector(BaseModel):
  38. def __init__(self, model_name, num_classes=80, **params):
  39. self.init_params.update(locals())
  40. del self.init_params['params']
  41. super(BaseDetector, self).__init__('detector')
  42. if not hasattr(architectures, model_name):
  43. raise Exception("ERROR: There's no model named {}.".format(
  44. model_name))
  45. self.model_name = model_name
  46. self.num_classes = num_classes
  47. self.labels = None
  48. self.net = self.build_net(**params)
  49. def build_net(self, **params):
  50. with paddle.utils.unique_name.guard():
  51. net = architectures.__dict__[self.model_name](**params)
  52. return net
  53. def get_test_inputs(self, image_shape):
  54. input_spec = [{
  55. "image": InputSpec(
  56. shape=[None, 3] + image_shape, name='image', dtype='float32'),
  57. "im_shape": InputSpec(
  58. shape=[None, 2], name='im_shape', dtype='float32'),
  59. "scale_factor": InputSpec(
  60. shape=[None, 2], name='scale_factor', dtype='float32')
  61. }]
  62. return input_spec
  63. def _get_backbone(self, backbone_name, **params):
  64. backbone = backbones.__dict__[backbone_name](**params)
  65. return backbone
  66. def run(self, net, inputs, mode):
  67. net_out = net(inputs)
  68. if mode in ['train', 'eval']:
  69. outputs = net_out
  70. else:
  71. for key in ['im_shape', 'scale_factor']:
  72. net_out[key] = inputs[key]
  73. outputs = dict()
  74. for key in net_out:
  75. outputs[key] = net_out[key].numpy()
  76. return outputs
  77. def default_optimizer(self, parameters, learning_rate, warmup_steps,
  78. warmup_start_lr, lr_decay_epochs, lr_decay_gamma,
  79. num_steps_each_epoch):
  80. boundaries = [b * num_steps_each_epoch for b in lr_decay_epochs]
  81. values = [(lr_decay_gamma**i) * learning_rate
  82. for i in range(len(lr_decay_epochs) + 1)]
  83. scheduler = paddle.optimizer.lr.PiecewiseDecay(
  84. boundaries=boundaries, values=values)
  85. if warmup_steps > 0:
  86. if warmup_steps > lr_decay_epochs[0] * num_steps_each_epoch:
  87. logging.error(
  88. "In function train(), parameters should satisfy: "
  89. "warmup_steps <= lr_decay_epochs[0]*num_samples_in_train_dataset",
  90. exit=False)
  91. logging.error(
  92. "See this doc for more information: "
  93. "https://github.com/PaddlePaddle/PaddleX/blob/develop/docs/appendix/parameters.md#notice",
  94. exit=False)
  95. scheduler = paddle.optimizer.lr.LinearWarmup(
  96. learning_rate=scheduler,
  97. warmup_steps=warmup_steps,
  98. start_lr=warmup_start_lr,
  99. end_lr=learning_rate)
  100. optimizer = paddle.optimizer.Momentum(
  101. scheduler,
  102. momentum=.9,
  103. weight_decay=paddle.regularizer.L2Decay(coeff=1e-04),
  104. parameters=parameters)
  105. return optimizer
  106. def train(self,
  107. num_epochs,
  108. train_dataset,
  109. train_batch_size=64,
  110. eval_dataset=None,
  111. optimizer=None,
  112. save_interval_epochs=1,
  113. log_interval_steps=10,
  114. save_dir='output',
  115. pretrain_weights='IMAGENET',
  116. learning_rate=.001,
  117. warmup_steps=0,
  118. warmup_start_lr=0.0,
  119. lr_decay_epochs=(216, 243),
  120. lr_decay_gamma=0.1,
  121. metric=None,
  122. early_stop=False,
  123. early_stop_patience=5,
  124. use_vdl=True):
  125. """
  126. Train the model.
  127. Args:
  128. num_epochs(int): The number of epochs.
  129. train_dataset(paddlex.dataset): Training dataset.
  130. train_batch_size(int, optional): Total batch size among all cards used in training. Defaults to 64.
  131. eval_dataset(paddlex.dataset, optional):
  132. Evaluation dataset. If None, the model will not be evaluated during training process. Defaults to None.
  133. optimizer(paddle.optimizer.Optimizer or None, optional):
  134. Optimizer used for training. If None, a default optimizer is used. Defaults to None.
  135. save_interval_epochs(int, optional): Epoch interval for saving the model. Defaults to 1.
  136. log_interval_steps(int, optional): Step interval for printing training information. Defaults to 10.
  137. save_dir(str, optional): Directory to save the model. Defaults to 'output'.
  138. pretrain_weights(str or None, optional):
  139. None or name/path of pretrained weights. If None, no pretrained weights will be loaded. Defaults to 'IMAGENET'.
  140. learning_rate(float, optional): Learning rate for training. Defaults to .001.
  141. warmup_steps(int, optional): The number of steps of warm-up training. Defaults to 0.
  142. warmup_start_lr(float, optional): Start learning rate of warm-up training. Defaults to 0..
  143. lr_decay_epochs(list or tuple, optional): Epoch milestones for learning rate decay. Defaults to (216, 243).
  144. lr_decay_gamma(float, optional): Gamma coefficient of learning rate decay. Defaults to .1.
  145. metric({'VOC', 'COCO', None}, optional):
  146. Evaluation metric. If None, determine the metric according to the dataset format. Defaults to None.
  147. early_stop(bool, optional): Whether to adopt early stop strategy. Defaults to False.
  148. early_stop_patience(int, optional): Early stop patience. Defaults to 5.
  149. use_vdl(bool, optional): Whether to use VisualDL to monitor the training process. Defaults to True.
  150. """
  151. if train_dataset.__class__.__name__ == 'VOCDetection':
  152. train_dataset.data_fields = {
  153. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  154. 'difficult'
  155. }
  156. elif train_dataset.__class__.__name__ == 'CocoDetection':
  157. if self.__class__.__name__ == 'MaskRCNN':
  158. train_dataset.data_fields = {
  159. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  160. 'gt_poly', 'is_crowd'
  161. }
  162. else:
  163. train_dataset.data_fields = {
  164. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  165. 'is_crowd'
  166. }
  167. if metric is None:
  168. if eval_dataset.__class__.__name__ == 'VOCDetection':
  169. self.metric = 'voc'
  170. elif eval_dataset.__class__.__name__ == 'CocoDetection':
  171. self.metric = 'coco'
  172. else:
  173. assert metric.lower() in ['coco', 'voc'], \
  174. "Evaluation metric {} is not supported, please choose form 'COCO' and 'VOC'"
  175. self.metric = metric.lower()
  176. train_dataset.batch_transforms = self._compose_batch_transform(
  177. train_dataset.transforms, mode='train')
  178. self.labels = train_dataset.labels
  179. # build optimizer if not defined
  180. if optimizer is None:
  181. num_steps_each_epoch = len(train_dataset) // train_batch_size
  182. self.optimizer = self.default_optimizer(
  183. parameters=self.net.parameters(),
  184. learning_rate=learning_rate,
  185. warmup_steps=warmup_steps,
  186. warmup_start_lr=warmup_start_lr,
  187. lr_decay_epochs=lr_decay_epochs,
  188. lr_decay_gamma=lr_decay_gamma,
  189. num_steps_each_epoch=num_steps_each_epoch)
  190. else:
  191. self.optimizer = optimizer
  192. # initiate weights
  193. if pretrain_weights is not None and not osp.exists(pretrain_weights):
  194. if pretrain_weights not in det_pretrain_weights_dict['_'.join(
  195. [self.model_name, self.backbone_name])]:
  196. logging.warning(
  197. "Path of pretrain_weights('{}') does not exist!".format(
  198. pretrain_weights))
  199. pretrain_weights = det_pretrain_weights_dict['_'.join(
  200. [self.model_name, self.backbone_name])][0]
  201. logging.warning("Pretrain_weights is forcibly set to '{}'. "
  202. "If don't want to use pretrain weights, "
  203. "set pretrain_weights to be None.".format(
  204. pretrain_weights))
  205. pretrained_dir = osp.join(save_dir, 'pretrain')
  206. self.net_initialize(
  207. pretrain_weights=pretrain_weights, save_dir=pretrained_dir)
  208. # start train loop
  209. self.train_loop(
  210. num_epochs=num_epochs,
  211. train_dataset=train_dataset,
  212. train_batch_size=train_batch_size,
  213. eval_dataset=eval_dataset,
  214. save_interval_epochs=save_interval_epochs,
  215. log_interval_steps=log_interval_steps,
  216. save_dir=save_dir,
  217. early_stop=early_stop,
  218. early_stop_patience=early_stop_patience,
  219. use_vdl=use_vdl)
  220. def evaluate(self,
  221. eval_dataset,
  222. batch_size=1,
  223. metric=None,
  224. return_details=False):
  225. """
  226. Evaluate the model.
  227. Args:
  228. eval_dataset(paddlex.dataset): Evaluation dataset.
  229. batch_size(int, optional): Total batch size among all cards used for evaluation. Defaults to 1.
  230. metric({'VOC', 'COCO', None}, optional):
  231. Evaluation metric. If None, determine the metric according to the dataset format. Defaults to None.
  232. return_details(bool, optional): Whether to return evaluation details. Defaults to False.
  233. Returns:
  234. collections.OrderedDict with key-value pairs: {"mAP(0.50, 11point)":`mean average precision`}.
  235. """
  236. if eval_dataset.__class__.__name__ == 'VOCDetection':
  237. eval_dataset.data_fields = {
  238. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  239. 'difficult'
  240. }
  241. elif eval_dataset.__class__.__name__ == 'CocoDetection':
  242. if self.__class__.__name__ == 'MaskRCNN':
  243. eval_dataset.data_fields = {
  244. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  245. 'gt_poly', 'is_crowd'
  246. }
  247. else:
  248. eval_dataset.data_fields = {
  249. 'im_id', 'image_shape', 'image', 'gt_bbox', 'gt_class',
  250. 'is_crowd'
  251. }
  252. eval_dataset.batch_transforms = self._compose_batch_transform(
  253. eval_dataset.transforms, mode='eval')
  254. arrange_transforms(
  255. model_type=self.model_type,
  256. transforms=eval_dataset.transforms,
  257. mode='eval')
  258. self.net.eval()
  259. nranks = paddle.distributed.get_world_size()
  260. local_rank = paddle.distributed.get_rank()
  261. if nranks > 1:
  262. # Initialize parallel environment if not done.
  263. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
  264. ):
  265. paddle.distributed.init_parallel_env()
  266. if batch_size > 1:
  267. logging.warning(
  268. "Detector only supports single card evaluation with batch_size=1 "
  269. "during evaluation, so batch_size is forcibly set to 1.")
  270. batch_size = 1
  271. if nranks < 2 or local_rank == 0:
  272. self.eval_data_loader = self.build_data_loader(
  273. eval_dataset, batch_size=batch_size, mode='eval')
  274. is_bbox_normalized = False
  275. if eval_dataset.batch_transforms is not None:
  276. is_bbox_normalized = any(
  277. isinstance(t, _NormalizeBox)
  278. for t in eval_dataset.batch_transforms.batch_transforms)
  279. if metric is None:
  280. if getattr(self, 'metric', None) is not None:
  281. if self.metric == 'voc':
  282. eval_metric = VOCMetric(
  283. labels=eval_dataset.labels,
  284. coco_gt=copy.deepcopy(eval_dataset.coco_gt),
  285. is_bbox_normalized=is_bbox_normalized,
  286. classwise=False)
  287. else:
  288. eval_metric = COCOMetric(
  289. coco_gt=copy.deepcopy(eval_dataset.coco_gt),
  290. classwise=False)
  291. else:
  292. if eval_dataset.__class__.__name__ == 'VOCDetection':
  293. eval_metric = VOCMetric(
  294. labels=eval_dataset.labels,
  295. coco_gt=copy.deepcopy(eval_dataset.coco_gt),
  296. is_bbox_normalized=is_bbox_normalized,
  297. classwise=False)
  298. elif eval_dataset.__class__.__name__ == 'CocoDetection':
  299. eval_metric = COCOMetric(
  300. coco_gt=copy.deepcopy(eval_dataset.coco_gt),
  301. classwise=False)
  302. else:
  303. assert metric.lower() in ['coco', 'voc'], \
  304. "Evaluation metric {} is not supported, please choose form 'COCO' and 'VOC'"
  305. if metric.lower() == 'coco':
  306. eval_metric = COCOMetric(
  307. coco_gt=copy.deepcopy(eval_dataset.coco_gt),
  308. classwise=False)
  309. else:
  310. eval_metric = VOCMetric(
  311. labels=eval_dataset.labels,
  312. is_bbox_normalized=is_bbox_normalized,
  313. classwise=False)
  314. scores = collections.OrderedDict()
  315. with paddle.no_grad():
  316. for step, data in enumerate(self.eval_data_loader):
  317. outputs = self.run(self.net, data, 'eval')
  318. eval_metric.update(data, outputs)
  319. eval_metric.accumulate()
  320. self.eval_details = eval_metric.details
  321. scores.update(eval_metric.get())
  322. eval_metric.reset()
  323. if return_details:
  324. return scores, self.eval_details
  325. return scores
  326. def predict(self, img_file, transforms=None):
  327. """
  328. Do inference.
  329. Args:
  330. img_file(List[np.ndarray or str], str or np.ndarray): img_file(list or str or np.array):
  331. Image path or decoded image data in a BGR format, which also could constitute a list,
  332. meaning all images to be predicted as a mini-batch.
  333. transforms(paddlex.transforms.Compose or None, optional):
  334. Transforms for inputs. If None, the transforms for evaluation process will be used. Defaults to None.
  335. Returns:
  336. If img_file is a string or np.array, the result is a list of dict with key-value pairs:
  337. {"category_id": `category_id`, "category": `category`, "bbox": `[x, y, w, h]`, "score": `score`}.
  338. If img_file is a list, the result is a list composed of dicts with the corresponding fields:
  339. category_id(int): the predicted category ID
  340. category(str): category name
  341. bbox(list): bounding box in [x, y, w, h] format
  342. score(str): confidence
  343. """
  344. if transforms is None and not hasattr(self, 'test_transforms'):
  345. raise Exception("transforms need to be defined, now is None.")
  346. if transforms is None:
  347. transforms = self.test_transforms
  348. if isinstance(img_file, (str, np.ndarray)):
  349. images = [img_file]
  350. else:
  351. images = img_file
  352. batch_samples = self._preprocess(images, transforms)
  353. self.net.eval()
  354. outputs = self.run(self.net, batch_samples, 'test')
  355. prediction = self._postprocess(outputs)
  356. if isinstance(img_file, (str, np.ndarray)):
  357. prediction = prediction[0]
  358. return prediction
  359. def _preprocess(self, images, transforms):
  360. arrange_transforms(
  361. model_type=self.model_type, transforms=transforms, mode='test')
  362. batch_samples = list()
  363. for im in images:
  364. sample = {'image': im}
  365. batch_samples.append(transforms(sample))
  366. batch_transforms = self._compose_batch_transform(transforms, 'test')
  367. batch_samples = batch_transforms(batch_samples)
  368. for k, v in batch_samples.items():
  369. batch_samples[k] = paddle.to_tensor(v)
  370. return batch_samples
  371. def _postprocess(self, batch_pred):
  372. infer_result = {}
  373. if 'bbox' in batch_pred:
  374. bboxes = batch_pred['bbox']
  375. bbox_nums = batch_pred['bbox_num']
  376. det_res = []
  377. k = 0
  378. for i in range(len(bbox_nums)):
  379. det_nums = bbox_nums[i]
  380. for j in range(det_nums):
  381. dt = bboxes[k]
  382. k = k + 1
  383. num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
  384. if int(num_id) < 0:
  385. continue
  386. category = self.labels[int(num_id)]
  387. w = xmax - xmin
  388. h = ymax - ymin
  389. bbox = [xmin, ymin, w, h]
  390. dt_res = {
  391. 'category_id': int(num_id),
  392. 'category': category,
  393. 'bbox': bbox,
  394. 'score': score
  395. }
  396. det_res.append(dt_res)
  397. infer_result['bbox'] = det_res
  398. if 'mask' in batch_pred:
  399. masks = batch_pred['mask']
  400. bboxes = batch_pred['bbox']
  401. mask_nums = batch_pred['bbox_num']
  402. seg_res = []
  403. k = 0
  404. for i in range(len(mask_nums)):
  405. det_nums = mask_nums[i]
  406. for j in range(det_nums):
  407. mask = masks[k].astype(np.uint8)
  408. score = float(bboxes[k][1])
  409. label = int(bboxes[k][0])
  410. k = k + 1
  411. if label == -1:
  412. continue
  413. category = self.labels[int(label)]
  414. import pycocotools.mask as mask_util
  415. rle = mask_util.encode(
  416. np.array(
  417. mask[:, :, None], order="F", dtype="uint8"))[0]
  418. if six.PY3:
  419. if 'counts' in rle:
  420. rle['counts'] = rle['counts'].decode("utf8")
  421. sg_res = {
  422. 'category': category,
  423. 'segmentation': rle,
  424. 'score': score
  425. }
  426. seg_res.append(sg_res)
  427. infer_result['mask'] = seg_res
  428. bbox_num = batch_pred['bbox_num']
  429. results = []
  430. start = 0
  431. for num in bbox_num:
  432. end = start + num
  433. curr_res = infer_result['bbox'][start:end]
  434. if 'mask' in infer_result:
  435. mask_res = infer_result['mask'][start:end]
  436. for box, mask in zip(curr_res, mask_res):
  437. box.update(mask)
  438. results.append(curr_res)
  439. start = end
  440. return results
  441. class YOLOv3(BaseDetector):
  442. def __init__(self,
  443. num_classes=80,
  444. backbone='MobileNetV1',
  445. anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
  446. [59, 119], [116, 90], [156, 198], [373, 326]],
  447. anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
  448. ignore_threshold=0.7,
  449. nms_score_threshold=0.01,
  450. nms_topk=1000,
  451. nms_keep_topk=100,
  452. nms_iou_threshold=0.45,
  453. label_smooth=False):
  454. self.init_params = locals()
  455. if backbone not in [
  456. 'MobileNetV1', 'MobileNetV1_ssld', 'MobileNetV3',
  457. 'MobileNetV3_ssld', 'DarkNet53', 'ResNet50_vd_dcn', 'ResNet34'
  458. ]:
  459. raise ValueError(
  460. "backbone: {} is not supported. Please choose one of "
  461. "('MobileNetV1', 'MobileNetV1_ssld', 'MobileNetV3', 'MobileNetV3_ssld', 'DarkNet53', 'ResNet50_vd_dcn', 'ResNet34')".
  462. format(backbone))
  463. if paddlex.env_info['place'] == 'gpu' and paddlex.env_info[
  464. 'num'] > 1 and not os.environ.get('PADDLEX_EXPORT_STAGE'):
  465. norm_type = 'sync_bn'
  466. else:
  467. norm_type = 'bn'
  468. self.backbone_name = backbone
  469. if 'MobileNetV1' in backbone:
  470. norm_type = 'bn'
  471. backbone = self._get_backbone('MobileNet', norm_type=norm_type)
  472. elif 'MobileNetV3' in backbone:
  473. backbone = self._get_backbone(
  474. 'MobileNetV3', norm_type=norm_type, feature_maps=[7, 13, 16])
  475. elif backbone == 'ResNet50_vd_dcn':
  476. backbone = self._get_backbone(
  477. 'ResNet',
  478. norm_type=norm_type,
  479. variant='d',
  480. return_idx=[1, 2, 3],
  481. dcn_v2_stages=[3],
  482. freeze_at=-1,
  483. freeze_norm=False)
  484. elif backbone == 'ResNet34':
  485. backbone = self._get_backbone(
  486. 'ResNet',
  487. depth=34,
  488. norm_type=norm_type,
  489. return_idx=[1, 2, 3],
  490. freeze_at=-1,
  491. freeze_norm=False,
  492. norm_decay=0.)
  493. else:
  494. backbone = self._get_backbone('DarkNet', norm_type=norm_type)
  495. neck = necks.YOLOv3FPN(
  496. norm_type=norm_type,
  497. in_channels=[i.channels for i in backbone.out_shape])
  498. loss = losses.YOLOv3Loss(
  499. num_classes=num_classes,
  500. ignore_thresh=ignore_threshold,
  501. label_smooth=label_smooth)
  502. yolo_head = heads.YOLOv3Head(
  503. in_channels=[i.channels for i in neck.out_shape],
  504. anchors=anchors,
  505. anchor_masks=anchor_masks,
  506. num_classes=num_classes,
  507. loss=loss)
  508. post_process = BBoxPostProcess(
  509. decode=YOLOBox(num_classes=num_classes),
  510. nms=MultiClassNMS(
  511. score_threshold=nms_score_threshold,
  512. nms_top_k=nms_topk,
  513. keep_top_k=nms_keep_topk,
  514. nms_threshold=nms_iou_threshold))
  515. params = {
  516. 'backbone': backbone,
  517. 'neck': neck,
  518. 'yolo_head': yolo_head,
  519. 'post_process': post_process
  520. }
  521. super(YOLOv3, self).__init__(
  522. model_name='YOLOv3', num_classes=num_classes, **params)
  523. self.anchors = anchors
  524. self.anchor_masks = anchor_masks
  525. def _compose_batch_transform(self, transforms, mode='train'):
  526. if mode == 'train':
  527. default_batch_transforms = [
  528. _BatchPadding(
  529. pad_to_stride=-1, pad_gt=False), _NormalizeBox(),
  530. _PadBox(getattr(self, 'num_max_boxes', 50)), _BboxXYXY2XYWH(),
  531. _Gt2YoloTarget(
  532. anchor_masks=self.anchor_masks,
  533. anchors=self.anchors,
  534. downsample_ratios=getattr(self, 'downsample_ratios',
  535. [32, 16, 8]),
  536. num_classes=self.num_classes)
  537. ]
  538. else:
  539. default_batch_transforms = [
  540. _BatchPadding(
  541. pad_to_stride=-1, pad_gt=False)
  542. ]
  543. custom_batch_transforms = []
  544. for i, op in enumerate(transforms.transforms):
  545. if isinstance(op, (BatchRandomResize, BatchRandomResizeByShort)):
  546. if mode != 'train':
  547. raise Exception(
  548. "{} cannot be present in the {} transforms. ".format(
  549. op.__class__.__name__, mode) +
  550. "Please check the {} transforms.".format(mode))
  551. custom_batch_transforms.insert(0, copy.deepcopy(op))
  552. batch_transforms = BatchCompose(custom_batch_transforms +
  553. default_batch_transforms)
  554. return batch_transforms
  555. class FasterRCNN(BaseDetector):
  556. def __init__(self,
  557. num_classes=80,
  558. backbone='ResNet50',
  559. with_fpn=True,
  560. aspect_ratios=[0.5, 1.0, 2.0],
  561. anchor_sizes=[[32], [64], [128], [256], [512]],
  562. keep_top_k=100,
  563. nms_threshold=0.5,
  564. score_threshold=0.05,
  565. fpn_num_channels=256,
  566. rpn_batch_size_per_im=256,
  567. rpn_fg_fraction=0.5,
  568. test_pre_nms_top_n=None,
  569. test_post_nms_top_n=1000):
  570. self.init_params = locals()
  571. if backbone not in [
  572. 'ResNet50', 'ResNet50_vd', 'ResNet50_vd_ssld', 'ResNet34',
  573. 'ResNet34_vd', 'ResNet101', 'ResNet101_vd'
  574. ]:
  575. raise ValueError(
  576. "backbone: {} is not supported. Please choose one of "
  577. "('ResNet50', 'ResNet50_vd', 'ResNet50_vd_ssld', 'ResNet34', 'ResNet34_vd', "
  578. "'ResNet101', 'ResNet101_vd')".format(backbone))
  579. self.backbone_name = backbone + '_fpn' if with_fpn else backbone
  580. if backbone == 'ResNet50_vd_ssld':
  581. if not with_fpn:
  582. logging.warning(
  583. "Backbone {} should be used along with fpn enabled, 'with_fpn' is forcibly set to True".
  584. format(backbone))
  585. with_fpn = True
  586. backbone = self._get_backbone(
  587. 'ResNet',
  588. variant='d',
  589. norm_type='bn',
  590. freeze_at=0,
  591. return_idx=[0, 1, 2, 3],
  592. num_stages=4,
  593. lr_mult_list=[0.05, 0.05, 0.1, 0.15])
  594. elif 'ResNet50' in backbone:
  595. if with_fpn:
  596. backbone = self._get_backbone(
  597. 'ResNet',
  598. variant='d' if '_vd' in backbone else 'b',
  599. norm_type='bn',
  600. freeze_at=0,
  601. return_idx=[0, 1, 2, 3],
  602. num_stages=4)
  603. else:
  604. backbone = self._get_backbone(
  605. 'ResNet',
  606. variant='d' if '_vd' in backbone else 'b',
  607. norm_type='bn',
  608. freeze_at=0,
  609. return_idx=[2],
  610. num_stages=3)
  611. elif 'ResNet34' in backbone:
  612. if not with_fpn:
  613. logging.warning(
  614. "Backbone {} should be used along with fpn enabled, 'with_fpn' is forcibly set to True".
  615. format(backbone))
  616. with_fpn = True
  617. backbone = self._get_backbone(
  618. 'ResNet',
  619. depth=34,
  620. variant='d' if 'vd' in backbone else 'b',
  621. norm_type='bn',
  622. freeze_at=0,
  623. return_idx=[0, 1, 2, 3],
  624. num_stages=4)
  625. else:
  626. if not with_fpn:
  627. logging.warning(
  628. "Backbone {} should be used along with fpn enabled, 'with_fpn' is forcibly set to True".
  629. format(backbone))
  630. with_fpn = True
  631. backbone = self._get_backbone(
  632. 'ResNet',
  633. depth=101,
  634. variant='d' if 'vd' in backbone else 'b',
  635. norm_type='bn',
  636. freeze_at=0,
  637. return_idx=[0, 1, 2, 3],
  638. num_stages=4)
  639. rpn_in_channel = backbone.out_shape[0].channels
  640. if with_fpn:
  641. neck = necks.FPN(
  642. in_channels=[i.channels for i in backbone.out_shape],
  643. out_channel=fpn_num_channels,
  644. spatial_scales=[1.0 / i.stride for i in backbone.out_shape])
  645. rpn_in_channel = neck.out_shape[0].channels
  646. anchor_generator_cfg = {
  647. 'aspect_ratios': aspect_ratios,
  648. 'anchor_sizes': anchor_sizes,
  649. 'strides': [4, 8, 16, 32, 64]
  650. }
  651. train_proposal_cfg = {
  652. 'min_size': 0.0,
  653. 'nms_thresh': .7,
  654. 'pre_nms_top_n': 2000,
  655. 'post_nms_top_n': 1000,
  656. 'topk_after_collect': True
  657. }
  658. test_proposal_cfg = {
  659. 'min_size': 0.0,
  660. 'nms_thresh': .7,
  661. 'pre_nms_top_n': 1000
  662. if test_pre_nms_top_n is None else test_pre_nms_top_n,
  663. 'post_nms_top_n': test_post_nms_top_n
  664. }
  665. head = heads.TwoFCHead(out_channel=1024)
  666. roi_extractor_cfg = {
  667. 'resolution': 7,
  668. 'spatial_scale': [1. / i.stride for i in neck.out_shape],
  669. 'sampling_ratio': 0,
  670. 'aligned': True
  671. }
  672. with_pool = False
  673. else:
  674. neck = None
  675. anchor_generator_cfg = {
  676. 'aspect_ratios': aspect_ratios,
  677. 'anchor_sizes': anchor_sizes,
  678. 'strides': [16]
  679. }
  680. train_proposal_cfg = {
  681. 'min_size': 0.0,
  682. 'nms_thresh': .7,
  683. 'pre_nms_top_n': 12000,
  684. 'post_nms_top_n': 2000,
  685. 'topk_after_collect': False
  686. }
  687. test_proposal_cfg = {
  688. 'min_size': 0.0,
  689. 'nms_thresh': .7,
  690. 'pre_nms_top_n': 6000
  691. if test_pre_nms_top_n is None else test_pre_nms_top_n,
  692. 'post_nms_top_n': test_post_nms_top_n
  693. }
  694. head = backbones.Res5Head()
  695. roi_extractor_cfg = {
  696. 'resolution': 14,
  697. 'spatial_scale': [1. / i.stride for i in backbone.out_shape],
  698. 'sampling_ratio': 0,
  699. 'aligned': True
  700. }
  701. with_pool = True
  702. rpn_target_assign_cfg = {
  703. 'batch_size_per_im': rpn_batch_size_per_im,
  704. 'fg_fraction': rpn_fg_fraction,
  705. 'negative_overlap': .3,
  706. 'positive_overlap': .7,
  707. 'use_random': True
  708. }
  709. rpn_head = RPNHead(
  710. anchor_generator=anchor_generator_cfg,
  711. rpn_target_assign=rpn_target_assign_cfg,
  712. train_proposal=train_proposal_cfg,
  713. test_proposal=test_proposal_cfg,
  714. in_channel=rpn_in_channel)
  715. bbox_assigner = BBoxAssigner(num_classes=num_classes)
  716. bbox_head = heads.BBoxHead(
  717. head=head,
  718. in_channel=head.out_shape[0].channels,
  719. roi_extractor=roi_extractor_cfg,
  720. with_pool=with_pool,
  721. bbox_assigner=bbox_assigner,
  722. num_classes=num_classes)
  723. bbox_post_process = BBoxPostProcess(
  724. num_classes=num_classes,
  725. decode=RCNNBox(num_classes=num_classes),
  726. nms=MultiClassNMS(
  727. score_threshold=score_threshold,
  728. keep_top_k=keep_top_k,
  729. nms_threshold=nms_threshold))
  730. params = {
  731. 'backbone': backbone,
  732. 'neck': neck,
  733. 'rpn_head': rpn_head,
  734. 'bbox_head': bbox_head,
  735. 'bbox_post_process': bbox_post_process
  736. }
  737. self.with_fpn = with_fpn
  738. super(FasterRCNN, self).__init__(
  739. model_name='FasterRCNN', num_classes=num_classes, **params)
  740. def _compose_batch_transform(self, transforms, mode='train'):
  741. if mode == 'train':
  742. default_batch_transforms = [
  743. _BatchPadding(
  744. pad_to_stride=32 if self.with_fpn else -1, pad_gt=True)
  745. ]
  746. else:
  747. default_batch_transforms = [
  748. _BatchPadding(
  749. pad_to_stride=32 if self.with_fpn else -1, pad_gt=False)
  750. ]
  751. custom_batch_transforms = []
  752. for i, op in enumerate(transforms.transforms):
  753. if isinstance(op, (BatchRandomResize, BatchRandomResizeByShort)):
  754. if mode != 'train':
  755. raise Exception(
  756. "{} cannot be present in the {} transforms. ".format(
  757. op.__class__.__name__, mode) +
  758. "Please check the {} transforms.".format(mode))
  759. custom_batch_transforms.insert(0, copy.deepcopy(op))
  760. batch_transforms = BatchCompose(custom_batch_transforms +
  761. default_batch_transforms)
  762. return batch_transforms
  763. class PPYOLO(YOLOv3):
  764. def __init__(self,
  765. num_classes=80,
  766. backbone='ResNet50_vd_dcn',
  767. anchors=None,
  768. anchor_masks=None,
  769. use_coord_conv=True,
  770. use_iou_aware=True,
  771. use_spp=True,
  772. use_drop_block=True,
  773. scale_x_y=1.05,
  774. ignore_threshold=0.7,
  775. label_smooth=False,
  776. use_iou_loss=True,
  777. use_matrix_nms=True,
  778. nms_score_threshold=0.01,
  779. nms_topk=-1,
  780. nms_keep_topk=100,
  781. nms_iou_threshold=0.45):
  782. self.init_params = locals()
  783. if backbone not in [
  784. 'ResNet50_vd_dcn', 'ResNet18_vd', 'MobileNetV3_large',
  785. 'MobileNetV3_small'
  786. ]:
  787. raise ValueError(
  788. "backbone: {} is not supported. Please choose one of "
  789. "('ResNet50_vd_dcn', 'ResNet18_vd', 'MobileNetV3_large', 'MobileNetV3_small')".
  790. format(backbone))
  791. self.backbone_name = backbone
  792. if paddlex.env_info['place'] == 'gpu' and paddlex.env_info[
  793. 'num'] > 1 and not os.environ.get('PADDLEX_EXPORT_STAGE'):
  794. norm_type = 'sync_bn'
  795. else:
  796. norm_type = 'bn'
  797. if anchors is None and anchor_masks is None:
  798. if 'MobileNetV3' in backbone:
  799. anchors = [[11, 18], [34, 47], [51, 126], [115, 71],
  800. [120, 195], [254, 235]]
  801. anchor_masks = [[3, 4, 5], [0, 1, 2]]
  802. elif backbone == 'ResNet50_vd_dcn':
  803. anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
  804. [59, 119], [116, 90], [156, 198], [373, 326]]
  805. anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
  806. else:
  807. anchors = [[10, 14], [23, 27], [37, 58], [81, 82], [135, 169],
  808. [344, 319]]
  809. anchor_masks = [[3, 4, 5], [0, 1, 2]]
  810. elif anchors is None or anchor_masks is None:
  811. raise ValueError("Please define both anchors and anchor_masks.")
  812. if backbone == 'ResNet50_vd_dcn':
  813. backbone = self._get_backbone(
  814. 'ResNet',
  815. variant='d',
  816. norm_type=norm_type,
  817. return_idx=[1, 2, 3],
  818. dcn_v2_stages=[3],
  819. freeze_at=-1,
  820. freeze_norm=False,
  821. norm_decay=0.)
  822. downsample_ratios = [32, 16, 8]
  823. elif backbone == 'ResNet18_vd':
  824. backbone = self._get_backbone(
  825. 'ResNet',
  826. depth=18,
  827. variant='d',
  828. norm_type=norm_type,
  829. return_idx=[2, 3],
  830. freeze_at=-1,
  831. freeze_norm=False,
  832. norm_decay=0.)
  833. downsample_ratios = [32, 16, 8]
  834. elif backbone == 'MobileNetV3_large':
  835. backbone = self._get_backbone(
  836. 'MobileNetV3',
  837. model_name='large',
  838. norm_type=norm_type,
  839. scale=1,
  840. with_extra_blocks=False,
  841. extra_block_filters=[],
  842. feature_maps=[13, 16])
  843. downsample_ratios = [32, 16]
  844. elif backbone == 'MobileNetV3_small':
  845. backbone = self._get_backbone(
  846. 'MobileNetV3',
  847. model_name='small',
  848. norm_type=norm_type,
  849. scale=1,
  850. with_extra_blocks=False,
  851. extra_block_filters=[],
  852. feature_maps=[9, 12])
  853. downsample_ratios = [32, 16]
  854. neck = necks.PPYOLOFPN(
  855. norm_type=norm_type,
  856. in_channels=[i.channels for i in backbone.out_shape],
  857. coord_conv=use_coord_conv,
  858. drop_block=use_drop_block,
  859. spp=use_spp,
  860. conv_block_num=0 if ('MobileNetV3' in self.backbone_name or
  861. self.backbone_name == 'ResNet18_vd') else 2)
  862. loss = losses.YOLOv3Loss(
  863. num_classes=num_classes,
  864. ignore_thresh=ignore_threshold,
  865. downsample=downsample_ratios,
  866. label_smooth=label_smooth,
  867. scale_x_y=scale_x_y,
  868. iou_loss=losses.IouLoss(
  869. loss_weight=2.5, loss_square=True) if use_iou_loss else None,
  870. iou_aware_loss=losses.IouAwareLoss(loss_weight=1.0)
  871. if use_iou_aware else None)
  872. yolo_head = heads.YOLOv3Head(
  873. in_channels=[i.channels for i in neck.out_shape],
  874. anchors=anchors,
  875. anchor_masks=anchor_masks,
  876. num_classes=num_classes,
  877. loss=loss,
  878. iou_aware=use_iou_aware)
  879. if use_matrix_nms:
  880. nms = MatrixNMS(
  881. keep_top_k=nms_keep_topk,
  882. score_threshold=nms_score_threshold,
  883. post_threshold=.05
  884. if 'MobileNetV3' in self.backbone_name else .01,
  885. nms_top_k=nms_topk,
  886. background_label=-1)
  887. else:
  888. nms = MultiClassNMS(
  889. score_threshold=nms_score_threshold,
  890. nms_top_k=nms_topk,
  891. keep_top_k=nms_keep_topk,
  892. nms_threshold=nms_iou_threshold)
  893. post_process = BBoxPostProcess(
  894. decode=YOLOBox(
  895. num_classes=num_classes,
  896. conf_thresh=.005
  897. if 'MobileNetV3' in self.backbone_name else .01,
  898. scale_x_y=scale_x_y),
  899. nms=nms)
  900. params = {
  901. 'backbone': backbone,
  902. 'neck': neck,
  903. 'yolo_head': yolo_head,
  904. 'post_process': post_process
  905. }
  906. super(YOLOv3, self).__init__(
  907. model_name='YOLOv3', num_classes=num_classes, **params)
  908. self.anchors = anchors
  909. self.anchor_masks = anchor_masks
  910. self.downsample_ratios = downsample_ratios
  911. self.model_name = 'PPYOLO'
  912. class PPYOLOTiny(YOLOv3):
  913. def __init__(self,
  914. num_classes=80,
  915. backbone='MobileNetV3',
  916. anchors=[[10, 15], [24, 36], [72, 42], [35, 87], [102, 96],
  917. [60, 170], [220, 125], [128, 222], [264, 266]],
  918. anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
  919. use_iou_aware=False,
  920. use_spp=True,
  921. use_drop_block=True,
  922. scale_x_y=1.05,
  923. ignore_threshold=0.5,
  924. label_smooth=False,
  925. use_iou_loss=True,
  926. use_matrix_nms=False,
  927. nms_score_threshold=0.005,
  928. nms_topk=1000,
  929. nms_keep_topk=100,
  930. nms_iou_threshold=0.45):
  931. self.init_params = locals()
  932. if backbone != 'MobileNetV3':
  933. logging.warning(
  934. "PPYOLOTiny only supports MobileNetV3 as backbone. "
  935. "Backbone is forcibly set to MobileNetV3.")
  936. self.backbone_name = 'MobileNetV3'
  937. if paddlex.env_info['place'] == 'gpu' and paddlex.env_info[
  938. 'num'] > 1 and not os.environ.get('PADDLEX_EXPORT_STAGE'):
  939. norm_type = 'sync_bn'
  940. else:
  941. norm_type = 'bn'
  942. backbone = self._get_backbone(
  943. 'MobileNetV3',
  944. model_name='large',
  945. norm_type=norm_type,
  946. scale=.5,
  947. with_extra_blocks=False,
  948. extra_block_filters=[],
  949. feature_maps=[7, 13, 16])
  950. downsample_ratios = [32, 16, 8]
  951. neck = necks.PPYOLOTinyFPN(
  952. detection_block_channels=[160, 128, 96],
  953. in_channels=[i.channels for i in backbone.out_shape],
  954. spp=use_spp,
  955. drop_block=use_drop_block)
  956. loss = losses.YOLOv3Loss(
  957. num_classes=num_classes,
  958. ignore_thresh=ignore_threshold,
  959. downsample=downsample_ratios,
  960. label_smooth=label_smooth,
  961. scale_x_y=scale_x_y,
  962. iou_loss=losses.IouLoss(
  963. loss_weight=2.5, loss_square=True) if use_iou_loss else None,
  964. iou_aware_loss=losses.IouAwareLoss(loss_weight=1.0)
  965. if use_iou_aware else None)
  966. yolo_head = heads.YOLOv3Head(
  967. in_channels=[i.channels for i in neck.out_shape],
  968. anchors=anchors,
  969. anchor_masks=anchor_masks,
  970. num_classes=num_classes,
  971. loss=loss,
  972. iou_aware=use_iou_aware)
  973. if use_matrix_nms:
  974. nms = MatrixNMS(
  975. keep_top_k=nms_keep_topk,
  976. score_threshold=nms_score_threshold,
  977. post_threshold=.05,
  978. nms_top_k=nms_topk,
  979. background_label=-1)
  980. else:
  981. nms = MultiClassNMS(
  982. score_threshold=nms_score_threshold,
  983. nms_top_k=nms_topk,
  984. keep_top_k=nms_keep_topk,
  985. nms_threshold=nms_iou_threshold)
  986. post_process = BBoxPostProcess(
  987. decode=YOLOBox(
  988. num_classes=num_classes,
  989. conf_thresh=.005,
  990. downsample_ratio=32,
  991. clip_bbox=True,
  992. scale_x_y=scale_x_y),
  993. nms=nms)
  994. params = {
  995. 'backbone': backbone,
  996. 'neck': neck,
  997. 'yolo_head': yolo_head,
  998. 'post_process': post_process
  999. }
  1000. super(YOLOv3, self).__init__(
  1001. model_name='YOLOv3', num_classes=num_classes, **params)
  1002. self.anchors = anchors
  1003. self.anchor_masks = anchor_masks
  1004. self.downsample_ratios = downsample_ratios
  1005. self.num_max_boxes = 100
  1006. self.model_name = 'PPYOLOTiny'
  1007. class PPYOLOv2(YOLOv3):
  1008. def __init__(self,
  1009. num_classes=80,
  1010. backbone='ResNet50_vd_dcn',
  1011. anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
  1012. [59, 119], [116, 90], [156, 198], [373, 326]],
  1013. anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
  1014. use_iou_aware=True,
  1015. use_spp=True,
  1016. use_drop_block=True,
  1017. scale_x_y=1.05,
  1018. ignore_threshold=0.7,
  1019. label_smooth=False,
  1020. use_iou_loss=True,
  1021. use_matrix_nms=True,
  1022. nms_score_threshold=0.01,
  1023. nms_topk=-1,
  1024. nms_keep_topk=100,
  1025. nms_iou_threshold=0.45):
  1026. self.init_params = locals()
  1027. if backbone not in ['ResNet50_vd_dcn', 'ResNet101_vd_dcn']:
  1028. raise ValueError(
  1029. "backbone: {} is not supported. Please choose one of "
  1030. "('ResNet50_vd_dcn', 'ResNet18_vd')".format(backbone))
  1031. self.backbone_name = backbone
  1032. if paddlex.env_info['place'] == 'gpu' and paddlex.env_info[
  1033. 'num'] > 1 and not os.environ.get('PADDLEX_EXPORT_STAGE'):
  1034. norm_type = 'sync_bn'
  1035. else:
  1036. norm_type = 'bn'
  1037. if backbone == 'ResNet50_vd_dcn':
  1038. backbone = self._get_backbone(
  1039. 'ResNet',
  1040. variant='d',
  1041. norm_type=norm_type,
  1042. return_idx=[1, 2, 3],
  1043. dcn_v2_stages=[3],
  1044. freeze_at=-1,
  1045. freeze_norm=False,
  1046. norm_decay=0.)
  1047. downsample_ratios = [32, 16, 8]
  1048. elif backbone == 'ResNet101_vd_dcn':
  1049. backbone = self._get_backbone(
  1050. 'ResNet',
  1051. depth=101,
  1052. variant='d',
  1053. norm_type=norm_type,
  1054. return_idx=[1, 2, 3],
  1055. dcn_v2_stages=[3],
  1056. freeze_at=-1,
  1057. freeze_norm=False,
  1058. norm_decay=0.)
  1059. downsample_ratios = [32, 16, 8]
  1060. neck = necks.PPYOLOPAN(
  1061. norm_type=norm_type,
  1062. in_channels=[i.channels for i in backbone.out_shape],
  1063. drop_block=use_drop_block,
  1064. block_size=3,
  1065. keep_prob=.9,
  1066. spp=use_spp)
  1067. loss = losses.YOLOv3Loss(
  1068. num_classes=num_classes,
  1069. ignore_thresh=ignore_threshold,
  1070. downsample=downsample_ratios,
  1071. label_smooth=label_smooth,
  1072. scale_x_y=scale_x_y,
  1073. iou_loss=losses.IouLoss(
  1074. loss_weight=2.5, loss_square=True) if use_iou_loss else None,
  1075. iou_aware_loss=losses.IouAwareLoss(loss_weight=1.0)
  1076. if use_iou_aware else None)
  1077. yolo_head = heads.YOLOv3Head(
  1078. in_channels=[i.channels for i in neck.out_shape],
  1079. anchors=anchors,
  1080. anchor_masks=anchor_masks,
  1081. num_classes=num_classes,
  1082. loss=loss,
  1083. iou_aware=use_iou_aware,
  1084. iou_aware_factor=.5)
  1085. if use_matrix_nms:
  1086. nms = MatrixNMS(
  1087. keep_top_k=nms_keep_topk,
  1088. score_threshold=nms_score_threshold,
  1089. post_threshold=.01,
  1090. nms_top_k=nms_topk,
  1091. background_label=-1)
  1092. else:
  1093. nms = MultiClassNMS(
  1094. score_threshold=nms_score_threshold,
  1095. nms_top_k=nms_topk,
  1096. keep_top_k=nms_keep_topk,
  1097. nms_threshold=nms_iou_threshold)
  1098. post_process = BBoxPostProcess(
  1099. decode=YOLOBox(
  1100. num_classes=num_classes,
  1101. conf_thresh=.01,
  1102. downsample_ratio=32,
  1103. clip_bbox=True,
  1104. scale_x_y=scale_x_y),
  1105. nms=nms)
  1106. params = {
  1107. 'backbone': backbone,
  1108. 'neck': neck,
  1109. 'yolo_head': yolo_head,
  1110. 'post_process': post_process
  1111. }
  1112. super(YOLOv3, self).__init__(
  1113. model_name='YOLOv3', num_classes=num_classes, **params)
  1114. self.anchors = anchors
  1115. self.anchor_masks = anchor_masks
  1116. self.downsample_ratios = downsample_ratios
  1117. self.num_max_boxes = 100
  1118. self.model_name = 'PPYOLOv2'
  1119. class MaskRCNN(BaseDetector):
  1120. def __init__(self,
  1121. num_classes=80,
  1122. backbone='ResNet50_vd',
  1123. with_fpn=True,
  1124. aspect_ratios=[0.5, 1.0, 2.0],
  1125. anchor_sizes=[[32], [64], [128], [256], [512]],
  1126. keep_top_k=100,
  1127. nms_threshold=0.5,
  1128. score_threshold=0.05,
  1129. fpn_num_channels=256,
  1130. rpn_batch_size_per_im=256,
  1131. rpn_fg_fraction=0.5,
  1132. test_pre_nms_top_n=None,
  1133. test_post_nms_top_n=1000):
  1134. self.init_params = locals()
  1135. if backbone not in [
  1136. 'ResNet50', 'ResNet50_vd', 'ResNet50_vd_ssld', 'ResNet101',
  1137. 'ResNet101_vd'
  1138. ]:
  1139. raise ValueError(
  1140. "backbone: {} is not supported. Please choose one of "
  1141. "('ResNet50', 'ResNet50_vd', 'ResNet50_vd_ssld', 'ResNet101', 'ResNet101_vd')".
  1142. format(backbone))
  1143. self.backbone_name = backbone + '_fpn' if with_fpn else backbone
  1144. if backbone == 'ResNet50':
  1145. if with_fpn:
  1146. backbone = self._get_backbone(
  1147. 'ResNet',
  1148. norm_type='bn',
  1149. freeze_at=0,
  1150. return_idx=[0, 1, 2, 3],
  1151. num_stages=4)
  1152. else:
  1153. backbone = self._get_backbone(
  1154. 'ResNet',
  1155. norm_type='bn',
  1156. freeze_at=0,
  1157. return_idx=[2],
  1158. num_stages=3)
  1159. elif 'ResNet50_vd' in backbone:
  1160. if not with_fpn:
  1161. logging.warning(
  1162. "Backbone {} should be used along with fpn enabled, 'with_fpn' is forcibly set to True".
  1163. format(backbone))
  1164. with_fpn = True
  1165. backbone = self._get_backbone(
  1166. 'ResNet',
  1167. variant='d',
  1168. norm_type='bn',
  1169. freeze_at=0,
  1170. return_idx=[0, 1, 2, 3],
  1171. num_stages=4,
  1172. lr_mult_list=[0.05, 0.05, 0.1, 0.15]
  1173. if '_ssld' in backbone else [1.0, 1.0, 1.0, 1.0])
  1174. else:
  1175. if not with_fpn:
  1176. logging.warning(
  1177. "Backbone {} should be used along with fpn enabled, 'with_fpn' is forcibly set to True".
  1178. format(backbone))
  1179. with_fpn = True
  1180. backbone = self._get_backbone(
  1181. 'ResNet',
  1182. variant='d' if '_vd' in backbone else 'b',
  1183. depth=101,
  1184. norm_type='bn',
  1185. freeze_at=0,
  1186. return_idx=[0, 1, 2, 3],
  1187. num_stages=4)
  1188. rpn_in_channel = backbone.out_shape[0].channels
  1189. if with_fpn:
  1190. neck = necks.FPN(
  1191. in_channels=[i.channels for i in backbone.out_shape],
  1192. out_channel=fpn_num_channels,
  1193. spatial_scales=[1.0 / i.stride for i in backbone.out_shape])
  1194. rpn_in_channel = neck.out_shape[0].channels
  1195. anchor_generator_cfg = {
  1196. 'aspect_ratios': aspect_ratios,
  1197. 'anchor_sizes': anchor_sizes,
  1198. 'strides': [4, 8, 16, 32, 64]
  1199. }
  1200. train_proposal_cfg = {
  1201. 'min_size': 0.0,
  1202. 'nms_thresh': .7,
  1203. 'pre_nms_top_n': 2000,
  1204. 'post_nms_top_n': 1000,
  1205. 'topk_after_collect': True
  1206. }
  1207. test_proposal_cfg = {
  1208. 'min_size': 0.0,
  1209. 'nms_thresh': .7,
  1210. 'pre_nms_top_n': 1000
  1211. if test_pre_nms_top_n is None else test_pre_nms_top_n,
  1212. 'post_nms_top_n': test_post_nms_top_n
  1213. }
  1214. bb_head = heads.TwoFCHead(
  1215. in_channel=neck.out_shape[0].channels, out_channel=1024)
  1216. bb_roi_extractor_cfg = {
  1217. 'resolution': 7,
  1218. 'spatial_scale': [1. / i.stride for i in neck.out_shape],
  1219. 'sampling_ratio': 0,
  1220. 'aligned': True
  1221. }
  1222. with_pool = False
  1223. m_head = heads.MaskFeat(
  1224. in_channel=neck.out_shape[0].channels,
  1225. out_channel=256,
  1226. num_convs=4)
  1227. m_roi_extractor_cfg = {
  1228. 'resolution': 14,
  1229. 'spatial_scale': [1. / i.stride for i in neck.out_shape],
  1230. 'sampling_ratio': 0,
  1231. 'aligned': True
  1232. }
  1233. mask_assigner = MaskAssigner(
  1234. num_classes=num_classes, mask_resolution=28)
  1235. share_bbox_feat = False
  1236. else:
  1237. neck = None
  1238. anchor_generator_cfg = {
  1239. 'aspect_ratios': aspect_ratios,
  1240. 'anchor_sizes': anchor_sizes,
  1241. 'strides': [16]
  1242. }
  1243. train_proposal_cfg = {
  1244. 'min_size': 0.0,
  1245. 'nms_thresh': .7,
  1246. 'pre_nms_top_n': 12000,
  1247. 'post_nms_top_n': 2000,
  1248. 'topk_after_collect': False
  1249. }
  1250. test_proposal_cfg = {
  1251. 'min_size': 0.0,
  1252. 'nms_thresh': .7,
  1253. 'pre_nms_top_n': 6000
  1254. if test_pre_nms_top_n is None else test_pre_nms_top_n,
  1255. 'post_nms_top_n': test_post_nms_top_n
  1256. }
  1257. bb_head = backbones.Res5Head()
  1258. bb_roi_extractor_cfg = {
  1259. 'resolution': 14,
  1260. 'spatial_scale': [1. / i.stride for i in backbone.out_shape],
  1261. 'sampling_ratio': 0,
  1262. 'aligned': True
  1263. }
  1264. with_pool = True
  1265. m_head = heads.MaskFeat(
  1266. in_channel=bb_head.out_shape[0].channels,
  1267. out_channel=256,
  1268. num_convs=0)
  1269. m_roi_extractor_cfg = {
  1270. 'resolution': 14,
  1271. 'spatial_scale': [1. / i.stride for i in backbone.out_shape],
  1272. 'sampling_ratio': 0,
  1273. 'aligned': True
  1274. }
  1275. mask_assigner = MaskAssigner(
  1276. num_classes=num_classes, mask_resolution=14)
  1277. share_bbox_feat = True
  1278. rpn_target_assign_cfg = {
  1279. 'batch_size_per_im': rpn_batch_size_per_im,
  1280. 'fg_fraction': rpn_fg_fraction,
  1281. 'negative_overlap': .3,
  1282. 'positive_overlap': .7,
  1283. 'use_random': True
  1284. }
  1285. rpn_head = RPNHead(
  1286. anchor_generator=anchor_generator_cfg,
  1287. rpn_target_assign=rpn_target_assign_cfg,
  1288. train_proposal=train_proposal_cfg,
  1289. test_proposal=test_proposal_cfg,
  1290. in_channel=rpn_in_channel)
  1291. bbox_assigner = BBoxAssigner(num_classes=num_classes)
  1292. bbox_head = heads.BBoxHead(
  1293. head=bb_head,
  1294. in_channel=bb_head.out_shape[0].channels,
  1295. roi_extractor=bb_roi_extractor_cfg,
  1296. with_pool=with_pool,
  1297. bbox_assigner=bbox_assigner,
  1298. num_classes=num_classes)
  1299. mask_head = heads.MaskHead(
  1300. head=m_head,
  1301. roi_extractor=m_roi_extractor_cfg,
  1302. mask_assigner=mask_assigner,
  1303. share_bbox_feat=share_bbox_feat,
  1304. num_classes=num_classes)
  1305. bbox_post_process = BBoxPostProcess(
  1306. num_classes=num_classes,
  1307. decode=RCNNBox(num_classes=num_classes),
  1308. nms=MultiClassNMS(
  1309. score_threshold=score_threshold,
  1310. keep_top_k=keep_top_k,
  1311. nms_threshold=nms_threshold))
  1312. mask_post_process = MaskPostProcess(binary_thresh=.5)
  1313. params = {
  1314. 'backbone': backbone,
  1315. 'neck': neck,
  1316. 'rpn_head': rpn_head,
  1317. 'bbox_head': bbox_head,
  1318. 'mask_head': mask_head,
  1319. 'bbox_post_process': bbox_post_process,
  1320. 'mask_post_process': mask_post_process
  1321. }
  1322. self.with_fpn = with_fpn
  1323. super(MaskRCNN, self).__init__(
  1324. model_name='MaskRCNN', num_classes=num_classes, **params)
  1325. def _compose_batch_transform(self, transforms, mode='train'):
  1326. if mode == 'train':
  1327. default_batch_transforms = [
  1328. _BatchPadding(
  1329. pad_to_stride=32 if self.with_fpn else -1, pad_gt=True)
  1330. ]
  1331. else:
  1332. default_batch_transforms = [
  1333. _BatchPadding(
  1334. pad_to_stride=32 if self.with_fpn else -1, pad_gt=False)
  1335. ]
  1336. custom_batch_transforms = []
  1337. for i, op in enumerate(transforms.transforms):
  1338. if isinstance(op, (BatchRandomResize, BatchRandomResizeByShort)):
  1339. if mode != 'train':
  1340. raise Exception(
  1341. "{} cannot be present in the {} transforms. ".format(
  1342. op.__class__.__name__, mode) +
  1343. "Please check the {} transforms.".format(mode))
  1344. custom_batch_transforms.insert(0, copy.deepcopy(op))
  1345. batch_transforms = BatchCompose(custom_batch_transforms +
  1346. default_batch_transforms)
  1347. return batch_transforms