yolo_v3.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from paddle import fluid
  15. from paddle.fluid.param_attr import ParamAttr
  16. from paddle.fluid.regularizer import L2Decay
  17. from collections import OrderedDict
  18. class YOLOv3:
  19. def __init__(self,
  20. backbone,
  21. num_classes,
  22. mode='train',
  23. anchors=None,
  24. anchor_masks=None,
  25. ignore_threshold=0.7,
  26. label_smooth=False,
  27. nms_score_threshold=0.01,
  28. nms_topk=1000,
  29. nms_keep_topk=100,
  30. nms_iou_threshold=0.45,
  31. train_random_shapes=[
  32. 320, 352, 384, 416, 448, 480, 512, 544, 576, 608
  33. ],
  34. fixed_input_shape=None):
  35. if anchors is None:
  36. anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
  37. [59, 119], [116, 90], [156, 198], [373, 326]]
  38. if anchor_masks is None:
  39. anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
  40. self.anchors = anchors
  41. self.anchor_masks = anchor_masks
  42. self._parse_anchors(anchors)
  43. self.mode = mode
  44. self.num_classes = num_classes
  45. self.backbone = backbone
  46. self.ignore_thresh = ignore_threshold
  47. self.label_smooth = label_smooth
  48. self.nms_score_threshold = nms_score_threshold
  49. self.nms_topk = nms_topk
  50. self.nms_keep_topk = nms_keep_topk
  51. self.nms_iou_threshold = nms_iou_threshold
  52. self.norm_decay = 0.0
  53. self.prefix_name = ''
  54. self.train_random_shapes = train_random_shapes
  55. self.fixed_input_shape = fixed_input_shape
  56. def _head(self, feats):
  57. outputs = []
  58. out_layer_num = len(self.anchor_masks)
  59. blocks = feats[-1:-out_layer_num - 1:-1]
  60. route = None
  61. for i, block in enumerate(blocks):
  62. if i > 0:
  63. block = fluid.layers.concat(input=[route, block], axis=1)
  64. route, tip = self._detection_block(
  65. block,
  66. channel=512 // (2**i),
  67. name=self.prefix_name + 'yolo_block.{}'.format(i))
  68. num_filters = len(self.anchor_masks[i]) * (self.num_classes + 5)
  69. block_out = fluid.layers.conv2d(
  70. input=tip,
  71. num_filters=num_filters,
  72. filter_size=1,
  73. stride=1,
  74. padding=0,
  75. act=None,
  76. param_attr=ParamAttr(name=self.prefix_name +
  77. 'yolo_output.{}.conv.weights'.format(i)),
  78. bias_attr=ParamAttr(
  79. regularizer=L2Decay(0.0),
  80. name=self.prefix_name +
  81. 'yolo_output.{}.conv.bias'.format(i)))
  82. outputs.append(block_out)
  83. if i < len(blocks) - 1:
  84. route = self._conv_bn(
  85. input=route,
  86. ch_out=256 // (2**i),
  87. filter_size=1,
  88. stride=1,
  89. padding=0,
  90. name=self.prefix_name + 'yolo_transition.{}'.format(i))
  91. route = self._upsample(route)
  92. return outputs
  93. def _parse_anchors(self, anchors):
  94. self.anchors = []
  95. self.mask_anchors = []
  96. assert len(anchors) > 0, "ANCHORS not set."
  97. assert len(self.anchor_masks) > 0, "ANCHOR_MASKS not set."
  98. for anchor in anchors:
  99. assert len(anchor) == 2, "anchor {} len should be 2".format(anchor)
  100. self.anchors.extend(anchor)
  101. anchor_num = len(anchors)
  102. for masks in self.anchor_masks:
  103. self.mask_anchors.append([])
  104. for mask in masks:
  105. assert mask < anchor_num, "anchor mask index overflow"
  106. self.mask_anchors[-1].extend(anchors[mask])
  107. def _conv_bn(self,
  108. input,
  109. ch_out,
  110. filter_size,
  111. stride,
  112. padding,
  113. act='leaky',
  114. is_test=False,
  115. name=None):
  116. conv = fluid.layers.conv2d(
  117. input=input,
  118. num_filters=ch_out,
  119. filter_size=filter_size,
  120. stride=stride,
  121. padding=padding,
  122. act=None,
  123. param_attr=ParamAttr(name=name + '.conv.weights'),
  124. bias_attr=False)
  125. bn_name = name + '.bn'
  126. bn_param_attr = ParamAttr(
  127. regularizer=L2Decay(self.norm_decay), name=bn_name + '.scale')
  128. bn_bias_attr = ParamAttr(
  129. regularizer=L2Decay(self.norm_decay), name=bn_name + '.offset')
  130. out = fluid.layers.batch_norm(
  131. input=conv,
  132. act=None,
  133. is_test=is_test,
  134. param_attr=bn_param_attr,
  135. bias_attr=bn_bias_attr,
  136. moving_mean_name=bn_name + '.mean',
  137. moving_variance_name=bn_name + '.var')
  138. if act == 'leaky':
  139. out = fluid.layers.leaky_relu(x=out, alpha=0.1)
  140. return out
  141. def _upsample(self, input, scale=2, name=None):
  142. out = fluid.layers.resize_nearest(
  143. input=input, scale=float(scale), name=name, align_corners=False)
  144. return out
  145. def _detection_block(self, input, channel, name=None):
  146. assert channel % 2 == 0, "channel({}) cannot be divided by 2 in detection block({})".format(
  147. channel, name)
  148. is_test = False if self.mode == 'train' else True
  149. conv = input
  150. for i in range(2):
  151. conv = self._conv_bn(
  152. conv,
  153. channel,
  154. filter_size=1,
  155. stride=1,
  156. padding=0,
  157. is_test=is_test,
  158. name='{}.{}.0'.format(name, i))
  159. conv = self._conv_bn(
  160. conv,
  161. channel * 2,
  162. filter_size=3,
  163. stride=1,
  164. padding=1,
  165. is_test=is_test,
  166. name='{}.{}.1'.format(name, i))
  167. route = self._conv_bn(
  168. conv,
  169. channel,
  170. filter_size=1,
  171. stride=1,
  172. padding=0,
  173. is_test=is_test,
  174. name='{}.2'.format(name))
  175. tip = self._conv_bn(
  176. route,
  177. channel * 2,
  178. filter_size=3,
  179. stride=1,
  180. padding=1,
  181. is_test=is_test,
  182. name='{}.tip'.format(name))
  183. return route, tip
  184. def _get_loss(self, inputs, gt_box, gt_label, gt_score):
  185. losses = []
  186. downsample = 32
  187. for i, input in enumerate(inputs):
  188. loss = fluid.layers.yolov3_loss(
  189. x=input,
  190. gt_box=gt_box,
  191. gt_label=gt_label,
  192. gt_score=gt_score,
  193. anchors=self.anchors,
  194. anchor_mask=self.anchor_masks[i],
  195. class_num=self.num_classes,
  196. ignore_thresh=self.ignore_thresh,
  197. downsample_ratio=downsample,
  198. use_label_smooth=self.label_smooth,
  199. name=self.prefix_name + 'yolo_loss' + str(i))
  200. losses.append(fluid.layers.reduce_mean(loss))
  201. downsample //= 2
  202. return sum(losses)
  203. def _get_prediction(self, inputs, im_size):
  204. boxes = []
  205. scores = []
  206. downsample = 32
  207. for i, input in enumerate(inputs):
  208. box, score = fluid.layers.yolo_box(
  209. x=input,
  210. img_size=im_size,
  211. anchors=self.mask_anchors[i],
  212. class_num=self.num_classes,
  213. conf_thresh=self.nms_score_threshold,
  214. downsample_ratio=downsample,
  215. name=self.prefix_name + 'yolo_box' + str(i))
  216. boxes.append(box)
  217. scores.append(fluid.layers.transpose(score, perm=[0, 2, 1]))
  218. downsample //= 2
  219. yolo_boxes = fluid.layers.concat(boxes, axis=1)
  220. yolo_scores = fluid.layers.concat(scores, axis=2)
  221. pred = fluid.layers.multiclass_nms(
  222. bboxes=yolo_boxes,
  223. scores=yolo_scores,
  224. score_threshold=self.nms_score_threshold,
  225. nms_top_k=self.nms_topk,
  226. keep_top_k=self.nms_keep_topk,
  227. nms_threshold=self.nms_iou_threshold,
  228. normalized=True,
  229. nms_eta=1.0,
  230. background_label=-1)
  231. return pred
  232. def generate_inputs(self):
  233. inputs = OrderedDict()
  234. if self.fixed_input_shape is not None:
  235. input_shape = [
  236. None, 3, self.fixed_input_shape[1], self.fixed_input_shape[0]
  237. ]
  238. inputs['image'] = fluid.data(
  239. dtype='float32', shape=input_shape, name='image')
  240. else:
  241. inputs['image'] = fluid.data(
  242. dtype='float32', shape=[None, 3, None, None], name='image')
  243. if self.mode == 'train':
  244. inputs['gt_box'] = fluid.data(
  245. dtype='float32', shape=[None, None, 4], name='gt_box')
  246. inputs['gt_label'] = fluid.data(
  247. dtype='int32', shape=[None, None], name='gt_label')
  248. inputs['gt_score'] = fluid.data(
  249. dtype='float32', shape=[None, None], name='gt_score')
  250. inputs['im_size'] = fluid.data(
  251. dtype='int32', shape=[None, 2], name='im_size')
  252. elif self.mode == 'eval':
  253. inputs['im_size'] = fluid.data(
  254. dtype='int32', shape=[None, 2], name='im_size')
  255. inputs['im_id'] = fluid.data(
  256. dtype='int32', shape=[None, 1], name='im_id')
  257. inputs['gt_box'] = fluid.data(
  258. dtype='float32', shape=[None, None, 4], name='gt_box')
  259. inputs['gt_label'] = fluid.data(
  260. dtype='int32', shape=[None, None], name='gt_label')
  261. inputs['is_difficult'] = fluid.data(
  262. dtype='int32', shape=[None, None], name='is_difficult')
  263. elif self.mode == 'test':
  264. inputs['im_size'] = fluid.data(
  265. dtype='int32', shape=[None, 2], name='im_size')
  266. return inputs
  267. def build_net(self, inputs):
  268. image = inputs['image']
  269. if self.mode == 'train':
  270. if isinstance(self.train_random_shapes,
  271. (list, tuple)) and len(self.train_random_shapes) > 0:
  272. import numpy as np
  273. shapes = np.array(self.train_random_shapes)
  274. shapes = np.stack([shapes, shapes], axis=1).astype('float32')
  275. shapes_tensor = fluid.layers.assign(shapes)
  276. index = fluid.layers.uniform_random(
  277. shape=[1], dtype='float32', min=0.0, max=1)
  278. index = fluid.layers.cast(
  279. index * len(self.train_random_shapes), dtype='int32')
  280. shape = fluid.layers.gather(shapes_tensor, index)
  281. shape = fluid.layers.reshape(shape, [-1])
  282. shape = fluid.layers.cast(shape, dtype='int32')
  283. image = fluid.layers.resize_nearest(
  284. image, out_shape=shape, align_corners=False)
  285. feats = self.backbone(image)
  286. if isinstance(feats, OrderedDict):
  287. feat_names = list(feats.keys())
  288. feats = [feats[name] for name in feat_names]
  289. head_outputs = self._head(feats)
  290. if self.mode == 'train':
  291. gt_box = inputs['gt_box']
  292. gt_label = inputs['gt_label']
  293. gt_score = inputs['gt_score']
  294. im_size = inputs['im_size']
  295. num_boxes = fluid.layers.shape(gt_box)[1]
  296. im_size_wh = fluid.layers.reverse(im_size, axis=1)
  297. whwh = fluid.layers.concat([im_size_wh, im_size_wh], axis=1)
  298. whwh = fluid.layers.unsqueeze(whwh, axes=[1])
  299. whwh = fluid.layers.expand(whwh, expand_times=[1, num_boxes, 1])
  300. whwh = fluid.layers.cast(whwh, dtype='float32')
  301. whwh.stop_gradient = True
  302. normalized_box = fluid.layers.elementwise_div(gt_box, whwh)
  303. return self._get_loss(head_outputs, normalized_box, gt_label,
  304. gt_score)
  305. else:
  306. im_size = inputs['im_size']
  307. return self._get_prediction(head_outputs, im_size)