detr_head.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. import paddle
  18. import paddle.nn as nn
  19. import paddle.nn.functional as F
  20. from paddlex.ppdet.core.workspace import register
  21. import pycocotools.mask as mask_util
  22. from ..initializer import linear_init_, constant_
  23. from ..transformers.utils import inverse_sigmoid
  24. __all__ = ['DETRHead', 'DeformableDETRHead']
  25. class MLP(nn.Layer):
  26. def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
  27. super().__init__()
  28. self.num_layers = num_layers
  29. h = [hidden_dim] * (num_layers - 1)
  30. self.layers = nn.LayerList(
  31. nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
  32. self._reset_parameters()
  33. def _reset_parameters(self):
  34. for l in self.layers:
  35. linear_init_(l)
  36. def forward(self, x):
  37. for i, layer in enumerate(self.layers):
  38. x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
  39. return x
  40. class MultiHeadAttentionMap(nn.Layer):
  41. """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
  42. def __init__(self,
  43. query_dim,
  44. hidden_dim,
  45. num_heads,
  46. dropout=0.0,
  47. bias=True):
  48. super().__init__()
  49. self.num_heads = num_heads
  50. self.hidden_dim = hidden_dim
  51. self.dropout = nn.Dropout(dropout)
  52. weight_attr = paddle.ParamAttr(
  53. initializer=paddle.nn.initializer.XavierUniform())
  54. bias_attr = paddle.framework.ParamAttr(
  55. initializer=paddle.nn.initializer.Constant()) if bias else False
  56. self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
  57. self.k_proj = nn.Conv2D(
  58. query_dim,
  59. hidden_dim,
  60. 1,
  61. weight_attr=weight_attr,
  62. bias_attr=bias_attr)
  63. self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
  64. def forward(self, q, k, mask=None):
  65. q = self.q_proj(q)
  66. k = self.k_proj(k)
  67. bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
  68. self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
  69. qh = q.reshape([bs, num_queries, n, c])
  70. kh = k.reshape([bs, n, c, h, w])
  71. # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
  72. qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
  73. kh = kh.reshape([-1, c, h * w])
  74. weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
  75. [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
  76. if mask is not None:
  77. weights += mask
  78. # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
  79. weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
  80. weights = self.dropout(weights)
  81. return weights
  82. class MaskHeadFPNConv(nn.Layer):
  83. """
  84. Simple convolutional head, using group norm.
  85. Upsampling is done using a FPN approach
  86. """
  87. def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
  88. super().__init__()
  89. inter_dims = [input_dim,
  90. ] + [context_dim // (2**i) for i in range(1, 5)]
  91. weight_attr = paddle.ParamAttr(
  92. initializer=paddle.nn.initializer.KaimingUniform())
  93. bias_attr = paddle.framework.ParamAttr(
  94. initializer=paddle.nn.initializer.Constant())
  95. self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
  96. weight_attr, bias_attr)
  97. self.conv_inter = nn.LayerList()
  98. for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
  99. self.conv_inter.append(
  100. self._make_layers(in_dims, out_dims, 3, num_groups,
  101. weight_attr, bias_attr))
  102. self.conv_out = nn.Conv2D(
  103. inter_dims[-1],
  104. 1,
  105. 3,
  106. padding=1,
  107. weight_attr=weight_attr,
  108. bias_attr=bias_attr)
  109. self.adapter = nn.LayerList()
  110. for i in range(len(fpn_dims)):
  111. self.adapter.append(
  112. nn.Conv2D(
  113. fpn_dims[i],
  114. inter_dims[i + 1],
  115. 1,
  116. weight_attr=weight_attr,
  117. bias_attr=bias_attr))
  118. def _make_layers(self,
  119. in_dims,
  120. out_dims,
  121. kernel_size,
  122. num_groups,
  123. weight_attr=None,
  124. bias_attr=None):
  125. return nn.Sequential(
  126. nn.Conv2D(
  127. in_dims,
  128. out_dims,
  129. kernel_size,
  130. padding=kernel_size // 2,
  131. weight_attr=weight_attr,
  132. bias_attr=bias_attr),
  133. nn.GroupNorm(num_groups, out_dims),
  134. nn.ReLU())
  135. def forward(self, x, bbox_attention_map, fpns):
  136. x = paddle.concat([
  137. x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
  138. bbox_attention_map.flatten(0, 1)
  139. ], 1)
  140. x = self.conv0(x)
  141. for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
  142. self.adapter, fpns):
  143. feat = adapter_layer(feat).tile(
  144. [bbox_attention_map.shape[1], 1, 1, 1])
  145. x = inter_layer(x)
  146. x = feat + F.interpolate(x, size=feat.shape[-2:])
  147. x = self.conv_inter[-1](x)
  148. x = self.conv_out(x)
  149. return x
  150. @register
  151. class DETRHead(nn.Layer):
  152. __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
  153. __inject__ = ['loss']
  154. def __init__(self,
  155. num_classes=80,
  156. hidden_dim=256,
  157. nhead=8,
  158. num_mlp_layers=3,
  159. loss='DETRLoss',
  160. fpn_dims=[1024, 512, 256],
  161. with_mask_head=False,
  162. use_focal_loss=False):
  163. super(DETRHead, self).__init__()
  164. # add background class
  165. self.num_classes = num_classes if use_focal_loss else num_classes + 1
  166. self.hidden_dim = hidden_dim
  167. self.loss = loss
  168. self.with_mask_head = with_mask_head
  169. self.use_focal_loss = use_focal_loss
  170. self.score_head = nn.Linear(hidden_dim, self.num_classes)
  171. self.bbox_head = MLP(hidden_dim,
  172. hidden_dim,
  173. output_dim=4,
  174. num_layers=num_mlp_layers)
  175. if self.with_mask_head:
  176. self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
  177. nhead)
  178. self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
  179. hidden_dim)
  180. self._reset_parameters()
  181. def _reset_parameters(self):
  182. linear_init_(self.score_head)
  183. @classmethod
  184. def from_config(cls, cfg, hidden_dim, nhead, input_shape):
  185. return {
  186. 'hidden_dim': hidden_dim,
  187. 'nhead': nhead,
  188. 'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
  189. }
  190. @staticmethod
  191. def get_gt_mask_from_polygons(gt_poly, pad_mask):
  192. out_gt_mask = []
  193. for polygons, padding in zip(gt_poly, pad_mask):
  194. height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
  195. masks = []
  196. for obj_poly in polygons:
  197. rles = mask_util.frPyObjects(obj_poly, height, width)
  198. rle = mask_util.merge(rles)
  199. masks.append(
  200. paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
  201. masks = paddle.stack(masks)
  202. masks_pad = paddle.zeros(
  203. [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
  204. masks_pad[:, :height, :width] = masks
  205. out_gt_mask.append(masks_pad)
  206. return out_gt_mask
  207. def forward(self, out_transformer, body_feats, inputs=None):
  208. r"""
  209. Args:
  210. out_transformer (Tuple): (feats: [num_levels, batch_size,
  211. num_queries, hidden_dim],
  212. memory: [batch_size, hidden_dim, h, w],
  213. src_proj: [batch_size, h*w, hidden_dim],
  214. src_mask: [batch_size, 1, 1, h, w])
  215. body_feats (List(Tensor)): list[[B, C, H, W]]
  216. inputs (dict): dict(inputs)
  217. """
  218. feats, memory, src_proj, src_mask = out_transformer
  219. outputs_logit = self.score_head(feats)
  220. outputs_bbox = F.sigmoid(self.bbox_head(feats))
  221. outputs_seg = None
  222. if self.with_mask_head:
  223. bbox_attention_map = self.bbox_attention(feats[-1], memory,
  224. src_mask)
  225. fpn_feats = [a for a in body_feats[::-1]][1:]
  226. outputs_seg = self.mask_head(src_proj, bbox_attention_map,
  227. fpn_feats)
  228. outputs_seg = outputs_seg.reshape([
  229. feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
  230. outputs_seg.shape[-1]
  231. ])
  232. if self.training:
  233. assert inputs is not None
  234. assert 'gt_bbox' in inputs and 'gt_class' in inputs
  235. gt_mask = self.get_gt_mask_from_polygons(
  236. inputs['gt_poly'],
  237. inputs['pad_mask']) if 'gt_poly' in inputs else None
  238. return self.loss(
  239. outputs_bbox,
  240. outputs_logit,
  241. inputs['gt_bbox'],
  242. inputs['gt_class'],
  243. masks=outputs_seg,
  244. gt_mask=gt_mask)
  245. else:
  246. return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
  247. @register
  248. class DeformableDETRHead(nn.Layer):
  249. __shared__ = ['num_classes', 'hidden_dim']
  250. __inject__ = ['loss']
  251. def __init__(self,
  252. num_classes=80,
  253. hidden_dim=512,
  254. nhead=8,
  255. num_mlp_layers=3,
  256. loss='DETRLoss'):
  257. super(DeformableDETRHead, self).__init__()
  258. self.num_classes = num_classes
  259. self.hidden_dim = hidden_dim
  260. self.nhead = nhead
  261. self.loss = loss
  262. self.score_head = nn.Linear(hidden_dim, self.num_classes)
  263. self.bbox_head = MLP(hidden_dim,
  264. hidden_dim,
  265. output_dim=4,
  266. num_layers=num_mlp_layers)
  267. self._reset_parameters()
  268. def _reset_parameters(self):
  269. linear_init_(self.score_head)
  270. constant_(self.score_head.bias, -4.595)
  271. constant_(self.bbox_head.layers[-1].weight)
  272. with paddle.no_grad():
  273. bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
  274. bias[2:] = -2.0
  275. self.bbox_head.layers[-1].bias.set_value(bias)
  276. @classmethod
  277. def from_config(cls, cfg, hidden_dim, nhead, input_shape):
  278. return {'hidden_dim': hidden_dim, 'nhead': nhead}
  279. def forward(self, out_transformer, body_feats, inputs=None):
  280. r"""
  281. Args:
  282. out_transformer (Tuple): (feats: [num_levels, batch_size,
  283. num_queries, hidden_dim],
  284. memory: [batch_size,
  285. \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
  286. reference_points: [batch_size, num_queries, 2])
  287. body_feats (List(Tensor)): list[[B, C, H, W]]
  288. inputs (dict): dict(inputs)
  289. """
  290. feats, memory, reference_points = out_transformer
  291. reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
  292. outputs_bbox = self.bbox_head(feats)
  293. # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
  294. # but the gradient is wrong in paddle.
  295. outputs_bbox = paddle.concat(
  296. [
  297. outputs_bbox[:, :, :, :2] + reference_points,
  298. outputs_bbox[:, :, :, 2:]
  299. ],
  300. axis=-1)
  301. outputs_bbox = F.sigmoid(outputs_bbox)
  302. outputs_logit = self.score_head(feats)
  303. if self.training:
  304. assert inputs is not None
  305. assert 'gt_bbox' in inputs and 'gt_class' in inputs
  306. return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
  307. inputs['gt_class'])
  308. else:
  309. return (outputs_bbox[-1], outputs_logit[-1], None)