processors.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import os.path as osp
  16. from typing import List, Sequence, Union, Optional, Tuple
  17. import numpy as np
  18. import cv2
  19. import lazy_paddle as paddle
  20. class ResizeVideo:
  21. """Resizes frames of a video to a specified target size.
  22. This class provides functionality to resize each frame of a video to
  23. a specified square dimension (height and width are equal).
  24. Attributes:
  25. target_size (int): The desired size (in pixels) for both the height
  26. and width of each frame in the video.
  27. """
  28. def __init__(self, target_size: int = 224) -> None:
  29. """Initializes the ResizeVideo with a target size.
  30. Args:
  31. target_size (int): The desired size in pixels for the output
  32. frames. Defaults to 224.
  33. """
  34. super().__init__()
  35. self.target_size = target_size
  36. def resize(self, video: List) -> List:
  37. """Resizes all frames of a single video.
  38. Args:
  39. video (list): A list of segments, where each segment is a list
  40. of frames represented as numpy arrays.
  41. Returns:
  42. list: The input video with each frame resized to the target size.
  43. Raises:
  44. NotImplementedError: If a frame is not an instance of numpy.ndarray.
  45. """
  46. num_seg = len(video)
  47. seg_len = len(video[0])
  48. for i in range(num_seg):
  49. for j in range(seg_len):
  50. img = video[i][j]
  51. if isinstance(img, np.ndarray):
  52. h, w, _ = img.shape
  53. else:
  54. raise NotImplementedError(
  55. "Currently, only numpy.ndarray frames are supported."
  56. )
  57. video[i][j] = cv2.resize(
  58. img,
  59. (self.target_size, self.target_size),
  60. interpolation=cv2.INTER_LINEAR,
  61. )
  62. return video
  63. def __call__(self, videos: List) -> List:
  64. """Resizes frames of multiple videos.
  65. Args:
  66. videos (list): A list containing multiple videos, where each video
  67. is a list of segments, and each segment is a list of frames.
  68. Returns:
  69. list: A list of videos with each frame resized to the target size.
  70. """
  71. return [self.resize(video) for video in videos]
  72. class Image2Array:
  73. """Convert a sequence of images to a numpy array with optional transposition."""
  74. def __init__(self, data_format: str = "tchw") -> None:
  75. """
  76. Initializes the Image2Array class.
  77. Args:
  78. data_format (str): The format to transpose to, either 'tchw' or 'cthw'.
  79. Raises:
  80. AssertionError: If data_format is not one of the allowed values.
  81. """
  82. super().__init__()
  83. assert data_format in [
  84. "tchw",
  85. "cthw",
  86. ], f"Target format must be in ['tchw', 'cthw'], but got {data_format}"
  87. self.data_format = data_format
  88. def img2array(self, video: List) -> List:
  89. """
  90. Converts a list of video frames to a numpy array, with frames transposed.
  91. Args:
  92. video (List): A list of frames represented as numpy arrays.
  93. Returns:
  94. List: A numpy array with the video frames transposed and concatenated.
  95. """
  96. # Transpose each image from HWC to CHW format
  97. num_seg = len(video)
  98. for i in range(num_seg):
  99. video_one = video[i]
  100. video_one = [img.transpose([2, 0, 1]) for img in video_one]
  101. video_one = np.concatenate(
  102. [np.expand_dims(img, axis=1) for img in video_one], axis=1
  103. )
  104. video[i] = video_one
  105. return video
  106. def __call__(self, videos: List[List[np.ndarray]]) -> List[np.ndarray]:
  107. """
  108. Process videos by converting each video to a transposed numpy array.
  109. Args:
  110. videos (List[List[np.ndarray]]): A list of videos, where each video is a list
  111. of frames represented as numpy arrays.
  112. Returns:
  113. List[np.ndarray]: A list of processed videos with transposed frames.
  114. """
  115. return [self.img2array(video) for video in videos]
  116. class NormalizeVideo:
  117. """
  118. A class to normalize video frames by scaling the pixel values.
  119. """
  120. def __init__(self, scale: float = 255.0) -> None:
  121. """
  122. Initializes the NormalizeVideo class.
  123. Args:
  124. scale (float): The scale factor to normalize the frames, usually the max pixel value.
  125. """
  126. super().__init__()
  127. self.scale = scale
  128. def normalize_video(self, video: List[np.ndarray]) -> List[np.ndarray]:
  129. """
  130. Normalizes a sequence of images by scaling the pixel values.
  131. Args:
  132. video (List[np.ndarray]): A list of frames, where each frame is a numpy array to be normalized.
  133. Returns:
  134. List[np.ndarray]: The normalized video frames as a list of numpy arrays.
  135. """
  136. num_seg = len(video) # Number of frames in the video
  137. for i in range(num_seg):
  138. # Convert frame to float32 and scale pixel values
  139. video[i] = video[i].astype(np.float32) / self.scale
  140. # Expand dimensions if needed
  141. video[i] = np.expand_dims(video[i], axis=0)
  142. return video
  143. def __call__(self, videos: List[List[np.ndarray]]) -> List[List[np.ndarray]]:
  144. """
  145. Apply normalization to a list of videos.
  146. Args:
  147. videos (List[List[np.ndarray]]): A list of videos, where each video is a list of frames
  148. represented as numpy arrays.
  149. Returns:
  150. List[List[np.ndarray]]: A list of normalized videos, each represented as a list of normalized frames.
  151. """
  152. return [self.normalize_video(video) for video in videos]
  153. def convert2cpu(gpu_matrix):
  154. float_32_g = gpu_matrix.astype("float32")
  155. return float_32_g.cpu()
  156. def convert2cpu_long(gpu_matrix):
  157. int_64_g = gpu_matrix.astype("int64")
  158. return int_64_g.cpu()
  159. def get_region_boxes(
  160. output,
  161. conf_thresh=0.005,
  162. num_classes=24,
  163. anchors=[
  164. 0.70458,
  165. 1.18803,
  166. 1.26654,
  167. 2.55121,
  168. 1.59382,
  169. 4.08321,
  170. 2.30548,
  171. 4.94180,
  172. 3.52332,
  173. 5.91979,
  174. ],
  175. num_anchors=5,
  176. only_objectness=1,
  177. ):
  178. """
  179. Processes the output of a neural network to extract bounding box predictions.
  180. Args:
  181. output (Tensor): The output tensor from the neural network.
  182. conf_thresh (float): The confidence threshold for filtering predictions. Default is 0.005.
  183. num_classes (int): The number of classes for classification. Default is 24.
  184. anchors (List[float]): A list of anchor box dimensions used in the model. Default is a list
  185. of 10 predefined anchor values.
  186. num_anchors (int): The number of anchor boxes used in the model. Default is 5.
  187. only_objectness (int): If set to 1, only objectness scores are considered for filtering. Default is 1.
  188. Returns:
  189. all_box(List[List[float]]): A list of predicted bounding boxes for each image in the batch.
  190. """
  191. anchor_step = len(anchors) // num_anchors
  192. if output.dim() == 3:
  193. output = output.unsqueeze(0)
  194. batch = output.shape[0]
  195. assert output.shape[1] == (5 + num_classes) * num_anchors
  196. h = output.shape[2]
  197. w = output.shape[3]
  198. all_boxes = []
  199. output = paddle.reshape(output, [batch * num_anchors, 5 + num_classes, h * w])
  200. output = paddle.transpose(output, (1, 0, 2))
  201. output = paddle.reshape(output, [5 + num_classes, batch * num_anchors * h * w])
  202. grid_x = paddle.linspace(0, w - 1, w)
  203. grid_x = paddle.tile(grid_x, [h, 1])
  204. grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])
  205. grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()
  206. grid_y = paddle.linspace(0, h - 1, h)
  207. grid_y = paddle.tile(grid_y, [w, 1]).t()
  208. grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])
  209. grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()
  210. sigmoid = paddle.nn.Sigmoid()
  211. xs = sigmoid(output[0]) + grid_x
  212. ys = sigmoid(output[1]) + grid_y
  213. anchor_w = paddle.to_tensor(anchors)
  214. anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])
  215. anchor_w = paddle.index_select(
  216. anchor_w, index=paddle.to_tensor(np.array([0]).astype("int32")), axis=1
  217. )
  218. anchor_h = paddle.to_tensor(anchors)
  219. anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])
  220. anchor_h = paddle.index_select(
  221. anchor_h, index=paddle.to_tensor(np.array([1]).astype("int32")), axis=1
  222. )
  223. anchor_w = paddle.tile(anchor_w, [batch, 1])
  224. anchor_w = paddle.tile(anchor_w, [1, 1, h * w])
  225. anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()
  226. anchor_h = paddle.tile(anchor_h, [batch, 1])
  227. anchor_h = paddle.tile(anchor_h, [1, 1, h * w])
  228. anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()
  229. ws = paddle.exp(output[2]) * anchor_w
  230. hs = paddle.exp(output[3]) * anchor_h
  231. det_confs = sigmoid(output[4])
  232. cls_confs = paddle.to_tensor(output[5 : 5 + num_classes], stop_gradient=True)
  233. cls_confs = paddle.transpose(cls_confs, [1, 0])
  234. s = paddle.nn.Softmax()
  235. cls_confs = paddle.to_tensor(s(cls_confs))
  236. cls_max_confs = paddle.max(cls_confs, axis=1)
  237. cls_max_ids = paddle.argmax(cls_confs, axis=1)
  238. cls_max_confs = paddle.reshape(cls_max_confs, [-1])
  239. cls_max_ids = paddle.reshape(cls_max_ids, [-1])
  240. sz_hw = h * w
  241. sz_hwa = sz_hw * num_anchors
  242. det_confs = convert2cpu(det_confs)
  243. cls_max_confs = convert2cpu(cls_max_confs)
  244. cls_max_ids = convert2cpu_long(cls_max_ids)
  245. xs = convert2cpu(xs)
  246. ys = convert2cpu(ys)
  247. ws = convert2cpu(ws)
  248. hs = convert2cpu(hs)
  249. for b in range(batch):
  250. boxes = []
  251. for cy in range(h):
  252. for cx in range(w):
  253. for i in range(num_anchors):
  254. ind = b * sz_hwa + i * sz_hw + cy * w + cx
  255. det_conf = det_confs[ind]
  256. if only_objectness:
  257. conf = det_confs[ind]
  258. else:
  259. conf = det_confs[ind] * cls_max_confs[ind]
  260. if conf > conf_thresh:
  261. bcx = xs[ind]
  262. bcy = ys[ind]
  263. bw = ws[ind]
  264. bh = hs[ind]
  265. cls_max_conf = cls_max_confs[ind]
  266. cls_max_id = cls_max_ids[ind]
  267. box = [
  268. bcx / w,
  269. bcy / h,
  270. bw / w,
  271. bh / h,
  272. det_conf,
  273. cls_max_conf,
  274. cls_max_id,
  275. ]
  276. boxes.append(box)
  277. all_boxes.append(boxes)
  278. return all_boxes
  279. def nms(boxes, nms_thresh):
  280. """
  281. Performs non-maximum suppression on the input boxes based on their IoUs.
  282. """
  283. if len(boxes) == 0:
  284. return boxes
  285. det_confs = paddle.zeros([len(boxes)])
  286. for i in range(len(boxes)):
  287. det_confs[i] = 1 - boxes[i][4]
  288. sortIds = paddle.argsort(det_confs)
  289. out_boxes = []
  290. for i in range(len(boxes)):
  291. box_i = boxes[sortIds[i]]
  292. if box_i[4] > 0:
  293. out_boxes.append(box_i)
  294. for j in range(i + 1, len(boxes)):
  295. box_j = boxes[sortIds[j]]
  296. if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
  297. box_j[4] = 0
  298. return out_boxes
  299. def bbox_iou(box1, box2, x1y1x2y2=True):
  300. """
  301. Returns the Intersection over Union (IoU) of two bounding boxes.
  302. """
  303. if x1y1x2y2:
  304. mx = min(box1[0], box2[0])
  305. Mx = max(box1[2], box2[2])
  306. my = min(box1[1], box2[1])
  307. My = max(box1[3], box2[3])
  308. w1 = box1[2] - box1[0]
  309. h1 = box1[3] - box1[1]
  310. w2 = box2[2] - box2[0]
  311. h2 = box2[3] - box2[1]
  312. else:
  313. mx = min(float(box1[0] - box1[2] / 2.0), float(box2[0] - box2[2] / 2.0))
  314. Mx = max(float(box1[0] + box1[2] / 2.0), float(box2[0] + box2[2] / 2.0))
  315. my = min(float(box1[1] - box1[3] / 2.0), float(box2[1] - box2[3] / 2.0))
  316. My = max(float(box1[1] + box1[3] / 2.0), float(box2[1] + box2[3] / 2.0))
  317. w1 = box1[2]
  318. h1 = box1[3]
  319. w2 = box2[2]
  320. h2 = box2[3]
  321. uw = Mx - mx
  322. uh = My - my
  323. cw = w1 + w2 - uw
  324. ch = h1 + h2 - uh
  325. carea = 0
  326. if cw <= 0 or ch <= 0:
  327. return paddle.to_tensor(0.0)
  328. area1 = w1 * h1
  329. area2 = w2 * h2
  330. carea = cw * ch
  331. uarea = area1 + area2 - carea
  332. return carea / uarea
  333. class DetVideoPostProcess:
  334. """
  335. A class used to perform post-processing on detection results in videos.
  336. """
  337. def __init__(
  338. self,
  339. label_list: List[str] = [],
  340. ) -> None:
  341. """
  342. Args:
  343. labels : List[str]
  344. A list of labels or class names associated with the detection results.
  345. """
  346. super().__init__()
  347. self.labels = label_list
  348. def postprocess(self, pred: List, nms_thresh: float, score_thresh: float) -> List:
  349. font = cv2.FONT_HERSHEY_SIMPLEX
  350. num_seg = len(pred)
  351. pred_all = []
  352. for i in range(num_seg):
  353. outputs = pred[i]
  354. for out in outputs:
  355. preds = []
  356. out = paddle.to_tensor(out)
  357. all_boxes = get_region_boxes(out, 0.3, len(self.labels))
  358. for i in range(out.shape[0]):
  359. boxes = all_boxes[i]
  360. boxes = nms(boxes, nms_thresh)
  361. for box in boxes:
  362. x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
  363. y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
  364. x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
  365. y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
  366. det_conf = float(box[4])
  367. for j in range((len(box) - 5) // 2):
  368. cls_conf = float(box[5 + 2 * j].item())
  369. prob = det_conf * cls_conf
  370. if prob > score_thresh:
  371. preds.append(
  372. [[x1, y1, x2, y2], prob, self.labels[int(box[6])]]
  373. )
  374. pred_all.append(preds)
  375. return pred_all
  376. def __call__(self, preds: List, nms_thresh, score_thresh) -> List:
  377. return [self.postprocess(pred, nms_thresh, score_thresh) for pred in preds]