text_det.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import sys
  16. import cv2
  17. import copy
  18. import math
  19. import pyclipper
  20. import numpy as np
  21. from PIL import Image
  22. from shapely.geometry import Polygon
  23. from ...utils.io import ImageReader
  24. from ....utils import logging
  25. from ..base import BaseComponent
  26. __all__ = ["DetResizeForTest", "NormalizeImage", "DBPostProcess", "CropByPolys"]
  27. class DetResizeForTest(BaseComponent):
  28. """DetResizeForTest"""
  29. INPUT_KEYS = ["img"]
  30. OUTPUT_KEYS = ["img", "img_shape"]
  31. DEAULT_INPUTS = {"img": "img"}
  32. DEAULT_OUTPUTS = {"img": "img", "img_shape": "img_shape"}
  33. def __init__(self, **kwargs):
  34. super().__init__()
  35. self.resize_type = 0
  36. self.keep_ratio = False
  37. if "image_shape" in kwargs:
  38. self.image_shape = kwargs["image_shape"]
  39. self.resize_type = 1
  40. if "keep_ratio" in kwargs:
  41. self.keep_ratio = kwargs["keep_ratio"]
  42. elif "limit_side_len" in kwargs:
  43. self.limit_side_len = kwargs["limit_side_len"]
  44. self.limit_type = kwargs.get("limit_type", "min")
  45. elif "resize_long" in kwargs:
  46. self.resize_type = 2
  47. self.resize_long = kwargs.get("resize_long", 960)
  48. else:
  49. self.limit_side_len = 736
  50. self.limit_type = "min"
  51. def apply(self, img):
  52. """apply"""
  53. src_h, src_w, _ = img.shape
  54. if sum([src_h, src_w]) < 64:
  55. img = self.image_padding(img)
  56. if self.resize_type == 0:
  57. # img, shape = self.resize_image_type0(img)
  58. img, [ratio_h, ratio_w] = self.resize_image_type0(img)
  59. elif self.resize_type == 2:
  60. img, [ratio_h, ratio_w] = self.resize_image_type2(img)
  61. else:
  62. # img, shape = self.resize_image_type1(img)
  63. img, [ratio_h, ratio_w] = self.resize_image_type1(img)
  64. return {"img": img, "img_shape": np.array([src_h, src_w, ratio_h, ratio_w])}
  65. def image_padding(self, im, value=0):
  66. """padding image"""
  67. h, w, c = im.shape
  68. im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
  69. im_pad[:h, :w, :] = im
  70. return im_pad
  71. def resize_image_type1(self, img):
  72. """resize the image"""
  73. resize_h, resize_w = self.image_shape
  74. ori_h, ori_w = img.shape[:2] # (h, w, c)
  75. if self.keep_ratio is True:
  76. resize_w = ori_w * resize_h / ori_h
  77. N = math.ceil(resize_w / 32)
  78. resize_w = N * 32
  79. ratio_h = float(resize_h) / ori_h
  80. ratio_w = float(resize_w) / ori_w
  81. img = cv2.resize(img, (int(resize_w), int(resize_h)))
  82. # return img, np.array([ori_h, ori_w])
  83. return img, [ratio_h, ratio_w]
  84. def resize_image_type0(self, img):
  85. """
  86. resize image to a size multiple of 32 which is required by the network
  87. args:
  88. img(array): array with shape [h, w, c]
  89. return(tuple):
  90. img, (ratio_h, ratio_w)
  91. """
  92. limit_side_len = self.limit_side_len
  93. h, w, c = img.shape
  94. # limit the max side
  95. if self.limit_type == "max":
  96. if max(h, w) > limit_side_len:
  97. if h > w:
  98. ratio = float(limit_side_len) / h
  99. else:
  100. ratio = float(limit_side_len) / w
  101. else:
  102. ratio = 1.0
  103. elif self.limit_type == "min":
  104. if min(h, w) < limit_side_len:
  105. if h < w:
  106. ratio = float(limit_side_len) / h
  107. else:
  108. ratio = float(limit_side_len) / w
  109. else:
  110. ratio = 1.0
  111. elif self.limit_type == "resize_long":
  112. ratio = float(limit_side_len) / max(h, w)
  113. else:
  114. raise Exception("not support limit type, image ")
  115. resize_h = int(h * ratio)
  116. resize_w = int(w * ratio)
  117. resize_h = max(int(round(resize_h / 32) * 32), 32)
  118. resize_w = max(int(round(resize_w / 32) * 32), 32)
  119. try:
  120. if int(resize_w) <= 0 or int(resize_h) <= 0:
  121. return None, (None, None)
  122. img = cv2.resize(img, (int(resize_w), int(resize_h)))
  123. except:
  124. logging.info(img.shape, resize_w, resize_h)
  125. sys.exit(0)
  126. ratio_h = resize_h / float(h)
  127. ratio_w = resize_w / float(w)
  128. return img, [ratio_h, ratio_w]
  129. def resize_image_type2(self, img):
  130. """resize image size"""
  131. h, w, _ = img.shape
  132. resize_w = w
  133. resize_h = h
  134. if resize_h > resize_w:
  135. ratio = float(self.resize_long) / resize_h
  136. else:
  137. ratio = float(self.resize_long) / resize_w
  138. resize_h = int(resize_h * ratio)
  139. resize_w = int(resize_w * ratio)
  140. max_stride = 128
  141. resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
  142. resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
  143. img = cv2.resize(img, (int(resize_w), int(resize_h)))
  144. ratio_h = resize_h / float(h)
  145. ratio_w = resize_w / float(w)
  146. return img, [ratio_h, ratio_w]
  147. class NormalizeImage(BaseComponent):
  148. """normalize image such as substract mean, divide std"""
  149. INPUT_KEYS = ["img"]
  150. OUTPUT_KEYS = ["img"]
  151. DEAULT_INPUTS = {"img": "img"}
  152. DEAULT_OUTPUTS = {"img": "img"}
  153. def __init__(self, scale=None, mean=None, std=None, order="chw", **kwargs):
  154. super().__init__()
  155. if isinstance(scale, str):
  156. scale = eval(scale)
  157. self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
  158. mean = mean if mean is not None else [0.485, 0.456, 0.406]
  159. std = std if std is not None else [0.229, 0.224, 0.225]
  160. shape = (3, 1, 1) if order == "chw" else (1, 1, 3)
  161. self.mean = np.array(mean).reshape(shape).astype("float32")
  162. self.std = np.array(std).reshape(shape).astype("float32")
  163. def apply(self, img):
  164. """apply"""
  165. from PIL import Image
  166. if isinstance(img, Image.Image):
  167. img = np.array(img)
  168. assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
  169. img = (img.astype("float32") * self.scale - self.mean) / self.std
  170. return {"img": img}
  171. class DBPostProcess(BaseComponent):
  172. """
  173. The post process for Differentiable Binarization (DB).
  174. """
  175. INPUT_KEYS = ["pred", "img_shape"]
  176. OUTPUT_KEYS = ["dt_polys", "dt_scores"]
  177. DEAULT_INPUTS = {"pred": "pred", "img_shape": "img_shape"}
  178. DEAULT_OUTPUTS = {"dt_polys": "dt_polys", "dt_scores": "dt_scores"}
  179. def __init__(
  180. self,
  181. thresh=0.3,
  182. box_thresh=0.7,
  183. max_candidates=1000,
  184. unclip_ratio=2.0,
  185. use_dilation=False,
  186. score_mode="fast",
  187. box_type="quad",
  188. **kwargs
  189. ):
  190. super().__init__()
  191. self.thresh = thresh
  192. self.box_thresh = box_thresh
  193. self.max_candidates = max_candidates
  194. self.unclip_ratio = unclip_ratio
  195. self.min_size = 3
  196. self.score_mode = score_mode
  197. self.box_type = box_type
  198. assert score_mode in [
  199. "slow",
  200. "fast",
  201. ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
  202. self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
  203. def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
  204. """_bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1}"""
  205. bitmap = _bitmap
  206. height, width = bitmap.shape
  207. boxes = []
  208. scores = []
  209. contours, _ = cv2.findContours(
  210. (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
  211. )
  212. for contour in contours[: self.max_candidates]:
  213. epsilon = 0.002 * cv2.arcLength(contour, True)
  214. approx = cv2.approxPolyDP(contour, epsilon, True)
  215. points = approx.reshape((-1, 2))
  216. if points.shape[0] < 4:
  217. continue
  218. score = self.box_score_fast(pred, points.reshape(-1, 2))
  219. if self.box_thresh > score:
  220. continue
  221. if points.shape[0] > 2:
  222. box = self.unclip(points, self.unclip_ratio)
  223. if len(box) > 1:
  224. continue
  225. else:
  226. continue
  227. box = box.reshape(-1, 2)
  228. _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
  229. if sside < self.min_size + 2:
  230. continue
  231. box = np.array(box)
  232. box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
  233. box[:, 1] = np.clip(
  234. np.round(box[:, 1] / height * dest_height), 0, dest_height
  235. )
  236. boxes.append(box.tolist())
  237. scores.append(score)
  238. return boxes, scores
  239. def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
  240. """_bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1}"""
  241. bitmap = _bitmap
  242. height, width = bitmap.shape
  243. outs = cv2.findContours(
  244. (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
  245. )
  246. if len(outs) == 3:
  247. img, contours, _ = outs[0], outs[1], outs[2]
  248. elif len(outs) == 2:
  249. contours, _ = outs[0], outs[1]
  250. num_contours = min(len(contours), self.max_candidates)
  251. boxes = []
  252. scores = []
  253. for index in range(num_contours):
  254. contour = contours[index]
  255. points, sside = self.get_mini_boxes(contour)
  256. if sside < self.min_size:
  257. continue
  258. points = np.array(points)
  259. if self.score_mode == "fast":
  260. score = self.box_score_fast(pred, points.reshape(-1, 2))
  261. else:
  262. score = self.box_score_slow(pred, contour)
  263. if self.box_thresh > score:
  264. continue
  265. box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
  266. box, sside = self.get_mini_boxes(box)
  267. if sside < self.min_size + 2:
  268. continue
  269. box = np.array(box)
  270. box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
  271. box[:, 1] = np.clip(
  272. np.round(box[:, 1] / height * dest_height), 0, dest_height
  273. )
  274. boxes.append(box.astype(np.int16))
  275. scores.append(score)
  276. return np.array(boxes, dtype=np.int16), scores
  277. def unclip(self, box, unclip_ratio):
  278. """unclip"""
  279. poly = Polygon(box)
  280. distance = poly.area * unclip_ratio / poly.length
  281. offset = pyclipper.PyclipperOffset()
  282. offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
  283. expanded = np.array(offset.Execute(distance))
  284. return expanded
  285. def get_mini_boxes(self, contour):
  286. """get mini boxes"""
  287. bounding_box = cv2.minAreaRect(contour)
  288. points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
  289. index_1, index_2, index_3, index_4 = 0, 1, 2, 3
  290. if points[1][1] > points[0][1]:
  291. index_1 = 0
  292. index_4 = 1
  293. else:
  294. index_1 = 1
  295. index_4 = 0
  296. if points[3][1] > points[2][1]:
  297. index_2 = 2
  298. index_3 = 3
  299. else:
  300. index_2 = 3
  301. index_3 = 2
  302. box = [points[index_1], points[index_2], points[index_3], points[index_4]]
  303. return box, min(bounding_box[1])
  304. def box_score_fast(self, bitmap, _box):
  305. """box_score_fast: use bbox mean score as the mean score"""
  306. h, w = bitmap.shape[:2]
  307. box = _box.copy()
  308. xmin = np.clip(np.floor(box[:, 0].min()).astype("int"), 0, w - 1)
  309. xmax = np.clip(np.ceil(box[:, 0].max()).astype("int"), 0, w - 1)
  310. ymin = np.clip(np.floor(box[:, 1].min()).astype("int"), 0, h - 1)
  311. ymax = np.clip(np.ceil(box[:, 1].max()).astype("int"), 0, h - 1)
  312. mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
  313. box[:, 0] = box[:, 0] - xmin
  314. box[:, 1] = box[:, 1] - ymin
  315. cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
  316. return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
  317. def box_score_slow(self, bitmap, contour):
  318. """box_score_slow: use polyon mean score as the mean score"""
  319. h, w = bitmap.shape[:2]
  320. contour = contour.copy()
  321. contour = np.reshape(contour, (-1, 2))
  322. xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
  323. xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
  324. ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
  325. ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
  326. mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
  327. contour[:, 0] = contour[:, 0] - xmin
  328. contour[:, 1] = contour[:, 1] - ymin
  329. cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
  330. return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
  331. def apply(self, pred, img_shape):
  332. """apply"""
  333. pred = pred[0][0, :, :]
  334. segmentation = pred > self.thresh
  335. src_h, src_w, ratio_h, ratio_w = img_shape
  336. if self.dilation_kernel is not None:
  337. mask = cv2.dilate(
  338. np.array(segmentation).astype(np.uint8),
  339. self.dilation_kernel,
  340. )
  341. else:
  342. mask = segmentation
  343. if self.box_type == "poly":
  344. boxes, scores = self.polygons_from_bitmap(pred, mask, src_w, src_h)
  345. elif self.box_type == "quad":
  346. boxes, scores = self.boxes_from_bitmap(pred, mask, src_w, src_h)
  347. else:
  348. raise ValueError("box_type can only be one of ['quad', 'poly']")
  349. return {"dt_polys": boxes, "dt_scores": scores}
  350. class CropByPolys(BaseComponent):
  351. """Crop Image by Polys"""
  352. INPUT_KEYS = ["img_path", "dt_polys"]
  353. OUTPUT_KEYS = ["img"]
  354. DEAULT_INPUTS = {"img_path": "img_path", "dt_polys": "dt_polys"}
  355. DEAULT_OUTPUTS = {"img": "img"}
  356. def __init__(self, det_box_type="quad"):
  357. super().__init__()
  358. self.det_box_type = det_box_type
  359. self._reader = ImageReader(backend="opencv")
  360. def apply(self, img_path, dt_polys):
  361. """apply"""
  362. img = self._reader.read(img_path)
  363. dt_boxes = np.array(dt_polys)
  364. # TODO
  365. # dt_boxes = self.sorted_boxes(data[K.DT_POLYS])
  366. output_list = []
  367. for bno in range(len(dt_boxes)):
  368. tmp_box = copy.deepcopy(dt_boxes[bno])
  369. if self.det_box_type == "quad":
  370. img_crop = self.get_rotate_crop_image(img, tmp_box)
  371. else:
  372. img_crop = self.get_minarea_rect_crop(img, tmp_box)
  373. output_list.append(
  374. {"img": img_crop, "img_size": [img_crop.shape[1], img_crop.shape[0]]}
  375. )
  376. return output_list
  377. def sorted_boxes(self, dt_boxes):
  378. """
  379. Sort text boxes in order from top to bottom, left to right
  380. args:
  381. dt_boxes(array):detected text boxes with shape [4, 2]
  382. return:
  383. sorted boxes(array) with shape [4, 2]
  384. """
  385. dt_boxes = np.array(dt_boxes)
  386. num_boxes = dt_boxes.shape[0]
  387. sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
  388. _boxes = list(sorted_boxes)
  389. for i in range(num_boxes - 1):
  390. for j in range(i, -1, -1):
  391. if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
  392. _boxes[j + 1][0][0] < _boxes[j][0][0]
  393. ):
  394. tmp = _boxes[j]
  395. _boxes[j] = _boxes[j + 1]
  396. _boxes[j + 1] = tmp
  397. else:
  398. break
  399. return _boxes
  400. def get_minarea_rect_crop(self, img, points):
  401. """get_minarea_rect_crop"""
  402. bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
  403. points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
  404. index_a, index_b, index_c, index_d = 0, 1, 2, 3
  405. if points[1][1] > points[0][1]:
  406. index_a = 0
  407. index_d = 1
  408. else:
  409. index_a = 1
  410. index_d = 0
  411. if points[3][1] > points[2][1]:
  412. index_b = 2
  413. index_c = 3
  414. else:
  415. index_b = 3
  416. index_c = 2
  417. box = [points[index_a], points[index_b], points[index_c], points[index_d]]
  418. crop_img = self.get_rotate_crop_image(img, np.array(box))
  419. return crop_img
  420. def get_rotate_crop_image(self, img, points):
  421. """
  422. img_height, img_width = img.shape[0:2]
  423. left = int(np.min(points[:, 0]))
  424. right = int(np.max(points[:, 0]))
  425. top = int(np.min(points[:, 1]))
  426. bottom = int(np.max(points[:, 1]))
  427. img_crop = img[top:bottom, left:right, :].copy()
  428. points[:, 0] = points[:, 0] - left
  429. points[:, 1] = points[:, 1] - top
  430. """
  431. assert len(points) == 4, "shape of points must be 4*2"
  432. img_crop_width = int(
  433. max(
  434. np.linalg.norm(points[0] - points[1]),
  435. np.linalg.norm(points[2] - points[3]),
  436. )
  437. )
  438. img_crop_height = int(
  439. max(
  440. np.linalg.norm(points[0] - points[3]),
  441. np.linalg.norm(points[1] - points[2]),
  442. )
  443. )
  444. pts_std = np.float32(
  445. [
  446. [0, 0],
  447. [img_crop_width, 0],
  448. [img_crop_width, img_crop_height],
  449. [0, img_crop_height],
  450. ]
  451. )
  452. M = cv2.getPerspectiveTransform(points, pts_std)
  453. dst_img = cv2.warpPerspective(
  454. img,
  455. M,
  456. (img_crop_width, img_crop_height),
  457. borderMode=cv2.BORDER_REPLICATE,
  458. flags=cv2.INTER_CUBIC,
  459. )
  460. dst_img_height, dst_img_width = dst_img.shape[0:2]
  461. if dst_img_height * 1.0 / dst_img_width >= 1.5:
  462. dst_img = np.rot90(dst_img)
  463. return dst_img