common.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. from pathlib import Path
  16. from copy import deepcopy
  17. import numpy as np
  18. import cv2
  19. from .....utils.flags import (
  20. INFER_BENCHMARK,
  21. INFER_BENCHMARK_ITER,
  22. INFER_BENCHMARK_DATA_SIZE,
  23. )
  24. from .....utils.cache import CACHE_DIR, temp_file_manager
  25. from ....utils.io import ImageReader, ImageWriter, PDFReader
  26. from ...base import BaseComponent
  27. from ..read_data import _BaseRead
  28. from . import funcs as F
  29. __all__ = [
  30. "ReadImage",
  31. "Flip",
  32. "Crop",
  33. "Resize",
  34. "ResizeByLong",
  35. "ResizeByShort",
  36. "Pad",
  37. "Normalize",
  38. "ToCHWImage",
  39. "PadStride",
  40. ]
  41. def _check_image_size(input_):
  42. """check image size"""
  43. if not (
  44. isinstance(input_, (list, tuple))
  45. and len(input_) == 2
  46. and isinstance(input_[0], int)
  47. and isinstance(input_[1], int)
  48. ):
  49. raise TypeError(f"{input_} cannot represent a valid image size.")
  50. class ReadImage(_BaseRead):
  51. """Load image from the file."""
  52. INPUT_KEYS = ["img"]
  53. OUTPUT_KEYS = ["img", "img_size", "ori_img", "ori_img_size"]
  54. DEAULT_INPUTS = {"img": "img"}
  55. DEAULT_OUTPUTS = {
  56. "img": "img",
  57. "input_path": "input_path",
  58. "img_size": "img_size",
  59. "ori_img": "ori_img",
  60. "ori_img_size": "ori_img_size",
  61. }
  62. _FLAGS_DICT = {
  63. "BGR": cv2.IMREAD_COLOR,
  64. "RGB": cv2.IMREAD_COLOR,
  65. "GRAY": cv2.IMREAD_GRAYSCALE,
  66. }
  67. SUFFIX = ["jpg", "png", "jpeg", "JPEG", "JPG", "bmp", "PDF", "pdf"]
  68. def __init__(self, batch_size=1, format="BGR"):
  69. """
  70. Initialize the instance.
  71. Args:
  72. format (str, optional): Target color format to convert the image to.
  73. Choices are 'BGR', 'RGB', and 'GRAY'. Default: 'BGR'.
  74. """
  75. super().__init__(batch_size)
  76. self.format = format
  77. flags = self._FLAGS_DICT[self.format]
  78. self._img_reader = ImageReader(backend="opencv", flags=flags)
  79. self._pdf_reader = PDFReader()
  80. self._writer = ImageWriter(backend="opencv")
  81. def apply(self, img):
  82. """apply"""
  83. def process_ndarray(img):
  84. with temp_file_manager.temp_file_context(suffix=".png") as temp_file:
  85. img_path = Path(temp_file.name)
  86. self._writer.write(img_path, img)
  87. if self.format == "RGB":
  88. img = img[:, :, ::-1]
  89. return {
  90. "input_path": img_path,
  91. "img": img,
  92. "img_size": [img.shape[1], img.shape[0]],
  93. "ori_img": deepcopy(img),
  94. "ori_img_size": deepcopy([img.shape[1], img.shape[0]]),
  95. }
  96. if INFER_BENCHMARK and img is None:
  97. size = int(INFER_BENCHMARK_DATA_SIZE)
  98. for _ in range(INFER_BENCHMARK_ITER):
  99. yield [
  100. process_ndarray(
  101. np.random.randint(0, 256, (size, size, 3), dtype=np.uint8)
  102. )
  103. for _ in range(self.batch_size)
  104. ]
  105. elif isinstance(img, np.ndarray):
  106. yield [process_ndarray(img)]
  107. elif isinstance(img, str):
  108. file_path = img
  109. file_path = self._download_from_url(file_path)
  110. file_list = self._get_files_list(file_path)
  111. batch = []
  112. for file_path in file_list:
  113. img = self._read(file_path)
  114. batch.extend(img)
  115. if len(batch) >= self.batch_size:
  116. yield batch
  117. batch = []
  118. if len(batch) > 0:
  119. yield batch
  120. else:
  121. raise TypeError(
  122. f"ReadImage only supports the following types:\n"
  123. f"1. str, indicating a image file path or a directory containing image files.\n"
  124. f"2. numpy.ndarray.\n"
  125. f"However, got type: {type(img).__name__}."
  126. )
  127. def _read(self, file_path):
  128. if str(file_path).lower().endswith(".pdf"):
  129. return self._read_pdf(file_path)
  130. else:
  131. return self._read_img(file_path)
  132. def _read_img(self, img_path):
  133. blob = self._img_reader.read(img_path)
  134. if blob is None:
  135. raise Exception("Image read Error")
  136. if self.format == "RGB":
  137. if blob.ndim != 3:
  138. raise RuntimeError("Array is not 3-dimensional.")
  139. # BGR to RGB
  140. blob = blob[..., ::-1]
  141. return [
  142. {
  143. "input_path": img_path,
  144. "img": blob,
  145. "img_size": [blob.shape[1], blob.shape[0]],
  146. "ori_img": deepcopy(blob),
  147. "ori_img_size": deepcopy([blob.shape[1], blob.shape[0]]),
  148. }
  149. ]
  150. def _read_pdf(self, pdf_path):
  151. img_list = self._pdf_reader.read(pdf_path)
  152. return [
  153. {
  154. "input_path": pdf_path,
  155. "img": img,
  156. "img_size": [img.shape[1], img.shape[0]],
  157. "ori_img": deepcopy(img),
  158. "ori_img_size": deepcopy([img.shape[1], img.shape[0]]),
  159. }
  160. for img in img_list
  161. ]
  162. class GetImageInfo(BaseComponent):
  163. """Get Image Info"""
  164. INPUT_KEYS = "img"
  165. OUTPUT_KEYS = "img_size"
  166. DEAULT_INPUTS = {"img": "img"}
  167. DEAULT_OUTPUTS = {"img_size": "img_size"}
  168. def __init__(self):
  169. super().__init__()
  170. def apply(self, img):
  171. """apply"""
  172. return {"img_size": [img.shape[1], img.shape[0]]}
  173. class Flip(BaseComponent):
  174. """Flip the image vertically or horizontally."""
  175. INPUT_KEYS = "img"
  176. OUTPUT_KEYS = "img"
  177. DEAULT_INPUTS = {"img": "img"}
  178. DEAULT_OUTPUTS = {"img": "img"}
  179. def __init__(self, mode="H"):
  180. """
  181. Initialize the instance.
  182. Args:
  183. mode (str, optional): 'H' for horizontal flipping and 'V' for vertical
  184. flipping. Default: 'H'.
  185. """
  186. super().__init__()
  187. if mode not in ("H", "V"):
  188. raise ValueError("`mode` should be 'H' or 'V'.")
  189. self.mode = mode
  190. def apply(self, img):
  191. """apply"""
  192. if self.mode == "H":
  193. img = F.flip_h(img)
  194. elif self.mode == "V":
  195. img = F.flip_v(img)
  196. return {"img": img}
  197. class Crop(BaseComponent):
  198. """Crop region from the image."""
  199. INPUT_KEYS = "img"
  200. OUTPUT_KEYS = ["img", "img_size"]
  201. DEAULT_INPUTS = {"img": "img"}
  202. DEAULT_OUTPUTS = {"img": "img", "img_size": "img_size"}
  203. def __init__(self, crop_size, mode="C"):
  204. """
  205. Initialize the instance.
  206. Args:
  207. crop_size (list|tuple|int): Width and height of the region to crop.
  208. mode (str, optional): 'C' for cropping the center part and 'TL' for
  209. cropping the top left part. Default: 'C'.
  210. """
  211. super().__init__()
  212. if isinstance(crop_size, int):
  213. crop_size = [crop_size, crop_size]
  214. _check_image_size(crop_size)
  215. self.crop_size = crop_size
  216. if mode not in ("C", "TL"):
  217. raise ValueError("Unsupported interpolation method")
  218. self.mode = mode
  219. def apply(self, img):
  220. """apply"""
  221. h, w = img.shape[:2]
  222. cw, ch = self.crop_size
  223. if self.mode == "C":
  224. x1 = max(0, (w - cw) // 2)
  225. y1 = max(0, (h - ch) // 2)
  226. elif self.mode == "TL":
  227. x1, y1 = 0, 0
  228. x2 = min(w, x1 + cw)
  229. y2 = min(h, y1 + ch)
  230. coords = (x1, y1, x2, y2)
  231. if coords == (0, 0, w, h):
  232. raise ValueError(
  233. f"Input image ({w}, {h}) smaller than the target size ({cw}, {ch})."
  234. )
  235. img = F.slice(img, coords=coords)
  236. return {"img": img, "img_size": [img.shape[1], img.shape[0]]}
  237. class _BaseResize(BaseComponent):
  238. _INTERP_DICT = {
  239. "NEAREST": cv2.INTER_NEAREST,
  240. "LINEAR": cv2.INTER_LINEAR,
  241. "CUBIC": cv2.INTER_CUBIC,
  242. "AREA": cv2.INTER_AREA,
  243. "LANCZOS4": cv2.INTER_LANCZOS4,
  244. }
  245. def __init__(self, size_divisor, interp):
  246. super().__init__()
  247. if size_divisor is not None:
  248. assert isinstance(
  249. size_divisor, int
  250. ), "`size_divisor` should be None or int."
  251. self.size_divisor = size_divisor
  252. try:
  253. interp = self._INTERP_DICT[interp]
  254. except KeyError:
  255. raise ValueError(
  256. "`interp` should be one of {}.".format(self._INTERP_DICT.keys())
  257. )
  258. self.interp = interp
  259. @staticmethod
  260. def _rescale_size(img_size, target_size):
  261. """rescale size"""
  262. scale = min(max(target_size) / max(img_size), min(target_size) / min(img_size))
  263. rescaled_size = [round(i * scale) for i in img_size]
  264. return rescaled_size, scale
  265. class Resize(_BaseResize):
  266. """Resize the image."""
  267. INPUT_KEYS = "img"
  268. OUTPUT_KEYS = ["img", "img_size", "scale_factors"]
  269. DEAULT_INPUTS = {"img": "img"}
  270. DEAULT_OUTPUTS = {
  271. "img": "img",
  272. "img_size": "img_size",
  273. "scale_factors": "scale_factors",
  274. }
  275. def __init__(
  276. self, target_size, keep_ratio=False, size_divisor=None, interp="LINEAR"
  277. ):
  278. """
  279. Initialize the instance.
  280. Args:
  281. target_size (list|tuple|int): Target width and height.
  282. keep_ratio (bool, optional): Whether to keep the aspect ratio of resized
  283. image. Default: False.
  284. size_divisor (int|None, optional): Divisor of resized image size.
  285. Default: None.
  286. interp (str, optional): Interpolation method. Choices are 'NEAREST',
  287. 'LINEAR', 'CUBIC', 'AREA', and 'LANCZOS4'. Default: 'LINEAR'.
  288. """
  289. super().__init__(size_divisor=size_divisor, interp=interp)
  290. if isinstance(target_size, int):
  291. target_size = [target_size, target_size]
  292. _check_image_size(target_size)
  293. self.target_size = target_size
  294. self.keep_ratio = keep_ratio
  295. def apply(self, img):
  296. """apply"""
  297. target_size = self.target_size
  298. original_size = img.shape[:2][::-1]
  299. if self.keep_ratio:
  300. h, w = img.shape[0:2]
  301. target_size, _ = self._rescale_size((w, h), self.target_size)
  302. if self.size_divisor:
  303. target_size = [
  304. math.ceil(i / self.size_divisor) * self.size_divisor
  305. for i in target_size
  306. ]
  307. img_scale_w, img_scale_h = [
  308. target_size[0] / original_size[0],
  309. target_size[1] / original_size[1],
  310. ]
  311. img = F.resize(img, target_size, interp=self.interp)
  312. return {
  313. "img": img,
  314. "img_size": [img.shape[1], img.shape[0]],
  315. "scale_factors": [img_scale_w, img_scale_h],
  316. }
  317. class ResizeByLong(_BaseResize):
  318. """
  319. Proportionally resize the image by specifying the target length of the
  320. longest side.
  321. """
  322. INPUT_KEYS = "img"
  323. OUTPUT_KEYS = ["img", "img_size"]
  324. DEAULT_INPUTS = {"img": "img"}
  325. DEAULT_OUTPUTS = {"img": "img", "img_size": "img_size"}
  326. def __init__(self, target_long_edge, size_divisor=None, interp="LINEAR"):
  327. """
  328. Initialize the instance.
  329. Args:
  330. target_long_edge (int): Target length of the longest side of image.
  331. size_divisor (int|None, optional): Divisor of resized image size.
  332. Default: None.
  333. interp (str, optional): Interpolation method. Choices are 'NEAREST',
  334. 'LINEAR', 'CUBIC', 'AREA', and 'LANCZOS4'. Default: 'LINEAR'.
  335. """
  336. super().__init__(size_divisor=size_divisor, interp=interp)
  337. self.target_long_edge = target_long_edge
  338. def apply(self, img):
  339. """apply"""
  340. h, w = img.shape[:2]
  341. scale = self.target_long_edge / max(h, w)
  342. h_resize = round(h * scale)
  343. w_resize = round(w * scale)
  344. if self.size_divisor is not None:
  345. h_resize = math.ceil(h_resize / self.size_divisor) * self.size_divisor
  346. w_resize = math.ceil(w_resize / self.size_divisor) * self.size_divisor
  347. img = F.resize(img, (w_resize, h_resize), interp=self.interp)
  348. return {"img": img, "img_size": [img.shape[1], img.shape[0]]}
  349. class ResizeByShort(_BaseResize):
  350. """
  351. Proportionally resize the image by specifying the target length of the
  352. shortest side.
  353. """
  354. INPUT_KEYS = "img"
  355. OUTPUT_KEYS = ["img", "img_size"]
  356. DEAULT_INPUTS = {"img": "img"}
  357. DEAULT_OUTPUTS = {"img": "img", "img_size": "img_size"}
  358. def __init__(self, target_short_edge, size_divisor=None, interp="LINEAR"):
  359. """
  360. Initialize the instance.
  361. Args:
  362. target_short_edge (int): Target length of the shortest side of image.
  363. size_divisor (int|None, optional): Divisor of resized image size.
  364. Default: None.
  365. interp (str, optional): Interpolation method. Choices are 'NEAREST',
  366. 'LINEAR', 'CUBIC', 'AREA', and 'LANCZOS4'. Default: 'LINEAR'.
  367. """
  368. super().__init__(size_divisor=size_divisor, interp=interp)
  369. self.target_short_edge = target_short_edge
  370. def apply(self, img):
  371. """apply"""
  372. h, w = img.shape[:2]
  373. scale = self.target_short_edge / min(h, w)
  374. h_resize = round(h * scale)
  375. w_resize = round(w * scale)
  376. if self.size_divisor is not None:
  377. h_resize = math.ceil(h_resize / self.size_divisor) * self.size_divisor
  378. w_resize = math.ceil(w_resize / self.size_divisor) * self.size_divisor
  379. img = F.resize(img, (w_resize, h_resize), interp=self.interp)
  380. return {"img": img, "img_size": [img.shape[1], img.shape[0]]}
  381. class Pad(BaseComponent):
  382. """Pad the image."""
  383. INPUT_KEYS = "img"
  384. OUTPUT_KEYS = ["img", "img_size"]
  385. DEAULT_INPUTS = {"img": "img"}
  386. DEAULT_OUTPUTS = {"img": "img", "img_size": "img_size"}
  387. def __init__(self, target_size, val=127.5):
  388. """
  389. Initialize the instance.
  390. Args:
  391. target_size (list|tuple|int): Target width and height of the image after
  392. padding.
  393. val (float, optional): Value to fill the padded area. Default: 127.5.
  394. """
  395. super().__init__()
  396. if isinstance(target_size, int):
  397. target_size = [target_size, target_size]
  398. _check_image_size(target_size)
  399. self.target_size = target_size
  400. self.val = val
  401. def apply(self, img):
  402. """apply"""
  403. h, w = img.shape[:2]
  404. tw, th = self.target_size
  405. ph = th - h
  406. pw = tw - w
  407. if ph < 0 or pw < 0:
  408. raise ValueError(
  409. f"Input image ({w}, {h}) smaller than the target size ({tw}, {th})."
  410. )
  411. else:
  412. img = F.pad(img, pad=(0, ph, 0, pw), val=self.val)
  413. return {"img": img, "img_size": [img.shape[1], img.shape[0]]}
  414. class PadStride(BaseComponent):
  415. """padding image for model with FPN , instead PadBatch(pad_to_stride, pad_gt) in original config
  416. Args:
  417. stride (bool): model with FPN need image shape % stride == 0
  418. """
  419. INPUT_KEYS = "img"
  420. OUTPUT_KEYS = "img"
  421. DEAULT_INPUTS = {"img": "img"}
  422. DEAULT_OUTPUTS = {"img": "img"}
  423. def __init__(self, stride=0):
  424. super().__init__()
  425. self.coarsest_stride = stride
  426. def apply(self, img):
  427. """
  428. Args:
  429. im (np.ndarray): image (np.ndarray)
  430. Returns:
  431. im (np.ndarray): processed image (np.ndarray)
  432. """
  433. im = img
  434. coarsest_stride = self.coarsest_stride
  435. if coarsest_stride <= 0:
  436. return {"img": im}
  437. im_c, im_h, im_w = im.shape
  438. pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
  439. pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
  440. padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
  441. padding_im[:, :im_h, :im_w] = im
  442. return {"img": padding_im}
  443. class Normalize(BaseComponent):
  444. """Normalize the image."""
  445. INPUT_KEYS = "img"
  446. OUTPUT_KEYS = "img"
  447. DEAULT_INPUTS = {"img": "img"}
  448. DEAULT_OUTPUTS = {"img": "img"}
  449. def __init__(self, scale=1.0 / 255, mean=0.5, std=0.5, preserve_dtype=False):
  450. """
  451. Initialize the instance.
  452. Args:
  453. scale (float, optional): Scaling factor to apply to the image before
  454. applying normalization. Default: 1/255.
  455. mean (float|tuple|list, optional): Means for each channel of the image.
  456. Default: 0.5.
  457. std (float|tuple|list, optional): Standard deviations for each channel
  458. of the image. Default: 0.5.
  459. preserve_dtype (bool, optional): Whether to preserve the original dtype
  460. of the image.
  461. """
  462. super().__init__()
  463. self.scale = np.float32(scale)
  464. if isinstance(mean, float):
  465. mean = [mean]
  466. self.mean = np.asarray(mean).astype("float32")
  467. if isinstance(std, float):
  468. std = [std]
  469. self.std = np.asarray(std).astype("float32")
  470. self.preserve_dtype = preserve_dtype
  471. def apply(self, img):
  472. """apply"""
  473. old_type = img.dtype
  474. # XXX: If `old_type` has higher precision than float32,
  475. # we will lose some precision.
  476. img = img.astype("float32", copy=False)
  477. img *= self.scale
  478. img -= self.mean
  479. img /= self.std
  480. if self.preserve_dtype:
  481. img = img.astype(old_type, copy=False)
  482. return {"img": img}
  483. class ToCHWImage(BaseComponent):
  484. """Reorder the dimensions of the image from HWC to CHW."""
  485. INPUT_KEYS = "img"
  486. OUTPUT_KEYS = "img"
  487. DEAULT_INPUTS = {"img": "img"}
  488. DEAULT_OUTPUTS = {"img": "img"}
  489. def apply(self, img):
  490. """apply"""
  491. img = img.transpose((2, 0, 1))
  492. return {"img": img}