| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615 |
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import sys
- import cv2
- import copy
- import math
- import pyclipper
- import numpy as np
- from PIL import Image
- from shapely.geometry import Polygon
- from ....utils import logging
- from ...base.predictor.io.writers import ImageWriter
- from ...base.predictor.io.readers import ImageReader
- from ...base.predictor import BaseTransform
- from .keys import TextDetKeys as K
- __all__ = [
- 'DetResizeForTest', 'NormalizeImage', 'DBPostProcess', 'SaveTextDetResults'
- ]
- class DetResizeForTest(BaseTransform):
- """ DetResizeForTest """
- def __init__(self, **kwargs):
- super(DetResizeForTest, self).__init__()
- self.resize_type = 0
- self.keep_ratio = False
- if 'image_shape' in kwargs:
- self.image_shape = kwargs['image_shape']
- self.resize_type = 1
- if 'keep_ratio' in kwargs:
- self.keep_ratio = kwargs['keep_ratio']
- elif 'limit_side_len' in kwargs:
- self.limit_side_len = kwargs['limit_side_len']
- self.limit_type = kwargs.get('limit_type', 'min')
- elif 'resize_long' in kwargs:
- self.resize_type = 2
- self.resize_long = kwargs.get('resize_long', 960)
- else:
- self.limit_side_len = 736
- self.limit_type = 'min'
- def apply(self, data):
- """ apply """
- img = data[K.IMAGE]
- src_h, src_w, _ = img.shape
- if sum([src_h, src_w]) < 64:
- img = self.image_padding(img)
- if self.resize_type == 0:
- # img, shape = self.resize_image_type0(img)
- img, [ratio_h, ratio_w] = self.resize_image_type0(img)
- elif self.resize_type == 2:
- img, [ratio_h, ratio_w] = self.resize_image_type2(img)
- else:
- # img, shape = self.resize_image_type1(img)
- img, [ratio_h, ratio_w] = self.resize_image_type1(img)
- data[K.IMAGE] = img
- data[K.SHAPE] = np.array([src_h, src_w, ratio_h, ratio_w])
- return data
- @classmethod
- def get_input_keys(cls):
- """ get input keys """
- return [K.IMAGE]
- @classmethod
- def get_output_keys(cls):
- """ get output keys """
- return [K.IMAGE, K.SHAPE]
- def image_padding(self, im, value=0):
- """ padding image """
- h, w, c = im.shape
- im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
- im_pad[:h, :w, :] = im
- return im_pad
- def resize_image_type1(self, img):
- """ resize the image """
- resize_h, resize_w = self.image_shape
- ori_h, ori_w = img.shape[:2] # (h, w, c)
- if self.keep_ratio is True:
- resize_w = ori_w * resize_h / ori_h
- N = math.ceil(resize_w / 32)
- resize_w = N * 32
- ratio_h = float(resize_h) / ori_h
- ratio_w = float(resize_w) / ori_w
- img = cv2.resize(img, (int(resize_w), int(resize_h)))
- # return img, np.array([ori_h, ori_w])
- return img, [ratio_h, ratio_w]
- def resize_image_type0(self, img):
- """
- resize image to a size multiple of 32 which is required by the network
- args:
- img(array): array with shape [h, w, c]
- return(tuple):
- img, (ratio_h, ratio_w)
- """
- limit_side_len = self.limit_side_len
- h, w, c = img.shape
- # limit the max side
- if self.limit_type == 'max':
- if max(h, w) > limit_side_len:
- if h > w:
- ratio = float(limit_side_len) / h
- else:
- ratio = float(limit_side_len) / w
- else:
- ratio = 1.
- elif self.limit_type == 'min':
- if min(h, w) < limit_side_len:
- if h < w:
- ratio = float(limit_side_len) / h
- else:
- ratio = float(limit_side_len) / w
- else:
- ratio = 1.
- elif self.limit_type == 'resize_long':
- ratio = float(limit_side_len) / max(h, w)
- else:
- raise Exception('not support limit type, image ')
- resize_h = int(h * ratio)
- resize_w = int(w * ratio)
- resize_h = max(int(round(resize_h / 32) * 32), 32)
- resize_w = max(int(round(resize_w / 32) * 32), 32)
- try:
- if int(resize_w) <= 0 or int(resize_h) <= 0:
- return None, (None, None)
- img = cv2.resize(img, (int(resize_w), int(resize_h)))
- except:
- logging.info(img.shape, resize_w, resize_h)
- sys.exit(0)
- ratio_h = resize_h / float(h)
- ratio_w = resize_w / float(w)
- return img, [ratio_h, ratio_w]
- def resize_image_type2(self, img):
- """ resize image size """
- h, w, _ = img.shape
- resize_w = w
- resize_h = h
- if resize_h > resize_w:
- ratio = float(self.resize_long) / resize_h
- else:
- ratio = float(self.resize_long) / resize_w
- resize_h = int(resize_h * ratio)
- resize_w = int(resize_w * ratio)
- max_stride = 128
- resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
- resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
- img = cv2.resize(img, (int(resize_w), int(resize_h)))
- ratio_h = resize_h / float(h)
- ratio_w = resize_w / float(w)
- return img, [ratio_h, ratio_w]
- class NormalizeImage(BaseTransform):
- """ normalize image such as substract mean, divide std
- """
- def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
- if isinstance(scale, str):
- scale = eval(scale)
- self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
- mean = mean if mean is not None else [0.485, 0.456, 0.406]
- std = std if std is not None else [0.229, 0.224, 0.225]
- shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
- self.mean = np.array(mean).reshape(shape).astype('float32')
- self.std = np.array(std).reshape(shape).astype('float32')
- def apply(self, data):
- """ apply """
- img = data[K.IMAGE]
- from PIL import Image
- if isinstance(img, Image.Image):
- img = np.array(img)
- assert isinstance(img,
- np.ndarray), "invalid input 'img' in NormalizeImage"
- data[K.IMAGE] = (
- img.astype('float32') * self.scale - self.mean) / self.std
- return data
- @classmethod
- def get_input_keys(cls):
- """ get input keys """
- return [K.IMAGE]
- @classmethod
- def get_output_keys(cls):
- """ get output keys """
- return [K.IMAGE]
- class DBPostProcess(BaseTransform):
- """
- The post process for Differentiable Binarization (DB).
- """
- def __init__(self,
- thresh=0.3,
- box_thresh=0.7,
- max_candidates=1000,
- unclip_ratio=2.0,
- use_dilation=False,
- score_mode="fast",
- box_type='quad',
- **kwargs):
- self.thresh = thresh
- self.box_thresh = box_thresh
- self.max_candidates = max_candidates
- self.unclip_ratio = unclip_ratio
- self.min_size = 3
- self.score_mode = score_mode
- self.box_type = box_type
- assert score_mode in [
- "slow", "fast"
- ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
- self.dilation_kernel = None if not use_dilation else np.array([[1, 1],
- [1, 1]])
- def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
- """ _bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1} """
- bitmap = _bitmap
- height, width = bitmap.shape
- boxes = []
- scores = []
- contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
- cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
- for contour in contours[:self.max_candidates]:
- epsilon = 0.002 * cv2.arcLength(contour, True)
- approx = cv2.approxPolyDP(contour, epsilon, True)
- points = approx.reshape((-1, 2))
- if points.shape[0] < 4:
- continue
- score = self.box_score_fast(pred, points.reshape(-1, 2))
- if self.box_thresh > score:
- continue
- if points.shape[0] > 2:
- box = self.unclip(points, self.unclip_ratio)
- if len(box) > 1:
- continue
- else:
- continue
- box = box.reshape(-1, 2)
- _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
- if sside < self.min_size + 2:
- continue
- box = np.array(box)
- box[:, 0] = np.clip(
- np.round(box[:, 0] / width * dest_width), 0, dest_width)
- box[:, 1] = np.clip(
- np.round(box[:, 1] / height * dest_height), 0, dest_height)
- boxes.append(box.tolist())
- scores.append(score)
- return boxes, scores
- def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
- """ _bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1} """
- bitmap = _bitmap
- height, width = bitmap.shape
- outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
- cv2.CHAIN_APPROX_SIMPLE)
- if len(outs) == 3:
- img, contours, _ = outs[0], outs[1], outs[2]
- elif len(outs) == 2:
- contours, _ = outs[0], outs[1]
- num_contours = min(len(contours), self.max_candidates)
- boxes = []
- scores = []
- for index in range(num_contours):
- contour = contours[index]
- points, sside = self.get_mini_boxes(contour)
- if sside < self.min_size:
- continue
- points = np.array(points)
- if self.score_mode == "fast":
- score = self.box_score_fast(pred, points.reshape(-1, 2))
- else:
- score = self.box_score_slow(pred, contour)
- if self.box_thresh > score:
- continue
- box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
- box, sside = self.get_mini_boxes(box)
- if sside < self.min_size + 2:
- continue
- box = np.array(box)
- box[:, 0] = np.clip(
- np.round(box[:, 0] / width * dest_width), 0, dest_width)
- box[:, 1] = np.clip(
- np.round(box[:, 1] / height * dest_height), 0, dest_height)
- boxes.append(box.astype(np.int16))
- scores.append(score)
- return np.array(boxes, dtype=np.int16), scores
- def unclip(self, box, unclip_ratio):
- """ unclip """
- poly = Polygon(box)
- distance = poly.area * unclip_ratio / poly.length
- offset = pyclipper.PyclipperOffset()
- offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
- expanded = np.array(offset.Execute(distance))
- return expanded
- def get_mini_boxes(self, contour):
- """ get mini boxes """
- bounding_box = cv2.minAreaRect(contour)
- points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
- index_1, index_2, index_3, index_4 = 0, 1, 2, 3
- if points[1][1] > points[0][1]:
- index_1 = 0
- index_4 = 1
- else:
- index_1 = 1
- index_4 = 0
- if points[3][1] > points[2][1]:
- index_2 = 2
- index_3 = 3
- else:
- index_2 = 3
- index_3 = 2
- box = [
- points[index_1], points[index_2], points[index_3], points[index_4]
- ]
- return box, min(bounding_box[1])
- def box_score_fast(self, bitmap, _box):
- """ box_score_fast: use bbox mean score as the mean score """
- h, w = bitmap.shape[:2]
- box = _box.copy()
- xmin = np.clip(np.floor(box[:, 0].min()).astype("int"), 0, w - 1)
- xmax = np.clip(np.ceil(box[:, 0].max()).astype("int"), 0, w - 1)
- ymin = np.clip(np.floor(box[:, 1].min()).astype("int"), 0, h - 1)
- ymax = np.clip(np.ceil(box[:, 1].max()).astype("int"), 0, h - 1)
- mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
- box[:, 0] = box[:, 0] - xmin
- box[:, 1] = box[:, 1] - ymin
- cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
- return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
- def box_score_slow(self, bitmap, contour):
- """ box_score_slow: use polyon mean score as the mean score """
- h, w = bitmap.shape[:2]
- contour = contour.copy()
- contour = np.reshape(contour, (-1, 2))
- xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
- xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
- ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
- ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
- mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
- contour[:, 0] = contour[:, 0] - xmin
- contour[:, 1] = contour[:, 1] - ymin
- cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
- return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
- def apply(self, data):
- """ apply """
- pred = data[K.PROB_MAP]
- shape_list = [data[K.SHAPE]]
- pred = pred[0][:, 0, :, :]
- segmentation = pred > self.thresh
- boxes_batch = []
- for batch_index in range(pred.shape[0]):
- src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
- if self.dilation_kernel is not None:
- mask = cv2.dilate(
- np.array(segmentation[batch_index]).astype(np.uint8),
- self.dilation_kernel)
- else:
- mask = segmentation[batch_index]
- if self.box_type == 'poly':
- boxes, scores = self.polygons_from_bitmap(pred[batch_index],
- mask, src_w, src_h)
- elif self.box_type == 'quad':
- boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
- src_w, src_h)
- else:
- raise ValueError("box_type can only be one of ['quad', 'poly']")
- data[K.DT_POLYS] = boxes
- data[K.DT_SCORES] = scores
- return data
- @classmethod
- def get_input_keys(cls):
- """ get input keys """
- return [K.PROB_MAP]
- @classmethod
- def get_output_keys(cls):
- """ get output keys """
- return [K.DT_POLYS, K.DT_SCORES]
- class CropByPolys(BaseTransform):
- """Crop Image by Polys
- """
- def __init__(self, det_box_type='quad'):
- super().__init__()
- self.det_box_type = det_box_type
- def apply(self, data):
- """ apply """
- ori_im = data[K.ORI_IM]
- # TODO
- # dt_boxes = self.sorted_boxes(data[K.DT_POLYS])
- dt_boxes = np.array(data[K.DT_POLYS])
- img_crop_list = []
- for bno in range(len(dt_boxes)):
- tmp_box = copy.deepcopy(dt_boxes[bno])
- if self.det_box_type == "quad":
- img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
- else:
- img_crop = self.get_minarea_rect_crop(ori_im, tmp_box)
- img_crop_list.append(img_crop)
- data[K.SUB_IMGS] = img_crop_list
- return data
- @classmethod
- def get_input_keys(cls):
- """ get input keys """
- return [K.IM_PATH, K.DT_POLYS]
- @classmethod
- def get_output_keys(cls):
- """ get output keys """
- return [K.SUB_IMGS]
- def sorted_boxes(self, dt_boxes):
- """
- Sort text boxes in order from top to bottom, left to right
- args:
- dt_boxes(array):detected text boxes with shape [4, 2]
- return:
- sorted boxes(array) with shape [4, 2]
- """
- dt_boxes = np.array(dt_boxes)
- num_boxes = dt_boxes.shape[0]
- sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
- _boxes = list(sorted_boxes)
- for i in range(num_boxes - 1):
- for j in range(i, -1, -1):
- if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
- _boxes[j + 1][0][0] < _boxes[j][0][0]):
- tmp = _boxes[j]
- _boxes[j] = _boxes[j + 1]
- _boxes[j + 1] = tmp
- else:
- break
- return _boxes
- def get_minarea_rect_crop(self, img, points):
- """get_minarea_rect_crop
- """
- bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
- points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
- index_a, index_b, index_c, index_d = 0, 1, 2, 3
- if points[1][1] > points[0][1]:
- index_a = 0
- index_d = 1
- else:
- index_a = 1
- index_d = 0
- if points[3][1] > points[2][1]:
- index_b = 2
- index_c = 3
- else:
- index_b = 3
- index_c = 2
- box = [
- points[index_a], points[index_b], points[index_c], points[index_d]
- ]
- crop_img = self.get_rotate_crop_image(img, np.array(box))
- return crop_img
- def get_rotate_crop_image(self, img, points):
- """
- img_height, img_width = img.shape[0:2]
- left = int(np.min(points[:, 0]))
- right = int(np.max(points[:, 0]))
- top = int(np.min(points[:, 1]))
- bottom = int(np.max(points[:, 1]))
- img_crop = img[top:bottom, left:right, :].copy()
- points[:, 0] = points[:, 0] - left
- points[:, 1] = points[:, 1] - top
- """
- assert len(points) == 4, "shape of points must be 4*2"
- img_crop_width = int(
- max(
- np.linalg.norm(points[0] - points[1]),
- np.linalg.norm(points[2] - points[3])))
- img_crop_height = int(
- max(
- np.linalg.norm(points[0] - points[3]),
- np.linalg.norm(points[1] - points[2])))
- pts_std = np.float32([
- [0, 0],
- [img_crop_width, 0],
- [img_crop_width, img_crop_height],
- [0, img_crop_height],
- ])
- M = cv2.getPerspectiveTransform(points, pts_std)
- dst_img = cv2.warpPerspective(
- img,
- M,
- (img_crop_width, img_crop_height),
- borderMode=cv2.BORDER_REPLICATE,
- flags=cv2.INTER_CUBIC, )
- dst_img_height, dst_img_width = dst_img.shape[0:2]
- if dst_img_height * 1.0 / dst_img_width >= 1.5:
- dst_img = np.rot90(dst_img)
- return dst_img
- class SaveTextDetResults(BaseTransform):
- """ Save Text Det Results """
- _DEFAULT_FILE_NAME = 'text_det_out.png'
- def __init__(self, save_dir, file_name=None):
- super().__init__()
- self.save_dir = save_dir
- if file_name is None:
- file_name = self._DEFAULT_FILE_NAME
- self.file_name = file_name
- # We use pillow backend to save both numpy arrays and PIL Image objects
- self._writer = ImageWriter(backend='opencv')
- def apply(self, data):
- """ apply """
- if self.save_dir is None:
- logging.warning(
- "The `save_dir` has been set to None, so the text detection result won't to be saved."
- )
- return data
- save_path = os.path.join(self.save_dir, self.file_name)
- bbox_res = data[K.DT_POLYS]
- vis_img = self.draw_rectangle(data[K.IM_PATH], bbox_res)
- self._writer.write(save_path, vis_img)
- return data
- @classmethod
- def get_input_keys(cls):
- """ get input keys """
- return [K.IM_PATH, K.DT_POLYS, K.DT_SCORES]
- @classmethod
- def get_output_keys(cls):
- """ get output keys """
- return []
- def draw_rectangle(self, img_path, boxes):
- """ draw rectangle """
- boxes = np.array(boxes)
- img = cv2.imread(img_path)
- img_show = img.copy()
- for box in boxes.astype(int):
- box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
- cv2.polylines(img_show, [box], True, (0, 0, 255), 2)
- return img_show
|