# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import cv2 import copy import math import pyclipper import numpy as np from PIL import Image from shapely.geometry import Polygon from ....utils import logging from ...base.predictor.io.writers import ImageWriter from ...base.predictor.io.readers import ImageReader from ...base.predictor import BaseTransform from .keys import TextDetKeys as K __all__ = [ 'DetResizeForTest', 'NormalizeImage', 'DBPostProcess', 'SaveTextDetResults' ] class DetResizeForTest(BaseTransform): """ DetResizeForTest """ def __init__(self, **kwargs): super(DetResizeForTest, self).__init__() self.resize_type = 0 self.keep_ratio = False if 'image_shape' in kwargs: self.image_shape = kwargs['image_shape'] self.resize_type = 1 if 'keep_ratio' in kwargs: self.keep_ratio = kwargs['keep_ratio'] elif 'limit_side_len' in kwargs: self.limit_side_len = kwargs['limit_side_len'] self.limit_type = kwargs.get('limit_type', 'min') elif 'resize_long' in kwargs: self.resize_type = 2 self.resize_long = kwargs.get('resize_long', 960) else: self.limit_side_len = 736 self.limit_type = 'min' def apply(self, data): """ apply """ img = data[K.IMAGE] src_h, src_w, _ = img.shape if sum([src_h, src_w]) < 64: img = self.image_padding(img) if self.resize_type == 0: # img, shape = self.resize_image_type0(img) img, [ratio_h, ratio_w] = self.resize_image_type0(img) elif self.resize_type == 2: img, [ratio_h, ratio_w] = self.resize_image_type2(img) else: # img, shape = self.resize_image_type1(img) img, [ratio_h, ratio_w] = self.resize_image_type1(img) data[K.IMAGE] = img data[K.SHAPE] = np.array([src_h, src_w, ratio_h, ratio_w]) return data @classmethod def get_input_keys(cls): """ get input keys """ return [K.IMAGE] @classmethod def get_output_keys(cls): """ get output keys """ return [K.IMAGE, K.SHAPE] def image_padding(self, im, value=0): """ padding image """ h, w, c = im.shape im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value im_pad[:h, :w, :] = im return im_pad def resize_image_type1(self, img): """ resize the image """ resize_h, resize_w = self.image_shape ori_h, ori_w = img.shape[:2] # (h, w, c) if self.keep_ratio is True: resize_w = ori_w * resize_h / ori_h N = math.ceil(resize_w / 32) resize_w = N * 32 ratio_h = float(resize_h) / ori_h ratio_w = float(resize_w) / ori_w img = cv2.resize(img, (int(resize_w), int(resize_h))) # return img, np.array([ori_h, ori_w]) return img, [ratio_h, ratio_w] def resize_image_type0(self, img): """ resize image to a size multiple of 32 which is required by the network args: img(array): array with shape [h, w, c] return(tuple): img, (ratio_h, ratio_w) """ limit_side_len = self.limit_side_len h, w, c = img.shape # limit the max side if self.limit_type == 'max': if max(h, w) > limit_side_len: if h > w: ratio = float(limit_side_len) / h else: ratio = float(limit_side_len) / w else: ratio = 1. elif self.limit_type == 'min': if min(h, w) < limit_side_len: if h < w: ratio = float(limit_side_len) / h else: ratio = float(limit_side_len) / w else: ratio = 1. elif self.limit_type == 'resize_long': ratio = float(limit_side_len) / max(h, w) else: raise Exception('not support limit type, image ') resize_h = int(h * ratio) resize_w = int(w * ratio) resize_h = max(int(round(resize_h / 32) * 32), 32) resize_w = max(int(round(resize_w / 32) * 32), 32) try: if int(resize_w) <= 0 or int(resize_h) <= 0: return None, (None, None) img = cv2.resize(img, (int(resize_w), int(resize_h))) except: logging.info(img.shape, resize_w, resize_h) sys.exit(0) ratio_h = resize_h / float(h) ratio_w = resize_w / float(w) return img, [ratio_h, ratio_w] def resize_image_type2(self, img): """ resize image size """ h, w, _ = img.shape resize_w = w resize_h = h if resize_h > resize_w: ratio = float(self.resize_long) / resize_h else: ratio = float(self.resize_long) / resize_w resize_h = int(resize_h * ratio) resize_w = int(resize_w * ratio) max_stride = 128 resize_h = (resize_h + max_stride - 1) // max_stride * max_stride resize_w = (resize_w + max_stride - 1) // max_stride * max_stride img = cv2.resize(img, (int(resize_w), int(resize_h))) ratio_h = resize_h / float(h) ratio_w = resize_w / float(w) return img, [ratio_h, ratio_w] class NormalizeImage(BaseTransform): """ normalize image such as substract mean, divide std """ def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): if isinstance(scale, str): scale = eval(scale) self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) mean = mean if mean is not None else [0.485, 0.456, 0.406] std = std if std is not None else [0.229, 0.224, 0.225] shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) self.mean = np.array(mean).reshape(shape).astype('float32') self.std = np.array(std).reshape(shape).astype('float32') def apply(self, data): """ apply """ img = data[K.IMAGE] from PIL import Image if isinstance(img, Image.Image): img = np.array(img) assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage" data[K.IMAGE] = ( img.astype('float32') * self.scale - self.mean) / self.std return data @classmethod def get_input_keys(cls): """ get input keys """ return [K.IMAGE] @classmethod def get_output_keys(cls): """ get output keys """ return [K.IMAGE] class DBPostProcess(BaseTransform): """ The post process for Differentiable Binarization (DB). """ def __init__(self, thresh=0.3, box_thresh=0.7, max_candidates=1000, unclip_ratio=2.0, use_dilation=False, score_mode="fast", box_type='quad', **kwargs): self.thresh = thresh self.box_thresh = box_thresh self.max_candidates = max_candidates self.unclip_ratio = unclip_ratio self.min_size = 3 self.score_mode = score_mode self.box_type = box_type assert score_mode in [ "slow", "fast" ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]]) def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): """ _bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1} """ bitmap = _bitmap height, width = bitmap.shape boxes = [] scores = [] contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) for contour in contours[:self.max_candidates]: epsilon = 0.002 * cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, epsilon, True) points = approx.reshape((-1, 2)) if points.shape[0] < 4: continue score = self.box_score_fast(pred, points.reshape(-1, 2)) if self.box_thresh > score: continue if points.shape[0] > 2: box = self.unclip(points, self.unclip_ratio) if len(box) > 1: continue else: continue box = box.reshape(-1, 2) _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) if sside < self.min_size + 2: continue box = np.array(box) box[:, 0] = np.clip( np.round(box[:, 0] / width * dest_width), 0, dest_width) box[:, 1] = np.clip( np.round(box[:, 1] / height * dest_height), 0, dest_height) boxes.append(box.tolist()) scores.append(score) return boxes, scores def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): """ _bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1} """ bitmap = _bitmap height, width = bitmap.shape outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) if len(outs) == 3: img, contours, _ = outs[0], outs[1], outs[2] elif len(outs) == 2: contours, _ = outs[0], outs[1] num_contours = min(len(contours), self.max_candidates) boxes = [] scores = [] for index in range(num_contours): contour = contours[index] points, sside = self.get_mini_boxes(contour) if sside < self.min_size: continue points = np.array(points) if self.score_mode == "fast": score = self.box_score_fast(pred, points.reshape(-1, 2)) else: score = self.box_score_slow(pred, contour) if self.box_thresh > score: continue box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) box, sside = self.get_mini_boxes(box) if sside < self.min_size + 2: continue box = np.array(box) box[:, 0] = np.clip( np.round(box[:, 0] / width * dest_width), 0, dest_width) box[:, 1] = np.clip( np.round(box[:, 1] / height * dest_height), 0, dest_height) boxes.append(box.astype(np.int16)) scores.append(score) return np.array(boxes, dtype=np.int16), scores def unclip(self, box, unclip_ratio): """ unclip """ poly = Polygon(box) distance = poly.area * unclip_ratio / poly.length offset = pyclipper.PyclipperOffset() offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) expanded = np.array(offset.Execute(distance)) return expanded def get_mini_boxes(self, contour): """ get mini boxes """ bounding_box = cv2.minAreaRect(contour) points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) index_1, index_2, index_3, index_4 = 0, 1, 2, 3 if points[1][1] > points[0][1]: index_1 = 0 index_4 = 1 else: index_1 = 1 index_4 = 0 if points[3][1] > points[2][1]: index_2 = 2 index_3 = 3 else: index_2 = 3 index_3 = 2 box = [ points[index_1], points[index_2], points[index_3], points[index_4] ] return box, min(bounding_box[1]) def box_score_fast(self, bitmap, _box): """ box_score_fast: use bbox mean score as the mean score """ h, w = bitmap.shape[:2] box = _box.copy() xmin = np.clip(np.floor(box[:, 0].min()).astype("int"), 0, w - 1) xmax = np.clip(np.ceil(box[:, 0].max()).astype("int"), 0, w - 1) ymin = np.clip(np.floor(box[:, 1].min()).astype("int"), 0, h - 1) ymax = np.clip(np.ceil(box[:, 1].max()).astype("int"), 0, h - 1) mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) box[:, 0] = box[:, 0] - xmin box[:, 1] = box[:, 1] - ymin cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] def box_score_slow(self, bitmap, contour): """ box_score_slow: use polyon mean score as the mean score """ h, w = bitmap.shape[:2] contour = contour.copy() contour = np.reshape(contour, (-1, 2)) xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) contour[:, 0] = contour[:, 0] - xmin contour[:, 1] = contour[:, 1] - ymin cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] def apply(self, data): """ apply """ pred = data[K.PROB_MAP] shape_list = [data[K.SHAPE]] pred = pred[0][:, 0, :, :] segmentation = pred > self.thresh boxes_batch = [] for batch_index in range(pred.shape[0]): src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] if self.dilation_kernel is not None: mask = cv2.dilate( np.array(segmentation[batch_index]).astype(np.uint8), self.dilation_kernel) else: mask = segmentation[batch_index] if self.box_type == 'poly': boxes, scores = self.polygons_from_bitmap(pred[batch_index], mask, src_w, src_h) elif self.box_type == 'quad': boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, src_w, src_h) else: raise ValueError("box_type can only be one of ['quad', 'poly']") data[K.DT_POLYS] = boxes data[K.DT_SCORES] = scores return data @classmethod def get_input_keys(cls): """ get input keys """ return [K.PROB_MAP] @classmethod def get_output_keys(cls): """ get output keys """ return [K.DT_POLYS, K.DT_SCORES] class CropByPolys(BaseTransform): """Crop Image by Polys """ def __init__(self, det_box_type='quad'): super().__init__() self.det_box_type = det_box_type def apply(self, data): """ apply """ ori_im = data[K.ORI_IM] # TODO # dt_boxes = self.sorted_boxes(data[K.DT_POLYS]) dt_boxes = np.array(data[K.DT_POLYS]) img_crop_list = [] for bno in range(len(dt_boxes)): tmp_box = copy.deepcopy(dt_boxes[bno]) if self.det_box_type == "quad": img_crop = self.get_rotate_crop_image(ori_im, tmp_box) else: img_crop = self.get_minarea_rect_crop(ori_im, tmp_box) img_crop_list.append(img_crop) data[K.SUB_IMGS] = img_crop_list return data @classmethod def get_input_keys(cls): """ get input keys """ return [K.IM_PATH, K.DT_POLYS] @classmethod def get_output_keys(cls): """ get output keys """ return [K.SUB_IMGS] def sorted_boxes(self, dt_boxes): """ Sort text boxes in order from top to bottom, left to right args: dt_boxes(array):detected text boxes with shape [4, 2] return: sorted boxes(array) with shape [4, 2] """ dt_boxes = np.array(dt_boxes) num_boxes = dt_boxes.shape[0] sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) _boxes = list(sorted_boxes) for i in range(num_boxes - 1): for j in range(i, -1, -1): if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and ( _boxes[j + 1][0][0] < _boxes[j][0][0]): tmp = _boxes[j] _boxes[j] = _boxes[j + 1] _boxes[j + 1] = tmp else: break return _boxes def get_minarea_rect_crop(self, img, points): """get_minarea_rect_crop """ bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32)) points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) index_a, index_b, index_c, index_d = 0, 1, 2, 3 if points[1][1] > points[0][1]: index_a = 0 index_d = 1 else: index_a = 1 index_d = 0 if points[3][1] > points[2][1]: index_b = 2 index_c = 3 else: index_b = 3 index_c = 2 box = [ points[index_a], points[index_b], points[index_c], points[index_d] ] crop_img = self.get_rotate_crop_image(img, np.array(box)) return crop_img def get_rotate_crop_image(self, img, points): """ img_height, img_width = img.shape[0:2] left = int(np.min(points[:, 0])) right = int(np.max(points[:, 0])) top = int(np.min(points[:, 1])) bottom = int(np.max(points[:, 1])) img_crop = img[top:bottom, left:right, :].copy() points[:, 0] = points[:, 0] - left points[:, 1] = points[:, 1] - top """ assert len(points) == 4, "shape of points must be 4*2" img_crop_width = int( max( np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3]))) img_crop_height = int( max( np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2]))) pts_std = np.float32([ [0, 0], [img_crop_width, 0], [img_crop_width, img_crop_height], [0, img_crop_height], ]) M = cv2.getPerspectiveTransform(points, pts_std) dst_img = cv2.warpPerspective( img, M, (img_crop_width, img_crop_height), borderMode=cv2.BORDER_REPLICATE, flags=cv2.INTER_CUBIC, ) dst_img_height, dst_img_width = dst_img.shape[0:2] if dst_img_height * 1.0 / dst_img_width >= 1.5: dst_img = np.rot90(dst_img) return dst_img class SaveTextDetResults(BaseTransform): """ Save Text Det Results """ _DEFAULT_FILE_NAME = 'text_det_out.png' def __init__(self, save_dir, file_name=None): super().__init__() self.save_dir = save_dir if file_name is None: file_name = self._DEFAULT_FILE_NAME self.file_name = file_name # We use pillow backend to save both numpy arrays and PIL Image objects self._writer = ImageWriter(backend='opencv') def apply(self, data): """ apply """ if self.save_dir is None: logging.warning( "The `save_dir` has been set to None, so the text detection result won't to be saved." ) return data save_path = os.path.join(self.save_dir, self.file_name) bbox_res = data[K.DT_POLYS] vis_img = self.draw_rectangle(data[K.IM_PATH], bbox_res) self._writer.write(save_path, vis_img) return data @classmethod def get_input_keys(cls): """ get input keys """ return [K.IM_PATH, K.DT_POLYS, K.DT_SCORES] @classmethod def get_output_keys(cls): """ get output keys """ return [] def draw_rectangle(self, img_path, boxes): """ draw rectangle """ boxes = np.array(boxes) img = cv2.imread(img_path) img_show = img.copy() for box in boxes.astype(int): box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) cv2.polylines(img_show, [box], True, (0, 0, 255), 2) return img_show