Browse Source

predictor -> model

gaotingquan 1 year ago
parent
commit
00afa21e33

+ 1 - 2
paddlex/__init__.py

@@ -25,11 +25,10 @@ from .modules import (
     build_dataset_checker,
     build_trainer,
     build_evaluater,
-    build_predictor,
 )
 
 
-from .inference import create_predictor, create_pipeline
+from .inference import create_model, create_pipeline
 
 
 def _initialize():

+ 1 - 1
paddlex/inference/__init__.py

@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .predictors import create_predictor
+from .models import create_model
 from .pipelines import create_pipeline
 from .utils.pp_option import PaddlePredictorOption

+ 1 - 1
paddlex/inference/predictors/__init__.py → paddlex/inference/models/__init__.py

@@ -56,7 +56,7 @@ def _create_hp_predictor(
     )
 
 
-def create_predictor(
+def create_model(
     model: str, device: str = None, *args, use_hpip=False, hpi_params=None, **kwargs
 ) -> BasePredictor:
     model_dir = check_model(model)

+ 0 - 0
paddlex/inference/predictors/base.py → paddlex/inference/models/base.py


+ 0 - 0
paddlex/inference/predictors/general_recognition.py → paddlex/inference/models/general_recognition.py


+ 0 - 0
paddlex/inference/predictors/image_classification.py → paddlex/inference/models/image_classification.py


+ 0 - 0
paddlex/inference/predictors/image_unwarping.py → paddlex/inference/models/image_unwarping.py


+ 0 - 0
paddlex/inference/predictors/instance_segmentation.py → paddlex/inference/models/instance_segmentation.py


+ 0 - 0
paddlex/inference/predictors/object_detection.py → paddlex/inference/models/object_detection.py


+ 1 - 1
paddlex/inference/predictors/official_models.py → paddlex/inference/models/official_models.py

@@ -254,4 +254,4 @@ class OfficialModelsDict(dict):
         return save_dir / f"{key}"
 
 
-official_models = OfficialModelsDict(OFFICIAL_MODELS)
+official_models = OfficialModelsDict(OFFICIAL_MODELS)

+ 0 - 0
paddlex/inference/predictors/semantic_segmentation.py → paddlex/inference/models/semantic_segmentation.py


+ 0 - 0
paddlex/inference/predictors/table_recognition.py → paddlex/inference/models/table_recognition.py


+ 0 - 0
paddlex/inference/predictors/text_detection.py → paddlex/inference/models/text_detection.py


+ 0 - 0
paddlex/inference/predictors/text_recognition.py → paddlex/inference/models/text_recognition.py


+ 0 - 0
paddlex/inference/predictors/ts.py → paddlex/inference/models/ts.py


+ 0 - 0
paddlex/inference/predictors/ts_cls.py → paddlex/inference/models/ts_cls.py


+ 0 - 0
paddlex/inference/predictors/ts_fc.py → paddlex/inference/models/ts_fc.py


+ 3 - 3
paddlex/inference/pipelines/base.py

@@ -16,7 +16,7 @@ from abc import ABC
 from typing import Any, Dict, Optional
 
 from ...utils.subclass_register import AutoRegisterABCMetaClass
-from ..predictors import create_predictor
+from ..models import create_model
 
 
 class BasePipeline(ABC, metaclass=AutoRegisterABCMetaClass):
@@ -34,5 +34,5 @@ class BasePipeline(ABC, metaclass=AutoRegisterABCMetaClass):
     def __call__(self, *args, **kwargs):
         yield from self.predict(*args, **kwargs)
 
-    def _create_predictor(self, *args, **kwargs):
-        return create_predictor(*args, **kwargs, **self._predictor_kwargs)
+    def _create_model(self, *args, **kwargs):
+        return create_model(*args, **kwargs, **self._predictor_kwargs)

+ 2 - 2
paddlex/inference/pipelines/ocr.py

@@ -31,8 +31,8 @@ class OCRPipeline(BasePipeline):
         predictor_kwargs=None,
     ):
         super().__init__(predictor_kwargs)
-        self._det_predict = self._create_predictor(det_model, device=device)
-        self._rec_predict = self._create_predictor(
+        self._det_predict = self._create_model(det_model, device=device)
+        self._rec_predict = self._create_model(
             rec_model, batch_size=rec_batch_size, device=device
         )
         self.is_curve = self._det_predict.model_name in [

+ 1 - 3
paddlex/inference/pipelines/single_model_pipeline.py

@@ -26,9 +26,7 @@ class SingleModelPipeline(BasePipeline):
 
     def __init__(self, model, batch_size=1, device="gpu", predictor_kwargs=None):
         super().__init__(predictor_kwargs)
-        self._predict = self._create_predictor(
-            model, batch_size=batch_size, device=device
-        )
+        self._predict = self._create_model(model, batch_size=batch_size, device=device)
 
     def predict(self, x):
         yield from self._predict(x)

+ 2 - 2
paddlex/inference/pipelines/table_recognition/table_recognition.py

@@ -38,7 +38,7 @@ class TableRecPipeline(BasePipeline):
     ):
         super().__init__(predictor_kwargs)
 
-        self.layout_predictor = self._create_predictor(
+        self.layout_predictor = self._create_model(
             model=layout_model, device=device, batch_size=batch_size
         )
 
@@ -50,7 +50,7 @@ class TableRecPipeline(BasePipeline):
             det_device=device,
             predictor_kwargs=predictor_kwargs,
         )
-        self.table_predictor = self._create_predictor(
+        self.table_predictor = self._create_model(
             model=table_model, device=device, batch_size=batch_size
         )
         self._crop_by_boxes = CropByBoxes()

+ 3 - 3
paddlex/modules/predictor.py

@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from copy import deepcopy
-from ..inference.predictors import create_predictor
+from ..inference.models import create_model
 from ..inference.utils.pp_option import PaddlePredictorOption
 from ..utils.config import AttrDict
 
@@ -28,10 +28,10 @@ class Predictor(object):
         model = model_name if model_dir is None else model_dir
         self.input_path = predict_config.pop("input_path")
         pp_option = PaddlePredictorOption(**predict_config.pop("kernel_option", {}))
-        self.predictor = create_predictor(model, pp_option=pp_option, **predict_config)
+        self.model = create_model(model, pp_option=pp_option, **predict_config)
 
     def predict(self):
-        for res in self.predictor(self.input_path):
+        for res in self.model(self.input_path):
             res.print()
 
 

+ 0 - 115
paddlex/modules/text_detection/predictor/predictor.py

@@ -1,115 +0,0 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from operator import le
-import os
-
-import numpy as np
-from . import transforms as T
-from ....utils import logging
-from ...base import BasePredictor
-from ...base.predictor.transforms import image_common
-from .keys import TextDetKeys as K
-from ..model_list import MODELS
-
-
-class TextDetPredictor(BasePredictor):
-    """TextDetPredictor"""
-
-    entities = MODELS
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [[K.IMAGE], [K.IM_PATH]]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return [K.PROB_MAP, K.SHAPE]
-
-    def _run(self, batch_input):
-        """_run"""
-        if len(batch_input) != 1:
-            raise ValueError(
-                f"For `{self.__class__.__name__}`, batch size can only be set to 1."
-            )
-        images = [data[K.IMAGE] for data in batch_input]
-        input_ = np.stack(images, axis=0)
-        if input_.ndim == 3:
-            input_ = input_[:, np.newaxis]
-        input_ = input_.astype(dtype=np.float32, copy=False)
-        outputs = self._predictor.predict([input_])
-
-        pred = batch_input
-        pred[0][K.PROB_MAP] = outputs
-
-        return pred
-
-    def _get_pre_transforms_from_config(self):
-        """get preprocess transforms"""
-
-        if self.model_name in ['PP-OCRv4_server_seal_det', 'PP-OCRv4_mobile_seal_det']:
-            limit_side_len = 736
-        else:
-            limit_side_len = 960
-    
-        return [
-            image_common.ReadImage(),
-            T.DetResizeForTest(limit_side_len=limit_side_len, limit_type="max"),
-            T.NormalizeImage(
-                mean=[0.485, 0.456, 0.406],
-                std=[0.229, 0.224, 0.225],
-                scale=1.0 / 255,
-                order="hwc",
-            ),
-            image_common.ToCHWImage(),
-        ]
-
-    def _get_post_transforms_from_config(self):
-        """get postprocess transforms"""
-        if self.model_name in ['PP-OCRv4_server_seal_det', 'PP-OCRv4_mobile_seal_det']:
-            task = 'poly'
-            post_transforms = [
-                T.DBPostProcess(
-                    thresh=0.2,
-                    box_thresh=0.6,
-                    max_candidates=1000,
-                    unclip_ratio=1.5,
-                    use_dilation=False,
-                    score_mode="fast",
-                    box_type="poly",
-                )
-            ]
-        else:
-            task = 'quad'
-            post_transforms = [
-                T.DBPostProcess(
-                    thresh=0.3,
-                    box_thresh=0.6,
-                    max_candidates=1000,
-                    unclip_ratio=1.5,
-                    use_dilation=False,
-                    score_mode="fast",
-                    box_type="quad",
-                )
-            ]
-        if not self.disable_print:
-            post_transforms.append(T.PrintResult())
-        if not self.disable_save:
-            post_transforms.append(
-                T.SaveTextDetResults(self.output, task),
-            )
-        return post_transforms

+ 0 - 987
paddlex/modules/text_detection/predictor/transforms.py

@@ -1,987 +0,0 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import sys
-import cv2
-import copy
-import math
-import pyclipper
-import numpy as np
-from numpy.linalg import norm
-from PIL import Image
-from shapely.geometry import Polygon
-
-from ....utils import logging
-from ...base.predictor.io.writers import ImageWriter
-from ...base.predictor.io.readers import ImageReader
-from ...base.predictor import BaseTransform
-from .keys import TextDetKeys as K
-from .utils import AutoRectifier
-
-__all__ = [
-    "DetResizeForTest",
-    "NormalizeImage",
-    "DBPostProcess",
-    "SaveTextDetResults",
-    "PrintResult",
-]
-
-
-class DetResizeForTest(BaseTransform):
-    """DetResizeForTest"""
-
-    def __init__(self, **kwargs):
-        super(DetResizeForTest, self).__init__()
-        self.resize_type = 0
-        self.keep_ratio = False
-        if "image_shape" in kwargs:
-            self.image_shape = kwargs["image_shape"]
-            self.resize_type = 1
-            if "keep_ratio" in kwargs:
-                self.keep_ratio = kwargs["keep_ratio"]
-        elif "limit_side_len" in kwargs:
-            self.limit_side_len = kwargs["limit_side_len"]
-            self.limit_type = kwargs.get("limit_type", "min")
-        elif "resize_long" in kwargs:
-            self.resize_type = 2
-            self.resize_long = kwargs.get("resize_long", 960)
-        else:
-            self.limit_side_len = 736
-            self.limit_type = "min"
-
-    def apply(self, data):
-        """apply"""
-        img = data[K.IMAGE]
-        src_h, src_w, _ = img.shape
-        if sum([src_h, src_w]) < 64:
-            img = self.image_padding(img)
-
-        if self.resize_type == 0:
-            # img, shape = self.resize_image_type0(img)
-            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
-        elif self.resize_type == 2:
-            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
-        else:
-            # img, shape = self.resize_image_type1(img)
-            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
-        data[K.IMAGE] = img
-        data[K.SHAPE] = np.array([src_h, src_w, ratio_h, ratio_w])
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-
-        return [K.IMAGE]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-
-        return [K.IMAGE, K.SHAPE]
-
-    def image_padding(self, im, value=0):
-        """padding image"""
-        h, w, c = im.shape
-        im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
-        im_pad[:h, :w, :] = im
-        return im_pad
-
-    def resize_image_type1(self, img):
-        """resize the image"""
-        resize_h, resize_w = self.image_shape
-        ori_h, ori_w = img.shape[:2]  # (h, w, c)
-        if self.keep_ratio is True:
-            resize_w = ori_w * resize_h / ori_h
-            N = math.ceil(resize_w / 32)
-            resize_w = N * 32
-        ratio_h = float(resize_h) / ori_h
-        ratio_w = float(resize_w) / ori_w
-        img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        # return img, np.array([ori_h, ori_w])
-        return img, [ratio_h, ratio_w]
-
-    def resize_image_type0(self, img):
-        """
-        resize image to a size multiple of 32 which is required by the network
-        args:
-            img(array): array with shape [h, w, c]
-        return(tuple):
-            img, (ratio_h, ratio_w)
-        """
-        limit_side_len = self.limit_side_len
-        h, w, c = img.shape
-
-        # limit the max side
-        if self.limit_type == "max":
-            if max(h, w) > limit_side_len:
-                if h > w:
-                    ratio = float(limit_side_len) / h
-                else:
-                    ratio = float(limit_side_len) / w
-            else:
-                ratio = 1.0
-        elif self.limit_type == "min":
-            if min(h, w) < limit_side_len:
-                if h < w:
-                    ratio = float(limit_side_len) / h
-                else:
-                    ratio = float(limit_side_len) / w
-            else:
-                ratio = 1.0
-        elif self.limit_type == "resize_long":
-            ratio = float(limit_side_len) / max(h, w)
-        else:
-            raise Exception("not support limit type, image ")
-        resize_h = int(h * ratio)
-        resize_w = int(w * ratio)
-
-        resize_h = max(int(round(resize_h / 32) * 32), 32)
-        resize_w = max(int(round(resize_w / 32) * 32), 32)
-
-        try:
-            if int(resize_w) <= 0 or int(resize_h) <= 0:
-                return None, (None, None)
-            img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        except:
-            logging.info(img.shape, resize_w, resize_h)
-            sys.exit(0)
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-        return img, [ratio_h, ratio_w]
-
-    def resize_image_type2(self, img):
-        """resize image size"""
-        h, w, _ = img.shape
-
-        resize_w = w
-        resize_h = h
-
-        if resize_h > resize_w:
-            ratio = float(self.resize_long) / resize_h
-        else:
-            ratio = float(self.resize_long) / resize_w
-
-        resize_h = int(resize_h * ratio)
-        resize_w = int(resize_w * ratio)
-
-        max_stride = 128
-        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
-        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
-        img = cv2.resize(img, (int(resize_w), int(resize_h)))
-        ratio_h = resize_h / float(h)
-        ratio_w = resize_w / float(w)
-
-        return img, [ratio_h, ratio_w]
-
-
-class NormalizeImage(BaseTransform):
-    """normalize image such as substract mean, divide std"""
-
-    def __init__(self, scale=None, mean=None, std=None, order="chw", **kwargs):
-        if isinstance(scale, str):
-            scale = eval(scale)
-        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
-        mean = mean if mean is not None else [0.485, 0.456, 0.406]
-        std = std if std is not None else [0.229, 0.224, 0.225]
-
-        shape = (3, 1, 1) if order == "chw" else (1, 1, 3)
-        self.mean = np.array(mean).reshape(shape).astype("float32")
-        self.std = np.array(std).reshape(shape).astype("float32")
-
-    def apply(self, data):
-        """apply"""
-        img = data[K.IMAGE]
-        from PIL import Image
-
-        if isinstance(img, Image.Image):
-            img = np.array(img)
-        assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage"
-        data[K.IMAGE] = (img.astype("float32") * self.scale - self.mean) / self.std
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [K.IMAGE]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return [K.IMAGE]
-
-
-class DBPostProcess(BaseTransform):
-    """
-    The post process for Differentiable Binarization (DB).
-    """
-
-    def __init__(
-        self,
-        thresh=0.3,
-        box_thresh=0.7,
-        max_candidates=1000,
-        unclip_ratio=2.0,
-        use_dilation=False,
-        score_mode="fast",
-        box_type="quad",
-        **kwargs
-    ):
-        self.thresh = thresh
-        self.box_thresh = box_thresh
-        self.max_candidates = max_candidates
-        self.unclip_ratio = unclip_ratio
-        self.min_size = 3
-        self.score_mode = score_mode
-        self.box_type = box_type
-        assert score_mode in [
-            "slow",
-            "fast",
-        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
-
-        self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
-
-    def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
-        """_bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1}"""
-
-        bitmap = _bitmap
-        height, width = bitmap.shape
-
-        boxes = []
-        scores = []
-
-        contours, _ = cv2.findContours(
-            (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
-        )
-
-        for contour in contours[: self.max_candidates]:
-            epsilon = 0.002 * cv2.arcLength(contour, True)
-            approx = cv2.approxPolyDP(contour, epsilon, True)
-            points = approx.reshape((-1, 2))
-            if points.shape[0] < 4:
-                continue
-
-            score = self.box_score_fast(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:
-                continue
-
-            if points.shape[0] > 2:
-                box = self.unclip(points, self.unclip_ratio)
-                if len(box) > 1:
-                    continue
-            else:
-                continue
-            box = box.reshape(-1, 2)
-
-            _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
-            if sside < self.min_size + 2:
-                continue
-
-            box = np.array(box)
-            box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
-            box[:, 1] = np.clip(
-                np.round(box[:, 1] / height * dest_height), 0, dest_height
-            )
-            boxes.append(box.tolist())
-            scores.append(score)
-        return boxes, scores
-
-    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
-        """_bitmap: single map with shape (1, H, W), whose values are binarized as {0, 1}"""
-
-        bitmap = _bitmap
-        height, width = bitmap.shape
-
-        outs = cv2.findContours(
-            (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
-        )
-        if len(outs) == 3:
-            img, contours, _ = outs[0], outs[1], outs[2]
-        elif len(outs) == 2:
-            contours, _ = outs[0], outs[1]
-
-        num_contours = min(len(contours), self.max_candidates)
-
-        boxes = []
-        scores = []
-        for index in range(num_contours):
-            contour = contours[index]
-            points, sside = self.get_mini_boxes(contour)
-            if sside < self.min_size:
-                continue
-            points = np.array(points)
-            if self.score_mode == "fast":
-                score = self.box_score_fast(pred, points.reshape(-1, 2))
-            else:
-                score = self.box_score_slow(pred, contour)
-            if self.box_thresh > score:
-                continue
-
-            box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2)
-            box, sside = self.get_mini_boxes(box)
-            if sside < self.min_size + 2:
-                continue
-            box = np.array(box)
-
-            box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
-            box[:, 1] = np.clip(
-                np.round(box[:, 1] / height * dest_height), 0, dest_height
-            )
-            boxes.append(box.astype(np.int16))
-            scores.append(score)
-        return np.array(boxes, dtype=np.int16), scores
-
-    def unclip(self, box, unclip_ratio):
-        """unclip"""
-        poly = Polygon(box)
-        distance = poly.area * unclip_ratio / poly.length
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        expanded = np.array(offset.Execute(distance))
-        return expanded
-
-    def get_mini_boxes(self, contour):
-        """get mini boxes"""
-        bounding_box = cv2.minAreaRect(contour)
-        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
-        if points[1][1] > points[0][1]:
-            index_1 = 0
-            index_4 = 1
-        else:
-            index_1 = 1
-            index_4 = 0
-        if points[3][1] > points[2][1]:
-            index_2 = 2
-            index_3 = 3
-        else:
-            index_2 = 3
-            index_3 = 2
-
-        box = [points[index_1], points[index_2], points[index_3], points[index_4]]
-        return box, min(bounding_box[1])
-
-    def box_score_fast(self, bitmap, _box):
-        """box_score_fast: use bbox mean score as the mean score"""
-        h, w = bitmap.shape[:2]
-        box = _box.copy()
-        xmin = np.clip(np.floor(box[:, 0].min()).astype("int"), 0, w - 1)
-        xmax = np.clip(np.ceil(box[:, 0].max()).astype("int"), 0, w - 1)
-        ymin = np.clip(np.floor(box[:, 1].min()).astype("int"), 0, h - 1)
-        ymax = np.clip(np.ceil(box[:, 1].max()).astype("int"), 0, h - 1)
-
-        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
-        box[:, 0] = box[:, 0] - xmin
-        box[:, 1] = box[:, 1] - ymin
-        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
-        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
-
-    def box_score_slow(self, bitmap, contour):
-        """box_score_slow: use polyon mean score as the mean score"""
-        h, w = bitmap.shape[:2]
-        contour = contour.copy()
-        contour = np.reshape(contour, (-1, 2))
-
-        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
-        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
-        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
-        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
-
-        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
-
-        contour[:, 0] = contour[:, 0] - xmin
-        contour[:, 1] = contour[:, 1] - ymin
-
-        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
-        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
-
-    def apply(self, data):
-        """apply"""
-        pred = data[K.PROB_MAP]
-        shape_list = [data[K.SHAPE]]
-        pred = pred[0][:, 0, :, :]
-        segmentation = pred > self.thresh
-
-        boxes_batch = []
-        for batch_index in range(pred.shape[0]):
-            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
-            if self.dilation_kernel is not None:
-                mask = cv2.dilate(
-                    np.array(segmentation[batch_index]).astype(np.uint8),
-                    self.dilation_kernel,
-                )
-            else:
-                mask = segmentation[batch_index]
-            if self.box_type == "poly":
-                boxes, scores = self.polygons_from_bitmap(
-                    pred[batch_index], mask, src_w, src_h
-                )
-            elif self.box_type == "quad":
-                boxes, scores = self.boxes_from_bitmap(
-                    pred[batch_index], mask, src_w, src_h
-                )
-            else:
-                raise ValueError("box_type can only be one of ['quad', 'poly']")
-
-            data[K.DT_POLYS] = boxes
-            data[K.DT_SCORES] = scores
-
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [K.PROB_MAP]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return [K.DT_POLYS, K.DT_SCORES]
-
-
-class CropByPolys(BaseTransform):
-    """Crop Image by Polys"""
-
-    def __init__(self, det_box_type="quad"):
-        super().__init__()
-        self.det_box_type = det_box_type
-
-    def apply(self, data):
-        """apply"""
-        ori_im = data[K.ORI_IM]
-        if self.det_box_type == "quad":
-            dt_boxes = self.sorted_boxes(data[K.DT_POLYS])
-            dt_boxes = np.array(dt_boxes)
-            img_crop_list = []
-            for bno in range(len(dt_boxes)):
-                tmp_box = copy.deepcopy(dt_boxes[bno])
-                img_crop = self.get_minarea_rect_crop(ori_im, tmp_box)
-                img_crop_list.append(img_crop)
-        elif self.det_box_type == "poly":
-            img_crop_list = []
-            dt_boxes = data[K.DT_POLYS]
-            for bno in range(len(dt_boxes)):
-                tmp_box = copy.deepcopy(dt_boxes[bno])
-                img_crop = self.get_poly_rect_crop(ori_im.copy(), tmp_box)
-                img_crop_list.append(img_crop)
-        else:
-            raise NotImplementedError
-        data[K.SUB_IMGS] = img_crop_list
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [K.IM_PATH, K.DT_POLYS]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return [K.SUB_IMGS]
-
-    def sorted_boxes(self, dt_boxes):
-        """
-        Sort text boxes in order from top to bottom, left to right
-        args:
-            dt_boxes(array):detected text boxes with shape [4, 2]
-        return:
-            sorted boxes(array) with shape [4, 2]
-        """
-        dt_boxes = np.array(dt_boxes)
-        num_boxes = dt_boxes.shape[0]
-        sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
-        _boxes = list(sorted_boxes)
-
-        for i in range(num_boxes - 1):
-            for j in range(i, -1, -1):
-                if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
-                    _boxes[j + 1][0][0] < _boxes[j][0][0]
-                ):
-                    tmp = _boxes[j]
-                    _boxes[j] = _boxes[j + 1]
-                    _boxes[j + 1] = tmp
-                else:
-                    break
-        return _boxes
-
-    def get_minarea_rect_crop(self, img, points):
-        """get_minarea_rect_crop"""
-        bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
-        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-        index_a, index_b, index_c, index_d = 0, 1, 2, 3
-        if points[1][1] > points[0][1]:
-            index_a = 0
-            index_d = 1
-        else:
-            index_a = 1
-            index_d = 0
-        if points[3][1] > points[2][1]:
-            index_b = 2
-            index_c = 3
-        else:
-            index_b = 3
-            index_c = 2
-
-        box = [points[index_a], points[index_b], points[index_c], points[index_d]]
-        crop_img = self.get_rotate_crop_image(img, np.array(box))
-        return crop_img
-
-
-    def get_rotate_crop_image(self, img, points):
-        """
-        img_height, img_width = img.shape[0:2]
-        left = int(np.min(points[:, 0]))
-        right = int(np.max(points[:, 0]))
-        top = int(np.min(points[:, 1]))
-        bottom = int(np.max(points[:, 1]))
-        img_crop = img[top:bottom, left:right, :].copy()
-        points[:, 0] = points[:, 0] - left
-        points[:, 1] = points[:, 1] - top
-        """
-        assert len(points) == 4, "shape of points must be 4*2"
-        img_crop_width = int(
-            max(
-                np.linalg.norm(points[0] - points[1]),
-                np.linalg.norm(points[2] - points[3]),
-            )
-        )
-        img_crop_height = int(
-            max(
-                np.linalg.norm(points[0] - points[3]),
-                np.linalg.norm(points[1] - points[2]),
-            )
-        )
-        pts_std = np.float32(
-            [
-                [0, 0],
-                [img_crop_width, 0],
-                [img_crop_width, img_crop_height],
-                [0, img_crop_height],
-            ]
-        )
-        M = cv2.getPerspectiveTransform(points, pts_std)
-        dst_img = cv2.warpPerspective(
-            img,
-            M,
-            (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_REPLICATE,
-            flags=cv2.INTER_CUBIC,
-        )
-        dst_img_height, dst_img_width = dst_img.shape[0:2]
-        if dst_img_height * 1.0 / dst_img_width >= 1.5:
-            dst_img = np.rot90(dst_img)
-        return dst_img
-
-    def reorder_poly_edge(self, points):
-        """Get the respective points composing head edge, tail edge, top
-        sideline and bottom sideline.
-
-        Args:
-            points (ndarray): The points composing a text polygon.
-
-        Returns:
-            head_edge (ndarray): The two points composing the head edge of text
-                polygon.
-            tail_edge (ndarray): The two points composing the tail edge of text
-                polygon.
-            top_sideline (ndarray): The points composing top curved sideline of
-                text polygon.
-            bot_sideline (ndarray): The points composing bottom curved sideline
-                of text polygon.
-        """
-
-        assert points.ndim == 2
-        assert points.shape[0] >= 4
-        assert points.shape[1] == 2
-
-        orientation_thr=2.0             # 一个经验超参数
-
-        head_inds, tail_inds = self.find_head_tail(points, orientation_thr)
-        head_edge, tail_edge = points[head_inds], points[tail_inds]
-
-
-        pad_points = np.vstack([points, points])
-        if tail_inds[1] < 1:
-            tail_inds[1] = len(points)
-        sideline1 = pad_points[head_inds[1]:tail_inds[1]]
-        sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))]
-        return head_edge, tail_edge, sideline1, sideline2
-
-    def vector_slope(self, vec):
-        assert len(vec) == 2
-        return abs(vec[1] / (vec[0] + 1e-8)) 
-
-    def find_head_tail(self, points, orientation_thr):
-        """Find the head edge and tail edge of a text polygon.
-
-        Args:
-            points (ndarray): The points composing a text polygon.
-            orientation_thr (float): The threshold for distinguishing between
-                head edge and tail edge among the horizontal and vertical edges
-                of a quadrangle.
-
-        Returns:
-            head_inds (list): The indexes of two points composing head edge.
-            tail_inds (list): The indexes of two points composing tail edge.
-        """
-
-        assert points.ndim == 2
-        assert points.shape[0] >= 4
-        assert points.shape[1] == 2
-        assert isinstance(orientation_thr, float)
-
-        if len(points) > 4:
-            pad_points = np.vstack([points, points[0]])
-            edge_vec = pad_points[1:] - pad_points[:-1]
-
-            theta_sum = []
-            adjacent_vec_theta = []
-            for i, edge_vec1 in enumerate(edge_vec):
-                adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]]
-                adjacent_edge_vec = edge_vec[adjacent_ind]
-                temp_theta_sum = np.sum(
-                    self.vector_angle(edge_vec1, adjacent_edge_vec))
-                temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0],
-                                                        adjacent_edge_vec[1])
-                theta_sum.append(temp_theta_sum)
-                adjacent_vec_theta.append(temp_adjacent_theta)
-            theta_sum_score = np.array(theta_sum) / np.pi
-            adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi
-            poly_center = np.mean(points, axis=0)
-            edge_dist = np.maximum(
-                norm(
-                    pad_points[1:] - poly_center, axis=-1),
-                norm(
-                    pad_points[:-1] - poly_center, axis=-1))
-            dist_score = edge_dist / np.max(edge_dist)
-            position_score = np.zeros(len(edge_vec))
-            score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score
-            score += 0.35 * dist_score
-            if len(points) % 2 == 0:
-                position_score[(len(score) // 2 - 1)] += 1
-                position_score[-1] += 1
-            score += 0.1 * position_score
-            pad_score = np.concatenate([score, score])
-            score_matrix = np.zeros((len(score), len(score) - 3))
-            x = np.arange(len(score) - 3) / float(len(score) - 4)
-            gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power(
-                (x - 0.5) / 0.5, 2.) / 2)
-            gaussian = gaussian / np.max(gaussian)
-            for i in range(len(score)):
-                score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len(
-                    score) - 1)] * gaussian * 0.3
-
-            head_start, tail_increment = np.unravel_index(score_matrix.argmax(),
-                                                            score_matrix.shape)
-            tail_start = (head_start + tail_increment + 2) % len(points)
-            head_end = (head_start + 1) % len(points)
-            tail_end = (tail_start + 1) % len(points)
-
-            if head_end > tail_end:
-                head_start, tail_start = tail_start, head_start
-                head_end, tail_end = tail_end, head_end
-            head_inds = [head_start, head_end]
-            tail_inds = [tail_start, tail_end]
-        else:
-            if vector_slope(points[1] - points[0]) + vector_slope(points[
-                    3] - points[2]) < vector_slope(points[2] - points[
-                        1]) + vector_slope(points[0] - points[3]):
-                horizontal_edge_inds = [[0, 1], [2, 3]]
-                vertical_edge_inds = [[3, 0], [1, 2]]
-            else:
-                horizontal_edge_inds = [[3, 0], [1, 2]]
-                vertical_edge_inds = [[0, 1], [2, 3]]
-
-            vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[
-                vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][
-                    0]] - points[vertical_edge_inds[1][1]])
-            horizontal_len_sum = norm(points[horizontal_edge_inds[0][
-                0]] - points[horizontal_edge_inds[0][1]]) + norm(points[
-                    horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1]
-                                                            [1]])
-
-            if vertical_len_sum > horizontal_len_sum * orientation_thr:
-                head_inds = horizontal_edge_inds[0]
-                tail_inds = horizontal_edge_inds[1]
-            else:
-                head_inds = vertical_edge_inds[0]
-                tail_inds = vertical_edge_inds[1]
-
-        return head_inds, tail_inds
-
-    def vector_angle(self, vec1, vec2):
-        if vec1.ndim > 1:
-            unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8).reshape((-1, 1))
-        else:
-            unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8)
-        if vec2.ndim > 1:
-            unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8).reshape((-1, 1))
-        else:
-            unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8)
-        return np.arccos(np.clip(np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0))
-
-
-    def get_minarea_rect(self, img, points):
-        bounding_box = cv2.minAreaRect(points)
-        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-        index_a, index_b, index_c, index_d = 0, 1, 2, 3
-        if points[1][1] > points[0][1]:
-            index_a = 0
-            index_d = 1
-        else:
-            index_a = 1
-            index_d = 0
-        if points[3][1] > points[2][1]:
-            index_b = 2
-            index_c = 3
-        else:
-            index_b = 3
-            index_c = 2
-
-        box = [points[index_a], points[index_b], points[index_c], points[index_d]]
-        crop_img = self.get_rotate_crop_image(img, np.array(box))
-        return crop_img, box
-
-    def sample_points_on_bbox_bp(self, line, n=50):
-        """Resample n points on a line.
-
-        Args:
-            line (ndarray): The points composing a line.
-            n (int): The resampled points number.
-
-        Returns:
-            resampled_line (ndarray): The points composing the resampled line.
-        """
-        from numpy.linalg import norm
-        # 断言检查输入参数的有效性
-        assert line.ndim == 2
-        assert line.shape[0] >= 2
-        assert line.shape[1] == 2
-        assert isinstance(n, int)
-        assert n > 0
-
-        length_list = [
-            norm(line[i + 1] - line[i]) for i in range(len(line) - 1)
-        ]
-        total_length = sum(length_list)
-        length_cumsum = np.cumsum([0.0] + length_list)
-        delta_length = total_length / (float(n) + 1e-8)
-        current_edge_ind = 0
-        resampled_line = [line[0]]
-
-        for i in range(1, n):
-            current_line_len = i * delta_length
-            while current_edge_ind + 1 < len(
-                    length_cumsum) and current_line_len >= length_cumsum[
-                        current_edge_ind + 1]:
-                current_edge_ind += 1
-            current_edge_end_shift = current_line_len - length_cumsum[
-                current_edge_ind]
-            if current_edge_ind >= len(length_list):
-                break
-            end_shift_ratio = current_edge_end_shift / length_list[
-                current_edge_ind]
-            current_point = line[current_edge_ind] + (line[current_edge_ind + 1]
-                                                    - line[current_edge_ind]
-                                                    ) * end_shift_ratio
-            resampled_line.append(current_point)
-        resampled_line.append(line[-1])
-        resampled_line = np.array(resampled_line)
-        return resampled_line
-
-    def sample_points_on_bbox(self, line, n=50):
-        """Resample n points on a line.
-
-        Args:
-            line (ndarray): The points composing a line.
-            n (int): The resampled points number.
-
-        Returns:
-            resampled_line (ndarray): The points composing the resampled line.
-        """
-        assert line.ndim == 2
-        assert line.shape[0] >= 2
-        assert line.shape[1] == 2
-        assert isinstance(n, int)
-        assert n > 0
-
-        length_list = [
-            norm(line[i + 1] - line[i]) for i in range(len(line) - 1)
-        ]
-        total_length = sum(length_list)
-        mean_length = total_length / (len(length_list) + 1e-8)
-        group = [[0]]
-        for i in range(len(length_list)):
-            point_id = i+1
-            if length_list[i] < 0.9 * mean_length:
-                for g in group:
-                    if i in g:
-                        g.append(point_id)
-                        break
-            else:
-                g = [point_id]
-                group.append(g)
-
-        top_tail_len = norm(line[0] - line[-1])
-        if top_tail_len < 0.9 * mean_length:
-            group[0].extend(g)
-            group.remove(g)
-        mean_positions = []  
-        for indices in group:  
-            x_sum = 0  
-            y_sum = 0  
-            for index in indices:  
-                x, y = line[index]  
-                x_sum += x  
-                y_sum += y  
-            num_points = len(indices)  
-            mean_x = x_sum / num_points  
-            mean_y = y_sum / num_points  
-            mean_positions.append((mean_x, mean_y)) 
-        resampled_line = np.array(mean_positions)
-        return resampled_line
-
-    def get_poly_rect_crop(self, img, points):
-        '''
-            修改该函数,实现使用polygon,对不规则、弯曲文本的矫正以及crop
-            args: img: 图片 ndarrary格式
-            points: polygon格式的多点坐标 N*2 shape, ndarray格式
-            return: 矫正后的图片 ndarray格式
-        '''
-        points = np.array(points).astype(np.int32).reshape(-1, 2)
-        temp_crop_img, temp_box = self.get_minarea_rect(img, points)
-        # 计算最小外接矩形与polygon的IoU
-        def get_union(pD, pG):
-            return Polygon(pD).union(Polygon(pG)).area
-
-        def get_intersection_over_union(pD, pG):
-            return get_intersection(pD, pG) / (get_union(pD, pG)+ 1e-10)
-
-        def get_intersection(pD, pG):
-            return Polygon(pD).intersection(Polygon(pG)).area
-
-        cal_IoU = get_intersection_over_union(points, temp_box)
-
-        if cal_IoU >= 0.7:
-            points = self.sample_points_on_bbox_bp(points, 31)
-            return temp_crop_img
-
-        points_sample = self.sample_points_on_bbox(points)
-        points_sample = points_sample.astype(np.int32)
-        head_edge, tail_edge, top_line, bot_line = self.reorder_poly_edge(points_sample)
-
-        resample_top_line = self.sample_points_on_bbox_bp(top_line, 15)
-        resample_bot_line = self.sample_points_on_bbox_bp(bot_line, 15)
-
-        sideline_mean_shift = np.mean(
-            resample_top_line, axis=0) - np.mean(
-                resample_bot_line, axis=0)
-        if sideline_mean_shift[1] > 0:
-            resample_bot_line, resample_top_line = resample_top_line, resample_bot_line
-        rectifier = AutoRectifier()
-        new_points = np.concatenate([resample_top_line, resample_bot_line])
-        new_points_list = list(new_points.astype(np.float32).reshape(1, -1).tolist())
-
-        if len(img.shape) == 2:
-            img = np.stack((img,)*3, axis=-1)
-        img_crop, image = rectifier.run(img, new_points_list, mode='homography')
-        return img_crop[0]
-
-
-class SaveTextDetResults(BaseTransform):
-    """Save Text Det Results"""
-
-    def __init__(self, save_dir, task='quad'):
-        super().__init__()
-        self.save_dir = save_dir
-        self.task = task
-        # We use pillow backend to save both numpy arrays and PIL Image objects
-        self._writer = ImageWriter(backend="opencv")
-
-    def apply(self, data):
-        """apply"""
-        if self.save_dir is None:
-            logging.warning(
-                "The `save_dir` has been set to None, so the text detection result won't to be saved."
-            )
-            return data
-        fn = os.path.basename(data["input_path"])
-        save_path = os.path.join(self.save_dir, fn)
-        bbox_res = data[K.DT_POLYS]
-        if self.task == "quad":
-            vis_img = self.draw_rectangle(data[K.IM_PATH], bbox_res)
-        else:
-            vis_img = self.draw_polyline(data[K.IM_PATH], bbox_res)
-        self._writer.write(save_path, vis_img)
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [K.IM_PATH, K.DT_POLYS, K.DT_SCORES]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return []
-
-    def draw_rectangle(self, img_path, boxes):
-        """draw rectangle"""
-        boxes = np.array(boxes)
-        img = cv2.imread(img_path)
-        img_show = img.copy()
-        for box in boxes.astype(int):
-            box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
-            cv2.polylines(img_show, [box], True, (0, 0, 255), 2)
-        return img_show
-    
-    def draw_polyline(self, img_path, boxes):
-        """draw polyline"""
-        img = cv2.imread(img_path)
-        img_show = img.copy()
-        for box in boxes:
-            box = np.array(box).astype(int)
-            box = np.reshape(box, [-1, 1, 2]).astype(np.int64)
-            cv2.polylines(img_show, [box], True, (0, 0, 255), 2)
-        return img_show
-
-
-class PrintResult(BaseTransform):
-    """Print Result Transform"""
-
-    def apply(self, data):
-        """apply"""
-        logging.info("The prediction result is:")
-        logging.info(data[K.DT_POLYS])
-        return data
-
-    @classmethod
-    def get_input_keys(cls):
-        """get input keys"""
-        return [K.DT_SCORES]
-
-    @classmethod
-    def get_output_keys(cls):
-        """get output keys"""
-        return []
-
-    # DT_SCORES = 'dt_scores'
-    # DT_POLYS = 'dt_polys'
-
-

+ 0 - 698
paddlex/modules/text_detection/predictor/utils.py

@@ -1,698 +0,0 @@
-import os, sys
-import numpy as np
-from numpy import cos, sin, arctan, sqrt
-import cv2
-import copy
-import time
-
-def Homography(image, img_points, world_width, world_height,
-               interpolation=cv2.INTER_CUBIC, ratio_width=1.0, ratio_height=1.0):
-    """
-    将图像透视变换到新的视角,返回变换后的图像。
-    
-    Args:
-        image (np.ndarray): 输入的图像,应为numpy数组类型。
-        img_points (List[Tuple[int, int]]): 图像上的四个点的坐标,顺序为左上角、右上角、右下角、左下角。
-        world_width (int): 变换后图像在世界坐标系中的宽度。
-        world_height (int): 变换后图像在世界坐标系中的高度。
-        interpolation (int, optional): 插值方式,默认为cv2.INTER_CUBIC。
-        ratio_width (float, optional): 变换后图像在x轴上的缩放比例,默认为1.0。
-        ratio_height (float, optional): 变换后图像在y轴上的缩放比例,默认为1.0。
-    
-    Returns:
-        np.ndarray: 变换后的图像,为numpy数组类型。
-    
-    """
-    _points = np.array(img_points).reshape(-1, 2).astype(np.float32)
-
-    expand_x = int(0.5 * world_width * (ratio_width - 1))
-    expand_y = int(0.5 * world_height * (ratio_height - 1))
-
-    pt_lefttop = [expand_x, expand_y]
-    pt_righttop = [expand_x + world_width, expand_y]
-    pt_leftbottom = [expand_x + world_width, expand_y + world_height]
-    pt_rightbottom = [expand_x, expand_y + world_height]
-
-    pts_std = np.float32([pt_lefttop, pt_righttop,
-                          pt_leftbottom, pt_rightbottom])
-
-    img_crop_width = int(world_width * ratio_width)
-    img_crop_height = int(world_height * ratio_height)
-
-    M = cv2.getPerspectiveTransform(_points, pts_std)
-
-    dst_img = cv2.warpPerspective(
-        image,
-        M, (img_crop_width, img_crop_height),
-        borderMode=cv2.BORDER_CONSTANT,  # BORDER_CONSTANT BORDER_REPLICATE
-        flags=interpolation)
-
-    return dst_img
-
-
-class CurveTextRectifier:
-    """
-    spatial transformer via monocular vision
-    """
-    def __init__(self):
-        self.get_virtual_camera_parameter()
-
-
-    def get_virtual_camera_parameter(self):
-        vcam_thz = 0
-        vcam_thx1 = 180
-        vcam_thy = 180
-        vcam_thx2 = 0
-
-        vcam_x = 0
-        vcam_y = 0
-        vcam_z = 100
-
-        radian = np.pi / 180
-
-        angle_z = radian * vcam_thz
-        angle_x1 = radian * vcam_thx1
-        angle_y = radian * vcam_thy
-        angle_x2 = radian * vcam_thx2
-
-        optic_x = vcam_x
-        optic_y = vcam_y
-        optic_z = vcam_z
-
-        fu = 100
-        fv = 100
-
-        matT = np.zeros((4, 4))
-        matT[0, 0] = cos(angle_z) * cos(angle_y) - sin(angle_z) * sin(angle_x1) * sin(angle_y)
-        matT[0, 1] = cos(angle_z) * sin(angle_y) * sin(angle_x2) - sin(angle_z) * (
-                    cos(angle_x1) * cos(angle_x2) - sin(angle_x1) * cos(angle_y) * sin(angle_x2))
-        matT[0, 2] = cos(angle_z) * sin(angle_y) * cos(angle_x2) + sin(angle_z) * (
-                    cos(angle_x1) * sin(angle_x2) + sin(angle_x1) * cos(angle_y) * cos(angle_x2))
-        matT[0, 3] = optic_x
-        matT[1, 0] = sin(angle_z) * cos(angle_y) + cos(angle_z) * sin(angle_x1) * sin(angle_y)
-        matT[1, 1] = sin(angle_z) * sin(angle_y) * sin(angle_x2) + cos(angle_z) * (
-                    cos(angle_x1) * cos(angle_x2) - sin(angle_x1) * cos(angle_y) * sin(angle_x2))
-        matT[1, 2] = sin(angle_z) * sin(angle_y) * cos(angle_x2) - cos(angle_z) * (
-                    cos(angle_x1) * sin(angle_x2) + sin(angle_x1) * cos(angle_y) * cos(angle_x2))
-        matT[1, 3] = optic_y
-        matT[2, 0] = -cos(angle_x1) * sin(angle_y)
-        matT[2, 1] = cos(angle_x1) * cos(angle_y) * sin(angle_x2) + sin(angle_x1) * cos(angle_x2)
-        matT[2, 2] = cos(angle_x1) * cos(angle_y) * cos(angle_x2) - sin(angle_x1) * sin(angle_x2)
-        matT[2, 3] = optic_z
-        matT[3, 0] = 0
-        matT[3, 1] = 0
-        matT[3, 2] = 0
-        matT[3, 3] = 1
-
-        matS = np.zeros((4, 4))
-        matS[2, 3] = 0.5
-        matS[3, 2] = 0.5
-
-        self.ifu = 1 / fu
-        self.ifv = 1 / fv
-
-        self.matT = matT
-        self.matS = matS
-        self.K = np.dot(matT.T, matS)
-        self.K = np.dot(self.K, matT)
-
-
-    def vertical_text_process(self, points, org_size):
-        """
-        change sequence amd process
-        :param points:
-        :param org_size:
-        :return:
-        """
-        org_w, org_h = org_size
-        _points = np.array(points).reshape(-1).tolist()
-        _points = np.array(_points[2:] + _points[:2]).reshape(-1, 2)
-
-        # convert to horizontal points
-        adjusted_points = np.zeros(_points.shape, dtype=np.float32)
-        adjusted_points[:, 0] = _points[:, 1]
-        adjusted_points[:, 1] = org_h - _points[:, 0] - 1
-
-        _image_coord, _world_coord, _new_image_size = self.horizontal_text_process(adjusted_points)
-
-        # # convert to vertical points back
-        image_coord = _points.reshape(1, -1, 2)
-        world_coord = np.zeros(_world_coord.shape, dtype=np.float32)
-        world_coord[:, :, 0] = 0 - _world_coord[:, :, 1]
-        world_coord[:, :, 1] = _world_coord[:, :, 0]
-        world_coord[:, :, 2] = _world_coord[:, :, 2]
-        new_image_size = (_new_image_size[1], _new_image_size[0])
-
-        return image_coord, world_coord, new_image_size
-
-
-    def horizontal_text_process(self, points):
-        """
-        get image coordinate and world coordinate
-        :param points:
-        :return:
-        """
-        poly = np.array(points).reshape(-1)
-
-        dx_list = []
-        dy_list = []
-        for i in range(1, len(poly) // 2):
-            xdx = poly[i * 2] - poly[(i - 1) * 2]
-            xdy = poly[i * 2 + 1] - poly[(i - 1) * 2 + 1]
-            d = sqrt(xdx ** 2 + xdy ** 2)
-            dx_list.append(d)
-
-        for i in range(0, len(poly) // 4):
-            ydx = poly[i * 2] - poly[len(poly) - 1 - (i * 2 + 1)]
-            ydy = poly[i * 2 + 1] - poly[len(poly) - 1 - (i * 2)]
-            d = sqrt(ydx ** 2 + ydy ** 2)
-            dy_list.append(d)
-
-        dx_list = [(dx_list[i] + dx_list[len(dx_list) - 1 - i]) / 2 for i in range(len(dx_list) // 2)]
-
-        height = np.around(np.mean(dy_list))
-
-        rect_coord = [0, 0]
-        for i in range(0, len(poly) // 4 - 1):
-            x = rect_coord[-2]
-            x += dx_list[i]
-            y = 0
-            rect_coord.append(x)
-            rect_coord.append(y)
-
-        rect_coord_half = copy.deepcopy(rect_coord)
-        for i in range(0, len(poly) // 4):
-            x = rect_coord_half[len(rect_coord_half) - 2 * i - 2]
-            y = height
-            rect_coord.append(x)
-            rect_coord.append(y)
-
-        np_rect_coord = np.array(rect_coord).reshape(-1, 2)
-        x_min = np.min(np_rect_coord[:, 0])
-        y_min = np.min(np_rect_coord[:, 1])
-        x_max = np.max(np_rect_coord[:, 0])
-        y_max = np.max(np_rect_coord[:, 1])
-        new_image_size = (int(x_max - x_min + 0.5), int(y_max - y_min + 0.5))
-        x_mean = (x_max - x_min) / 2
-        y_mean = (y_max - y_min) / 2
-        np_rect_coord[:, 0] -= x_mean
-        np_rect_coord[:, 1] -= y_mean
-        rect_coord = np_rect_coord.reshape(-1).tolist()
-
-        rect_coord = np.array(rect_coord).reshape(-1, 2)
-        world_coord = np.ones((len(rect_coord), 3)) * 0
-
-        world_coord[:, :2] = rect_coord
-
-        image_coord = np.array(poly).reshape(1, -1, 2)
-        world_coord = world_coord.reshape(1, -1, 3)
-
-        return image_coord, world_coord, new_image_size
-
-
-    def horizontal_text_estimate(self, points):
-        """
-        horizontal or vertical text
-        :param points:
-        :return:
-        """
-        pts = np.array(points).reshape(-1, 2)
-        x_min = int(np.min(pts[:, 0]))
-        y_min = int(np.min(pts[:, 1]))
-        x_max = int(np.max(pts[:, 0]))
-        y_max = int(np.max(pts[:, 1]))
-        x = x_max - x_min
-        y = y_max - y_min
-        is_horizontal_text = True
-        if y / x > 1.5: # vertical text condition
-            is_horizontal_text = False
-        return is_horizontal_text
-
-
-    def virtual_camera_to_world(self, size):
-        ifu, ifv = self.ifu, self.ifv
-        K, matT = self.K, self.matT
-
-        ppu = size[0] / 2 + 1e-6
-        ppv = size[1] / 2 + 1e-6
-
-        P = np.zeros((size[1], size[0], 3))
-
-        lu = np.array([i for i in range(size[0])])
-        lv = np.array([i for i in range(size[1])])
-        u, v = np.meshgrid(lu, lv)
-
-        yp = (v - ppv) * ifv
-        xp = (u - ppu) * ifu
-        angle_a = arctan(sqrt(xp * xp + yp * yp))
-        angle_b = arctan(yp / xp)
-
-        D0 = sin(angle_a) * cos(angle_b)
-        D1 = sin(angle_a) * sin(angle_b)
-        D2 = cos(angle_a)
-
-        D0[xp <= 0] = -D0[xp <= 0]
-        D1[xp <= 0] = -D1[xp <= 0]
-
-        ratio_a = K[0, 0] * D0 * D0 + K[1, 1] * D1 * D1 + K[2, 2] * D2 * D2 + \
-                  (K[0, 1] + K[1, 0]) * D0 * D1 + (K[0, 2] + K[2, 0]) * D0 * D2 + (K[1, 2] + K[2, 1]) * D1 * D2
-        ratio_b = (K[0, 3] + K[3, 0]) * D0 + (K[1, 3] + K[3, 1]) * D1 + (K[2, 3] + K[3, 2]) * D2
-        ratio_c = K[3, 3] * np.ones(ratio_b.shape)
-
-        delta = ratio_b * ratio_b - 4 * ratio_a * ratio_c
-        t = np.zeros(delta.shape)
-        t[ratio_a == 0] = -ratio_c[ratio_a == 0] / ratio_b[ratio_a == 0]
-        t[ratio_a != 0] = (-ratio_b[ratio_a != 0] + sqrt(delta[ratio_a != 0])) / (2 * ratio_a[ratio_a != 0])
-        t[delta < 0] = 0
-
-        P[:, :, 0] = matT[0, 3] + t * (matT[0, 0] * D0 + matT[0, 1] * D1 + matT[0, 2] * D2)
-        P[:, :, 1] = matT[1, 3] + t * (matT[1, 0] * D0 + matT[1, 1] * D1 + matT[1, 2] * D2)
-        P[:, :, 2] = matT[2, 3] + t * (matT[2, 0] * D0 + matT[2, 1] * D1 + matT[2, 2] * D2)
-
-        return P
-
-
-    def world_to_image(self, image_size, world, intrinsic, distCoeffs, rotation, tvec):
-        r11 = rotation[0, 0]
-        r12 = rotation[0, 1]
-        r13 = rotation[0, 2]
-        r21 = rotation[1, 0]
-        r22 = rotation[1, 1]
-        r23 = rotation[1, 2]
-        r31 = rotation[2, 0]
-        r32 = rotation[2, 1]
-        r33 = rotation[2, 2]
-
-        t1 = tvec[0]
-        t2 = tvec[1]
-        t3 = tvec[2]
-
-        k1 = distCoeffs[0]
-        k2 = distCoeffs[1]
-        p1 = distCoeffs[2]
-        p2 = distCoeffs[3]
-        k3 = distCoeffs[4]
-        k4 = distCoeffs[5]
-        k5 = distCoeffs[6]
-        k6 = distCoeffs[7]
-
-        if len(distCoeffs) > 8:
-            s1 = distCoeffs[8]
-            s2 = distCoeffs[9]
-            s3 = distCoeffs[10]
-            s4 = distCoeffs[11]
-        else:
-            s1 = s2 = s3 = s4 = 0
-
-        if len(distCoeffs) > 12:
-            tx = distCoeffs[12]
-            ty = distCoeffs[13]
-        else:
-            tx = ty = 0
-
-        fu = intrinsic[0, 0]
-        fv = intrinsic[1, 1]
-        ppu = intrinsic[0, 2]
-        ppv = intrinsic[1, 2]
-
-        cos_tx = cos(tx)
-        cos_ty = cos(ty)
-        sin_tx = sin(tx)
-        sin_ty = sin(ty)
-
-        tao11 = cos_ty * cos_tx * cos_ty + sin_ty * cos_tx * sin_ty
-        tao12 = cos_ty * cos_tx * sin_ty * sin_tx - sin_ty * cos_tx * cos_ty * sin_tx
-        tao13 = -cos_ty * cos_tx * sin_ty * cos_tx + sin_ty * cos_tx * cos_ty * cos_tx
-        tao21 = -sin_tx * sin_ty
-        tao22 = cos_ty * cos_tx * cos_tx + sin_tx * cos_ty * sin_tx
-        tao23 = cos_ty * cos_tx * sin_tx - sin_tx * cos_ty * cos_tx
-
-        P = np.zeros((image_size[1], image_size[0], 2))
-
-        c3 = r31 * world[:, :, 0] + r32 * world[:, :, 1] + r33 * world[:, :, 2] + t3
-        c1 = r11 * world[:, :, 0] + r12 * world[:, :, 1] + r13 * world[:, :, 2] + t1
-        c2 = r21 * world[:, :, 0] + r22 * world[:, :, 1] + r23 * world[:, :, 2] + t2
-
-        x1 = c1 / c3
-        y1 = c2 / c3
-        x12 = x1 * x1
-        y12 = y1 * y1
-        x1y1 = 2 * x1 * y1
-        r2 = x12 + y12
-        r4 = r2 * r2
-        r6 = r2 * r4
-
-        radial_distortion = (1 + k1 * r2 + k2 * r4 + k3 * r6) / (1 + k4 * r2 + k5 * r4 + k6 * r6)
-        x2 = x1 * radial_distortion + p1 * x1y1 + p2 * (r2 + 2 * x12) + s1 * r2 + s2 * r4
-        y2 = y1 * radial_distortion + p2 * x1y1 + p1 * (r2 + 2 * y12) + s3 * r2 + s4 * r4
-
-        x3 = tao11 * x2 + tao12 * y2 + tao13
-        y3 = tao21 * x2 + tao22 * y2 + tao23
-
-        P[:, :, 0] = fu * x3 + ppu
-        P[:, :, 1] = fv * y3 + ppv
-        P[c3 <= 0] = 0
-
-        return P
-
-
-    def spatial_transform(self, image_data, new_image_size, mtx, dist, rvecs, tvecs, interpolation):
-        rotation, _ = cv2.Rodrigues(rvecs)
-        world_map = self.virtual_camera_to_world(new_image_size)
-        image_map = self.world_to_image(new_image_size, world_map, mtx, dist, rotation, tvecs)
-        image_map = image_map.astype(np.float32)
-        dst = cv2.remap(image_data, image_map[:, :, 0], image_map[:, :, 1], interpolation)
-        return dst
-
-
-    def calibrate(self, org_size, image_coord, world_coord):
-        """
-        calibration
-        :param org_size:
-        :param image_coord:
-        :param world_coord:
-        :return:
-        """
-        # flag = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_TILTED_MODEL  | cv2.CALIB_THIN_PRISM_MODEL
-        flag = cv2.CALIB_RATIONAL_MODEL
-        flag2 = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_TILTED_MODEL
-        flag3 = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_THIN_PRISM_MODEL
-        flag4 = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_ZERO_TANGENT_DIST | cv2.CALIB_FIX_ASPECT_RATIO
-        flag5 = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_TILTED_MODEL | cv2.CALIB_ZERO_TANGENT_DIST
-        flag6 = cv2.CALIB_RATIONAL_MODEL | cv2.CALIB_FIX_ASPECT_RATIO
-        flag_list = [flag2, flag3, flag4, flag5, flag6]
-
-        ret, mtx, dist, rvecs, tvecs = cv2.calibrateCamera(world_coord.astype(np.float32),
-                                                                image_coord.astype(np.float32),
-                                                                org_size,
-                                                                None,
-                                                                None,
-                                                                flags=flag)
-        if ret > 2:
-            # strategies
-            min_ret = ret
-            for i, flag in enumerate(flag_list):
-                _ret, _mtx, _dist, _rvecs, _tvecs = cv2.calibrateCamera(world_coord.astype(np.float32),
-                                                                   image_coord.astype(np.float32),
-                                                                   org_size,
-                                                                   None,
-                                                                   None,
-                                                                   flags=flag)
-                if _ret < min_ret:
-                    min_ret = _ret
-                    ret, mtx, dist, rvecs, tvecs = _ret, _mtx, _dist, _rvecs, _tvecs
-
-        return ret, mtx, dist, rvecs, tvecs
-
-
-    def dc_homo(self, img, img_points, obj_points, is_horizontal_text, interpolation=cv2.INTER_LINEAR,
-                ratio_width=1.0, ratio_height=1.0):
-        """
-        divide and conquer: homography
-        # ratio_width and ratio_height must be 1.0 here
-        """
-        _img_points = img_points.reshape(-1, 2)
-        _obj_points = obj_points.reshape(-1, 3)
-
-        homo_img_list = []
-        width_list = []
-        height_list = []
-        # divide and conquer
-        for i in range(len(_img_points) // 2 - 1):
-            new_img_points = np.zeros((4, 2)).astype(np.float32)
-            new_obj_points = np.zeros((4, 2)).astype(np.float32)
-
-            new_img_points[0:2, :] = _img_points[i:(i + 2), :2]
-            new_img_points[2:4, :] = _img_points[::-1, :][i:(i + 2), :2][::-1, :]
-
-            new_obj_points[0:2, :] = _obj_points[i:(i + 2), :2]
-            new_obj_points[2:4, :] = _obj_points[::-1, :][i:(i + 2), :2][::-1, :]
-
-            if is_horizontal_text:
-                world_width = np.abs(new_obj_points[1, 0] - new_obj_points[0, 0])
-                world_height = np.abs(new_obj_points[3, 1] - new_obj_points[0, 1])
-            else:
-                world_width = np.abs(new_obj_points[1, 1] - new_obj_points[0, 1])
-                world_height = np.abs(new_obj_points[3, 0] - new_obj_points[0, 0])
-
-            homo_img = Homography(img, new_img_points, world_width, world_height,
-                                              interpolation=interpolation,
-                                              ratio_width=ratio_width, ratio_height=ratio_height)
-
-            homo_img_list.append(homo_img)
-            _h, _w = homo_img.shape[:2]
-            width_list.append(_w)
-            height_list.append(_h)
-
-        # stitching
-        rectified_image = np.zeros((np.max(height_list), sum(width_list), 3)).astype(np.uint8)
-
-        st = 0
-        for (homo_img, w, h) in zip(homo_img_list, width_list, height_list):
-            rectified_image[:h, st:st + w, :] = homo_img
-            st += w
-
-        if not is_horizontal_text:
-            # vertical rotation
-            rectified_image = np.rot90(rectified_image, 3)
-
-        return rectified_image
-
-    def Homography(self, image, img_points, world_width, world_height,
-                interpolation=cv2.INTER_CUBIC, ratio_width=1.0, ratio_height=1.0):
-        """
-        将图像透视变换到新的视角,返回变换后的图像。
-        
-        Args:
-            image (np.ndarray): 输入的图像,应为numpy数组类型。
-            img_points (List[Tuple[int, int]]): 图像上的四个点的坐标,顺序为左上角、右上角、右下角、左下角。
-            world_width (int): 变换后图像在世界坐标系中的宽度。
-            world_height (int): 变换后图像在世界坐标系中的高度。
-            interpolation (int, optional): 插值方式,默认为cv2.INTER_CUBIC。
-            ratio_width (float, optional): 变换后图像在x轴上的缩放比例,默认为1.0。
-            ratio_height (float, optional): 变换后图像在y轴上的缩放比例,默认为1.0。
-        
-        Returns:
-            np.ndarray: 变换后的图像,为numpy数组类型。
-        
-        """
-        _points = np.array(img_points).reshape(-1, 2).astype(np.float32)
-
-        expand_x = int(0.5 * world_width * (ratio_width - 1))
-        expand_y = int(0.5 * world_height * (ratio_height - 1))
-
-        pt_lefttop = [expand_x, expand_y]
-        pt_righttop = [expand_x + world_width, expand_y]
-        pt_leftbottom = [expand_x + world_width, expand_y + world_height]
-        pt_rightbottom = [expand_x, expand_y + world_height]
-
-        pts_std = np.float32([pt_lefttop, pt_righttop,
-                            pt_leftbottom, pt_rightbottom])
-
-        img_crop_width = int(world_width * ratio_width)
-        img_crop_height = int(world_height * ratio_height)
-
-        M = cv2.getPerspectiveTransform(_points, pts_std)
-
-        dst_img = cv2.warpPerspective(
-            image,
-            M, (img_crop_width, img_crop_height),
-            borderMode=cv2.BORDER_CONSTANT,  # BORDER_CONSTANT BORDER_REPLICATE
-            flags=interpolation)
-
-        return dst_img
-
-
-    def __call__(self, image_data, points, interpolation=cv2.INTER_LINEAR, ratio_width=1.0, ratio_height=1.0, mode='calibration'):
-        """
-        spatial transform for a poly text
-        :param image_data:
-        :param points: [x1,y1,x2,y2,x3,y3,...], clockwise order, (x1,y1) must be the top-left of first char.
-        :param interpolation: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
-        :param ratio_width:  roi_image width expansion. It should not be smaller than 1.0
-        :param ratio_height: roi_image height expansion. It should not be smaller than 1.0
-        :param mode: 'calibration' or 'homography'. when homography, ratio_width and ratio_height must be 1.0
-        :return:
-        """
-        org_h, org_w = image_data.shape[:2]
-        org_size = (org_w, org_h)
-        self.image = image_data
-
-        is_horizontal_text = self.horizontal_text_estimate(points)
-        if is_horizontal_text:
-            image_coord, world_coord, new_image_size = self.horizontal_text_process(points)
-        else:
-            image_coord, world_coord, new_image_size = self.vertical_text_process(points, org_size)
-
-        if mode.lower() == 'calibration':
-            ret, mtx, dist, rvecs, tvecs = self.calibrate(org_size, image_coord, world_coord)
-
-            st_size = (int(new_image_size[0]*ratio_width), int(new_image_size[1]*ratio_height))
-            dst = self.spatial_transform(image_data, st_size, mtx, dist[0], rvecs[0], tvecs[0], interpolation)
-        elif mode.lower() == 'homography':
-            # ratio_width and ratio_height must be 1.0 here and ret set to 0.01 without loss manually
-            ret = 0.01
-            dst = self.dc_homo(image_data, image_coord, world_coord, is_horizontal_text,
-                               interpolation=interpolation, ratio_width=1.0, ratio_height=1.0)
-        else:
-            raise ValueError('mode must be ["calibration", "homography"], but got {}'.format(mode))
-
-        return dst, ret
-
-
-class AutoRectifier:
-    def __init__(self):
-        self.npoints = 10
-        self.curveTextRectifier = CurveTextRectifier()
-
-    @staticmethod
-    def get_rotate_crop_image(img, points, interpolation=cv2.INTER_CUBIC, ratio_width=1.0, ratio_height=1.0):
-        """
-        crop or homography
-        :param img:
-        :param points:
-        :param interpolation:
-        :param ratio_width:
-        :param ratio_height:
-        :return:
-        """
-        h, w = img.shape[:2]
-        _points = np.array(points).reshape(-1, 2).astype(np.float32)
-
-        if len(_points) != 4:
-            x_min = int(np.min(_points[:, 0]))
-            y_min = int(np.min(_points[:, 1]))
-            x_max = int(np.max(_points[:, 0]))
-            y_max = int(np.max(_points[:, 1]))
-            dx = x_max - x_min
-            dy = y_max - y_min
-            expand_x = int(0.5 * dx * (ratio_width - 1))
-            expand_y = int(0.5 * dy * (ratio_height - 1))
-            x_min = np.clip(int(x_min - expand_x), 0, w - 1)
-            y_min = np.clip(int(y_min - expand_y), 0, h - 1)
-            x_max = np.clip(int(x_max + expand_x), 0, w - 1)
-            y_max = np.clip(int(y_max + expand_y), 0, h - 1)
-
-            dst_img = img[y_min:y_max, x_min:x_max, :].copy()
-        else:
-            img_crop_width = int(
-                max(
-                    np.linalg.norm(_points[0] - _points[1]),
-                    np.linalg.norm(_points[2] - _points[3])))
-            img_crop_height = int(
-                max(
-                    np.linalg.norm(_points[0] - _points[3]),
-                    np.linalg.norm(_points[1] - _points[2])))
-
-            dst_img = Homography(img, _points, img_crop_width, img_crop_height, interpolation, ratio_width, ratio_height)
-
-        return dst_img
-
-
-    def visualize(self, image_data, points_list):
-        visualization = image_data.copy()
-
-        for box in points_list:
-            box = np.array(box).reshape(-1, 2).astype(np.int32)
-            cv2.drawContours(visualization, [np.array(box).reshape((-1, 1, 2))], -1, (0, 0, 255), 2)
-            for i, p in enumerate(box):
-                if i != 0:
-                    cv2.circle(visualization, tuple(p), radius=1, color=(255, 0, 0), thickness=2)
-                else:
-                    cv2.circle(visualization, tuple(p), radius=1, color=(255, 255, 0), thickness=2)
-        return visualization
-
-
-    def __call__(self, image_data, points, interpolation=cv2.INTER_LINEAR,
-                 ratio_width=1.0, ratio_height=1.0, loss_thresh=5.0, mode='calibration'):
-        """
-        rectification in strategies for a poly text
-        :param image_data:
-        :param points: [x1,y1,x2,y2,x3,y3,...], clockwise order, (x1,y1) must be the top-left of first char.
-        :param interpolation: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
-        :param ratio_width:  roi_image width expansion. It should not be smaller than 1.0
-        :param ratio_height: roi_image height expansion. It should not be smaller than 1.0
-        :param loss_thresh: if loss greater than loss_thresh --> get_rotate_crop_image
-        :param mode: 'calibration' or 'homography'. when homography, ratio_width and ratio_height must be 1.0
-        :return:
-        """
-        _points = np.array(points).reshape(-1,2)
-        if len(_points) >= self.npoints and len(_points) % 2 == 0:
-            try:
-                curveTextRectifier = CurveTextRectifier()
-
-                dst_img, loss = curveTextRectifier(image_data, points, interpolation, ratio_width, ratio_height, mode)
-                if loss >= 2:
-                    # for robust
-                    # large loss means it cannot be reconstruct correctly, we must find other way to reconstruct
-                    img_list, loss_list = [dst_img], [loss]
-                    _dst_img, _loss = PlanB()(image_data, points, curveTextRectifier,
-                                              interpolation, ratio_width, ratio_height,
-                                              loss_thresh=loss_thresh,
-                                              square=True)
-                    img_list += [_dst_img]
-                    loss_list += [_loss]
-
-                    _dst_img, _loss = PlanB()(image_data, points, curveTextRectifier,
-                                              interpolation, ratio_width, ratio_height,
-                                              loss_thresh=loss_thresh, square=False)
-                    img_list += [_dst_img]
-                    loss_list += [_loss]
-
-                    min_loss = min(loss_list)
-                    dst_img = img_list[loss_list.index(min_loss)]
-
-                    if min_loss >= loss_thresh:
-                        print('calibration loss: {} is too large for spatial transformer. It is failed. Using get_rotate_crop_image'.format(loss))
-                        dst_img = self.get_rotate_crop_image(image_data, points, interpolation, ratio_width, ratio_height)
-                        print('here')
-            except Exception as e:
-                print(e)
-                dst_img = self.get_rotate_crop_image(image_data, points, interpolation, ratio_width, ratio_height)
-        else:
-            dst_img = self.get_rotate_crop_image(image_data, _points, interpolation, ratio_width, ratio_height)
-
-        return dst_img
-
-
-    def run(self, image_data, points_list, interpolation=cv2.INTER_LINEAR,
-            ratio_width=1.0, ratio_height=1.0, loss_thresh=5.0, mode='calibration'):
-        """
-        run for texts in an image
-        :param image_data: numpy.ndarray. The shape is [h, w, 3]
-        :param points_list: [[x1,y1,x2,y2,x3,y3,...], [x1,y1,x2,y2,x3,y3,...], ...], clockwise order, (x1,y1) must be the top-left of first char.
-        :param interpolation: cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
-        :param ratio_width:  roi_image width expansion. It should not be smaller than 1.0
-        :param ratio_height: roi_image height expansion. It should not be smaller than 1.0
-        :param loss_thresh: if loss greater than loss_thresh --> get_rotate_crop_image
-        :param mode: 'calibration' or 'homography'. when homography, ratio_width and ratio_height must be 1.0
-        :return: res: roi-image list, visualized_image: draw polys in original image
-        """
-        if image_data is None:
-            raise ValueError
-        if not isinstance(points_list, list):
-            raise ValueError
-        for points in points_list:
-            if not isinstance(points, list):
-                raise ValueError
-
-        if ratio_width < 1.0 or ratio_height < 1.0:
-            raise ValueError('ratio_width and ratio_height cannot be smaller than 1, but got {}', (ratio_width, ratio_height))
-
-        if mode.lower() != 'calibration' and mode.lower() != 'homography':
-            raise ValueError('mode must be ["calibration", "homography"], but got {}'.format(mode))
-
-        if mode.lower() == 'homography' and ratio_width != 1.0 and ratio_height != 1.0:
-            raise ValueError('ratio_width and ratio_height must be 1.0 when mode is homography, but got mode:{}, ratio:({},{})'.format(mode, ratio_width, ratio_height))
-
-        res = []
-        for points in points_list:
-            rectified_img = self(image_data, points, interpolation, ratio_width, ratio_height,
-                                 loss_thresh=loss_thresh, mode=mode)
-            res.append(rectified_img)
-
-        # visualize
-        visualized_image = self.visualize(image_data, points_list)
-
-        return res, visualized_image
-

+ 0 - 175
paddlex/pipelines/OCR/pipeline.py

@@ -1,175 +0,0 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import cv2
-
-from ..base import BasePipeline
-from ...modules.text_detection.model_list import MODELS as text_det_models
-from ...modules.text_recognition.model_list import MODELS as text_rec_models
-from ...modules import create_model, PaddleInferenceOption
-from ...modules.text_detection import transforms as text_det_T
-from .utils import draw_ocr_box_txt
-
-
-class OCRPipeline(BasePipeline):
-    """OCR Pipeline"""
-
-    entities = "OCR"
-
-    def __init__(
-        self,
-        text_det_model_name=None,
-        text_rec_model_name=None,
-        text_det_model_dir=None,
-        text_rec_model_dir=None,
-        text_det_kernel_option=None,
-        text_rec_kernel_option=None,
-        output="./",
-        device="gpu",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_det_model_name = text_det_model_name
-        self.text_rec_model_name = text_rec_model_name
-        self.text_det_model_dir = text_det_model_dir
-        self.text_rec_model_dir = text_rec_model_dir
-        self.output = output
-        self.device = device
-        self.text_det_kernel_option = text_det_kernel_option
-        self.text_rec_kernel_option = text_rec_kernel_option
-        if self.text_det_model_name in ['PP-OCRv4_server_seal_det', 'PP-OCRv4_mobile_seal_det']:
-            self.task = "poly"
-        else:
-            self.task = "quad"
-        
-        if (
-            self.text_det_model_name is not None
-            and self.text_rec_model_name is not None
-        ):
-            self.load_model()
-
-    def check_model_name(self):
-        """check that model name is valid"""
-        assert (
-            self.text_det_model_name in text_det_models
-        ), f"The model name({self.text_det_model_name}) error. \
-Only support: {text_det_models}."
-
-        assert (
-            self.text_rec_model_name in text_rec_models
-        ), f"The model name({self.text_rec_model_name}) error. \
-Only support: {text_rec_models}."
-
-    def load_model(self):
-        """load model predictor"""
-        self.check_model_name()
-        text_det_kernel_option = (
-            self.get_kernel_option()
-            if self.text_det_kernel_option is None
-            else self.text_det_kernel_option
-        )
-        text_rec_kernel_option = (
-            self.get_kernel_option()
-            if self.text_rec_kernel_option is None
-            else self.text_rec_kernel_option
-        )
-        if self.task == "poly":
-            text_det_post_transforms = [
-                text_det_T.DBPostProcess(
-                    thresh=0.2,
-                    box_thresh=0.6,
-                    max_candidates=1000,
-                    unclip_ratio=1.5,
-                    use_dilation=False,
-                    score_mode="fast",
-                    box_type="poly",
-                ),
-                # TODO
-                text_det_T.CropByPolys(det_box_type="poly"),
-            ]
-        else:
-            text_det_post_transforms = [
-                text_det_T.DBPostProcess(
-                    thresh=0.3,
-                    box_thresh=0.6,
-                    max_candidates=1000,
-                    unclip_ratio=1.5,
-                    use_dilation=False,
-                    score_mode="fast",
-                    box_type="quad",
-                ),
-                # TODO
-                text_det_T.CropByPolys(det_box_type="quad"),
-            ]
-
-        self.text_det_model = create_model(
-            self.text_det_model_name,
-            self.text_det_model_dir,
-            kernel_option=text_det_kernel_option,
-            post_transforms=text_det_post_transforms,
-        )
-        self.text_rec_model = create_model(
-            self.text_rec_model_name,
-            self.text_rec_model_dir,
-            kernel_option=text_rec_kernel_option,
-            disable_print=self.disable_print,
-            disable_save=self.disable_save,
-        )
-
-    def predict(self, input):
-        """predict"""
-        result = self.text_det_model.predict(input)
-        all_rec_result = []
-        for i, img in enumerate(result["sub_imgs"]):
-            rec_result = self.text_rec_model.predict({"image": img})
-            all_rec_result.append(rec_result["rec_text"][0])
-        result["rec_text"] = all_rec_result
-
-        if self.output is not None:
-            draw_img = draw_ocr_box_txt(
-                result["original_image"], result["dt_polys"], result["rec_text"]
-            )
-            fn = os.path.basename(result["input_path"])
-            cv2.imwrite(
-                os.path.join(self.output, fn),
-                draw_img[:, :, ::-1],
-            )
-
-        return result
-
-    def update_model(self, model_name_list, model_dir_list):
-        """update model
-
-        Args:
-            model_name_list (list): list of model name.
-            model_dir_list (list): list of model directory.
-        """
-        assert len(model_name_list) == 2
-        self.text_det_model_name = model_name_list[0]
-        self.text_rec_model_name = model_name_list[1]
-        if model_dir_list:
-            assert len(model_dir_list) == 2
-            self.text_det_model_dir = model_dir_list[0]
-            self.text_rec_model_dir = model_dir_list[1]
-
-    def get_kernel_option(self):
-        """get kernel option"""
-        kernel_option = PaddleInferenceOption()
-        kernel_option.set_device(self.device)
-        return kernel_option
-
-    def get_input_keys(self):
-        """get dict keys of input argument input"""
-        return self.text_det_model.get_input_keys()

+ 0 - 148
paddlex/pipelines/OCR/utils.py

@@ -1,148 +0,0 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import PIL
-from PIL import Image, ImageDraw, ImageFont
-import cv2
-import numpy as np
-import random
-import math
-import copy
-
-from ...utils.fonts import PINGFANG_FONT_FILE_PATH
-
-
-def get_minarea_rect(points):
-    bounding_box = cv2.minAreaRect(points)
-    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
-
-    index_a, index_b, index_c, index_d = 0, 1, 2, 3
-    if points[1][1] > points[0][1]:
-        index_a = 0
-        index_d = 1
-    else:
-        index_a = 1
-        index_d = 0
-    if points[3][1] > points[2][1]:
-        index_b = 2
-        index_c = 3
-    else:
-        index_b = 3
-        index_c = 2
-
-    box = np.array([points[index_a], points[index_b], points[index_c], points[index_d]]).astype(np.int32)
-
-    return box
-
-def draw_ocr_box_txt(
-    img,
-    boxes,
-    txts=None,
-    scores=None,
-    drop_score=0.5,
-    font_path=PINGFANG_FONT_FILE_PATH,
-):
-    """draw ocr result"""
-    image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-    h, w = image.height, image.width
-    img_left = image.copy()
-    img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
-    random.seed(0)
-
-    draw_left = ImageDraw.Draw(img_left)
-    if txts is None or len(txts) != len(boxes):
-        txts = [None] * len(boxes)
-    for idx, (box, txt) in enumerate(zip(boxes, txts)):
-        try:
-            if scores is not None and scores[idx] < drop_score:
-                continue
-            color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
-            box = np.array(box)
-            if len(box) > 4:
-                pts = [(x, y) for x, y in box.tolist()]
-                draw_left.polygon(pts, outline=color, width=8)
-                box = get_minarea_rect(box)
-                height = int(0.5 * (max(box[:,1]) - min(box[:,1])))
-                box[:2,1] = np.mean(box[:,1])
-                box[2:,1] = np.mean(box[:,1]) + min(20, height)
-            draw_left.polygon(box, fill=color)
-            img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
-            pts = np.array(box, np.int32).reshape((-1, 1, 2))
-            cv2.polylines(img_right_text, [pts], True, color, 1)
-            img_right = cv2.bitwise_and(img_right, img_right_text)
-        except:
-            continue
-
-    img_left = Image.blend(image, img_left, 0.5)
-    img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
-    img_show.paste(img_left, (0, 0, w, h))
-    img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
-    return np.array(img_show)
-
-
-def draw_box_txt_fine(img_size, box, txt, font_path=PINGFANG_FONT_FILE_PATH):
-    """draw box text"""
-    box_height = int(
-        math.sqrt((box[0][0] - box[3][0]) ** 2 + (box[0][1] - box[3][1]) ** 2)
-    )
-    box_width = int(
-        math.sqrt((box[0][0] - box[1][0]) ** 2 + (box[0][1] - box[1][1]) ** 2)
-    )
-
-    if box_height > 2 * box_width and box_height > 30:
-        img_text = Image.new("RGB", (box_height, box_width), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_height, box_width), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-        img_text = img_text.transpose(Image.ROTATE_270)
-    else:
-        img_text = Image.new("RGB", (box_width, box_height), (255, 255, 255))
-        draw_text = ImageDraw.Draw(img_text)
-        if txt:
-            font = create_font(txt, (box_width, box_height), font_path)
-            draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font)
-
-    pts1 = np.float32(
-        [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]
-    )
-    pts2 = np.array(box, dtype=np.float32)
-    M = cv2.getPerspectiveTransform(pts1, pts2)
-
-    img_text = np.array(img_text, dtype=np.uint8)
-    img_right_text = cv2.warpPerspective(
-        img_text,
-        M,
-        img_size,
-        flags=cv2.INTER_NEAREST,
-        borderMode=cv2.BORDER_CONSTANT,
-        borderValue=(255, 255, 255),
-    )
-    return img_right_text
-
-
-def create_font(txt, sz, font_path=PINGFANG_FONT_FILE_PATH):
-    """create font"""
-    font_size = int(sz[1] * 0.8)
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    if int(PIL.__version__.split(".")[0]) < 10:
-        length = font.getsize(txt)[0]
-    else:
-        length = font.getlength(txt)
-
-    if length > sz[0]:
-        font_size = int(font_size * sz[0] / length)
-        font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-    return font