zhengchun
/
PaddleX


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
							# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math

import numpy as np
import cv2
from PIL import Image

from ..base import PyOnlyProcessor

__all__ = [
    "GetImageInfo",
    "Flip",
    "Crop",
    "Resize",
    "ResizeByLong",
    "ResizeByShort",
    "Pad",
    "PadStride",
    "Normalize",
    "ToCHWImage",
    "LaTeXOCRReisizeNormImg",
]


def _resize(im, target_size, interp):
    w, h = target_size
    im = cv2.resize(im, (w, h), interpolation=interp)
    return im


def _flip_h(im):
    if len(im.shape) == 3:
        im = im[:, ::-1, :]
    elif len(im.shape) == 2:
        im = im[:, ::-1]
    return im


def _flip_v(im):
    if len(im.shape) == 3:
        im = im[::-1, :, :]
    elif len(im.shape) == 2:
        im = im[::-1, :]
    return im


def _slice(im, coords):
    x1, y1, x2, y2 = coords
    im = im[y1:y2, x1:x2, ...]
    return im


def _pad(im, pad, val):
    if isinstance(pad, int):
        pad = [pad] * 4
    if len(pad) != 4:
        raise ValueError
    chns = 1 if im.ndim == 2 else im.shape[2]
    im = cv2.copyMakeBorder(im, *pad, cv2.BORDER_CONSTANT, value=(val,) * chns)
    return im


def _check_image_size(input_):
    if not (
        isinstance(input_, (list, tuple))
        and len(input_) == 2
        and isinstance(input_[0], int)
        and isinstance(input_[1], int)
    ):
        raise TypeError(f"{input_} cannot represent a valid image size.")


class GetImageInfo(PyOnlyProcessor):
    def __call__(self, data):
        img = data["img"]

        return {**data, "img_size": [img.shape[1], img.shape[0]]}


class Flip(PyOnlyProcessor):
    def __init__(self, mode="H"):
        super().__init__()
        if mode not in ("H", "V"):
            raise ValueError("`mode` should be 'H' or 'V'.")
        self._mode = mode

    def __call__(self, data):
        img = data["img"]

        if self._mode == "H":
            img = _flip_h(img)
        elif self._mode == "V":
            img = _flip_v(img)

        return {**data, "img": img}


class Crop(PyOnlyProcessor):
    def __init__(self, crop_size, mode="C"):
        super().__init__()
        if isinstance(crop_size, int):
            crop_size = [crop_size, crop_size]
        _check_image_size(crop_size)

        self._crop_size = crop_size

        if mode not in ("C", "TL"):
            raise ValueError("Unsupported interpolation method")
        self._mode = mode

    def __call__(self, data):
        img = data["img"]

        h, w = img.shape[:2]
        cw, ch = self._crop_size
        if self._mode == "C":
            x1 = max(0, (w - cw) // 2)
            y1 = max(0, (h - ch) // 2)
        elif self._mode == "TL":
            x1, y1 = 0, 0
        x2 = min(w, x1 + cw)
        y2 = min(h, y1 + ch)
        coords = (x1, y1, x2, y2)
        if coords == (0, 0, w, h):
            raise ValueError(
                f"Input image ({w}, {h}) smaller than the target size ({cw}, {ch})."
            )
        img = _slice(img, coords=coords)

        return {**data, "img": img, "img_size": [img.shape[1], img.shape[0]]}


class _BaseResize(PyOnlyProcessor):
    _INTERP_DICT = {
        "NEAREST": cv2.INTER_NEAREST,
        "LINEAR": cv2.INTER_LINEAR,
        "CUBIC": cv2.INTER_CUBIC,
        "AREA": cv2.INTER_AREA,
        "LANCZOS4": cv2.INTER_LANCZOS4,
    }

    def __init__(self, size_divisor, interp):
        super().__init__()

        if size_divisor is not None:
            assert isinstance(
                size_divisor, int
            ), "`size_divisor` should be None or int."
        self._size_divisor = size_divisor

        try:
            interp = self._INTERP_DICT[interp]
        except KeyError:
            raise ValueError(
                "`interp` should be one of {}.".format(self._INTERP_DICT.keys())
            )
        self._interp = interp

    @staticmethod
    def _rescale_size(img_size, target_size):
        scale = min(max(target_size) / max(img_size), min(target_size) / min(img_size))
        rescaled_size = [round(i * scale) for i in img_size]
        return rescaled_size, scale


class Resize(_BaseResize):
    def __init__(
        self, target_size, keep_ratio=False, size_divisor=None, interp="LINEAR"
    ):
        super().__init__(size_divisor=size_divisor, interp=interp)

        if isinstance(target_size, int):
            target_size = [target_size, target_size]
        _check_image_size(target_size)
        self._target_size = target_size

        self._keep_ratio = keep_ratio

    def __call__(self, data):
        img = data["img"]

        target_size = self._target_size
        original_size = img.shape[:2][::-1]

        if self._keep_ratio:
            h, w = img.shape[0:2]
            target_size, _ = self._rescale_size((w, h), self._target_size)

        if self._size_divisor:
            target_size = [
                math.ceil(i / self._size_divisor) * self._size_divisor
                for i in target_size
            ]

        img_scale_w, img_scale_h = [
            target_size[0] / original_size[0],
            target_size[1] / original_size[1],
        ]
        img = _resize(img, target_size, interp=self._interp)

        return {
            **data,
            "img": img,
            "img_size": [img.shape[1], img.shape[0]],
            "scale_factors": [img_scale_w, img_scale_h],
        }


class ResizeByLong(_BaseResize):
    def __init__(self, target_long_edge, size_divisor=None, interp="LINEAR"):
        super().__init__(size_divisor=size_divisor, interp=interp)
        self._target_long_edge = target_long_edge

    def __call__(self, data):
        img = data["img"]

        h, w = img.shape[:2]
        scale = self._target_long_edge / max(h, w)
        h_resize = round(h * scale)
        w_resize = round(w * scale)
        if self._size_divisor is not None:
            h_resize = math.ceil(h_resize / self._size_divisor) * self._size_divisor
            w_resize = math.ceil(w_resize / self._size_divisor) * self._size_divisor

        img = _resize(img, (w_resize, h_resize), interp=self._interp)

        return {**data, "img": img, "img_size": [img.shape[1], img.shape[0]]}


class ResizeByShort(_BaseResize):
    INPUT_KEYS = "img"
    OUTPUT_KEYS = ["img", "img_size"]
    DEAULT_INPUTS = {"img": "img"}
    DEAULT_OUTPUTS = {"img": "img", "img_size": "img_size"}

    def __init__(self, target_short_edge, size_divisor=None, interp="LINEAR"):
        super().__init__(size_divisor=size_divisor, interp=interp)
        self._target_short_edge = target_short_edge

    def __call__(self, data):
        img = data["img"]

        h, w = img.shape[:2]
        scale = self._target_short_edge / min(h, w)
        h_resize = round(h * scale)
        w_resize = round(w * scale)
        if self._size_divisor is not None:
            h_resize = math.ceil(h_resize / self._size_divisor) * self._size_divisor
            w_resize = math.ceil(w_resize / self._size_divisor) * self._size_divisor

        img = _resize(img, (w_resize, h_resize), interp=self._interp)

        return {**data, "img": img, "img_size": [img.shape[1], img.shape[0]]}


class Pad(PyOnlyProcessor):
    def __init__(self, target_size, val=127.5):
        super().__init__()

        if isinstance(target_size, int):
            target_size = [target_size, target_size]
        _check_image_size(target_size)
        self._target_size = target_size

        self._val = val

    def __call__(self, data):
        img = data["img"]

        h, w = img.shape[:2]
        tw, th = self._target_size
        ph = th - h
        pw = tw - w

        if ph < 0 or pw < 0:
            raise ValueError(
                f"Input image ({w}, {h}) smaller than the target size ({tw}, {th})."
            )
        else:
            img = _pad(img, pad=(0, ph, 0, pw), val=self._val)

        return {**data, "img": img, "img_size": [img.shape[1], img.shape[0]]}


class PadStride(PyOnlyProcessor):
    INPUT_KEYS = "img"
    OUTPUT_KEYS = "img"
    DEAULT_INPUTS = {"img": "img"}
    DEAULT_OUTPUTS = {"img": "img"}

    def __init__(self, stride=0):
        super().__init__()
        self._coarsest_stride = stride

    def __call__(self, data):
        img = data["img"]

        im = img
        coarsest_stride = self._coarsest_stride
        if coarsest_stride <= 0:
            return {"img": im}
        im_c, im_h, im_w = im.shape
        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
        padding_im[:, :im_h, :im_w] = im

        return {**data, "img": padding_im}


class Normalize(PyOnlyProcessor):
    def __init__(self, scale=1.0 / 255, mean=0.5, std=0.5, preserve_dtype=False):
        super().__init__()
        self._scale = np.float32(scale)
        if isinstance(mean, float):
            mean = [mean]
        self._mean = np.asarray(mean).astype("float32")
        if isinstance(std, float):
            std = [std]
        self._std = np.asarray(std).astype("float32")
        self._preserve_dtype = preserve_dtype

    def __call__(self, data):
        img = data["img"]

        old_type = img.dtype
        # XXX: If `old_type` has higher precision than float32,
        # we will lose some precision.
        img = img.astype("float32", copy=False)
        img *= self._scale
        img -= self._mean
        img /= self._std
        if self._preserve_dtype:
            img = img.astype(old_type, copy=False)

        return {**data, "img": img}


class ToCHWImage(PyOnlyProcessor):
    def __call__(self, data):
        img = data["img"]

        img = img.transpose((2, 0, 1))

        return {**data, "img": img}


class BGR2RGB(PyOnlyProcessor):
    def __call__(self, data):
        img = data["img"]

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        return {**data, "img": img}


class LaTeXOCRReisizeNormImg(PyOnlyProcessor):
    """for ocr image resize and normalization"""

    def __init__(self, rec_image_shape=(3, 48, 320)):
        super().__init__()
        self.rec_image_shape = rec_image_shape

    def pad_(self, img, divable=32):
        threshold = 128
        data = np.array(img.convert("LA"))
        if data[..., -1].var() == 0:
            data = (data[..., 0]).astype(np.uint8)
        else:
            data = (255 - data[..., -1]).astype(np.uint8)
        data = (data - data.min()) / (data.max() - data.min()) * 255
        if data.mean() > threshold:
            # To invert the text to white
            gray = 255 * (data < threshold).astype(np.uint8)
        else:
            gray = 255 * (data > threshold).astype(np.uint8)
            data = 255 - data

        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)
        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
        rect = data[b : b + h, a : a + w]
        im = Image.fromarray(rect).convert("L")
        dims = []
        for x in [w, h]:
            div, mod = divmod(x, divable)
            dims.append(divable * (div + (1 if mod > 0 else 0)))
        padded = Image.new("L", dims, 255)
        padded.paste(im, (0, 0, im.size[0], im.size[1]))
        return padded

    def minmax_size_(
        self,
        img,
        max_dimensions,
        min_dimensions,
    ):
        if max_dimensions is not None:
            ratios = [a / b for a, b in zip(img.size, max_dimensions)]
            if any([r > 1 for r in ratios]):
                size = np.array(img.size) // max(ratios)
                img = img.resize(tuple(size.astype(int)), Image.BILINEAR)
        if min_dimensions is not None:
            # hypothesis: there is a dim in img smaller than min_dimensions, and return a proper dim >= min_dimensions
            padded_size = [
                max(img_dim, min_dim)
                for img_dim, min_dim in zip(img.size, min_dimensions)
            ]
            if padded_size != list(img.size):  # assert hypothesis
                padded_im = Image.new("L", padded_size, 255)
                padded_im.paste(img, img.getbbox())
                img = padded_im
        return img

    def norm_img_latexocr(self, img):
        # CAN only predict gray scale image
        shape = (1, 1, 3)
        mean = [0.7931, 0.7931, 0.7931]
        std = [0.1738, 0.1738, 0.1738]
        scale = np.float32(1.0 / 255.0)
        min_dimensions = [32, 32]
        max_dimensions = [672, 192]
        mean = np.array(mean).reshape(shape).astype("float32")
        std = np.array(std).reshape(shape).astype("float32")

        im_h, im_w = img.shape[:2]
        if (
            min_dimensions[0] <= im_w <= max_dimensions[0]
            and min_dimensions[1] <= im_h <= max_dimensions[1]
        ):
            pass
        else:
            img = Image.fromarray(np.uint8(img))
            img = self.minmax_size_(self.pad_(img), max_dimensions, min_dimensions)
            img = np.array(img)
            im_h, im_w = img.shape[:2]
            img = np.dstack([img, img, img])
        img = (img.astype("float32") * scale - mean) / std
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        divide_h = math.ceil(im_h / 16) * 16
        divide_w = math.ceil(im_w / 16) * 16
        img = np.pad(
            img, ((0, divide_h - im_h), (0, divide_w - im_w)), constant_values=(1, 1)
        )
        img = img[:, :, np.newaxis].transpose(2, 0, 1)
        img = img.astype("float32")
        return img

    def __call__(self, data):
        """apply"""
        img = data["img"]
        img = self.norm_img_latexocr(img)
        return {"img": img}