Răsfoiți Sursa

support xycut enhanced for region

zhouchangda 5 luni în urmă
părinte
comite
1781887f73

+ 859 - 0
paddlex/inference/pipelines/layout_parsing/layout_objects.py

@@ -0,0 +1,859 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Union
+
+import numpy as np
+
+from .setting import BLOCK_LABEL_MAP, LINE_SETTINGS
+from .utils import (
+    caculate_euclidean_dist,
+    calculate_projection_overlap_ratio,
+    is_english_letter,
+    is_non_breaking_punctuation,
+    is_numeric,
+)
+
+__all__ = [
+    "TextSpan",
+    "TextLine",
+    "LayoutBlock",
+    "LayoutRegion",
+]
+
+
+class TextSpan(object):
+    """Text span class"""
+
+    def __init__(self, box, text, label):
+        """
+        Initialize a TextSpan object.
+
+        Args:
+            box (list): The bounding box of the text span.
+            text (str): The text content of the text span.
+            label (int): The label of the text span.
+        """
+        self.box = box
+        self.text = text
+        self.label = label
+
+    def __str__(self) -> str:
+        return f"{self.text}"
+
+    def __repr__(self) -> str:
+        return f"{self.text}"
+
+
+class TextLine(object):
+    """Text line class"""
+
+    def __init__(self, spans: List[TextSpan] = [], direction="horizontal"):
+        """
+        Initialize a TextLine object.
+
+        Args:
+            spans (List[TextSpan]): A list of TextSpan objects. Defaults to [].
+            direction (str): The direction of the text line. Defaults to "horizontal".
+        """
+        self.spans = spans
+        self.direction = direction
+        self.region_box = self.get_region_box()
+        self.need_new_line = False
+
+    @property
+    def labels(self):
+        return [span.label for span in self.spans]
+
+    @property
+    def boxes(self):
+        return [span.box for span in self.spans]
+
+    @property
+    def height(self):
+        start_idx = 1 if self.direction == "horizontal" else 0
+        end_idx = 3 if self.direction == "horizontal" else 2
+        return abs(self.region_box[end_idx] - self.region_box[start_idx])
+
+    @property
+    def width(self):
+        start_idx = 0 if self.direction == "horizontal" else 1
+        end_idx = 2 if self.direction == "horizontal" else 3
+        return abs(self.region_box[end_idx] - self.region_box[start_idx])
+
+    def __str__(self) -> str:
+        return f"{' '.join([str(span.text) for span in self.spans])}\n"
+
+    def __repr__(self) -> str:
+        return f"{' '.join([str(span.text) for span in self.spans])}\n"
+
+    def add_span(self, span: Union[TextSpan, List[TextSpan]]):
+        """
+        Add a span to the text line.
+
+        Args:
+            span (Union[TextSpan, List[TextSpan]]): A single TextSpan object or a list of TextSpan objects.
+        """
+        if isinstance(span, list):
+            self.spans.extend(span)
+        else:
+            self.spans.append(span)
+        self.region_box = self.get_region_box()
+
+    def get_region_box(self):
+        """
+        Get the region box of the text line.
+
+        Returns:
+            list: The region box of the text line.
+        """
+        if not self.spans:
+            return None  # or an empty list, or however you want to handle no spans
+
+        # Initialize min and max values with the first span's box
+        x_min, y_min, x_max, y_max = self.spans[0].box
+
+        for span in self.spans:
+            x_min = min(x_min, span.box[0])
+            y_min = min(y_min, span.box[1])
+            x_max = max(x_max, span.box[2])
+            y_max = max(y_max, span.box[3])
+
+        return [x_min, y_min, x_max, y_max]
+
+    def get_texts(
+        self,
+        block_label: str,
+        block_text_width: int,
+        block_start_coordinate: int,
+        block_stop_coordinate: int,
+        ori_image,
+        text_rec_model=None,
+        text_rec_score_thresh=None,
+    ):
+        """
+        Get the text of the text line.
+
+        Args:
+            block_label (str): The label of the block.
+            block_text_width (int): The width of the block.
+            block_start_coordinate (int): The starting coordinate of the block.
+            block_stop_coordinate (int): The stopping coordinate of the block.
+            ori_image (np.ndarray): The original image.
+            text_rec_model (Any): The text recognition model.
+            text_rec_score_thresh (float): The text recognition score threshold.
+
+        Returns:
+            str: The text of the text line.
+        """
+        span_box_start_index = 0 if self.direction == "horizontal" else 1
+        lines_start_index = 1 if self.direction == "horizontal" else 3
+        self.spans.sort(
+            key=lambda span: (
+                span.box[span_box_start_index] // 2,
+                (
+                    span.box[lines_start_index]
+                    if self.direction == "horizontal"
+                    else -span.box[lines_start_index]
+                ),
+            )
+        )
+        if "formula" in self.labels:
+            sort_index = 0 if self.direction == "horizontal" else 1
+            splited_spans = self.split_boxes_by_projection()
+            if len(self.spans) != len(splited_spans):
+                splited_spans.sort(key=lambda span: span.box[sort_index])
+                new_spans = []
+                for span in splited_spans:
+                    bbox = span.box
+                    if span.label == "text":
+                        crop_img = ori_image[
+                            int(bbox[1]) : int(bbox[3]),
+                            int(bbox[0]) : int(bbox[2]),
+                        ]
+                        crop_img_rec_res = next(text_rec_model([crop_img]))
+                        crop_img_rec_score = crop_img_rec_res["rec_score"]
+                        crop_img_rec_text = crop_img_rec_res["rec_text"]
+                        span.text = crop_img_rec_text
+                        if crop_img_rec_score < text_rec_score_thresh:
+                            continue
+                    new_spans.append(span)
+                self.spans = new_spans
+        line_text = self.format_line(
+            block_text_width,
+            block_start_coordinate,
+            block_stop_coordinate,
+            line_gap_limit=self.height * 1.5,
+            block_label=block_label,
+        )
+        return line_text
+
+    def is_projection_contained(self, box_a, box_b, start_idx, end_idx):
+        """Check if box_a completely contains box_b in the x-direction."""
+        return box_a[start_idx] <= box_b[start_idx] and box_a[end_idx] >= box_b[end_idx]
+
+    def split_boxes_by_projection(self, offset=1e-5):
+        """
+        Check if there is any complete containment in the x-direction
+        between the bounding boxes and split the containing box accordingly.
+
+        Args:
+            offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
+        Returns:
+            A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
+        """
+
+        new_spans = []
+        if self.direction == "horizontal":
+            projection_start_index, projection_end_index = 0, 2
+        else:
+            projection_start_index, projection_end_index = 1, 3
+
+        for i in range(len(self.spans)):
+            span = self.spans[i]
+            is_split = False
+            for j in range(i, len(self.spans)):
+                box_b = self.spans[j].box
+                box_a, text, label = span.box, span.text, span.label
+                if self.is_projection_contained(
+                    box_a, box_b, projection_start_index, projection_end_index
+                ):
+                    is_split = True
+                    # Split box_a based on the x-coordinates of box_b
+                    if box_a[projection_start_index] < box_b[projection_start_index]:
+                        w = (
+                            box_b[projection_start_index]
+                            - offset
+                            - box_a[projection_start_index]
+                        )
+                        if w > 1:
+                            new_bbox = box_a.copy()
+                            new_bbox[projection_end_index] = (
+                                box_b[projection_start_index] - offset
+                            )
+                            new_spans.append(
+                                TextSpan(
+                                    box=np.array(new_bbox),
+                                    text=text,
+                                    label=label,
+                                )
+                            )
+                    if box_a[projection_end_index] > box_b[projection_end_index]:
+                        w = (
+                            box_a[projection_end_index]
+                            - box_b[projection_end_index]
+                            + offset
+                        )
+                        if w > 1:
+                            box_a[projection_start_index] = (
+                                box_b[projection_end_index] + offset
+                            )
+                            span = TextSpan(
+                                box=np.array(box_a),
+                                text=text,
+                                label=label,
+                            )
+                if j == len(self.spans) - 1 and is_split:
+                    new_spans.append(span)
+            if not is_split:
+                new_spans.append(span)
+
+        return new_spans
+
+    def format_line(
+        self,
+        block_text_width: int,
+        block_start_coordinate: int,
+        block_stop_coordinate: int,
+        line_gap_limit: int = 10,
+        block_label: str = "text",
+    ) -> str:
+        """
+        Format a line of text spans based on layout constraints.
+
+        Args:
+            block_text_width (int): The width of the block.
+            block_start_coordinate (int): The starting coordinate of the block.
+            block_stop_coordinate (int): The stopping coordinate of the block.
+            line_gap_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
+            block_label (str): The label associated with the entire block. Default is 'text'.
+        Returns:
+            str: Formatted line of text.
+        """
+        first_span_box = self.spans[0].box
+        last_span_box = self.spans[-1].box
+
+        line_text = ""
+        for span in self.spans:
+            if span.label == "formula" and block_label != "formula":
+                formula_rec = span.text
+                if not formula_rec.startswith("$") and not formula_rec.endswith("$"):
+                    if len(self.spans) > 1:
+                        span.text = f"${span.text}$"
+                    else:
+                        span.text = f"\n${span.text}$"
+            line_text += span.text
+            if (
+                len(span.text) > 0
+                and is_english_letter(line_text[-1])
+                or span.label == "formula"
+            ):
+                line_text += " "
+
+        if self.direction == "horizontal":
+            text_stop_index = 2
+        else:
+            text_stop_index = 3
+
+        if line_text.endswith(" "):
+            line_text = line_text[:-1]
+
+        if len(line_text) == 0:
+            return ""
+
+        last_char = line_text[-1]
+
+        if (
+            not is_english_letter(last_char)
+            and not is_non_breaking_punctuation(last_char)
+            and not is_numeric(last_char)
+        ) or (
+            block_stop_coordinate - last_span_box[text_stop_index]
+            > block_text_width * 0.3
+        ):
+            if (
+                self.direction == "horizontal"
+                and block_stop_coordinate - last_span_box[text_stop_index]
+                > line_gap_limit
+            ) or (
+                self.direction == "vertical"
+                and (
+                    block_stop_coordinate - last_span_box[text_stop_index]
+                    > line_gap_limit
+                    or first_span_box[1] - block_start_coordinate > line_gap_limit
+                )
+            ):
+                self.need_new_line = True
+
+        if line_text.endswith("-"):
+            line_text = line_text[:-1]
+            return line_text
+
+        if (len(line_text) > 0 and is_english_letter(last_char)) or line_text.endswith(
+            "$"
+        ):
+            line_text += " "
+        if (
+            len(line_text) > 0
+            and not is_english_letter(last_char)
+            and not is_numeric(last_char)
+        ) or self.direction == "vertical":
+            if (
+                block_stop_coordinate - last_span_box[text_stop_index]
+                > block_text_width * 0.3
+                and len(line_text) > 0
+                and not is_non_breaking_punctuation(last_char)
+            ):
+                line_text += "\n"
+                self.need_new_line = True
+        elif (
+            block_stop_coordinate - last_span_box[text_stop_index]
+            > (block_stop_coordinate - block_start_coordinate) * 0.5
+        ):
+            line_text += "\n"
+            self.need_new_line = True
+
+        return line_text
+
+
+class LayoutBlock(object):
+    """Layout Block Class"""
+
+    def __init__(self, label, bbox, content="") -> None:
+        """
+        Initialize a LayoutBlock object.
+
+        Args:
+            label (str): Label assigned to the block.
+            bbox (list): Bounding box coordinates of the block.
+            content (str, optional): Content of the block. Defaults to an empty string.
+        """
+        self.label = label
+        self.order_label = None
+        self.bbox = list(map(int, bbox))
+        self.content = content
+        self.seg_start_coordinate = float("inf")
+        self.seg_end_coordinate = float("-inf")
+        self.width = bbox[2] - bbox[0]
+        self.height = bbox[3] - bbox[1]
+        self.area = self.width * self.height
+        self.num_of_lines = 1
+        self.image = None
+        self.index = None
+        self.order_index = None
+        self.text_line_width = 1
+        self.text_line_height = 1
+        self.child_blocks = []
+        self.update_direction()
+
+    def __str__(self) -> str:
+        _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
+        return _str
+
+    def __repr__(self) -> str:
+        _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
+        return _str
+
+    def to_dict(self) -> dict:
+        return self.__dict__
+
+    def update_direction(self, direction=None) -> None:
+        """
+        Update the direction of the block based on its bounding box.
+
+        Args:
+            direction (str, optional): Direction of the block. If not provided, it will be determined automatically using the bounding box. Defaults to None.
+        """
+        if not direction:
+            direction = self.get_bbox_direction()
+        self.direction = direction
+        self.update_direction_info()
+
+    def update_direction_info(self) -> None:
+        """Update the direction information of the block based on its direction."""
+        if self.direction == "horizontal":
+            self.secondary_direction = "vertical"
+            self.short_side_length = self.height
+            self.long_side_length = self.width
+            self.start_coordinate = self.bbox[0]
+            self.end_coordinate = self.bbox[2]
+            self.secondary_direction_start_coordinate = self.bbox[1]
+            self.secondary_direction_end_coordinate = self.bbox[3]
+        else:
+            self.secondary_direction = "horizontal"
+            self.short_side_length = self.width
+            self.long_side_length = self.height
+            self.start_coordinate = self.bbox[1]
+            self.end_coordinate = self.bbox[3]
+            self.secondary_direction_start_coordinate = self.bbox[0]
+            self.secondary_direction_end_coordinate = self.bbox[2]
+
+    def append_child_block(self, child_block) -> None:
+        """
+        Append a child block to the current block.
+
+        Args:
+            child_block (LayoutBlock): Child block to be added.
+        Returns:
+            None
+        """
+        if not self.child_blocks:
+            self.ori_bbox = self.bbox.copy()
+        x1, y1, x2, y2 = self.bbox
+        x1_child, y1_child, x2_child, y2_child = child_block.bbox
+        union_bbox = (
+            min(x1, x1_child),
+            min(y1, y1_child),
+            max(x2, x2_child),
+            max(y2, y2_child),
+        )
+        self.bbox = union_bbox
+        self.update_direction_info()
+        child_blocks = [child_block]
+        if child_block.child_blocks:
+            child_blocks.extend(child_block.get_child_blocks())
+        self.child_blocks.extend(child_blocks)
+
+    def get_child_blocks(self) -> list:
+        """Get all child blocks of the current block."""
+        self.bbox = self.ori_bbox
+        child_blocks = self.child_blocks.copy()
+        self.child_blocks = []
+        return child_blocks
+
+    def get_centroid(self) -> tuple:
+        """Get the centroid of the bounding box of the block."""
+        x1, y1, x2, y2 = self.bbox
+        centroid = ((x1 + x2) / 2, (y1 + y2) / 2)
+        return centroid
+
+    def get_bbox_direction(self, direction_ratio: float = 1.0) -> str:
+        """
+        Determine if a bounding box is horizontal or vertical.
+
+        Args:
+            direction_ratio (float): Ratio for determining direction. Default is 1.0.
+
+        Returns:
+            str: "horizontal" or "vertical".
+        """
+        return (
+            "horizontal" if self.width * direction_ratio >= self.height else "vertical"
+        )
+
+    def calculate_text_line_direction(
+        self, bboxes: List[List[int]], direction_ratio: float = 1.5
+    ) -> bool:
+        """
+        Calculate the direction of the text based on the bounding boxes.
+
+        Args:
+            bboxes (list): A list of bounding boxes.
+            direction_ratio (float): Ratio for determining direction. Default is 1.5.
+
+        Returns:
+            str: "horizontal" or "vertical".
+        """
+
+        horizontal_box_num = 0
+        for bbox in bboxes:
+            if len(bbox) != 4:
+                raise ValueError(
+                    "Invalid bounding box format. Expected a list of length 4."
+                )
+            x1, y1, x2, y2 = bbox
+            width = x2 - x1
+            height = y2 - y1
+            horizontal_box_num += 1 if width * direction_ratio >= height else 0
+
+        return "horizontal" if horizontal_box_num >= len(bboxes) * 0.5 else "vertical"
+
+    def group_boxes_into_lines(
+        self, ocr_rec_res, line_height_iou_threshold
+    ) -> List[TextLine]:
+        """
+        Group the bounding boxes into lines based on their direction.
+
+        Args:
+            ocr_rec_res (dict): The result of OCR recognition.
+            line_height_iou_threshold (float): The minimum IOU value required for two spans to belong to the same line.
+
+        Returns:
+            list: A list of TextLines.
+        """
+        rec_boxes = ocr_rec_res["boxes"]
+        rec_texts = ocr_rec_res["rec_texts"]
+        rec_labels = ocr_rec_res["rec_labels"]
+
+        text_boxes = [
+            rec_boxes[i] for i in range(len(rec_boxes)) if rec_labels[i] == "text"
+        ]
+        direction = self.calculate_text_line_direction(text_boxes)
+        self.update_direction(direction)
+
+        spans = [TextSpan(*span) for span in zip(rec_boxes, rec_texts, rec_labels)]
+
+        if not spans:
+            return []
+
+        # sort spans by direction
+        if self.direction == "vertical":
+            spans.sort(
+                key=lambda span: span.box[0], reverse=True
+            )  # sort by x coordinate
+            match_direction = "horizontal"
+        else:
+            spans.sort(
+                key=lambda span: span.box[1], reverse=False
+            )  # sort by y coordinate
+            match_direction = "vertical"
+
+        lines = []
+        current_line = TextLine([spans[0]], direction=self.direction)
+
+        for span in spans[1:]:
+            overlap_ratio = calculate_projection_overlap_ratio(
+                current_line.region_box, span.box, match_direction, mode="small"
+            )
+
+            if overlap_ratio >= line_height_iou_threshold:
+                current_line.add_span(span)
+            else:
+                lines.append(current_line)
+                current_line = TextLine([span], direction=self.direction)
+
+        lines.append(current_line)
+
+        if lines and self.direction == "vertical":
+            line_heights = np.array([line.height for line in lines])
+            min_height = np.min(line_heights)
+            max_height = np.max(line_heights)
+
+            # if height is too large, filter out the line
+            if max_height > min_height * 2:
+                normal_height_threshold = min_height * 1.1
+                normal_height_count = np.sum(line_heights < normal_height_threshold)
+
+                # if the number of lines with height less than the threshold is less than 40%, then filter out the line
+                if normal_height_count < len(lines) * 0.4:
+                    keep_condition = line_heights <= normal_height_threshold
+                    lines = [line for line, keep in zip(lines, keep_condition) if keep]
+
+        # calculate the average height of the text line
+        if lines:
+            line_heights = [line.height for line in lines]
+            line_widths = [line.width for line in lines]
+            self.text_line_height = np.mean(line_heights)
+            self.text_line_width = np.mean(line_widths)
+        else:
+            self.text_line_height = 0
+            self.text_line_width = 0
+
+        return lines
+
+    def update_text_content(
+        self,
+        image: list,
+        ocr_rec_res: dict,
+        text_rec_model: Any,
+        text_rec_score_thresh: Union[float, None] = None,
+    ) -> None:
+        """
+        Update the text content of the block based on the OCR result.
+
+        Args:
+            image (list): The input image.
+            ocr_rec_res (dict): The result of OCR recognition.
+            text_rec_model (Any): The model used for text recognition.
+            text_rec_score_thresh (Union[float, None]): The score threshold for text recognition. If None, use the default setting.
+
+        Returns:
+            None
+        """
+
+        if len(ocr_rec_res["rec_texts"]) == 0:
+            self.content = ""
+            return
+
+        lines = self.group_boxes_into_lines(
+            ocr_rec_res,
+            LINE_SETTINGS.get("line_height_iou_threshold", 0.8),
+        )
+
+        # words start coordinate and stop coordinate in the line
+        coord_start_idx = 0 if self.direction == "horizontal" else 1
+        coord_end_idx = coord_start_idx + 2
+
+        if self.label == "reference":
+            rec_boxes = ocr_rec_res["boxes"]
+            block_start = min([box[coord_start_idx] for box in rec_boxes])
+            block_stop = max([box[coord_end_idx] for box in rec_boxes])
+        else:
+            block_start = self.bbox[coord_start_idx]
+            block_stop = self.bbox[coord_end_idx]
+
+        text_lines = []
+        text_width_list = []
+        need_new_line_num = 0
+
+        for line_idx, line in enumerate(lines):
+            line: TextLine = line
+            text_width_list.append(line.width)
+            # get text from line
+            line_text = line.get_texts(
+                block_label=self.label,
+                block_text_width=max(text_width_list),
+                block_start_coordinate=block_start,
+                block_stop_coordinate=block_stop,
+                ori_image=image,
+                text_rec_model=text_rec_model,
+                text_rec_score_thresh=text_rec_score_thresh,
+            )
+
+            if line.need_new_line:
+                need_new_line_num += 1
+
+            # set segment start and end coordinate
+            if line_idx == 0:
+                self.seg_start_coordinate = line.spans[0].box[0]
+            elif line_idx == len(lines) - 1:
+                self.seg_end_coordinate = line.spans[-1].box[2]
+
+            text_lines.append(line_text)
+
+        delim = LINE_SETTINGS["delimiter_map"].get(self.label, "")
+
+        if delim == "":
+            content = ""
+            pre_line_end = False
+            last_char = ""
+            for idx, line_text in enumerate(text_lines):
+                if len(line_text) == 0:
+                    continue
+
+                line: TextLine = lines[idx]
+                if pre_line_end:
+                    start_gep_len = line.region_box[coord_start_idx] - block_start
+                    if (
+                        (
+                            start_gep_len > line.height * 1.5
+                            and not is_english_letter(last_char)
+                            and not is_numeric(last_char)
+                        )
+                        or start_gep_len > (block_stop - block_start) * 0.4
+                    ) and not content.endswith("\n"):
+                        line_text = "\n" + line_text
+                content += f"{line_text}"
+
+                if len(line_text) > 2 and line_text.endswith(" "):
+                    last_char = line_text[-2]
+                else:
+                    last_char = line_text[-1]
+                if (
+                    len(line_text) > 0
+                    and not line_text.endswith("\n")
+                    and not is_english_letter(last_char)
+                    and not is_non_breaking_punctuation(last_char)
+                    and not is_numeric(last_char)
+                    and need_new_line_num > len(text_lines) * 0.5
+                ) or need_new_line_num > len(text_lines) * 0.6:
+                    content += f"\n"
+                if (
+                    block_stop - line.region_box[coord_end_idx]
+                    > (block_stop - block_start) * 0.3
+                ):
+                    pre_line_end = True
+        else:
+            content = delim.join(text_lines)
+
+        self.content = content
+        self.num_of_lines = len(text_lines)
+
+
+class LayoutRegion(LayoutBlock):
+    """LayoutRegion class"""
+
+    def __init__(
+        self,
+        bbox,
+        blocks: List[LayoutBlock] = [],
+    ) -> None:
+        """
+        Initialize a LayoutRegion object.
+
+        Args:
+            bbox (List[int]): The bounding box of the region.
+            blocks (List[LayoutBlock]): A list of blocks that belong to this region.
+        """
+        super().__init__("region", bbox, content="")
+        self.bbox = bbox
+        self.block_map = {}
+        self.direction = "horizontal"
+        self.doc_title_block_idxes = []
+        self.paragraph_title_block_idxes = []
+        self.vision_block_idxes = []
+        self.unordered_block_idxes = []
+        self.vision_title_block_idxes = []
+        self.normal_text_block_idxes = []
+        self.euclidean_distance = float(np.inf)
+        self.header_block_idxes = []
+        self.footer_block_idxes = []
+        self.text_line_width = 20
+        self.text_line_height = 10
+        self.num_of_lines = 10
+        self.init_region_info_from_layout(blocks)
+        self.update_euclidean_distance()
+
+    def init_region_info_from_layout(self, blocks: List[LayoutBlock]) -> None:
+        """Initialize the information about the layout region from the given blocks.
+
+        Args:
+            blocks (List[LayoutBlock]): A list of blocks that belong to this region.
+        Returns:
+            None
+        """
+        horizontal_normal_text_block_num = 0
+        text_line_height_list = []
+        text_line_width_list = []
+        for idx, block in enumerate(blocks):
+            self.block_map[idx] = block
+            block.index = idx
+            if block.label in BLOCK_LABEL_MAP["header_labels"]:
+                self.header_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
+                self.doc_title_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]:
+                self.paragraph_title_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
+                self.vision_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["vision_title_labels"]:
+                self.vision_title_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
+                self.footer_block_idxes.append(idx)
+            elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
+                self.unordered_block_idxes.append(idx)
+            else:
+                self.normal_text_block_idxes.append(idx)
+                text_line_height_list.append(block.text_line_height)
+                text_line_width_list.append(block.text_line_width)
+                if block.direction == "horizontal":
+                    horizontal_normal_text_block_num += 1
+        direction = (
+            "horizontal"
+            if horizontal_normal_text_block_num
+            >= len(self.normal_text_block_idxes) * 0.5
+            else "vertical"
+        )
+        self.update_direction(direction)
+        self.text_line_width = (
+            np.mean(text_line_width_list) if text_line_width_list else 20
+        )
+        self.text_line_height = (
+            np.mean(text_line_height_list) if text_line_height_list else 10
+        )
+
+    def update_euclidean_distance(self):
+        """Update euclidean distance between each block and the reference point"""
+        blocks: List[LayoutBlock] = list(self.block_map.values())
+        if self.direction == "horizontal":
+            ref_point = (0, 0)
+            block_distance = [
+                caculate_euclidean_dist((block.bbox[0], block.bbox[1]), ref_point)
+                for block in blocks
+            ]
+        else:
+            ref_point = (self.bbox[2], 0)
+            block_distance = [
+                caculate_euclidean_dist((block.bbox[2], block.bbox[1]), ref_point)
+                for block in blocks
+            ]
+        self.euclidean_distance = min(block_distance)
+
+    def update_direction(self, direction=None):
+        """
+        Update the direction of the layout region.
+
+        Args:
+            direction (str): The new direction of the layout region.
+        """
+        super().update_direction(direction=direction)
+        if self.direction == "horizontal":
+            self.direction_start_index = 0
+            self.direction_end_index = 2
+            self.secondary_direction_start_index = 1
+            self.secondary_direction_end_index = 3
+            self.secondary_direction = "vertical"
+        else:
+            self.direction_start_index = 1
+            self.direction_end_index = 3
+            self.secondary_direction_start_index = 0
+            self.secondary_direction_end_index = 2
+            self.secondary_direction = "horizontal"
+
+        self.direction_center_coordinate = (
+            self.bbox[self.direction_start_index] + self.bbox[self.direction_end_index]
+        ) / 2
+        self.secondary_direction_center_coordinate = (
+            self.bbox[self.secondary_direction_start_index]
+            + self.bbox[self.secondary_direction_end_index]
+        ) / 2

+ 132 - 198
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -30,23 +30,22 @@ from ...utils.pp_option import PaddlePredictorOption
 from .._parallel import AutoParallelImageSimpleInferencePipeline
 from ..base import BasePipeline
 from ..ocr.result import OCRResult
-from .result_v2 import LayoutParsingBlock, LayoutParsingRegion, LayoutParsingResultV2
-from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, LINE_SETTINGS, REGION_SETTINGS
+from .layout_objects import LayoutBlock, LayoutRegion
+from .result_v2 import LayoutParsingResultV2
+from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, REGION_SETTINGS
 from .utils import (
     caculate_bbox_area,
     calculate_minimum_enclosing_bbox,
     calculate_overlap_ratio,
     convert_formula_res_to_ocr_format,
-    format_line,
     gather_imgs,
     get_bbox_intersection,
     get_sub_regions_ocr_res,
-    group_boxes_into_lines,
     remove_overlap_blocks,
     shrink_supplement_region_bbox,
-    split_boxes_by_projection,
     update_region_box,
 )
+from .xycut_enhanced import xycut_enhanced
 
 
 class _LayoutParsingPipelineV2(BasePipeline):
@@ -485,6 +484,11 @@ class _LayoutParsingPipelineV2(BasePipeline):
                 )
                 block_to_ocr_map[idx] = [idx]
 
+        mask_labels = (
+            BLOCK_LABEL_MAP.get("unordered_labels", [])
+            + BLOCK_LABEL_MAP.get("header_labels", [])
+            + BLOCK_LABEL_MAP.get("footer_labels", [])
+        )
         block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
         region_det_res["boxes"] = sorted(
             region_det_res["boxes"],
@@ -507,58 +511,117 @@ class _LayoutParsingPipelineV2(BasePipeline):
                 region_to_block_map[region_idx] = []
                 region_bbox = region_info["coordinate"]
                 for block_idx in block_idxes_set:
+                    if layout_det_res["boxes"][block_idx]["label"] in mask_labels:
+                        continue
                     overlap_ratio = calculate_overlap_ratio(
                         region_bbox, block_bboxes[block_idx], mode="small"
                     )
                     if overlap_ratio > REGION_SETTINGS.get(
                         "match_block_overlap_ratio_threshold", 0.8
                     ):
-                        region_to_block_map[region_idx].append(block_idx)
                         matched_idxes.append(block_idx)
+                old_region_bbox_matched_idxes = []
                 if len(matched_idxes) > 0:
+                    while len(old_region_bbox_matched_idxes) != len(matched_idxes):
+                        old_region_bbox_matched_idxes = copy.deepcopy(matched_idxes)
+                        matched_idxes = []
+                        matched_bboxes = [
+                            block_bboxes[idx] for idx in old_region_bbox_matched_idxes
+                        ]
+                        new_region_bbox = calculate_minimum_enclosing_bbox(
+                            matched_bboxes
+                        )
+                        for block_idx in block_idxes_set:
+                            if (
+                                layout_det_res["boxes"][block_idx]["label"]
+                                in mask_labels
+                            ):
+                                continue
+                            overlap_ratio = calculate_overlap_ratio(
+                                new_region_bbox, block_bboxes[block_idx], mode="small"
+                            )
+                            if overlap_ratio > REGION_SETTINGS.get(
+                                "match_block_overlap_ratio_threshold", 0.8
+                            ):
+                                matched_idxes.append(block_idx)
                     for block_idx in matched_idxes:
                         block_idxes_set.remove(block_idx)
-                    matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
-                    new_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
+                    region_to_block_map[region_idx] = matched_idxes
                     region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
             # Supplement region when there is no matched block
-            if len(block_idxes_set) > 0:
-                while len(block_idxes_set) > 0:
-                    matched_idxes = []
-                    unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
-                    supplement_region_bbox = calculate_minimum_enclosing_bbox(
-                        unmatched_bboxes
+            while len(block_idxes_set) > 0:
+                unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
+                if len(unmatched_bboxes) == 0:
+                    break
+                supplement_region_bbox = calculate_minimum_enclosing_bbox(
+                    unmatched_bboxes
+                )
+                matched_idxes = []
+                # check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
+                for region_idx, region_info in enumerate(region_det_res["boxes"]):
+                    if len(region_to_block_map[region_idx]) == 0:
+                        continue
+                    region_bbox = region_info["coordinate"]
+                    overlap_ratio = calculate_overlap_ratio(
+                        supplement_region_bbox, region_bbox
                     )
-                    # check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
-                    for region_info in region_det_res["boxes"]:
-                        region_bbox = region_info["coordinate"]
-                        overlap_ratio = calculate_overlap_ratio(
-                            supplement_region_bbox, region_bbox
-                        )
-                        if overlap_ratio > 0:
-                            supplement_region_bbox, matched_idxes = (
-                                shrink_supplement_region_bbox(
-                                    supplement_region_bbox,
-                                    region_bbox,
-                                    image.shape[1],
-                                    image.shape[0],
-                                    block_idxes_set,
-                                    block_bboxes,
-                                )
+                    if overlap_ratio > 0:
+                        supplement_region_bbox, matched_idxes = (
+                            shrink_supplement_region_bbox(
+                                supplement_region_bbox,
+                                region_bbox,
+                                image.shape[1],
+                                image.shape[0],
+                                block_idxes_set,
+                                block_bboxes,
                             )
+                        )
+
+                matched_idxes = [
+                    idx
+                    for idx in matched_idxes
+                    if layout_det_res["boxes"][idx]["label"] not in mask_labels
+                ]
+                if len(matched_idxes) == 0:
+                    matched_idxes = [
+                        idx
+                        for idx in block_idxes_set
+                        if layout_det_res["boxes"][idx]["label"] not in mask_labels
+                    ]
                     if len(matched_idxes) == 0:
-                        matched_idxes = list(block_idxes_set)
-                    region_idx = len(region_det_res["boxes"])
-                    region_to_block_map[region_idx] = list(matched_idxes)
-                    for block_idx in matched_idxes:
-                        block_idxes_set.remove(block_idx)
-                    region_det_res["boxes"].append(
-                        {
-                            "coordinate": supplement_region_bbox,
-                            "label": "SupplementaryRegion",
-                            "score": 1,
-                        }
-                    )
+                        break
+                matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
+                supplement_region_bbox = calculate_minimum_enclosing_bbox(
+                    matched_bboxes
+                )
+                region_idx = len(region_det_res["boxes"])
+                region_to_block_map[region_idx] = list(matched_idxes)
+                for block_idx in matched_idxes:
+                    block_idxes_set.remove(block_idx)
+                region_det_res["boxes"].append(
+                    {
+                        "coordinate": supplement_region_bbox,
+                        "label": "SupplementaryRegion",
+                        "score": 1,
+                    }
+                )
+
+            mask_idxes = [
+                idx
+                for idx in range(len(layout_det_res["boxes"]))
+                if layout_det_res["boxes"][idx]["label"] in mask_labels
+            ]
+            for idx in mask_idxes:
+                bbox = layout_det_res["boxes"][idx]["coordinate"]
+                region_idx = len(region_det_res["boxes"])
+                region_to_block_map[region_idx] = [idx]
+                region_det_res["boxes"].append(
+                    {
+                        "coordinate": bbox,
+                        "label": "SupplementaryRegion",
+                        "score": 1,
+                    }
+                )
 
         region_block_ocr_idx_map = dict(
             region_to_block_map=region_to_block_map,
@@ -567,142 +630,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
 
         return region_block_ocr_idx_map, region_det_res, layout_det_res
 
-    def sort_line_by_projection(
-        self,
-        line: List[List[Union[List[int], str]]],
-        input_img: np.ndarray,
-        text_rec_model: Any,
-        text_rec_score_thresh: Union[float, None] = None,
-        direction: str = "vertical",
-    ) -> None:
-        """
-        Sort a line of text spans based on their vertical position within the layout bounding box.
-
-        Args:
-            line (list): A list of spans, where each span is a list containing a bounding box and text.
-            input_img (ndarray): The input image used for OCR.
-            general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
-
-        Returns:
-            list: The sorted line of text spans.
-        """
-        sort_index = 0 if direction == "horizontal" else 1
-        splited_boxes = split_boxes_by_projection(line, direction)
-        splited_lines = []
-        if len(line) != len(splited_boxes):
-            splited_boxes.sort(key=lambda span: span[0][sort_index])
-            for span in splited_boxes:
-                bbox, text, label = span
-                if label == "text":
-                    crop_img = input_img[
-                        int(bbox[1]) : int(bbox[3]),
-                        int(bbox[0]) : int(bbox[2]),
-                    ]
-                    crop_img_rec_res = list(text_rec_model([crop_img]))[0]
-                    crop_img_rec_score = crop_img_rec_res["rec_score"]
-                    crop_img_rec_text = crop_img_rec_res["rec_text"]
-                    text = (
-                        crop_img_rec_text
-                        if crop_img_rec_score >= text_rec_score_thresh
-                        else ""
-                    )
-                    span[1] = text
-
-                splited_lines.append(span)
-        else:
-            splited_lines = line
-
-        return splited_lines
-
-    def get_block_rec_content(
-        self,
-        image: list,
-        ocr_rec_res: dict,
-        block: LayoutParsingBlock,
-        text_rec_model: Any,
-        text_rec_score_thresh: Union[float, None] = None,
-    ) -> str:
-
-        if len(ocr_rec_res["rec_texts"]) == 0:
-            block.content = ""
-            return block
-
-        lines, text_direction, text_line_height = group_boxes_into_lines(
-            ocr_rec_res,
-            LINE_SETTINGS.get("line_height_iou_threshold", 0.8),
-        )
-
-        # format line
-        text_lines = []
-        need_new_line_num = 0
-        # words start coordinate and stop coordinate in the line
-        words_start_index = 0 if text_direction == "horizontal" else 1
-        words_stop_index = words_start_index + 2
-        lines_start_index = 1 if text_direction == "horizontal" else 3
-        line_width_list = []
-
-        if block.label == "reference":
-            rec_boxes = ocr_rec_res["boxes"]
-            block_start_coordinate = min([box[words_start_index] for box in rec_boxes])
-            block_stop_coordinate = max([box[words_stop_index] for box in rec_boxes])
-        else:
-            block_start_coordinate = block.bbox[words_start_index]
-            block_stop_coordinate = block.bbox[words_stop_index]
-
-        for idx, line in enumerate(lines):
-            line.sort(
-                key=lambda span: (
-                    span[0][words_start_index] // 2,
-                    (
-                        span[0][lines_start_index]
-                        if text_direction == "horizontal"
-                        else -span[0][lines_start_index]
-                    ),
-                )
-            )
-
-            line_width = line[-1][0][words_stop_index] - line[0][0][words_start_index]
-            line_width_list.append(line_width)
-            # merge formula and text
-            ocr_labels = [span[2] for span in line]
-            if "formula" in ocr_labels:
-                line = self.sort_line_by_projection(
-                    line, image, text_rec_model, text_rec_score_thresh, text_direction
-                )
-
-            line_text, need_new_line = format_line(
-                line,
-                text_direction,
-                np.max(line_width_list),
-                block_start_coordinate,
-                block_stop_coordinate,
-                line_gap_limit=text_line_height * 1.5,
-                block_label=block.label,
-            )
-            if need_new_line:
-                need_new_line_num += 1
-            if idx == 0:
-                line_start_coordinate = line[0][0][0]
-                block.seg_start_coordinate = line_start_coordinate
-            elif idx == len(lines) - 1:
-                line_end_coordinate = line[-1][0][2]
-                block.seg_end_coordinate = line_end_coordinate
-            text_lines.append(line_text)
-
-        delim = LINE_SETTINGS["delimiter_map"].get(block.label, "")
-        if need_new_line_num > len(text_lines) * 0.5 and delim == "":
-            text_lines = [text.replace("\n", "") for text in text_lines]
-            delim = "\n"
-        content = delim.join(text_lines)
-        block.content = content
-        block.num_of_lines = len(text_lines)
-        block.direction = text_direction
-        block.text_line_height = text_line_height
-        block.text_line_width = np.mean(line_width_list)
-
-        return block
-
-    def get_layout_parsing_blocks(
+    def get_layout_parsing_objects(
         self,
         image: list,
         region_block_ocr_idx_map: dict,
@@ -746,7 +674,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         table_index = 0
         seal_index = 0
         chart_index = 0
-        layout_parsing_blocks: List[LayoutParsingBlock] = []
+        layout_parsing_blocks: List[LayoutBlock] = []
 
         for box_idx, box_info in enumerate(layout_det_res["boxes"]):
 
@@ -754,7 +682,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             block_bbox = box_info["coordinate"]
             rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
 
-            block = LayoutParsingBlock(label=label, bbox=block_bbox)
+            block = LayoutBlock(label=label, bbox=block_bbox)
 
             if label == "table" and len(table_res_list) > 0:
                 block.content = table_res_list[table_index]["pred_html"]
@@ -783,9 +711,8 @@ class _LayoutParsingPipelineV2(BasePipeline):
                     rec_res["rec_labels"].append(
                         overall_ocr_res["rec_labels"][box_no],
                     )
-                block = self.get_block_rec_content(
+                block.update_text_content(
                     image=image,
-                    block=block,
                     ocr_rec_res=rec_res,
                     text_rec_model=text_rec_model,
                     text_rec_score_thresh=text_rec_score_thresh,
@@ -805,26 +732,35 @@ class _LayoutParsingPipelineV2(BasePipeline):
 
             layout_parsing_blocks.append(block)
 
-        region_list: List[LayoutParsingRegion] = []
+        page_region_bbox = [65535, 65535, 0, 0]
+        layout_parsing_regions: List[LayoutRegion] = []
         for region_idx, region_info in enumerate(region_det_res["boxes"]):
-            region_bbox = region_info["coordinate"]
+            region_bbox = np.array(region_info["coordinate"]).astype("int")
             region_blocks = [
                 layout_parsing_blocks[idx]
                 for idx in region_block_ocr_idx_map["region_to_block_map"][region_idx]
             ]
-            region = LayoutParsingRegion(
-                bbox=region_bbox,
-                blocks=region_blocks,
-                image_shape=image.shape[:2],
-            )
-            region_list.append(region)
+            if region_blocks:
+                page_region_bbox = update_region_box(region_bbox, page_region_bbox)
+                region = LayoutRegion(bbox=region_bbox, blocks=region_blocks)
+                layout_parsing_regions.append(region)
 
-        region_list = sorted(
-            region_list,
-            key=lambda r: (r.weighted_distance),
+        layout_parsing_page = LayoutRegion(
+            bbox=np.array(page_region_bbox).astype("int"), blocks=layout_parsing_regions
         )
 
-        return region_list
+        return layout_parsing_page
+
+    def sort_layout_parsing_blocks(
+        self, layout_parsing_page: LayoutRegion
+    ) -> List[LayoutBlock]:
+        layout_parsing_regions = xycut_enhanced(layout_parsing_page)
+        parsing_res_list = []
+        for region in layout_parsing_regions:
+            layout_parsing_blocks = xycut_enhanced(region)
+            parsing_res_list.extend(layout_parsing_blocks)
+
+        return parsing_res_list
 
     def get_layout_parsing_res(
         self,
@@ -866,7 +802,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
         )
 
         # Format layout parsing block
-        region_list = self.get_layout_parsing_blocks(
+        layout_parsing_page = self.get_layout_parsing_objects(
             image=image,
             region_block_ocr_idx_map=region_block_ocr_idx_map,
             region_det_res=region_det_res,
@@ -879,9 +815,7 @@ class _LayoutParsingPipelineV2(BasePipeline):
             text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
         )
 
-        parsing_res_list = []
-        for region in region_list:
-            parsing_res_list.extend(region.sort())
+        parsing_res_list = self.sort_layout_parsing_blocks(layout_parsing_page)
 
         index = 1
         for block in parsing_res_list:

+ 6 - 270
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -14,7 +14,6 @@
 from __future__ import annotations
 
 import copy
-import math
 import re
 from functools import partial
 from typing import List
@@ -30,7 +29,8 @@ from ...common.result import (
     MarkdownMixin,
     XlsxMixin,
 )
-from .setting import BLOCK_LABEL_MAP
+from .layout_objects import LayoutBlock
+from .utils import get_seg_flag
 
 
 def compile_title_pattern():
@@ -140,58 +140,6 @@ def format_first_line_func(block, templates, format_func, spliter):
     return spliter.join(lines)
 
 
-def get_seg_flag(block: LayoutParsingBlock, prev_block: LayoutParsingBlock):
-
-    seg_start_flag = True
-    seg_end_flag = True
-
-    block_box = block.bbox
-    context_left_coordinate = block_box[0]
-    context_right_coordinate = block_box[2]
-    seg_start_coordinate = block.seg_start_coordinate
-    seg_end_coordinate = block.seg_end_coordinate
-
-    if prev_block is not None:
-        prev_block_bbox = prev_block.bbox
-        num_of_prev_lines = prev_block.num_of_lines
-        pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
-        prev_end_space_small = (
-            abs(prev_block_bbox[2] - pre_block_seg_end_coordinate) < 10
-        )
-        prev_lines_more_than_one = num_of_prev_lines > 1
-
-        overlap_blocks = context_left_coordinate < prev_block_bbox[2]
-
-        # update context_left_coordinate and context_right_coordinate
-        if overlap_blocks:
-            context_left_coordinate = min(prev_block_bbox[0], context_left_coordinate)
-            context_right_coordinate = max(prev_block_bbox[2], context_right_coordinate)
-            prev_end_space_small = (
-                abs(context_right_coordinate - pre_block_seg_end_coordinate) < 10
-            )
-            edge_distance = 0
-        else:
-            edge_distance = abs(block_box[0] - prev_block_bbox[2])
-
-        current_start_space_small = seg_start_coordinate - context_left_coordinate < 10
-
-        if (
-            prev_end_space_small
-            and current_start_space_small
-            and prev_lines_more_than_one
-            and edge_distance < max(prev_block.width, block.width)
-        ):
-            seg_start_flag = False
-    else:
-        if seg_start_coordinate - context_left_coordinate < 10:
-            seg_start_flag = False
-
-    if context_right_coordinate - seg_end_coordinate < 10:
-        seg_end_flag = False
-
-    return seg_start_flag, seg_end_flag
-
-
 class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
     """Layout Parsing Result V2"""
 
@@ -247,7 +195,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         draw = ImageDraw.Draw(image, "RGBA")
         font_size = int(0.018 * int(image.width)) + 2
         font = ImageFont.truetype(PINGFANG_FONT_FILE_PATH, font_size, encoding="utf-8")
-        parsing_result: List[LayoutParsingBlock] = self["parsing_res_list"]
+        parsing_result: List[LayoutBlock] = self["parsing_res_list"]
         for block in parsing_result:
             bbox = block.bbox
             index = block.order_index
@@ -456,6 +404,9 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
             "table_title": format_text_func,
             "figure_title": format_text_func,
             "chart_title": format_text_func,
+            "vision_footnote": lambda block: block.content.replace(
+                "\n\n", "\n"
+            ).replace("\n", "\n\n"),
             "text": lambda block: block.content.replace("\n\n", "\n").replace(
                 "\n", "\n\n"
             ),
@@ -528,218 +479,3 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
             markdown_info["markdown_images"][img["path"]] = img["img"]
 
         return markdown_info
-
-
-class LayoutParsingBlock:
-
-    def __init__(self, label, bbox, content="") -> None:
-        self.label = label
-        self.order_label = None
-        self.bbox = list(map(int, bbox))
-        self.content = content
-        self.seg_start_coordinate = float("inf")
-        self.seg_end_coordinate = float("-inf")
-        self.width = bbox[2] - bbox[0]
-        self.height = bbox[3] - bbox[1]
-        self.area = self.width * self.height
-        self.num_of_lines = 1
-        self.image = None
-        self.index = None
-        self.order_index = None
-        self.text_line_width = 1
-        self.text_line_height = 1
-        self.direction = self.get_bbox_direction()
-        self.child_blocks = []
-        self.update_direction_info()
-
-    def __str__(self) -> str:
-        return f"{self.__dict__}"
-
-    def __repr__(self) -> str:
-        _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
-        return _str
-
-    def to_dict(self) -> dict:
-        return self.__dict__
-
-    def update_direction_info(self) -> None:
-        if self.direction == "horizontal":
-            self.secondary_direction = "vertical"
-            self.short_side_length = self.height
-            self.long_side_length = self.width
-            self.start_coordinate = self.bbox[0]
-            self.end_coordinate = self.bbox[2]
-            self.secondary_direction_start_coordinate = self.bbox[1]
-            self.secondary_direction_end_coordinate = self.bbox[3]
-        else:
-            self.secondary_direction = "horizontal"
-            self.short_side_length = self.width
-            self.long_side_length = self.height
-            self.start_coordinate = self.bbox[1]
-            self.end_coordinate = self.bbox[3]
-            self.secondary_direction_start_coordinate = self.bbox[0]
-            self.secondary_direction_end_coordinate = self.bbox[2]
-
-    def append_child_block(self, child_block: LayoutParsingBlock) -> None:
-        if not self.child_blocks:
-            self.ori_bbox = self.bbox.copy()
-        x1, y1, x2, y2 = self.bbox
-        x1_child, y1_child, x2_child, y2_child = child_block.bbox
-        union_bbox = (
-            min(x1, x1_child),
-            min(y1, y1_child),
-            max(x2, x2_child),
-            max(y2, y2_child),
-        )
-        self.bbox = union_bbox
-        self.update_direction_info()
-        child_blocks = [child_block]
-        if child_block.child_blocks:
-            child_blocks.extend(child_block.get_child_blocks())
-        self.child_blocks.extend(child_blocks)
-
-    def get_child_blocks(self) -> list:
-        self.bbox = self.ori_bbox
-        child_blocks = self.child_blocks.copy()
-        self.child_blocks = []
-        return child_blocks
-
-    def get_centroid(self) -> tuple:
-        x1, y1, x2, y2 = self.bbox
-        centroid = ((x1 + x2) / 2, (y1 + y2) / 2)
-        return centroid
-
-    def get_bbox_direction(self, direction_ratio: float = 1.0) -> bool:
-        """
-        Determine if a bounding box is horizontal or vertical.
-
-        Args:
-            bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
-            direction_ratio (float): Ratio for determining direction. Default is 1.0.
-
-        Returns:
-            str: "horizontal" or "vertical".
-        """
-        return (
-            "horizontal" if self.width * direction_ratio >= self.height else "vertical"
-        )
-
-
-class LayoutParsingRegion:
-
-    def __init__(
-        self, bbox, blocks: List[LayoutParsingBlock] = [], image_shape=None
-    ) -> None:
-        self.bbox = bbox
-        self.block_map = {}
-        self.direction = "horizontal"
-        self.calculate_bbox_metrics(image_shape)
-        self.doc_title_block_idxes = []
-        self.paragraph_title_block_idxes = []
-        self.vision_block_idxes = []
-        self.unordered_block_idxes = []
-        self.vision_title_block_idxes = []
-        self.normal_text_block_idxes = []
-        self.header_block_idxes = []
-        self.footer_block_idxes = []
-        self.text_line_width = 20
-        self.text_line_height = 10
-        self.init_region_info_from_layout(blocks)
-        self.init_direction_info()
-
-    def init_region_info_from_layout(self, blocks: List[LayoutParsingBlock]):
-        horizontal_normal_text_block_num = 0
-        text_line_height_list = []
-        text_line_width_list = []
-        for idx, block in enumerate(blocks):
-            self.block_map[idx] = block
-            block.index = idx
-            if block.label in BLOCK_LABEL_MAP["header_labels"]:
-                self.header_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
-                self.doc_title_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]:
-                self.paragraph_title_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
-                self.vision_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["vision_title_labels"]:
-                self.vision_title_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
-                self.footer_block_idxes.append(idx)
-            elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
-                self.unordered_block_idxes.append(idx)
-            else:
-                self.normal_text_block_idxes.append(idx)
-                text_line_height_list.append(block.text_line_height)
-                text_line_width_list.append(block.text_line_width)
-                if block.direction == "horizontal":
-                    horizontal_normal_text_block_num += 1
-        self.direction = (
-            "horizontal"
-            if horizontal_normal_text_block_num
-            >= len(self.normal_text_block_idxes) * 0.5
-            else "vertical"
-        )
-        self.text_line_width = (
-            np.mean(text_line_width_list) if text_line_width_list else 20
-        )
-        self.text_line_height = (
-            np.mean(text_line_height_list) if text_line_height_list else 10
-        )
-
-    def init_direction_info(self):
-        if self.direction == "horizontal":
-            self.direction_start_index = 0
-            self.direction_end_index = 2
-            self.secondary_direction_start_index = 1
-            self.secondary_direction_end_index = 3
-            self.secondary_direction = "vertical"
-        else:
-            self.direction_start_index = 1
-            self.direction_end_index = 3
-            self.secondary_direction_start_index = 0
-            self.secondary_direction_end_index = 2
-            self.secondary_direction = "horizontal"
-
-        self.direction_center_coordinate = (
-            self.bbox[self.direction_start_index] + self.bbox[self.direction_end_index]
-        ) / 2
-        self.secondary_direction_center_coordinate = (
-            self.bbox[self.secondary_direction_start_index]
-            + self.bbox[self.secondary_direction_end_index]
-        ) / 2
-
-    def calculate_bbox_metrics(self, image_shape):
-        x1, y1, x2, y2 = self.bbox
-        image_height, image_width = image_shape
-        width = x2 - x1
-        x_center, y_center = (x1 + x2) / 2, (y1 + y2) / 2
-        self.euclidean_distance = math.sqrt(((x1) ** 2 + (y1) ** 2))
-        self.center_euclidean_distance = math.sqrt(((x_center) ** 2 + (y_center) ** 2))
-        self.angle_rad = math.atan2(y_center, x_center)
-        self.weighted_distance = (
-            y2 + width + (x1 // (image_width // 10)) * (image_width // 10) * 1.5
-        )
-
-    def sort_normal_blocks(self, blocks):
-        if self.direction == "horizontal":
-            blocks.sort(
-                key=lambda x: (
-                    x.bbox[1] // self.text_line_height,
-                    x.bbox[0] // self.text_line_width,
-                    x.bbox[1] ** 2 + x.bbox[0] ** 2,
-                ),
-            )
-        else:
-            blocks.sort(
-                key=lambda x: (
-                    -x.bbox[0] // self.text_line_width,
-                    x.bbox[1] // self.text_line_height,
-                    -(x.bbox[2] ** 2 + x.bbox[1] ** 2),
-                ),
-            )
-
-    def sort(self):
-        from .xycut_enhanced import xycut_enhanced
-
-        return xycut_enhanced(self)

+ 1 - 0
paddlex/inference/pipelines/layout_parsing/setting.py

@@ -21,6 +21,7 @@ XYCUT_SETTINGS = {
         "up_edge_weight": 1,
         "down_edge_weight": 0.0001,
     },
+    "cross_layout_ref_text_block_words_num_threshold": 10,
 }
 
 REGION_SETTINGS = {

+ 104 - 306
paddlex/inference/pipelines/layout_parsing/utils.py

@@ -262,76 +262,6 @@ def calculate_overlap_ratio(
     return inter_area / ref_area
 
 
-def group_boxes_into_lines(ocr_rec_res, line_height_iou_threshold):
-    rec_boxes = ocr_rec_res["boxes"]
-    rec_texts = ocr_rec_res["rec_texts"]
-    rec_labels = ocr_rec_res["rec_labels"]
-
-    text_boxes = [
-        rec_boxes[i] for i in range(len(rec_boxes)) if rec_labels[i] == "text"
-    ]
-    text_orientation = calculate_text_orientation(text_boxes)
-
-    match_direction = "vertical" if text_orientation == "horizontal" else "horizontal"
-
-    line_start_index = 1 if text_orientation == "horizontal" else 0
-    line_end_index = 3 if text_orientation == "horizontal" else 2
-
-    spans = list(zip(rec_boxes, rec_texts, rec_labels))
-    sort_index = 1
-    reverse = False
-    if text_orientation == "vertical":
-        sort_index = 0
-        reverse = True
-    spans.sort(key=lambda span: span[0][sort_index], reverse=reverse)
-    spans = [list(span) for span in spans]
-
-    lines = []
-    line = [spans[0]]
-    line_region_box = spans[0][0].copy()
-    line_heights = []
-    # merge line
-    for span in spans[1:]:
-        rec_bbox = span[0]
-        if (
-            calculate_projection_overlap_ratio(
-                line_region_box, rec_bbox, match_direction, mode="small"
-            )
-            >= line_height_iou_threshold
-        ):
-            line.append(span)
-            line_region_box[line_start_index] = min(
-                line_region_box[line_start_index], rec_bbox[line_start_index]
-            )
-            line_region_box[line_end_index] = max(
-                line_region_box[line_end_index], rec_bbox[line_end_index]
-            )
-        else:
-            line_heights.append(
-                line_region_box[line_end_index] - line_region_box[line_start_index]
-            )
-            lines.append(line)
-            line = [span]
-            line_region_box = rec_bbox.copy()
-
-    lines.append(line)
-    line_heights.append(
-        line_region_box[line_end_index] - line_region_box[line_start_index]
-    )
-
-    min_height = min(line_heights) if line_heights else 0
-    max_height = max(line_heights) if line_heights else 0
-
-    if max_height > min_height * 2 and text_orientation == "vertical":
-        line_heights = np.array(line_heights)
-        min_height_num = np.sum(line_heights < min_height * 1.1)
-        if min_height_num < len(lines) * 0.4:
-            condition = line_heights > min_height * 1.1
-            lines = [value for value, keep in zip(lines, condition) if keep]
-
-    return lines, text_orientation, np.mean(line_heights)
-
-
 def calculate_minimum_enclosing_bbox(bboxes):
     """
     Calculate the minimum enclosing bounding box for a list of bounding boxes.
@@ -358,257 +288,41 @@ def calculate_minimum_enclosing_bbox(bboxes):
     return [min_x, min_y, max_x, max_y]
 
 
-def calculate_text_orientation(
-    bboxes: List[List[int]], orientation_ratio: float = 1.5
-) -> bool:
-    """
-    Calculate the orientation of the text based on the bounding boxes.
-
-    Args:
-        bboxes (list): A list of bounding boxes.
-        orientation_ratio (float): Ratio for determining orientation. Default is 1.5.
-
-    Returns:
-        str: "horizontal" or "vertical".
-    """
-
-    horizontal_box_num = 0
-    for bbox in bboxes:
-        if len(bbox) != 4:
-            raise ValueError(
-                "Invalid bounding box format. Expected a list of length 4."
-            )
-        x1, y1, x2, y2 = bbox
-        width = x2 - x1
-        height = y2 - y1
-        horizontal_box_num += 1 if width * orientation_ratio >= height else 0
-
-    return "horizontal" if horizontal_box_num >= len(bboxes) * 0.5 else "vertical"
-
-
 def is_english_letter(char):
+    """check if the char is english letter"""
     return bool(re.match(r"^[A-Za-z]$", char))
 
 
 def is_numeric(char):
-    return bool(re.match(r"^[\d.]+$", char))
+    """check if the char is numeric"""
+    return bool(re.match(r"^[\d]+$", char))
 
 
 def is_non_breaking_punctuation(char):
     """
-    判断一个字符是否是不需要换行的标点符号,包括全角和半角的符号。
-
-    :param char: str, 单个字符
-    :return: bool, 如果字符是不需要换行的标点符号,返回True,否则返回False
-    """
-    non_breaking_punctuations = {
-        ",",  # 半角逗号
-        ",",  # 全角逗号
-        "、",  # 顿号
-        ";",  # 半角分号
-        ";",  # 全角分号
-        ":",  # 半角冒号
-        ":",  # 全角冒号
-        "-",  # 连字符
-    }
-
-    return char in non_breaking_punctuations
-
-
-def format_line(
-    line: List[List[Union[List[int], str]]],
-    text_direction: int,
-    block_width: int,
-    block_start_coordinate: int,
-    block_stop_coordinate: int,
-    line_gap_limit: int = 10,
-    block_label: str = "text",
-) -> None:
-    """
-    Format a line of text spans based on layout constraints.
+    check if the char is non-breaking punctuation
 
     Args:
-        line (list): A list of spans, where each span is a list containing a bounding box and text.
-        block_left_coordinate (int): The text line directional minimum coordinate of the layout bounding box.
-        block_stop_coordinate (int): The text line directional maximum x-coordinate of the layout bounding box.
-        first_line_span_limit (int): The limit for the number of pixels before the first span that should be considered part of the first line. Default is 10.
-        line_gap_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
-        block_label (str): The label associated with the entire block. Default is 'text'.
-    Returns:
-        None: The function modifies the line in place.
-    """
-    first_span_box = line[0][0]
-    last_span_box = line[-1][0]
-
-    for span in line:
-        if span[2] == "formula" and block_label != "formula":
-            formula_rec = span[1]
-            if not formula_rec.startswith("$") and not formula_rec.endswith("$"):
-                if len(line) > 1:
-                    span[1] = f"${span[1]}$"
-                else:
-                    span[1] = f"\n${span[1]}$"
-
-    line_text = ""
-    for span in line:
-        _, text, label = span
-        line_text += text
-        if len(text) > 0 and is_english_letter(line_text[-1]) or label == "formula":
-            line_text += " "
-
-    if text_direction == "horizontal":
-        text_start_index = 0
-        text_stop_index = 2
-    else:
-        text_start_index = 1
-        text_stop_index = 3
-
-    need_new_line = False
-    if (
-        len(line_text) > 0
-        and not is_english_letter(line_text[-1])
-        and not is_non_breaking_punctuation(line_text[-1])
-    ):
-        if (
-            text_direction == "horizontal"
-            and block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
-        ) or (
-            text_direction == "vertical"
-            and (
-                block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
-                or first_span_box[1] - block_start_coordinate > line_gap_limit
-            )
-        ):
-            need_new_line = True
-
-    if line_text.endswith("-"):
-        line_text = line_text[:-1]
-    elif (
-        len(line_text) > 0 and is_english_letter(line_text[-1])
-    ) or line_text.endswith("$"):
-        line_text += " "
-    elif (
-        len(line_text) > 0
-        and not is_english_letter(line_text[-1])
-        and not is_non_breaking_punctuation(line_text[-1])
-        and not is_numeric(line_text[-1])
-    ) or text_direction == "vertical":
-        if block_stop_coordinate - last_span_box[text_stop_index] > block_width * 0.4:
-            line_text += "\n"
-        if (
-            first_span_box[text_start_index] - block_start_coordinate
-            > block_width * 0.4
-        ):
-            line_text = "\n" + line_text
-
-    return line_text, need_new_line
+        char (str): character to check
 
-
-def split_boxes_by_projection(spans: List[List[int]], direction, offset=1e-5):
-    """
-    Check if there is any complete containment in the x-direction
-    between the bounding boxes and split the containing box accordingly.
-
-    Args:
-        spans (list of lists): Each element is a list containing an ndarray of length 4, a text string, and a label.
-        direction: 'horizontal' or 'vertical', indicating whether the spans are arranged horizontally or vertically.
-        offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
     Returns:
-        A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
+        bool: True if the char is non-breaking punctuation
     """
+    non_breaking_punctuations = {
+        ",",
+        ",",
+        "、",
+        ";",
+        ";",
+        ":",
+        ":",
+        "-",
+        "'",
+        '"',
+        "“",
+    }
 
-    def is_projection_contained(box_a, box_b, start_idx, end_idx):
-        """Check if box_a completely contains box_b in the x-direction."""
-        return box_a[start_idx] <= box_b[start_idx] and box_a[end_idx] >= box_b[end_idx]
-
-    new_boxes = []
-    if direction == "horizontal":
-        projection_start_index, projection_end_index = 0, 2
-    else:
-        projection_start_index, projection_end_index = 1, 3
-
-    for i in range(len(spans)):
-        span = spans[i]
-        is_split = False
-        for j in range(i, len(spans)):
-            box_b = spans[j][0]
-            box_a, text, label = span
-            if is_projection_contained(
-                box_a, box_b, projection_start_index, projection_end_index
-            ):
-                is_split = True
-                # Split box_a based on the x-coordinates of box_b
-                if box_a[projection_start_index] < box_b[projection_start_index]:
-                    w = (
-                        box_b[projection_start_index]
-                        - offset
-                        - box_a[projection_start_index]
-                    )
-                    if w > 1:
-                        new_bbox = box_a.copy()
-                        new_bbox[projection_end_index] = (
-                            box_b[projection_start_index] - offset
-                        )
-                        new_boxes.append(
-                            [
-                                np.array(new_bbox),
-                                text,
-                                label,
-                            ]
-                        )
-                if box_a[projection_end_index] > box_b[projection_end_index]:
-                    w = (
-                        box_a[projection_end_index]
-                        - box_b[projection_end_index]
-                        + offset
-                    )
-                    if w > 1:
-                        box_a[projection_start_index] = (
-                            box_b[projection_end_index] + offset
-                        )
-                        span = [
-                            np.array(box_a),
-                            text,
-                            label,
-                        ]
-            if j == len(spans) - 1 and is_split:
-                new_boxes.append(span)
-        if not is_split:
-            new_boxes.append(span)
-
-    return new_boxes
-
-
-def remove_extra_space(input_text: str) -> str:
-    """
-    Process the input text to handle spaces.
-
-    The function removes multiple consecutive spaces between Chinese characters and ensures that
-    only a single space is retained between Chinese and non-Chinese characters.
-
-    Args:
-        input_text (str): The text to be processed.
-
-    Returns:
-        str: The processed text with properly formatted spaces.
-    """
-
-    # Remove spaces between Chinese characters
-    text_without_spaces = re.sub(
-        r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", input_text
-    )
-
-    # Ensure single space between Chinese and non-Chinese characters
-    text_with_single_spaces = re.sub(
-        r"(?<=[\u4e00-\u9fff])\s+(?=[^\u4e00-\u9fff])|(?<=[^\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])",
-        " ",
-        text_without_spaces,
-    )
-
-    # Reduce any remaining consecutive spaces to a single space
-    final_text = re.sub(r"\s+", " ", text_with_single_spaces).strip()
-
-    return final_text
+    return char in non_breaking_punctuations
 
 
 def gather_imgs(original_img, layout_det_objs):
@@ -856,6 +570,7 @@ def shrink_supplement_region_bbox(
 
 
 def update_region_box(bbox, region_box):
+    """Update region box with bbox"""
     if region_box is None:
         return bbox
 
@@ -873,6 +588,14 @@ def update_region_box(bbox, region_box):
 
 
 def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
+    """Convert formula result to OCR result format
+
+    Args:
+        formula_res_list (List): Formula results
+        ocr_res (dict): OCR result
+    Returns:
+        ocr_res (dict): Updated OCR result
+    """
     for formula_res in formula_res_list:
         x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
         poly_points = [
@@ -896,11 +619,86 @@ def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
 
 
 def caculate_bbox_area(bbox):
+    """Calculate bounding box area"""
     x1, y1, x2, y2 = map(float, bbox)
     area = abs((x2 - x1) * (y2 - y1))
     return area
 
 
+def caculate_euclidean_dist(point1, point2):
+    """Calculate euclidean distance between two points"""
+    x1, y1 = point1
+    x2, y2 = point2
+    return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
+
+
+def get_seg_flag(block, prev_block):
+    """Get segment start flag and end flag based on previous block
+
+    Args:
+        block (Block): Current block
+        prev_block (Block): Previous block
+
+    Returns:
+        seg_start_flag (bool): Segment start flag
+        seg_end_flag (bool): Segment end flag
+    """
+
+    seg_start_flag = True
+    seg_end_flag = True
+
+    context_left_coordinate = block.start_coordinate
+    context_right_coordinate = block.end_coordinate
+    seg_start_coordinate = block.seg_start_coordinate
+    seg_end_coordinate = block.seg_end_coordinate
+
+    if prev_block is not None:
+        num_of_prev_lines = prev_block.num_of_lines
+        pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
+        prev_end_space_small = (
+            abs(prev_block.end_coordinate - pre_block_seg_end_coordinate) < 10
+        )
+        prev_lines_more_than_one = num_of_prev_lines > 1
+
+        overlap_blocks = (
+            context_left_coordinate < prev_block.end_coordinate
+            and context_right_coordinate > prev_block.start_coordinate
+        )
+
+        # update context_left_coordinate and context_right_coordinate
+        if overlap_blocks:
+            context_left_coordinate = min(
+                prev_block.start_coordinate, context_left_coordinate
+            )
+            context_right_coordinate = max(
+                prev_block.end_coordinate, context_right_coordinate
+            )
+            prev_end_space_small = (
+                abs(context_right_coordinate - pre_block_seg_end_coordinate) < 10
+            )
+            edge_distance = 0
+        else:
+            edge_distance = abs(block.start_coordinate - prev_block.end_coordinate)
+
+        current_start_space_small = seg_start_coordinate - context_left_coordinate < 10
+
+        if (
+            prev_end_space_small
+            and current_start_space_small
+            and prev_lines_more_than_one
+            and edge_distance < max(prev_block.width, block.width)
+        ):
+            seg_start_flag = False
+    else:
+        if seg_start_coordinate - context_left_coordinate < 10:
+            seg_start_flag = False
+
+    if context_right_coordinate - seg_end_coordinate < 10:
+        seg_end_flag = False
+
+    return seg_start_flag, seg_end_flag
+
+
 def get_show_color(label: str, order_label=False) -> Tuple:
     if order_label:
         label_colors = {

+ 302 - 247
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py

@@ -16,9 +16,13 @@ from typing import List, Tuple
 
 import numpy as np
 
-from ..result_v2 import LayoutParsingBlock, LayoutParsingRegion
+from ..layout_objects import LayoutBlock, LayoutRegion
 from ..setting import BLOCK_LABEL_MAP, XYCUT_SETTINGS
-from ..utils import calculate_projection_overlap_ratio
+from ..utils import (
+    calculate_overlap_ratio,
+    calculate_projection_overlap_ratio,
+    get_seg_flag,
+)
 
 
 def get_nearest_edge_distance(
@@ -264,8 +268,8 @@ def recursive_xy_cut(
 
 
 def reference_insert(
-    block: LayoutParsingBlock,
-    sorted_blocks: List[LayoutParsingBlock],
+    block: LayoutBlock,
+    sorted_blocks: List[LayoutBlock],
     **kwargs,
 ):
     """
@@ -294,8 +298,8 @@ def reference_insert(
 
 
 def manhattan_insert(
-    block: LayoutParsingBlock,
-    sorted_blocks: List[LayoutParsingBlock],
+    block: LayoutBlock,
+    sorted_blocks: List[LayoutBlock],
     **kwargs,
 ):
     """
@@ -322,10 +326,38 @@ def manhattan_insert(
     return sorted_blocks
 
 
+def euclidean_insert(
+    block: LayoutRegion,
+    sorted_blocks: List[LayoutRegion],
+    **kwargs,
+):
+    """
+    Insert a block into a sorted list of blocks based on the Euclidean distance between the block and the nearest sorted block.
+
+    Args:
+        block: The block to insert into the sorted blocks.
+        sorted_blocks: The sorted blocks where the new block will be inserted.
+        config: Configuration dictionary containing parameters related to the layout parsing.
+        median_width: Median width of the document. Defaults to 0.0.
+
+    Returns:
+        sorted_blocks: The updated sorted blocks after insertion.
+    """
+    nearest_sorted_block_index = len(sorted_blocks)
+    block_euclidean_distance = block.euclidean_distance
+    for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
+        distance = sorted_block.euclidean_distance
+        if distance > block_euclidean_distance:
+            nearest_sorted_block_index = sorted_block_idx
+            break
+    sorted_blocks.insert(nearest_sorted_block_index, block)
+    return sorted_blocks
+
+
 def weighted_distance_insert(
-    block: LayoutParsingBlock,
-    sorted_blocks: List[LayoutParsingBlock],
-    region: LayoutParsingRegion,
+    block: LayoutBlock,
+    sorted_blocks: List[LayoutBlock],
+    region: LayoutRegion,
 ):
     """
     Insert a block into a sorted list of blocks based on the weighted distance between the block and the nearest sorted block.
@@ -398,18 +430,75 @@ def weighted_distance_insert(
         if weighted_distance < min_weighted_distance:
             nearest_sorted_block_index = sorted_block_idx
             min_weighted_distance = weighted_distance
-            if y1 > y1_prime or (y1 == y1_prime and x1 > x1_prime):
+            if abs(y1 // 2 - y1_prime // 2) > 0:
+                sorted_distance = y1_prime
+                block_distance = y1
+            else:
+                if region.direction == "horizontal":
+                    if abs(x1 // 2 - x2 // 2) > 0:
+                        sorted_distance = x1_prime
+                        block_distance = x1
+                    else:
+                        # distance with (0,0)
+                        sorted_block_center_x, sorted_block_center_y = (
+                            sorted_block.get_centroid()
+                        )
+                        block_center_x, block_center_y = block.get_centroid()
+                        sorted_distance = (
+                            sorted_block_center_x**2 + sorted_block_center_y**2
+                        )
+                        block_distance = block_center_x**2 + block_center_y**2
+                else:
+                    if abs(x1 - x2) > 0:
+                        sorted_distance = -x2_prime
+                        block_distance = -x2
+                    else:
+                        # distance with (max,0)
+                        sorted_block_center_x, sorted_block_center_y = (
+                            sorted_block.get_centroid()
+                        )
+                        block_center_x, block_center_y = block.get_centroid()
+                        sorted_distance = (
+                            sorted_block_center_x**2 + sorted_block_center_y**2
+                        )
+                        block_distance = block_center_x**2 + block_center_y**2
+            if block_distance > sorted_distance:
                 nearest_sorted_block_index = sorted_block_idx + 1
+                if (
+                    sorted_block_idx < len(sorted_blocks) - 1
+                    and block.label
+                    in BLOCK_LABEL_MAP["vision_labels"]
+                    + BLOCK_LABEL_MAP["vision_title_labels"]
+                ):
+                    seg_start_flag, _ = get_seg_flag(
+                        sorted_blocks[sorted_block_idx + 1],
+                        sorted_blocks[sorted_block_idx],
+                    )
+                    if not seg_start_flag:
+                        nearest_sorted_block_index += 1
+            else:
+                if (
+                    sorted_block_idx > 0
+                    and block.label
+                    in BLOCK_LABEL_MAP["vision_labels"]
+                    + BLOCK_LABEL_MAP["vision_title_labels"]
+                ):
+                    seg_start_flag, _ = get_seg_flag(
+                        sorted_blocks[sorted_block_idx],
+                        sorted_blocks[sorted_block_idx - 1],
+                    )
+                    if not seg_start_flag:
+                        nearest_sorted_block_index = sorted_block_idx - 1
 
     sorted_blocks.insert(nearest_sorted_block_index, block)
     return sorted_blocks
 
 
 def insert_child_blocks(
-    block: LayoutParsingBlock,
+    block: LayoutBlock,
     block_idx: int,
-    sorted_blocks: List[LayoutParsingBlock],
-) -> List[LayoutParsingBlock]:
+    sorted_blocks: List[LayoutBlock],
+) -> List[LayoutBlock]:
     """
     Insert child blocks of a block into the sorted blocks list.
 
@@ -432,34 +521,37 @@ def insert_child_blocks(
     return sorted_blocks
 
 
-def sort_child_blocks(blocks, direction="horizontal") -> List[LayoutParsingBlock]:
+def sort_child_blocks(
+    blocks: List[LayoutRegion], direction="horizontal"
+) -> List[LayoutBlock]:
     """
     Sort child blocks based on their bounding box coordinates.
 
     Args:
-        blocks: A list of LayoutParsingBlock objects representing the child blocks.
+        blocks: A list of LayoutBlock objects representing the child blocks.
         direction: direction of the blocks ('horizontal' or 'vertical'). Default is 'horizontal'.
     Returns:
-        sorted_blocks: A sorted list of LayoutParsingBlock objects.
+        sorted_blocks: A sorted list of LayoutBlock objects.
     """
-    if direction == "horizontal":
-        # from top to bottom
-        blocks.sort(
-            key=lambda x: (
-                x.bbox[1],  # y_min
-                x.bbox[0],  # x_min
-                x.bbox[1] ** 2 + x.bbox[0] ** 2,  # distance with (0,0)
-            ),
-        )
+    if blocks[0].label != "region":
+        if direction == "horizontal":
+            blocks.sort(
+                key=lambda x: (
+                    x.bbox[1],
+                    x.bbox[0],
+                    x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
+                ),  # distance with (0,0)
+            )
+        else:
+            blocks.sort(
+                key=lambda x: (
+                    -x.bbox[2],
+                    x.bbox[1],
+                    -x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
+                ),  # distance with (max,0)
+            )
     else:
-        # from right to left
-        blocks.sort(
-            key=lambda x: (
-                -x.bbox[0],  # x_min
-                x.bbox[1],  # y_min
-                x.bbox[1] ** 2 - x.bbox[0] ** 2,  # distance with (max,0)
-            ),
-        )
+        blocks.sort(key=lambda x: x.euclidean_distance)
     return blocks
 
 
@@ -504,41 +596,34 @@ def _manhattan_distance(
     return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
 
 
-def sort_normal_blocks(blocks, text_line_height, text_line_width, region_direction):
-    if region_direction == "horizontal":
-        blocks.sort(
-            key=lambda x: (
-                x.bbox[1] // text_line_height,
-                x.bbox[0] // text_line_width,
-                x.bbox[1] ** 2 + x.bbox[0] ** 2,
-            ),
-        )
-    else:
-        blocks.sort(
-            key=lambda x: (
-                -x.bbox[0] // text_line_width,
-                x.bbox[1] // text_line_height,
-                x.bbox[1] ** 2 - x.bbox[2] ** 2,  # distance with (max,0)
-            ),
-        )
-    return blocks
+def sort_normal_blocks(
+    blocks, text_line_height, text_line_width, region_direction
+) -> List[LayoutBlock]:
+    """Sort blocks by their position within the page
 
+    Args:
+        blocks (List[LayoutBlock]): List of blocks to be sorted.
+        text_line_height (int): Height of each line of text.
+        text_line_width (int): Width of each line of text.
+        region_direction (str): Direction of the region, either "horizontal" or "vertical".
 
-def sort_normal_blocks(blocks, text_line_height, text_line_width, region_direction):
+    Returns:
+        List[LayoutBlock]: Sorted list of blocks.
+    """
     if region_direction == "horizontal":
         blocks.sort(
             key=lambda x: (
                 x.bbox[1] // text_line_height,
                 x.bbox[0] // text_line_width,
-                x.bbox[1] ** 2 + x.bbox[0] ** 2,
+                x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
             ),
         )
     else:
         blocks.sort(
             key=lambda x: (
-                -x.bbox[0] // text_line_width,
+                -x.bbox[2] // text_line_width,
                 x.bbox[1] // text_line_height,
-                -(x.bbox[2] ** 2 + x.bbox[1] ** 2),
+                -x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
             ),
         )
     return blocks
@@ -585,45 +670,54 @@ def get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels=[]):
     return cuted_list
 
 
-def add_split_block(
-    blocks: List[LayoutParsingBlock], region_bbox: List[int]
-) -> List[LayoutParsingBlock]:
-    block_bboxes = np.array([block.bbox for block in blocks])
-    discontinuous = calculate_discontinuous_projection(
-        block_bboxes, direction="vertical"
-    )
-    current_interval = discontinuous[0]
-    for interval in discontinuous[1:]:
-        gap_len = interval[0] - current_interval[1]
-        if gap_len > 40:
-            x1, _, x2, __ = region_bbox
-            y1 = current_interval[1] + 5
-            y2 = interval[0] - 5
-            bbox = [x1, y1, x2, y2]
-            split_block = LayoutParsingBlock(label="split", bbox=bbox)
-            blocks.append(split_block)
-        current_interval = interval
+def get_blocks_by_direction_interval(
+    blocks: List[LayoutBlock],
+    start_index: int,
+    end_index: int,
+    direction: str = "horizontal",
+) -> List[LayoutBlock]:
+    """
+    Get blocks within a specified direction interval.
+
+    Args:
+        blocks (List[LayoutBlock]): A list of blocks.
+        start_index (int): The starting index of the direction.
+        end_index (int): The ending index of the direction.
+        direction (str, optional): The direction to consider. Defaults to "horizontal".
+
+    Returns:
+        List[LayoutBlock]: A list of blocks within the specified direction interval.
+    """
+    interval_blocks = []
+    aixis = 0 if direction == "horizontal" else 1
+    blocks.sort(key=lambda x: x.bbox[aixis + 2])
+
+    for block in blocks:
+        if block.bbox[aixis] >= start_index and block.bbox[aixis + 2] <= end_index:
+            interval_blocks.append(block)
+
+    return interval_blocks
 
 
 def get_nearest_blocks(
-    block: LayoutParsingBlock,
-    ref_blocks: List[LayoutParsingBlock],
+    block: LayoutBlock,
+    ref_blocks: List[LayoutBlock],
     overlap_threshold,
     direction="horizontal",
 ) -> List:
     """
     Get the adjacent blocks with the same direction as the current block.
     Args:
-        block (LayoutParsingBlock): The current block.
-        blocks (List[LayoutParsingBlock]): A list of all blocks.
+        block (LayoutBlock): The current block.
+        blocks (List[LayoutBlock]): A list of all blocks.
         ref_block_idxes (List[int]): A list of indices of reference blocks.
         iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
     Returns:
         Int: The index of the previous block with same direction.
         Int: The index of the following block with same direction.
     """
-    prev_blocks: List[LayoutParsingBlock] = []
-    post_blocks: List[LayoutParsingBlock] = []
+    prev_blocks: List[LayoutBlock] = []
+    post_blocks: List[LayoutBlock] = []
     sort_index = 1 if direction == "horizontal" else 0
     for ref_block in ref_blocks:
         if ref_block.index == block.index:
@@ -645,110 +739,9 @@ def get_nearest_blocks(
     return prev_blocks, post_blocks
 
 
-def get_adjacent_blocks_by_direction(
-    blocks: List[LayoutParsingBlock],
-    block_idx: int,
-    ref_block_idxes: List[int],
-    iou_threshold,
-) -> List:
-    """
-    Get the adjacent blocks with the same direction as the current block.
-    Args:
-        block (LayoutParsingBlock): The current block.
-        blocks (List[LayoutParsingBlock]): A list of all blocks.
-        ref_block_idxes (List[int]): A list of indices of reference blocks.
-        iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
-    Returns:
-        Int: The index of the previous block with same direction.
-        Int: The index of the following block with same direction.
-    """
-    min_prev_block_distance = float("inf")
-    prev_block_index = None
-    min_post_block_distance = float("inf")
-    post_block_index = None
-    block = blocks[block_idx]
-    child_labels = [
-        "vision_footnote",
-        "sub_paragraph_title",
-        "doc_title_text",
-        "vision_title",
-    ]
-
-    # find the nearest text block with same direction to the current block
-    for ref_block_idx in ref_block_idxes:
-        ref_block = blocks[ref_block_idx]
-        ref_block_direction = ref_block.direction
-        if ref_block.order_label in child_labels:
-            continue
-        match_block_iou = calculate_projection_overlap_ratio(
-            block.bbox,
-            ref_block.bbox,
-            ref_block_direction,
-        )
-
-        child_match_distance_tolerance_len = block.short_side_length / 10
-
-        if block.order_label == "vision":
-            if ref_block.num_of_lines == 1:
-                gap_tolerance_len = ref_block.short_side_length * 2
-            else:
-                gap_tolerance_len = block.short_side_length / 10
-        else:
-            gap_tolerance_len = block.short_side_length * 2
-
-        if match_block_iou >= iou_threshold:
-            prev_distance = (
-                block.secondary_direction_start_coordinate
-                - ref_block.secondary_direction_end_coordinate
-                + child_match_distance_tolerance_len
-            ) // 5 + ref_block.start_coordinate / 5000
-            next_distance = (
-                ref_block.secondary_direction_start_coordinate
-                - block.secondary_direction_end_coordinate
-                + child_match_distance_tolerance_len
-            ) // 5 + ref_block.start_coordinate / 5000
-            if (
-                ref_block.secondary_direction_end_coordinate
-                <= block.secondary_direction_start_coordinate
-                + child_match_distance_tolerance_len
-                and prev_distance < min_prev_block_distance
-            ):
-                min_prev_block_distance = prev_distance
-                if (
-                    block.secondary_direction_start_coordinate
-                    - ref_block.secondary_direction_end_coordinate
-                    < gap_tolerance_len
-                ):
-                    prev_block_index = ref_block_idx
-            elif (
-                ref_block.secondary_direction_start_coordinate
-                > block.secondary_direction_end_coordinate
-                - child_match_distance_tolerance_len
-                and next_distance < min_post_block_distance
-            ):
-                min_post_block_distance = next_distance
-                if (
-                    ref_block.secondary_direction_start_coordinate
-                    - block.secondary_direction_end_coordinate
-                    < gap_tolerance_len
-                ):
-                    post_block_index = ref_block_idx
-
-    diff_dist = abs(min_prev_block_distance - min_post_block_distance)
-
-    # if the difference in distance is too large, only consider the nearest one
-    if diff_dist * 5 > block.short_side_length:
-        if min_prev_block_distance < min_post_block_distance:
-            post_block_index = None
-        else:
-            prev_block_index = None
-
-    return prev_block_index, post_block_index
-
-
 def update_doc_title_child_blocks(
-    block: LayoutParsingBlock,
-    region: LayoutParsingRegion,
+    block: LayoutBlock,
+    region: LayoutRegion,
 ) -> None:
     """
     Update the child blocks of a document title block.
@@ -762,8 +755,8 @@ def update_doc_title_child_blocks(
         6. The nearest edge distance should be less than 2 times of the text line height.
 
     Args:
-        blocks (List[LayoutParsingBlock]): overall blocks.
-        block (LayoutParsingBlock): document title block.
+        blocks (List[LayoutBlock]): overall blocks.
+        block (LayoutBlock): document title block.
         prev_idx (int): previous block index, None if not exist.
         post_idx (int): post block index, None if not exist.
         config (dict): configurations.
@@ -813,10 +806,24 @@ def update_doc_title_child_blocks(
             block.append_child_block(ref_block)
             region.normal_text_block_idxes.remove(ref_block.index)
 
+    for ref_block in ref_blocks:
+        if ref_block.order_label == "doc_title_text":
+            continue
+        with_seem_direction = ref_block.direction == block.direction
+
+        overlap_ratio = calculate_overlap_ratio(
+            block.bbox, ref_block.bbox, mode="small"
+        )
+
+        if overlap_ratio > 0.9 and with_seem_direction:
+            ref_block.order_label = "doc_title_text"
+            block.append_child_block(ref_block)
+            region.normal_text_block_idxes.remove(ref_block.index)
+
 
 def update_paragraph_title_child_blocks(
-    block: LayoutParsingBlock,
-    region: LayoutParsingRegion,
+    block: LayoutBlock,
+    region: LayoutRegion,
 ) -> None:
     """
     Update the child blocks of a paragraph title block.
@@ -827,8 +834,8 @@ def update_paragraph_title_child_blocks(
         3. The child block must be paragraph title block.
 
     Args:
-        blocks (List[LayoutParsingBlock]): overall blocks.
-        block (LayoutParsingBlock): document title block.
+        blocks (List[LayoutBlock]): overall blocks.
+        block (LayoutBlock): document title block.
         prev_idx (int): previous block index, None if not exist.
         post_idx (int): post block index, None if not exist.
         config (dict): configurations.
@@ -858,8 +865,13 @@ def update_paragraph_title_child_blocks(
                 block.bbox, ref_block.bbox
             )
             with_seem_direction = ref_block.direction == block.direction
+            with_seem_start = (
+                abs(ref_block.start_coordinate - block.start_coordinate)
+                < min_text_line_height * 2
+            )
             if (
                 with_seem_direction
+                and with_seem_start
                 and nearest_edge_distance <= min_text_line_height * 1.5
             ):
                 ref_block.order_label = "sub_paragraph_title"
@@ -868,8 +880,8 @@ def update_paragraph_title_child_blocks(
 
 
 def update_vision_child_blocks(
-    block: LayoutParsingBlock,
-    region: LayoutParsingRegion,
+    block: LayoutBlock,
+    region: LayoutRegion,
 ) -> None:
     """
     Update the child blocks of a paragraph title block.
@@ -887,8 +899,8 @@ def update_vision_child_blocks(
         4. The difference between their centers is very small.
 
     Args:
-        blocks (List[LayoutParsingBlock]): overall blocks.
-        block (LayoutParsingBlock): document title block.
+        blocks (List[LayoutBlock]): overall blocks.
+        block (LayoutBlock): document title block.
         ref_block_idxes (List[int]): A list of indices of reference blocks.
         prev_idx (int): previous block index, None if not exist.
         post_idx (int): post block index, None if not exist.
@@ -934,11 +946,11 @@ def update_vision_child_blocks(
                     not has_vision_footnote
                     and ref_block.direction == block.direction
                     and ref_block.long_side_length < block.long_side_length
+                    and nearest_edge_distance <= ref_block.text_line_height * 2
                 ):
                     if (
                         (
-                            nearest_edge_distance <= block.text_line_height * 2
-                            and ref_block.short_side_length < block.short_side_length
+                            ref_block.short_side_length < block.short_side_length
                             and ref_block.long_side_length
                             < 0.5 * block.long_side_length
                             and abs(block_center[0] - ref_block_center[0]) < 10
@@ -979,12 +991,17 @@ def update_vision_child_blocks(
             if ref_block.label in BLOCK_LABEL_MAP["text_labels"]:
                 if (
                     not has_vision_footnote
-                    and nearest_edge_distance <= block.text_line_height * 2
-                    and ref_block.short_side_length < block.short_side_length
-                    and ref_block.long_side_length < 0.5 * block.long_side_length
                     and ref_block.direction == block.direction
-                    and (
-                        abs(block_center[0] - ref_block_center[0]) < 10
+                    and ref_block.long_side_length < block.long_side_length
+                    and nearest_edge_distance <= ref_block.text_line_height * 2
+                ):
+                    if (
+                        (
+                            ref_block.short_side_length < block.short_side_length
+                            and ref_block.long_side_length
+                            < 0.5 * block.long_side_length
+                            and abs(block_center[0] - ref_block_center[0]) < 10
+                        )
                         or (
                             block.bbox[0] - ref_block.bbox[0] < 10
                             and ref_block.num_of_lines == 1
@@ -993,16 +1010,56 @@ def update_vision_child_blocks(
                             block.bbox[2] - ref_block.bbox[2] < 10
                             and ref_block.num_of_lines == 1
                         )
-                    )
-                ):
-                    has_vision_footnote = True
-                    ref_block.order_label = "vision_footnote"
-                    block.append_child_block(ref_block)
-                    region.normal_text_block_idxes.remove(ref_block.index)
+                    ):
+                        has_vision_footnote = True
+                        ref_block.label = "vision_footnote"
+                        ref_block.order_label = "vision_footnote"
+                        block.append_child_block(ref_block)
+                        region.normal_text_block_idxes.remove(ref_block.index)
                 break
         if has_vision_title:
             break
 
+    for ref_block in ref_blocks:
+        if ref_block.index not in region.normal_text_block_idxes:
+            continue
+
+        overlap_ratio = calculate_overlap_ratio(
+            block.bbox, ref_block.bbox, mode="small"
+        )
+
+        if overlap_ratio > 0.9:
+            ref_block.label = "vision_footnote"
+            ref_block.order_label = "vision_footnote"
+            block.append_child_block(ref_block)
+            region.normal_text_block_idxes.remove(ref_block.index)
+
+
+def update_region_child_blocks(
+    block: LayoutBlock,
+    region: LayoutRegion,
+) -> None:
+    """Update child blocks of a region.
+
+    Args:
+        block (LayoutBlock): document title block.
+        region (LayoutRegion): layout region.
+
+    Returns:
+        None
+    """
+    for ref_block in region.block_map.values():
+        if block.index != ref_block.index:
+            bbox_iou = calculate_overlap_ratio(block.bbox, ref_block.bbox)
+            if (
+                bbox_iou > 0
+                and block.area > ref_block.area
+                and ref_block.order_label != "sub_region"
+            ):
+                ref_block.order_label = "sub_region"
+                block.append_child_block(ref_block)
+                region.normal_text_block_idxes.remove(ref_block.index)
+
 
 def calculate_discontinuous_projection(
     boxes, direction="horizontal", return_num=False
@@ -1049,44 +1106,6 @@ def calculate_discontinuous_projection(
     return merged_intervals
 
 
-def is_projection_consistent(blocks, intervals, direction="horizontal"):
-
-    for interval in intervals:
-        if direction == "horizontal":
-            start_index, stop_index = 0, 2
-            interval_box = [interval[0], 0, interval[1], 1]
-        else:
-            start_index, stop_index = 1, 3
-            interval_box = [0, interval[0], 1, interval[1]]
-        same_interval_bboxes = []
-        for block in blocks:
-            overlap_ratio = calculate_projection_overlap_ratio(
-                interval_box, block.bbox, direction=direction
-            )
-            if overlap_ratio > 0 and block.label in BLOCK_LABEL_MAP["text_labels"]:
-                same_interval_bboxes.append(block.bbox)
-        start_coordinates = [bbox[start_index] for bbox in same_interval_bboxes]
-        if start_coordinates:
-            min_start_coordinate = min(start_coordinates)
-            max_start_coordinate = max(start_coordinates)
-            is_start_consistent = (
-                False
-                if max_start_coordinate - min_start_coordinate
-                >= abs(interval[0] - interval[1]) * 0.05
-                else True
-            )
-            stop_coordinates = [bbox[stop_index] for bbox in same_interval_bboxes]
-            min_stop_coordinate = min(stop_coordinates)
-            max_stop_coordinate = max(stop_coordinates)
-            if (
-                max_stop_coordinate - min_stop_coordinate
-                >= abs(interval[0] - interval[1]) * 0.05
-                and is_start_consistent
-            ):
-                return False
-    return True
-
-
 def shrink_overlapping_boxes(
     boxes, direction="horizontal", min_threshold=0, max_threshold=0.1
 ) -> List:
@@ -1125,8 +1144,12 @@ def shrink_overlapping_boxes(
                 split_y = int((overlap_y_min + overlap_y_max) / 2)
                 overlap_y_min = split_y - 1
                 overlap_y_max = split_y + 1
-                current_block.bbox = [x1, y1, x2, overlap_y_min]
-                block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
+                if y1 < y1_prime:
+                    current_block.bbox = [x1, y1, x2, overlap_y_min]
+                    block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
+                else:
+                    current_block.bbox = [x1, overlap_y_min, x2, y2]
+                    block.bbox = [x1_prime, y1_prime, x2_prime, overlap_y_max]
         else:
             if (
                 (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
@@ -1138,7 +1161,39 @@ def shrink_overlapping_boxes(
                 split_x = int((overlap_x_min + overlap_x_max) / 2)
                 overlap_x_min = split_x - 1
                 overlap_x_max = split_x + 1
-                current_block.bbox = [x1, y1, overlap_x_min, y2]
-                block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
+                if x1 < x1_prime:
+                    current_block.bbox = [x1, y1, overlap_x_min, y2]
+                    block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
+                else:
+                    current_block.bbox = [overlap_x_min, y1, x2, y2]
+                    block.bbox = [x1_prime, y1_prime, overlap_x_max, y2_prime]
         current_block = block
     return boxes
+
+
+def find_local_minima_flat_regions(arr) -> List:
+    """
+    Find all local minima regions in a flat array.
+
+    Args:
+        arr (list): The input array.
+
+    Returns:
+        list: A list of tuples containing the indices of the local minima regions.
+    """
+    n = len(arr)
+    if n == 0:
+        return []
+
+    flat_minima_regions = []
+    start = 0
+
+    for i in range(1, n):
+        if arr[i] != arr[i - 1]:
+            if (start == 0 or arr[start - 1] > arr[start]) and (
+                i == n or arr[i] > arr[start]
+            ):
+                flat_minima_regions.append((start, i - 1))
+            start = i
+
+    return flat_minima_regions[1:] if len(flat_minima_regions) > 1 else None

+ 156 - 104
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py

@@ -17,11 +17,14 @@ from typing import Dict, List, Tuple
 
 import numpy as np
 
-from ..result_v2 import LayoutParsingBlock, LayoutParsingRegion
-from ..setting import BLOCK_LABEL_MAP
+from ..layout_objects import LayoutBlock, LayoutRegion
+from ..setting import BLOCK_LABEL_MAP, XYCUT_SETTINGS
 from ..utils import calculate_overlap_ratio, calculate_projection_overlap_ratio
 from .utils import (
     calculate_discontinuous_projection,
+    euclidean_insert,
+    find_local_minima_flat_regions,
+    get_blocks_by_direction_interval,
     get_cut_blocks,
     insert_child_blocks,
     manhattan_insert,
@@ -31,16 +34,16 @@ from .utils import (
     reference_insert,
     shrink_overlapping_boxes,
     sort_normal_blocks,
-    split_projection_profile,
     update_doc_title_child_blocks,
     update_paragraph_title_child_blocks,
+    update_region_child_blocks,
     update_vision_child_blocks,
     weighted_distance_insert,
 )
 
 
 def pre_process(
-    region: LayoutParsingRegion,
+    region: LayoutRegion,
 ) -> List:
     """
     Preprocess the layout for sorting purposes.
@@ -63,10 +66,11 @@ def pre_process(
         "sub_paragraph_title",
         "doc_title_text",
         "vision_title",
+        "sub_region",
     ]
     pre_cut_block_idxes = []
     block_map = region.block_map
-    blocks: List[LayoutParsingBlock] = list(block_map.values())
+    blocks: List[LayoutBlock] = list(block_map.values())
     for block in blocks:
         if block.order_label not in mask_labels:
             update_region_label(block, region)
@@ -83,7 +87,6 @@ def pre_process(
         ) / 2
         center_offset = abs(block_center - region.direction_center_coordinate)
         is_centered = center_offset <= tolerance_len
-
         if is_centered:
             pre_cut_block_idxes.append(block.index)
 
@@ -121,60 +124,83 @@ def pre_process(
                             block.secondary_direction_start_coordinate
                         )
                         cut_coordinates.append(block.secondary_direction_end_coordinate)
-    secondary_discontinuous = calculate_discontinuous_projection(
-        all_boxes, direction=region.direction
+    secondary_check_bboxes = np.array(
+        [
+            block.bbox
+            for block in blocks
+            if block.order_label not in mask_labels + ["vision"]
+        ]
     )
-    if len(secondary_discontinuous) == 1:
-        if not discontinuous:
-            discontinuous = calculate_discontinuous_projection(
-                all_boxes, direction=cut_direction
-            )
-        current_interval = discontinuous[0]
-        for interval in discontinuous[1:]:
-            gap_len = interval[0] - current_interval[1]
-            if gap_len >= region.text_line_height * 3:
-                cut_coordinates.append(current_interval[1])
-            elif gap_len > region.text_line_height * 1.2:
-                (pre_blocks, post_blocks) = get_cut_blocks(
-                    list(block_map.values()), cut_direction, [current_interval[1]], []
-                )
-                pre_bboxes = np.array([block.bbox for block in pre_blocks])
-                post_bboxes = np.array([block.bbox for block in post_blocks])
-                projection_index = 1 if cut_direction == "horizontal" else 0
-                pre_projection = projection_by_bboxes(pre_bboxes, projection_index)
-                post_projection = projection_by_bboxes(post_bboxes, projection_index)
-                pre_projection_min = np.min(pre_projection)
-                post_projection_min = np.min(post_projection)
-                pre_projection_min += 5 if pre_projection_min != 0 else 0
-                post_projection_min += 5 if post_projection_min != 0 else 0
-                pre_intervals = split_projection_profile(
-                    pre_projection, pre_projection_min, 1
-                )
-                post_intervals = split_projection_profile(
-                    post_projection, post_projection_min, 1
+    if len(secondary_check_bboxes) > 0 or blocks[0].label == "region":
+        secondary_discontinuous = calculate_discontinuous_projection(
+            secondary_check_bboxes, direction=region.direction
+        )
+        if len(secondary_discontinuous) == 1 or blocks[0].label == "region":
+            if not discontinuous:
+                discontinuous = calculate_discontinuous_projection(
+                    all_boxes, direction=cut_direction
                 )
-                pre_gap_boxes = []
-                if pre_intervals is not None:
-                    for start, end in zip(*pre_intervals):
-                        bbox = [0] * 4
-                        bbox[projection_index] = start
-                        bbox[projection_index + 2] = end
-                        pre_gap_boxes.append(bbox)
-                post_gap_boxes = []
-                if post_intervals is not None:
-                    for start, end in zip(*post_intervals):
-                        bbox = [0] * 4
-                        bbox[projection_index] = start
-                        bbox[projection_index + 2] = end
-                        post_gap_boxes.append(bbox)
-                max_gap_boxes_num = max(len(pre_gap_boxes), len(post_gap_boxes))
-                if max_gap_boxes_num > 0:
-                    discontinuous_intervals = calculate_discontinuous_projection(
-                        pre_gap_boxes + post_gap_boxes, direction=region.direction
+            current_interval = discontinuous[0]
+            pre_cut_coordinates = [
+                cood for cood in cut_coordinates if cood < current_interval[1]
+            ]
+            if not pre_cut_coordinates:
+                pre_cut_coordinate = 0
+            else:
+                pre_cut_coordinate = max(pre_cut_coordinates)
+            pre_cut_coordinate = max(current_interval[0], pre_cut_coordinate)
+            for interval in discontinuous[1:]:
+                gap_len = interval[0] - current_interval[1]
+                if (
+                    gap_len >= region.text_line_height * 3
+                    or blocks[0].label == "region"
+                ):
+                    cut_coordinates.append(current_interval[1])
+                elif gap_len > region.text_line_height * 1.2:
+                    pre_blocks = get_blocks_by_direction_interval(
+                        list(block_map.values()),
+                        pre_cut_coordinate,
+                        current_interval[1],
+                        cut_direction,
+                    )
+                    post_blocks = get_blocks_by_direction_interval(
+                        list(block_map.values()),
+                        current_interval[1],
+                        interval[1],
+                        cut_direction,
                     )
-                    if len(discontinuous_intervals) != max_gap_boxes_num:
-                        cut_coordinates.append(current_interval[1])
-            current_interval = interval
+                    pre_bboxes = np.array([block.bbox for block in pre_blocks])
+                    post_bboxes = np.array([block.bbox for block in post_blocks])
+                    projection_index = 1 if cut_direction == "horizontal" else 0
+                    pre_projection = projection_by_bboxes(pre_bboxes, projection_index)
+                    post_projection = projection_by_bboxes(
+                        post_bboxes, projection_index
+                    )
+                    pre_intervals = find_local_minima_flat_regions(pre_projection)
+                    post_intervals = find_local_minima_flat_regions(post_projection)
+                    pre_gap_boxes = []
+                    if pre_intervals is not None:
+                        for start, end in pre_intervals:
+                            bbox = [0] * 4
+                            bbox[projection_index] = start
+                            bbox[projection_index + 2] = end
+                            pre_gap_boxes.append(bbox)
+                    post_gap_boxes = []
+                    if post_intervals is not None:
+                        for start, end in post_intervals:
+                            bbox = [0] * 4
+                            bbox[projection_index] = start
+                            bbox[projection_index + 2] = end
+                            post_gap_boxes.append(bbox)
+                    max_gap_boxes_num = max(len(pre_gap_boxes), len(post_gap_boxes))
+                    if max_gap_boxes_num > 0:
+                        discontinuous_intervals = calculate_discontinuous_projection(
+                            pre_gap_boxes + post_gap_boxes, direction=region.direction
+                        )
+                        if len(discontinuous_intervals) != max_gap_boxes_num:
+                            pre_cut_coordinate = current_interval[1]
+                            cut_coordinates.append(current_interval[1])
+                current_interval = interval
     cut_list = get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels)
     pre_cut_list.extend(cut_list)
     if region.direction == "vertical":
@@ -184,14 +210,14 @@ def pre_process(
 
 
 def update_region_label(
-    block: LayoutParsingBlock,
-    region: LayoutParsingRegion,
+    block: LayoutBlock,
+    region: LayoutRegion,
 ) -> None:
     """
     Update the region label of a block based on its label and match the block with its children.
 
     Args:
-        blocks (List[LayoutParsingBlock]): The list of blocks to process.
+        blocks (List[LayoutBlock]): The list of blocks to process.
         config (Dict[str, Any]): The configuration dictionary containing the necessary information.
         block_idx (int): The index of the current block being processed.
 
@@ -210,17 +236,18 @@ def update_region_label(
     elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
         block.order_label = "vision"
         block.num_of_lines = 1
-        block.direction = region.direction
-        block.update_direction_info()
+        block.update_direction(region.direction)
     elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
         block.order_label = "footer"
     elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
         block.order_label = "unordered"
+    elif block.label == "region":
+        block.order_label = "region"
     else:
         block.order_label = "normal_text"
 
     # only vision and doc title block can have child block
-    if block.order_label not in ["vision", "doc_title", "paragraph_title"]:
+    if block.order_label not in ["vision", "doc_title", "paragraph_title", "region"]:
         return
 
     # match doc title text block
@@ -232,10 +259,12 @@ def update_region_label(
     # match vision title block and vision footnote block
     elif block.order_label == "vision":
         update_vision_child_blocks(block, region)
+    elif block.order_label == "region":
+        update_region_child_blocks(block, region)
 
 
 def get_layout_structure(
-    blocks: List[LayoutParsingBlock],
+    blocks: List[LayoutBlock],
     region_direction: str,
     region_secondary_direction: str,
 ) -> Tuple[List[Dict[str, any]], bool]:
@@ -263,11 +292,11 @@ def get_layout_structure(
                 continue
 
             bbox_iou = calculate_overlap_ratio(block.bbox, ref_block.bbox)
-            if bbox_iou > 0:
+            if bbox_iou:
                 if ref_block.order_label == "vision":
                     ref_block.order_label = "cross_layout"
                     break
-                if block.order_label == "vision" or block.area < ref_block.area:
+                if bbox_iou > 0.1 and block.area < ref_block.area:
                     block.order_label = "cross_layout"
                     break
 
@@ -320,13 +349,19 @@ def get_layout_structure(
                         and ref_match_projection_iou == 0
                         and secondary_direction_ref_match_projection_overlap_ratio > 0
                     ):
-                        if block.order_label == "vision" or (
+                        if block.order_label in ["vision", "region"] or (
                             ref_block.order_label == "normal_text"
                             and second_ref_block.order_label == "normal_text"
-                            and ref_block.text_line_width
-                            > ref_block.text_line_height * 5
-                            and second_ref_block.text_line_width
-                            > second_ref_block.text_line_height * 5
+                            and ref_block.long_side_length
+                            > ref_block.text_line_height
+                            * XYCUT_SETTINGS.get(
+                                "cross_layout_ref_text_block_words_num_threshold", 8
+                            )
+                            and second_ref_block.long_side_length
+                            > second_ref_block.text_line_height
+                            * XYCUT_SETTINGS.get(
+                                "cross_layout_ref_text_block_words_num_threshold", 8
+                            )
                         ):
                             block.order_label = (
                                 "cross_reference"
@@ -374,20 +409,20 @@ def sort_by_xycut(
 
 
 def match_unsorted_blocks(
-    sorted_blocks: List[LayoutParsingBlock],
-    unsorted_blocks: List[LayoutParsingBlock],
-    region: LayoutParsingRegion,
-) -> List[LayoutParsingBlock]:
+    sorted_blocks: List[LayoutBlock],
+    unsorted_blocks: List[LayoutBlock],
+    region: LayoutRegion,
+) -> List[LayoutBlock]:
     """
     Match special blocks with the sorted blocks based on their region labels.
     Args:
-        sorted_blocks (List[LayoutParsingBlock]): Sorted blocks to be matched.
-        unsorted_blocks (List[LayoutParsingBlock]): Unsorted blocks to be matched.
+        sorted_blocks (List[LayoutBlock]): Sorted blocks to be matched.
+        unsorted_blocks (List[LayoutBlock]): Unsorted blocks to be matched.
         config (Dict): Configuration dictionary containing various parameters.
         median_width (int): Median width value used for calculations.
 
     Returns:
-        List[LayoutParsingBlock]: The updated sorted blocks after matching special blocks.
+        List[LayoutBlock]: The updated sorted blocks after matching special blocks.
     """
     distance_type_map = {
         "cross_layout": weighted_distance_insert,
@@ -398,6 +433,7 @@ def match_unsorted_blocks(
         "cross_reference": reference_insert,
         "unordered": manhattan_insert,
         "other": manhattan_insert,
+        "region": euclidean_insert,
     }
 
     unsorted_blocks = sort_normal_blocks(
@@ -407,17 +443,19 @@ def match_unsorted_blocks(
         region.direction,
     )
     for idx, block in enumerate(unsorted_blocks):
-        order_label = block.order_label
+        order_label = block.order_label if block.label != "region" else "region"
         if idx == 0 and order_label == "doc_title":
             sorted_blocks.insert(0, block)
             continue
-        sorted_blocks = distance_type_map[order_label](block, sorted_blocks, region)
+        sorted_blocks = distance_type_map[order_label](
+            block=block, sorted_blocks=sorted_blocks, region=region
+        )
     return sorted_blocks
 
 
 def xycut_enhanced(
-    region: LayoutParsingRegion,
-) -> LayoutParsingRegion:
+    region: LayoutRegion,
+) -> LayoutRegion:
     """
     xycut_enhance function performs the following steps:
         1. Preprocess the input blocks by extracting headers, footers, and pre-cut blocks.
@@ -428,34 +466,34 @@ def xycut_enhanced(
         6. Return the ordered result list.
 
     Args:
-        blocks (List[LayoutParsingBlock]): Input blocks to be processed.
+        blocks (List[LayoutBlock]): Input blocks to be processed.
 
     Returns:
-        List[LayoutParsingBlock]: Ordered result list after processing.
+        List[LayoutBlock]: Ordered result list after processing.
     """
     if len(region.block_map) == 0:
         return []
 
-    pre_cut_list: List[List[LayoutParsingBlock]] = pre_process(region)
-    final_order_res_list: List[LayoutParsingBlock] = []
+    pre_cut_list: List[List[LayoutBlock]] = pre_process(region)
+    final_order_res_list: List[LayoutBlock] = []
 
-    header_blocks: List[LayoutParsingBlock] = [
+    header_blocks: List[LayoutBlock] = [
         region.block_map[idx] for idx in region.header_block_idxes
     ]
-    unordered_blocks: List[LayoutParsingBlock] = [
+    unordered_blocks: List[LayoutBlock] = [
         region.block_map[idx] for idx in region.unordered_block_idxes
     ]
-    footer_blocks: List[LayoutParsingBlock] = [
+    footer_blocks: List[LayoutBlock] = [
         region.block_map[idx] for idx in region.footer_block_idxes
     ]
 
-    header_blocks: List[LayoutParsingBlock] = sort_normal_blocks(
+    header_blocks: List[LayoutBlock] = sort_normal_blocks(
         header_blocks, region.text_line_height, region.text_line_width, region.direction
     )
-    footer_blocks: List[LayoutParsingBlock] = sort_normal_blocks(
+    footer_blocks: List[LayoutBlock] = sort_normal_blocks(
         footer_blocks, region.text_line_height, region.text_line_width, region.direction
     )
-    unordered_blocks: List[LayoutParsingBlock] = sort_normal_blocks(
+    unordered_blocks: List[LayoutBlock] = sort_normal_blocks(
         unordered_blocks,
         region.text_line_height,
         region.text_line_width,
@@ -463,16 +501,26 @@ def xycut_enhanced(
     )
     final_order_res_list.extend(header_blocks)
 
-    unsorted_blocks: List[LayoutParsingBlock] = []
-    sorted_blocks_by_pre_cuts: List[LayoutParsingBlock] = []
+    unsorted_blocks: List[LayoutBlock] = []
+    sorted_blocks_by_pre_cuts: List[LayoutBlock] = []
     for pre_cut_blocks in pre_cut_list:
-        sorted_blocks: List[LayoutParsingBlock] = []
-        doc_title_blocks: List[LayoutParsingBlock] = []
-        xy_cut_blocks: List[LayoutParsingBlock] = []
+        sorted_blocks: List[LayoutBlock] = []
+        doc_title_blocks: List[LayoutBlock] = []
+        xy_cut_blocks: List[LayoutBlock] = []
 
-        get_layout_structure(
-            pre_cut_blocks, region.direction, region.secondary_direction
-        )
+        if pre_cut_blocks and pre_cut_blocks[0].label == "region":
+            block_bboxes = np.array([block.bbox for block in pre_cut_blocks])
+            discontinuous = calculate_discontinuous_projection(
+                block_bboxes, direction=region.direction
+            )
+            if len(discontinuous) == 1:
+                get_layout_structure(
+                    pre_cut_blocks, region.direction, region.secondary_direction
+                )
+        else:
+            get_layout_structure(
+                pre_cut_blocks, region.direction, region.secondary_direction
+            )
 
         # Get xy cut blocks and add other blocks in special_block_map
         for block in pre_cut_blocks:
@@ -494,8 +542,6 @@ def xycut_enhanced(
             discontinuous = calculate_discontinuous_projection(
                 block_bboxes, direction=region.direction
             )
-            if len(discontinuous) > 1:
-                xy_cut_blocks = [block for block in xy_cut_blocks]
             blocks_to_sort = deepcopy(xy_cut_blocks)
             if region.direction == "vertical":
                 for block in blocks_to_sort:
@@ -526,7 +572,7 @@ def xycut_enhanced(
                     )
                 )
                 blocks_to_sort = shrink_overlapping_boxes(
-                    blocks_to_sort, region.direction
+                    blocks_to_sort, region.secondary_direction
                 )
                 block_bboxes = np.array([block.bbox for block in blocks_to_sort])
                 sorted_indexes = sort_by_xycut(
@@ -536,13 +582,19 @@ def xycut_enhanced(
             sorted_blocks = [
                 region.block_map[blocks_to_sort[i].index] for i in sorted_indexes
             ]
-
         sorted_blocks = match_unsorted_blocks(
             sorted_blocks,
             doc_title_blocks,
             region=region,
         )
 
+        if unsorted_blocks and unsorted_blocks[0].label == "region":
+            sorted_blocks = match_unsorted_blocks(
+                sorted_blocks,
+                unsorted_blocks,
+                region=region,
+            )
+            unsorted_blocks = []
         sorted_blocks_by_pre_cuts.extend(sorted_blocks)
 
     final_sorted_blocks = match_unsorted_blocks(