zhouchangda пре 7 месеци
родитељ
комит
e243a7e536

+ 72 - 66
paddlex/inference/models/formula_recognition/processors.py

@@ -631,74 +631,80 @@ class UniMERNetDecode(object):
         self.pad_token_type_id = 0
         self.pad_to_multiple_of = None
 
-        temp_path = tempfile.gettempdir()
-        fast_tokenizer_file = os.path.join(temp_path, "tokenizer.json")
-        tokenizer_config_file = os.path.join(temp_path, "tokenizer_config.json")
-        try:
-            with open(fast_tokenizer_file, "w") as f:
-                json.dump(character_list["fast_tokenizer_file"], f)
-            with open(tokenizer_config_file, "w") as f:
-                json.dump(character_list["tokenizer_config_file"], f)
-        except Exception as e:
-            print(
-                f"创建 tokenizer.json 和 tokenizer_config.json 文件失败, 原因{str(e)}"
-            )
-
-        self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
-        added_tokens_decoder = {}
-        added_tokens_map = {}
-        if tokenizer_config_file is not None:
-            with open(
-                tokenizer_config_file, encoding="utf-8"
-            ) as tokenizer_config_handle:
-                init_kwargs = json.load(tokenizer_config_handle)
-                if "added_tokens_decoder" in init_kwargs:
-                    for idx, token in init_kwargs["added_tokens_decoder"].items():
-                        if isinstance(token, dict):
-                            token = AddedToken(**token)
-                        if isinstance(token, AddedToken):
-                            added_tokens_decoder[int(idx)] = token
-                            added_tokens_map[str(token)] = token
-                        else:
-                            raise ValueError(
-                                f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
-                            )
-                init_kwargs["added_tokens_decoder"] = added_tokens_decoder
-                added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
-                tokens_to_add = [
-                    token
-                    for index, token in sorted(
-                        added_tokens_decoder.items(), key=lambda x: x[0]
-                    )
-                    if token not in added_tokens_decoder
-                ]
-                added_tokens_encoder = self.added_tokens_encoder(added_tokens_decoder)
-                encoder = list(added_tokens_encoder.keys()) + [
-                    str(token) for token in tokens_to_add
-                ]
-                tokens_to_add += [
-                    token
-                    for token in self.all_special_tokens_extended
-                    if token not in encoder and token not in tokens_to_add
-                ]
-                if len(tokens_to_add) > 0:
-                    is_last_special = None
-                    tokens = []
-                    special_tokens = self.all_special_tokens
-                    for token in tokens_to_add:
-                        is_special = (
-                            (token.special or str(token) in special_tokens)
-                            if isinstance(token, AddedToken)
-                            else str(token) in special_tokens
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=True
+        ) as temp_file1, tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=True
+        ) as temp_file2:
+            fast_tokenizer_file = temp_file1.name
+            tokenizer_config_file = temp_file2.name
+            try:
+                with open(fast_tokenizer_file, "w") as f:
+                    json.dump(character_list["fast_tokenizer_file"], f)
+                with open(tokenizer_config_file, "w") as f:
+                    json.dump(character_list["tokenizer_config_file"], f)
+            except Exception as e:
+                print(
+                    f"创建 tokenizer.json 和 tokenizer_config.json 文件失败, 原因{str(e)}"
+                )
+
+            self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+            added_tokens_decoder = {}
+            added_tokens_map = {}
+            if tokenizer_config_file is not None:
+                with open(
+                    tokenizer_config_file, encoding="utf-8"
+                ) as tokenizer_config_handle:
+                    init_kwargs = json.load(tokenizer_config_handle)
+                    if "added_tokens_decoder" in init_kwargs:
+                        for idx, token in init_kwargs["added_tokens_decoder"].items():
+                            if isinstance(token, dict):
+                                token = AddedToken(**token)
+                            if isinstance(token, AddedToken):
+                                added_tokens_decoder[int(idx)] = token
+                                added_tokens_map[str(token)] = token
+                            else:
+                                raise ValueError(
+                                    f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
+                                )
+                    init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+                    added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
+                    tokens_to_add = [
+                        token
+                        for index, token in sorted(
+                            added_tokens_decoder.items(), key=lambda x: x[0]
                         )
-                        if is_last_special is None or is_last_special == is_special:
-                            tokens.append(token)
-                        else:
+                        if token not in added_tokens_decoder
+                    ]
+                    added_tokens_encoder = self.added_tokens_encoder(
+                        added_tokens_decoder
+                    )
+                    encoder = list(added_tokens_encoder.keys()) + [
+                        str(token) for token in tokens_to_add
+                    ]
+                    tokens_to_add += [
+                        token
+                        for token in self.all_special_tokens_extended
+                        if token not in encoder and token not in tokens_to_add
+                    ]
+                    if len(tokens_to_add) > 0:
+                        is_last_special = None
+                        tokens = []
+                        special_tokens = self.all_special_tokens
+                        for token in tokens_to_add:
+                            is_special = (
+                                (token.special or str(token) in special_tokens)
+                                if isinstance(token, AddedToken)
+                                else str(token) in special_tokens
+                            )
+                            if is_last_special is None or is_last_special == is_special:
+                                tokens.append(token)
+                            else:
+                                self._add_tokens(tokens, special_tokens=is_last_special)
+                                tokens = [token]
+                            is_last_special = is_special
+                        if tokens:
                             self._add_tokens(tokens, special_tokens=is_last_special)
-                            tokens = [token]
-                        is_last_special = is_special
-                    if tokens:
-                        self._add_tokens(tokens, special_tokens=is_last_special)
 
     def _add_tokens(
         self, new_tokens: "List[Union[AddedToken, str]]", special_tokens: bool = False

+ 2 - 2
paddlex/inference/pipelines/layout_parsing/pipeline.py

@@ -240,10 +240,10 @@ class LayoutParsingPipeline(BasePipeline):
                     )
                     seal_index += 1
             else:
-                ocr_res_in_box, matched_idxs = get_sub_regions_ocr_res(
+                ocr_res_in_box, matched_idxes = get_sub_regions_ocr_res(
                     overall_ocr_res, [box], return_match_idx=True
                 )
-                for matched_idx in matched_idxs:
+                for matched_idx in matched_idxes:
                     if matched_ocr_dict.get(matched_idx, None) is None:
                         matched_ocr_dict[matched_idx] = [object_box_idx]
                     else:

+ 438 - 100
paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

@@ -15,9 +15,10 @@ from __future__ import annotations
 
 import copy
 import re
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+from PIL import Image
 
 from ....utils import logging
 from ....utils.deps import pipeline_requires_extra
@@ -28,8 +29,22 @@ from ...utils.hpi import HPIConfig
 from ...utils.pp_option import PaddlePredictorOption
 from ..base import BasePipeline
 from ..ocr.result import OCRResult
-from .result_v2 import LayoutParsingResultV2
-from .utils import gather_imgs, get_single_block_parsing_res, get_sub_regions_ocr_res
+from .result_v2 import LayoutParsingBlock, LayoutParsingResultV2
+from .utils import (
+    caculate_bbox_area,
+    calculate_text_orientation,
+    convert_formula_res_to_ocr_format,
+    format_line,
+    gather_imgs,
+    get_bbox_intersection,
+    get_sub_regions_ocr_res,
+    group_boxes_into_lines,
+    remove_overlap_blocks,
+    split_boxes_if_x_contained,
+    update_layout_order_config_block_index,
+    update_region_box,
+)
+from .xycut_enhanced import xycut_enhanced
 
 
 @pipeline_requires_extra("ocr")
@@ -67,7 +82,6 @@ class LayoutParsingPipelineV2(BasePipeline):
         )
 
         self.inintial_predictor(config)
-
         self.batch_sampler = ImageBatchSampler(batch_size=1)
 
         self.img_reader = ReadImage(format="BGR")
@@ -229,147 +243,477 @@ class LayoutParsingPipelineV2(BasePipeline):
 
         return True
 
-    def get_layout_parsing_res(
+    def standardized_data(
         self,
         image: list,
+        layout_order_config: dict,
         layout_det_res: DetResult,
         overall_ocr_res: OCRResult,
-        table_res_list: list,
-        seal_res_list: list,
         formula_res_list: list,
-        imgs_in_doc: list,
-        text_det_limit_side_len: Optional[int] = None,
-        text_det_limit_type: Optional[str] = None,
-        text_det_thresh: Optional[float] = None,
-        text_det_box_thresh: Optional[float] = None,
-        text_det_unclip_ratio: Optional[float] = None,
-        text_rec_score_thresh: Optional[float] = None,
+        text_rec_model: Any,
+        text_rec_score_thresh: Union[float, None] = None,
     ) -> list:
         """
         Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
         Args:
             image (list): The input image.
-            layout_det_res (DetResult): The detection result containing the layout information of the document.
-            overall_ocr_res (OCRResult): The overall OCR result containing text information.
-            table_res_list (list): A list of table recognition results.
-            seal_res_list (list): A list of seal recognition results.
+            overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
+                - "input_img": The image on which OCR was performed.
+                - "dt_boxes": A list of detected text box coordinates.
+                - "rec_texts": A list of recognized text corresponding to the detected boxes.
+
+            layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
+                - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
+
+            table_res_list (list): A list of table detection results, where each item is a dictionary containing:
+                - "block_bbox": The bounding box of the table layout.
+                - "pred_html": The predicted HTML representation of the table.
+
             formula_res_list (list): A list of formula recognition results.
-            text_det_limit_side_len (Optional[int], optional): The maximum side length of the text detection region. Defaults to None.
-            text_det_limit_type (Optional[str], optional): The type of limit for the text detection region. Defaults to None.
-            text_det_thresh (Optional[float], optional): The confidence threshold for text detection. Defaults to None.
-            text_det_box_thresh (Optional[float], optional): The confidence threshold for text detection bounding boxes. Defaults to None
-            text_det_unclip_ratio (Optional[float], optional): The unclip ratio for text detection. Defaults to None.
+            text_rec_model (Any): The text recognition model.
             text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
         Returns:
             list: A list of dictionaries representing the layout parsing result.
         """
+
         matched_ocr_dict = {}
-        image = np.array(image)
+        layout_to_ocr_mapping = {}
         object_boxes = []
         footnote_list = []
-        max_bottom_text_coordinate = 0
+        bottom_text_y_max = 0
+        max_block_area = 0.0
+
+        region_box = [65535, 65535, 0, 0]
+        layout_det_res = remove_overlap_blocks(
+            layout_det_res,
+            threshold=0.5,
+            smaller=True,
+        )
+
+        # convert formula_res_list to OCRResult format
+        convert_formula_res_to_ocr_format(formula_res_list, overall_ocr_res)
 
-        for object_box_idx, box_info in enumerate(layout_det_res["boxes"]):
+        # match layout boxes and ocr boxes and get some information for layout_order_config
+        for box_idx, box_info in enumerate(layout_det_res["boxes"]):
             box = box_info["coordinate"]
             label = box_info["label"].lower()
             object_boxes.append(box)
+            _, _, _, y2 = box
+
+            # update the region box and max_block_area according to the layout boxes
+            region_box = update_region_box(box, region_box)
+            max_block_area = max(max_block_area, caculate_bbox_area(box))
+
+            update_layout_order_config_block_index(layout_order_config, label, box_idx)
 
             # set the label of footnote to text, when it is above the text boxes
             if label == "footnote":
-                footnote_list.append(object_box_idx)
-            if label == "text" and box[3] > max_bottom_text_coordinate:
-                max_bottom_text_coordinate = box[3]
+                footnote_list.append(box_idx)
+            if label == "text":
+                bottom_text_y_max = max(y2, bottom_text_y_max)
 
             if label not in ["formula", "table", "seal"]:
-                _, matched_idxs = get_sub_regions_ocr_res(
+                _, matched_idxes = get_sub_regions_ocr_res(
                     overall_ocr_res, [box], return_match_idx=True
                 )
-                for matched_idx in matched_idxs:
+                layout_to_ocr_mapping[box_idx] = matched_idxes
+                for matched_idx in matched_idxes:
                     if matched_ocr_dict.get(matched_idx, None) is None:
-                        matched_ocr_dict[matched_idx] = [object_box_idx]
+                        matched_ocr_dict[matched_idx] = [box_idx]
                     else:
-                        matched_ocr_dict[matched_idx].append(object_box_idx)
+                        matched_ocr_dict[matched_idx].append(box_idx)
 
+        # fix the footnote label
         for footnote_idx in footnote_list:
             if (
                 layout_det_res["boxes"][footnote_idx]["coordinate"][3]
-                < max_bottom_text_coordinate
+                < bottom_text_y_max
             ):
                 layout_det_res["boxes"][footnote_idx]["label"] = "text"
+                layout_order_config["text_block_idxes"].append(footnote_idx)
+                layout_order_config["footer_block_idxes"].remove(footnote_idx)
 
-        already_processed = set()
-        for matched_idx, layout_box_ids in matched_ocr_dict.items():
-            if len(layout_box_ids) <= 1:
-                continue
-
-            # one ocr is matched to multiple layout boxes, split the text into multiple lines
-            for idx in layout_box_ids:
-                if idx in already_processed:
-                    continue
-
-                already_processed.add(idx)
-                wht_im = np.ones(image.shape, dtype=image.dtype) * 255
-                box = object_boxes[idx]
-                x1, y1, x2, y2 = [int(i) for i in box]
-                wht_im[y1:y2, x1:x2, :] = image[y1:y2, x1:x2, :]
-                sub_ocr_res = next(
-                    self.general_ocr_pipeline(
-                        wht_im,
-                        text_det_limit_side_len=text_det_limit_side_len,
-                        text_det_limit_type=text_det_limit_type,
-                        text_det_thresh=text_det_thresh,
-                        text_det_box_thresh=text_det_box_thresh,
-                        text_det_unclip_ratio=text_det_unclip_ratio,
-                        text_rec_score_thresh=text_rec_score_thresh,
-                    )
+        # fix the doc_title label
+        doc_title_idxes = layout_order_config.get("doc_title_block_idxes", [])
+        paragraph_title_idxes = layout_order_config.get(
+            "paragraph_title_block_idxes", []
+        )
+        # check if there is only one paragraph title and without doc_title
+        only_one_paragraph_title = (
+            len(paragraph_title_idxes) == 1 and len(doc_title_idxes) == 0
+        )
+        if only_one_paragraph_title:
+            paragraph_title_block_area = caculate_bbox_area(
+                layout_det_res["boxes"][paragraph_title_idxes[0]]["coordinate"]
+            )
+            title_area_max_block_threshold = layout_order_config.get(
+                "title_area_max_block_threshold", 0.3
+            )
+            if (
+                paragraph_title_block_area
+                > max_block_area * title_area_max_block_threshold
+            ):
+                layout_det_res["boxes"][paragraph_title_idxes[0]]["label"] = "doc_title"
+                layout_order_config["doc_title_block_idxes"].append(
+                    paragraph_title_idxes[0]
                 )
-                _, matched_idxs = get_sub_regions_ocr_res(
-                    overall_ocr_res, [box], return_match_idx=True
+                layout_order_config["paragraph_title_block_idxes"].remove(
+                    paragraph_title_idxes[0]
                 )
-                for matched_idx in sorted(matched_idxs, reverse=True):
-                    del overall_ocr_res["dt_polys"][matched_idx]
-                    del overall_ocr_res["rec_texts"][matched_idx]
-                    overall_ocr_res["rec_boxes"] = np.delete(
-                        overall_ocr_res["rec_boxes"], matched_idx, axis=0
+
+        # Replace the OCR information of the hurdles.
+        for overall_ocr_idx, layout_box_ids in matched_ocr_dict.items():
+            if len(layout_box_ids) > 1:
+                matched_no = 0
+                overall_ocr_box = copy.deepcopy(
+                    overall_ocr_res["rec_boxes"][overall_ocr_idx]
+                )
+                overall_ocr_dt_poly = copy.deepcopy(
+                    overall_ocr_res["dt_polys"][overall_ocr_idx]
+                )
+                for box_idx in layout_box_ids:
+                    layout_box = layout_det_res["boxes"][box_idx]["coordinate"]
+                    crop_box = get_bbox_intersection(overall_ocr_box, layout_box)
+                    x1, y1, x2, y2 = [int(i) for i in crop_box]
+                    crop_img = np.array(image)[y1:y2, x1:x2]
+                    crop_img_rec_res = next(text_rec_model([crop_img]))
+                    crop_img_dt_poly = get_bbox_intersection(
+                        overall_ocr_dt_poly, layout_box, return_format="poly"
                     )
-                    del overall_ocr_res["rec_polys"][matched_idx]
-                    del overall_ocr_res["rec_scores"][matched_idx]
+                    crop_img_rec_score = crop_img_rec_res["rec_score"]
+                    crop_img_rec_text = crop_img_rec_res["rec_text"]
+                    text_rec_score_thresh = (
+                        text_rec_score_thresh
+                        if text_rec_score_thresh is not None
+                        else (self.general_ocr_pipeline.text_rec_score_thresh)
+                    )
+                    if crop_img_rec_score >= text_rec_score_thresh:
+                        matched_no += 1
+                        if matched_no == 1:
+                            # the first matched ocr be replaced by the first matched layout box
+                            overall_ocr_res["dt_polys"][
+                                overall_ocr_idx
+                            ] = crop_img_dt_poly
+                            overall_ocr_res["rec_boxes"][overall_ocr_idx] = crop_box
+                            overall_ocr_res["rec_polys"][
+                                overall_ocr_idx
+                            ] = crop_img_dt_poly
+                            overall_ocr_res["rec_scores"][
+                                overall_ocr_idx
+                            ] = crop_img_rec_score
+                            overall_ocr_res["rec_texts"][
+                                overall_ocr_idx
+                            ] = crop_img_rec_text
+                        else:
+                            # the other matched ocr be appended to the overall ocr result
+                            overall_ocr_res["dt_polys"].append(crop_img_dt_poly)
+                            overall_ocr_res["rec_boxes"] = np.vstack(
+                                (overall_ocr_res["rec_boxes"], crop_box)
+                            )
+                            overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
+                            overall_ocr_res["rec_scores"].append(crop_img_rec_score)
+                            overall_ocr_res["rec_texts"].append(crop_img_rec_text)
+                            overall_ocr_res["rec_labels"].append("text")
+                            layout_to_ocr_mapping[box_idx].remove(overall_ocr_idx)
+                            layout_to_ocr_mapping[box_idx].append(
+                                len(overall_ocr_res["rec_texts"]) - 1
+                            )
+
+        layout_order_config["all_layout_region_box"] = region_box
+        layout_order_config["layout_to_ocr_mapping"] = layout_to_ocr_mapping
+        layout_order_config["matched_ocr_dict"] = matched_ocr_dict
+
+        return layout_order_config, layout_det_res
+
+    def sort_line_by_x_projection(
+        self,
+        line: List[List[Union[List[int], str]]],
+        input_img: np.ndarray,
+        text_rec_model: Any,
+        text_rec_score_thresh: Union[float, None] = None,
+    ) -> None:
+        """
+        Sort a line of text spans based on their vertical position within the layout bounding box.
 
-                if sub_ocr_res["rec_boxes"].size > 0:
-                    sub_ocr_res["rec_labels"] = ["text"] * len(sub_ocr_res["rec_texts"])
+        Args:
+            line (list): A list of spans, where each span is a list containing a bounding box and text.
+            input_img (ndarray): The input image used for OCR.
+            general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
 
-                    overall_ocr_res["dt_polys"].extend(sub_ocr_res["dt_polys"])
-                    overall_ocr_res["rec_texts"].extend(sub_ocr_res["rec_texts"])
-                    overall_ocr_res["rec_boxes"] = np.concatenate(
-                        [overall_ocr_res["rec_boxes"], sub_ocr_res["rec_boxes"]], axis=0
+        Returns:
+            list: The sorted line of text spans.
+        """
+        splited_boxes = split_boxes_if_x_contained(line)
+        splited_lines = []
+        if len(line) != len(splited_boxes):
+            splited_boxes.sort(key=lambda span: span[0][0])
+            for span in splited_boxes:
+                if span[2] == "text":
+                    crop_img = input_img[
+                        int(span[0][1]) : int(span[0][3]),
+                        int(span[0][0]) : int(span[0][2]),
+                    ]
+                    crop_img_rec_res = next(text_rec_model([crop_img]))
+                    crop_img_rec_score = crop_img_rec_res["rec_score"]
+                    crop_img_rec_text = crop_img_rec_res["rec_text"]
+                    span[1] = (
+                        crop_img_rec_text
+                        if crop_img_rec_score >= text_rec_score_thresh
+                        else ""
                     )
-                    overall_ocr_res["rec_polys"].extend(sub_ocr_res["rec_polys"])
-                    overall_ocr_res["rec_scores"].extend(sub_ocr_res["rec_scores"])
-                    overall_ocr_res["rec_labels"].extend(sub_ocr_res["rec_labels"])
-
-        for formula_res in formula_res_list:
-            x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
-            poly_points = [
-                (x_min, y_min),
-                (x_max, y_min),
-                (x_max, y_max),
-                (x_min, y_max),
+
+                splited_lines.append(span)
+        else:
+            splited_lines = line
+
+        return splited_lines
+
+    def get_block_rec_content(
+        self,
+        image: list,
+        layout_order_config: dict,
+        ocr_rec_res: dict,
+        block: LayoutParsingBlock,
+        text_rec_model: Any,
+        text_rec_score_thresh: Union[float, None] = None,
+    ) -> str:
+
+        text_delimiter_map = {
+            "content": "\n",
+        }
+        line_delimiter_map = {
+            "doc_title": " ",
+            "content": "\n",
+        }
+        if len(ocr_rec_res["rec_texts"]) == 0:
+            block.content = ""
+            return block
+
+        label = block.label
+        if label == "reference":
+            rec_boxes = ocr_rec_res["boxes"]
+            block_left_coordinate = min([box[0] for box in rec_boxes])
+            block_right_coordinate = max([box[2] for box in rec_boxes])
+            first_line_span_limit = (5,)
+            last_line_span_limit = (20,)
+        else:
+            block_left_coordinate, _, block_right_coordinate, _ = block.bbox
+            first_line_span_limit = (10,)
+            last_line_span_limit = (10,)
+
+        if label == "formula":
+            ocr_rec_res["rec_texts"] = [
+                rec_res_text.replace("$", "")
+                for rec_res_text in ocr_rec_res["rec_texts"]
             ]
-            overall_ocr_res["dt_polys"].append(poly_points)
-            overall_ocr_res["rec_texts"].append(f"${formula_res['rec_formula']}$")
-            overall_ocr_res["rec_boxes"] = np.vstack(
-                (overall_ocr_res["rec_boxes"], [formula_res["dt_polys"]])
+        lines = group_boxes_into_lines(
+            ocr_rec_res,
+            block,
+            layout_order_config.get("line_height_iou_threshold", 0.4),
+        )
+
+        block.num_of_lines = len(lines)
+
+        # format line
+        new_lines = []
+        horizontal_text_line_num = 0
+        for line in lines:
+            line.sort(key=lambda span: span[0][0])
+
+            # merge formula and text
+            ocr_labels = [span[2] for span in line]
+            if "formula" in ocr_labels:
+                line = self.sort_line_by_x_projection(
+                    line, image, text_rec_model, text_rec_score_thresh
+                )
+
+            text_orientation = calculate_text_orientation([span[0] for span in line])
+            horizontal_text_line_num += 1 if text_orientation == "horizontal" else 0
+
+            line_text = format_line(
+                line,
+                block_left_coordinate,
+                block_right_coordinate,
+                first_line_span_limit=first_line_span_limit,
+                last_line_span_limit=last_line_span_limit,
+                block_label=block.label,
+                delimiter_map=text_delimiter_map,
             )
-            overall_ocr_res["rec_labels"].append("formula")
-            overall_ocr_res["rec_polys"].append(poly_points)
-            overall_ocr_res["rec_scores"].append(1)
+            new_lines.append(line_text)
+
+        delim = line_delimiter_map.get(label, "")
+        content = delim.join(new_lines)
+        block.content = content
+        block.direction = (
+            "horizontal"
+            if horizontal_text_line_num > len(new_lines) * 0.5
+            else "vertical"
+        )
 
-        parsing_res_list = get_single_block_parsing_res(
-            self.general_ocr_pipeline,
+        return block
+
+    def get_layout_parsing_blocks(
+        self,
+        image: list,
+        layout_order_config: dict,
+        overall_ocr_res: OCRResult,
+        layout_det_res: DetResult,
+        table_res_list: list,
+        seal_res_list: list,
+        text_rec_model: Any,
+        text_rec_score_thresh: Union[float, None] = None,
+    ) -> list:
+        """
+        Extract structured information from OCR and layout detection results.
+
+        Args:
+            image (list): The input image.
+            overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
+                - "input_img": The image on which OCR was performed.
+                - "dt_boxes": A list of detected text box coordinates.
+                - "rec_texts": A list of recognized text corresponding to the detected boxes.
+
+            layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
+                - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
+
+            table_res_list (list): A list of table detection results, where each item is a dictionary containing:
+                - "block_bbox": The bounding box of the table layout.
+                - "pred_html": The predicted HTML representation of the table.
+
+            seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
+            text_rec_model (Any): A model for text recognition.
+            text_rec_score_thresh (Union[float, None]): The minimum score required for a recognized character to be considered valid. If None, use the default value specified during initialization. Default is None.
+
+        Returns:
+            list: A list of structured boxes where each item is a dictionary containing:
+                - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
+                - The label as a key with either table HTML or image data and text.
+                - "block_bbox": The coordinates of the layout box.
+        """
+
+        table_index = 0
+        seal_index = 0
+        layout_parsing_blocks: List[LayoutParsingBlock] = []
+
+        for box_idx, box_info in enumerate(layout_det_res["boxes"]):
+
+            label = box_info["label"]
+            block_bbox = box_info["coordinate"]
+            rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
+
+            block = LayoutParsingBlock(label=label, bbox=block_bbox)
+
+            if label == "table" and len(table_res_list) > 0:
+                block.content = table_res_list[table_index]["pred_html"]
+                table_index += 1
+            elif label == "seal" and len(seal_res_list) > 0:
+                block.content = seal_res_list[seal_index]["rec_texts"]
+                seal_index += 1
+            else:
+                if label == "formula":
+                    _, ocr_idx_list = get_sub_regions_ocr_res(
+                        overall_ocr_res, [block_bbox], return_match_idx=True
+                    )
+                    layout_order_config["layout_to_ocr_mapping"][box_idx] = ocr_idx_list
+                else:
+                    ocr_idx_list = layout_order_config["layout_to_ocr_mapping"].get(
+                        box_idx, []
+                    )
+                for box_no in ocr_idx_list:
+                    rec_res["boxes"].append(overall_ocr_res["rec_boxes"][box_no])
+                    rec_res["rec_texts"].append(
+                        overall_ocr_res["rec_texts"][box_no],
+                    )
+                    rec_res["rec_labels"].append(
+                        overall_ocr_res["rec_labels"][box_no],
+                    )
+                block = self.get_block_rec_content(
+                    image=image,
+                    block=block,
+                    layout_order_config=layout_order_config,
+                    ocr_rec_res=rec_res,
+                    text_rec_model=text_rec_model,
+                    text_rec_score_thresh=text_rec_score_thresh,
+                )
+
+            if label in ["chart", "image"]:
+                x_min, y_min, x_max, y_max = list(map(int, block_bbox))
+                img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
+                img = Image.fromarray(image[y_min:y_max, x_min:x_max, ::-1])
+                block.image = {img_path: img}
+
+            layout_parsing_blocks.append(block)
+
+        # when there is no layout detection result but there is ocr result, use ocr result
+        if len(layout_det_res["boxes"]) == 0:
+            region_box = [65535, 65535, 0, 0]
+            for ocr_idx, (ocr_rec_box, ocr_rec_text) in enumerate(
+                zip(overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"])
+            ):
+                update_layout_order_config_block_index(
+                    layout_order_config, "text", ocr_idx
+                )
+                region_box = update_region_box(ocr_rec_box, region_box)
+                layout_parsing_blocks.append(
+                    LayoutParsingBlock(
+                        label="text", bbox=ocr_rec_box, content=ocr_rec_text
+                    )
+                )
+            layout_order_config["all_layout_region_box"] = region_box
+
+        return layout_parsing_blocks, layout_order_config
+
+    def get_layout_parsing_res(
+        self,
+        image: list,
+        layout_det_res: DetResult,
+        overall_ocr_res: OCRResult,
+        table_res_list: list,
+        seal_res_list: list,
+        formula_res_list: list,
+        text_rec_score_thresh: Union[float, None] = None,
+    ) -> list:
+        """
+        Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
+        Args:
+            image (list): The input image.
+            layout_det_res (DetResult): The detection result containing the layout information of the document.
+            overall_ocr_res (OCRResult): The overall OCR result containing text information.
+            table_res_list (list): A list of table recognition results.
+            seal_res_list (list): A list of seal recognition results.
+            formula_res_list (list): A list of formula recognition results.
+            text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
+        Returns:
+            list: A list of dictionaries representing the layout parsing result.
+        """
+        from .setting import layout_order_config
+
+        # Standardize data
+        layout_order_config, layout_det_res = self.standardized_data(
+            image=image,
+            layout_order_config=copy.deepcopy(layout_order_config),
+            layout_det_res=layout_det_res,
+            overall_ocr_res=overall_ocr_res,
+            formula_res_list=formula_res_list,
+            text_rec_model=self.general_ocr_pipeline.text_rec_model,
+            text_rec_score_thresh=text_rec_score_thresh,
+        )
+
+        # Format layout parsing block
+        parsing_res_list, layout_order_config = self.get_layout_parsing_blocks(
+            image=image,
+            layout_order_config=layout_order_config,
             overall_ocr_res=overall_ocr_res,
             layout_det_res=layout_det_res,
             table_res_list=table_res_list,
             seal_res_list=seal_res_list,
+            text_rec_model=self.general_ocr_pipeline.text_rec_model,
+            text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
+        )
+
+        parsing_res_list = xycut_enhanced(
+            parsing_res_list,
+            layout_order_config,
         )
 
         return parsing_res_list
@@ -663,12 +1007,6 @@ class LayoutParsingPipelineV2(BasePipeline):
                 table_res_list=table_res_list,
                 seal_res_list=seal_res_list,
                 formula_res_list=formula_res_list,
-                imgs_in_doc=imgs_in_doc,
-                text_det_limit_side_len=text_det_limit_side_len,
-                text_det_limit_type=text_det_limit_type,
-                text_det_thresh=text_det_thresh,
-                text_det_box_thresh=text_det_box_thresh,
-                text_det_unclip_ratio=text_det_unclip_ratio,
                 text_rec_score_thresh=text_rec_score_thresh,
             )
 

+ 135 - 42
paddlex/inference/pipelines/layout_parsing/result_v2.py

@@ -16,6 +16,7 @@ from __future__ import annotations
 import copy
 import re
 from pathlib import Path
+from typing import List
 
 import numpy as np
 from PIL import Image, ImageDraw
@@ -27,7 +28,6 @@ from ...common.result import (
     MarkdownMixin,
     XlsxMixin,
 )
-from .utils import get_show_color
 
 
 class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
@@ -64,6 +64,8 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
             return fn
 
     def _to_img(self) -> dict[str, np.ndarray]:
+        from .utils import get_show_color
+
         res_img_dict = {}
         model_settings = self["model_settings"]
         if model_settings["use_doc_preprocessor"]:
@@ -101,11 +103,11 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         # for layout ordering image
         image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
         draw = ImageDraw.Draw(image, "RGBA")
-        parsing_result = self["parsing_res_list"]
+        parsing_result: List[LayoutParsingBlock] = self["parsing_res_list"]
         for block in parsing_result:
-            bbox = block["block_bbox"]
-            index = block.get("index", None)
-            label = block["sub_label"]
+            bbox = block.bbox
+            index = block.index
+            label = block.label
             fill_color = get_show_color(label)
             draw.rectangle(bbox, fill=fill_color)
             if index is not None:
@@ -176,9 +178,9 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
         parsing_res_list = self["parsing_res_list"]
         parsing_res_list = [
             {
-                "block_label": parsing_res["block_label"],
-                "block_content": parsing_res["block_content"],
-                "block_bbox": parsing_res["block_bbox"],
+                "block_label": parsing_res.label,
+                "block_content": parsing_res.content,
+                "block_bbox": parsing_res.bbox,
             }
             for parsing_res in parsing_res_list
         ]
@@ -281,18 +283,18 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                     " ",
                 )
 
-            def format_centered_text(key):
+            def format_centered_text():
                 return (
-                    f'<div style="text-align: center;">{block[key]}</div>'.replace(
+                    f'<div style="text-align: center;">{block.content}</div>'.replace(
                         "-\n",
                         "",
                     ).replace("\n", " ")
                     + "\n"
                 )
 
-            def format_image(label):
+            def format_image():
                 img_tags = []
-                image_path = "".join(block[label].keys())
+                image_path = "".join(block.image.keys())
                 img_tags.append(
                     '<div style="text-align: center;"><img src="{}" alt="Image" /></div>'.format(
                         image_path.replace("-\n", "").replace("\n", " "),
@@ -301,7 +303,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 return "\n".join(img_tags)
 
             def format_first_line(templates, format_func, spliter):
-                lines = block["block_content"].split(spliter)
+                lines = block.content.split(spliter)
                 for idx in range(len(lines)):
                     line = lines[idx]
                     if line.strip() == "":
@@ -312,23 +314,23 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 return spliter.join(lines)
 
             def format_table():
-                return "\n" + block["block_content"]
+                return "\n" + block.content
 
-            def get_seg_flag(block, prev_block):
+            def get_seg_flag(block: LayoutParsingBlock, prev_block: LayoutParsingBlock):
 
                 seg_start_flag = True
                 seg_end_flag = True
 
-                block_box = block["block_bbox"]
+                block_box = block.bbox
                 context_left_coordinate = block_box[0]
                 context_right_coordinate = block_box[2]
-                seg_start_coordinate = block.get("seg_start_coordinate")
-                seg_end_coordinate = block.get("seg_end_coordinate")
+                seg_start_coordinate = block.seg_start_coordinate
+                seg_end_coordinate = block.seg_end_coordinate
 
                 if prev_block is not None:
-                    prev_block_bbox = prev_block["block_bbox"]
-                    num_of_prev_lines = prev_block.get("num_of_lines")
-                    pre_block_seg_end_coordinate = prev_block.get("seg_end_coordinate")
+                    prev_block_bbox = prev_block.bbox
+                    num_of_prev_lines = prev_block.num_of_lines
+                    pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
                     prev_end_space_small = (
                         context_right_coordinate - pre_block_seg_end_coordinate < 10
                     )
@@ -368,32 +370,30 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
                 return seg_start_flag, seg_end_flag
 
             handlers = {
-                "paragraph_title": lambda: format_title(block["block_content"]),
-                "doc_title": lambda: f"# {block['block_content']}".replace(
+                "paragraph_title": lambda: format_title(block.content),
+                "doc_title": lambda: f"# {block.content}".replace(
                     "-\n",
                     "",
                 ).replace("\n", " "),
-                "table_title": lambda: format_centered_text("block_content"),
-                "figure_title": lambda: format_centered_text("block_content"),
-                "chart_title": lambda: format_centered_text("block_content"),
-                "text": lambda: block["block_content"]
-                .replace("-\n", " ")
-                .replace("\n", " "),
+                "table_title": lambda: format_centered_text(),
+                "figure_title": lambda: format_centered_text(),
+                "chart_title": lambda: format_centered_text(),
+                "text": lambda: block.content.replace("-\n", " ").replace("\n", " "),
                 "abstract": lambda: format_first_line(
                     ["摘要", "abstract"], lambda l: f"## {l}\n", " "
                 ),
-                "content": lambda: block["block_content"]
-                .replace("-\n", "  \n")
-                .replace("\n", "  \n"),
-                "image": lambda: format_image("block_image"),
-                "chart": lambda: format_image("block_image"),
-                "formula": lambda: f"$${block['block_content']}$$",
+                "content": lambda: block.content.replace("-\n", "  \n").replace(
+                    "\n", "  \n"
+                ),
+                "image": lambda: format_image(),
+                "chart": lambda: format_image(),
+                "formula": lambda: f"$${block.content}$$",
                 "table": format_table,
                 "reference": lambda: format_first_line(
                     ["参考文献", "references"], lambda l: f"## {l}", "\n"
                 ),
-                "algorithm": lambda: block["block_content"].strip("\n"),
-                "seal": lambda: f"Words of Seals:\n{block['block_content']}",
+                "algorithm": lambda: block.content.strip("\n"),
+                "seal": lambda: f"Words of Seals:\n{block.content}",
             }
             parsing_res_list = obj["parsing_res_list"]
             markdown_content = ""
@@ -403,14 +403,10 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
             prev_block = None
             page_first_element_seg_start_flag = None
             page_last_element_seg_end_flag = None
-            parsing_res_list = sorted(
-                parsing_res_list,
-                key=lambda x: x.get("sub_index", 999),
-            )
             for block in parsing_res_list:
                 seg_start_flag, seg_end_flag = get_seg_flag(block, prev_block)
 
-                label = block.get("block_label")
+                label = block.label
                 page_first_element_seg_start_flag = (
                     seg_start_flag
                     if (page_first_element_seg_start_flag is None)
@@ -465,3 +461,100 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
             markdown_info["markdown_images"][img["path"]] = img["img"]
 
         return markdown_info
+
+
+class LayoutParsingBlock:
+
+    def __init__(self, label, bbox, content="") -> None:
+        self.label = label
+        self.region_label = "other"
+        self.bbox = [int(item) for item in bbox]
+        self.content = content
+        self.seg_start_coordinate = float("inf")
+        self.seg_end_coordinate = float("-inf")
+        self.width = bbox[2] - bbox[0]
+        self.height = bbox[3] - bbox[1]
+        self.area = self.width * self.height
+        self.num_of_lines = 1
+        self.image = None
+        self.index = None
+        self.visual_index = None
+        self.direction = self.get_bbox_direction()
+        self.child_blocks = []
+        self.update_direction_info()
+
+    def __str__(self) -> str:
+        return f"{self.__dict__}"
+
+    def __repr__(self) -> str:
+        _str = f"\n\n#################\nlabel:\t{self.label}\nregion_label:\t{self.region_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
+        return _str
+
+    def to_dict(self) -> dict:
+        return self.__dict__
+
+    def update_direction_info(self) -> None:
+        if self.region_label == "vision":
+            self.direction = "horizontal"
+        if self.direction == "horizontal":
+            self.secondary_direction = "vertical"
+            self.short_side_length = self.height
+            self.long_side_length = self.width
+            self.start_coordinate = self.bbox[0]
+            self.end_coordinate = self.bbox[2]
+            self.secondary_direction_start_coordinate = self.bbox[1]
+            self.secondary_direction_end_coordinate = self.bbox[3]
+        else:
+            self.secondary_direction = "horizontal"
+            self.short_side_length = self.width
+            self.long_side_length = self.height
+            self.start_coordinate = self.bbox[1]
+            self.end_coordinate = self.bbox[3]
+            self.secondary_direction_start_coordinate = self.bbox[0]
+            self.secondary_direction_end_coordinate = self.bbox[2]
+
+    def append_child_block(self, child_block: LayoutParsingBlock) -> None:
+        if not self.child_blocks:
+            self.ori_bbox = self.bbox.copy()
+        x1, y1, x2, y2 = self.bbox
+        x1_child, y1_child, x2_child, y2_child = child_block.bbox
+        union_bbox = (
+            min(x1, x1_child),
+            min(y1, y1_child),
+            max(x2, x2_child),
+            max(y2, y2_child),
+        )
+        self.bbox = union_bbox
+        self.update_direction_info()
+        child_blocks = [child_block]
+        if child_block.child_blocks:
+            child_blocks.extend(child_block.get_child_blocks())
+        self.child_blocks.extend(child_blocks)
+
+    def get_child_blocks(self) -> list:
+        self.bbox = self.ori_bbox
+        child_blocks = self.child_blocks.copy()
+        self.child_blocks = []
+        return child_blocks
+
+    def get_centroid(self) -> tuple:
+        x1, y1, x2, y2 = self.bbox
+        centroid = ((x1 + x2) / 2, (y1 + y2) / 2)
+        return centroid
+
+    def get_bbox_direction(self, orientation_ratio: float = 1.0) -> bool:
+        """
+        Determine if a bounding box is horizontal or vertical.
+
+        Args:
+            bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
+            orientation_ratio (float): Ratio for determining orientation. Default is 1.0.
+
+        Returns:
+            str: "horizontal" or "vertical".
+        """
+        return (
+            "horizontal"
+            if self.width * orientation_ratio >= self.height
+            else "vertical"
+        )

+ 70 - 0
paddlex/inference/pipelines/layout_parsing/setting.py

@@ -0,0 +1,70 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+layout_order_config = {
+    # 人工配置项
+    "line_height_iou_threshold": 0.4,  # For line segmentation of OCR results
+    "title_area_max_block_threshold": 0.3,  # update paragraph_title -> doc_title
+    "block_label_match_iou_threshold": 0.1,
+    "block_title_match_iou_threshold": 0.1,
+    "doc_title_labels": ["doc_title"],  # 文档标题
+    "paragraph_title_labels": ["paragraph_title"],  # 段落标题
+    "vision_labels": [
+        "image",
+        "table",
+        "chart",
+        "figure",
+    ],  # 图、表、印章、图表、图
+    "vision_title_labels": ["table_title", "chart_title", "figure_title"],  # 图表标题
+    "unordered_labels": [
+        "aside_text",
+        "seal",
+        "number",
+        "formula_number",
+    ],
+    "text_labels": ["text"],
+    "header_labels": ["header", "header_image"],
+    "footer_labels": ["footer", "footer_image", "footnote"],
+    "visualize_index_labels": [
+        "text",
+        "formula",
+        "algorithm",
+        "reference",
+        "content",
+        "abstract",
+        "paragraph_title",
+        "doc_title",
+        "table_title",
+        "chart_title",
+        "figure_title",
+        "image",
+        "table",
+        "chart",
+        "figure",
+    ],
+    # 自动补全配置项
+    "layout_to_ocr_mapping": {},
+    "all_layout_region_box": [],  # 区域box
+    "doc_title_block_idxes": [],
+    "paragraph_title_block_idxes": [],
+    "text_title_labels": [],  # doc_title_labels+paragraph_title_labels
+    "text_title_block_idxes": [],
+    "vision_block_idxes": [],
+    "vision_title_block_idxes": [],
+    "vision_footnote_block_idxes": [],
+    "text_block_idxes": [],
+    "header_block_idxes": [],
+    "footer_block_idxes": [],
+    "unordered_block_idxes": [],
+}

+ 203 - 1919
paddlex/inference/pipelines/layout_parsing/utils.py

@@ -14,21 +14,21 @@
 
 __all__ = [
     "get_sub_regions_ocr_res",
-    "get_layout_ordering",
-    "get_single_block_parsing_res",
     "get_show_color",
     "sorted_layout_boxes",
+    "update_layout_order_config_block_index",
 ]
 
 import re
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image
 
-from ...models.object_detection.result import DetResult
+from ..components import convert_points_to_boxes
 from ..ocr.result import OCRResult
+from .xycut_enhanced import calculate_projection_iou
 
 
 def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
@@ -209,88 +209,107 @@ def _calculate_overlap_area_div_minbox_area_ratio(
     return intersection_area / min_box_area
 
 
-def _whether_y_overlap_exceeds_threshold(
-    bbox1: Union[list, tuple],
-    bbox2: Union[list, tuple],
-    overlap_ratio_threshold: float = 0.6,
-) -> bool:
-    """
-    Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
+def group_boxes_into_lines(ocr_rec_res, block_info, line_height_iou_threshold):
+    rec_boxes = ocr_rec_res["boxes"]
+    rec_texts = ocr_rec_res["rec_texts"]
+    rec_labels = ocr_rec_res["rec_labels"]
 
-    Args:
-        bbox1 (list or tuple): The first bounding box defined as (left, top, right, bottom).
-        bbox2 (list or tuple): The second bounding box defined as (left, top, right, bottom).
-        overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
-                                         Defaults to 0.6.
+    spans = list(zip(rec_boxes, rec_texts, rec_labels))
 
-    Returns:
-        bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
-              exceeds the overlap_ratio_threshold, otherwise False.
-    """
-    _, y1_0, _, y1_1 = bbox1
-    _, y2_0, _, y2_1 = bbox2
+    spans.sort(key=lambda span: span[0][1])
+    spans = [list(span) for span in spans]
 
-    overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
-    min_height = min(y1_1 - y1_0, y2_1 - y2_0)
+    lines = []
+    line = [spans[0]]
+    line_region_box = spans[0][0][:]
+    block_info.seg_start_coordinate = spans[0][0][0]
+    block_info.seg_end_coordinate = spans[-1][0][2]
 
-    return (overlap / min_height) > overlap_ratio_threshold
+    # merge line
+    for span in spans[1:]:
+        rec_bbox = span[0]
+        if (
+            calculate_projection_iou(line_region_box, rec_bbox, "vertical")
+            >= line_height_iou_threshold
+        ):
+            line.append(span)
+            line_region_box[1] = min(line_region_box[1], rec_bbox[1])
+            line_region_box[3] = max(line_region_box[3], rec_bbox[3])
+        else:
+            lines.append(line)
+            line = [span]
+            line_region_box = rec_bbox[:]
 
+    lines.append(line)
+    return lines
 
-def _adjust_span_text(span: List[str], prepend: bool = False, append: bool = False):
+
+def calculate_text_orientation(
+    bboxes: List[List[int]], orientation_ratio: float = 1.5
+) -> bool:
     """
-    Adjust the text of a span by prepending or appending a newline.
+    Calculate the orientation of the text based on the bounding boxes.
 
     Args:
-        span (list): A list where the second element is the text of the span.
-        prepend (bool): If True, prepend a newline to the text.
-        append (bool): If True, append a newline to the text.
+        bboxes (list): A list of bounding boxes.
+        orientation_ratio (float): Ratio for determining orientation. Default is 1.5.
 
     Returns:
-        None: The function modifies the span in place.
+        str: "horizontal" or "vertical".
     """
-    if prepend:
-        span[1] = "\n" + span[1]
-    if append:
-        span[1] = span[1] + "\n"
-    return span
 
+    bboxes = np.array(bboxes)
+    x_min = np.min(bboxes[:, 0])
+    x_max = np.max(bboxes[:, 2])
+    width = x_max - x_min
+    y_min = np.min(bboxes[:, 1])
+    y_max = np.max(bboxes[:, 3])
+    height = y_max - y_min
+    return "horizontal" if width * orientation_ratio >= height else "vertical"
 
-def _format_line(
+
+def format_line(
     line: List[List[Union[List[int], str]]],
-    layout_min: int,
-    layout_max: int,
-    is_reference: bool = False,
+    block_left_coordinate: int,
+    block_right_coordinate: int,
+    first_line_span_limit: int = 10,
+    last_line_span_limit: int = 10,
+    block_label: str = "text",
+    delimiter_map: Dict = {},
 ) -> None:
     """
     Format a line of text spans based on layout constraints.
 
     Args:
         line (list): A list of spans, where each span is a list containing a bounding box and text.
-        layout_min (int): The minimum x-coordinate of the layout bounding box.
-        layout_max (int): The maximum x-coordinate of the layout bounding box.
-        is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.
-
+        block_left_coordinate (int): The minimum x-coordinate of the layout bounding box.
+        block_right_coordinate (int): The maximum x-coordinate of the layout bounding box.
+        first_line_span_limit (int): The limit for the number of pixels before the first span that should be considered part of the first line. Default is 10.
+        last_line_span_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
+        block_label (str): The label associated with the entire block. Default is 'text'.
     Returns:
         None: The function modifies the line in place.
     """
     first_span = line[0]
-    end_span = line[-1]
+    last_span = line[-1]
 
-    if not is_reference:
-        if first_span[0][0] - layout_min > 10:
-            first_span = _adjust_span_text(first_span, prepend=True)
-        if layout_max - end_span[0][2] > 10:
-            end_span = _adjust_span_text(end_span, append=True)
-    else:
-        if first_span[0][0] - layout_min < 5:
-            first_span = _adjust_span_text(first_span, prepend=True)
-        if layout_max - end_span[0][2] > 20:
-            end_span = _adjust_span_text(end_span, append=True)
+    if first_span[0][0] - block_left_coordinate > first_line_span_limit:
+        first_span[1] = "\n" + first_span[1]
+    if block_right_coordinate - last_span[0][2] > last_line_span_limit:
+        last_span[1] = last_span[1] + "\n"
 
     line[0] = first_span
-    line[-1] = end_span
+    line[-1] = last_span
+
+    delim = delimiter_map.get(block_label, " ")
+    line_text = delim.join([span[1] for span in line])
+
+    if block_label != "reference":
+        line_text = remove_extra_space(line_text)
 
-    return line
+    if line_text.endswith("-"):
+        line_text = line_text[:-1]
+    return line_text
 
 
 def split_boxes_if_x_contained(boxes, offset=1e-5):
@@ -361,132 +380,7 @@ def split_boxes_if_x_contained(boxes, offset=1e-5):
     return new_boxes
 
 
-def _sort_line_by_x_projection(
-    input_img: np.ndarray,
-    general_ocr_pipeline: Any,
-    line: List[List[Union[List[int], str]]],
-) -> None:
-    """
-    Sort a line of text spans based on their vertical position within the layout bounding box.
-
-    Args:
-        input_img (ndarray): The input image used for OCR.
-        general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
-        line (list): A list of spans, where each span is a list containing a bounding box and text.
-
-    Returns:
-        list: The sorted line of text spans.
-    """
-    splited_boxes = split_boxes_if_x_contained(line)
-    splited_lines = []
-    if len(line) != len(splited_boxes):
-        splited_boxes.sort(key=lambda span: span[0][0])
-        text_rec_model = general_ocr_pipeline.text_rec_model
-        for span in splited_boxes:
-            if span[2] == "text":
-                crop_img = input_img[
-                    int(span[0][1]) : int(span[0][3]),
-                    int(span[0][0]) : int(span[0][2]),
-                ]
-                span[1] = next(text_rec_model([crop_img]))["rec_text"]
-            splited_lines.append(span)
-    else:
-        splited_lines = line
-
-    return splited_lines
-
-
-def _sort_ocr_res_by_y_projection(
-    input_img: np.ndarray,
-    general_ocr_pipeline: Any,
-    label: Any,
-    block_bbox: Tuple[int, int, int, int],
-    ocr_res: Dict[str, List[Any]],
-    line_height_iou_threshold: float = 0.7,
-) -> Dict[str, List[Any]]:
-    """
-    Sorts OCR results based on their spatial arrangement, grouping them into lines and blocks.
-
-    Args:
-        input_img (ndarray): The input image used for OCR.
-        general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
-        label (Any): The label associated with the OCR results. It's not used in the function but might be
-                     relevant for other parts of the calling context.
-        block_bbox (Tuple[int, int, int, int]): A tuple representing the layout bounding box, defined as
-                                                 (left, top, right, bottom).
-        ocr_res (Dict[str, List[Any]]): A dictionary containing OCR results with the following keys:
-            - "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
-            - "rec_texts": A corresponding list of recognized text strings for each box.
-        line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
-                                           the same line based on their vertical overlap. Defaults to 0.7.
-
-    Returns:
-        Dict[str, List[Any]]: A dictionary with the same structure as `ocr_res`, but with boxes and texts sorted
-                              and grouped into lines and blocks.
-    """
-    assert (
-        ocr_res["boxes"] and ocr_res["rec_texts"]
-    ), "OCR results must contain 'boxes' and 'rec_texts'"
-
-    boxes = ocr_res["boxes"]
-    rec_texts = ocr_res["rec_texts"]
-    rec_labels = ocr_res["rec_labels"]
-
-    x_min, _, x_max, _ = block_bbox
-    inline_x_min = min([box[0] for box in boxes])
-    inline_x_max = max([box[2] for box in boxes])
-
-    spans = list(zip(boxes, rec_texts, rec_labels))
-
-    spans.sort(key=lambda span: span[0][1])
-    spans = [list(span) for span in spans]
-
-    lines = []
-    current_line = [spans[0]]
-    current_y0, current_y1 = spans[0][0][1], spans[0][0][3]
-
-    for span in spans[1:]:
-        y0, y1 = span[0][1], span[0][3]
-        if _whether_y_overlap_exceeds_threshold(
-            (0, current_y0, 0, current_y1),
-            (0, y0, 0, y1),
-            line_height_iou_threshold,
-        ):
-            current_line.append(span)
-            current_y0 = min(current_y0, y0)
-            current_y1 = max(current_y1, y1)
-        else:
-            lines.append(current_line)
-            current_line = [span]
-            current_y0, current_y1 = y0, y1
-
-    if current_line:
-        lines.append(current_line)
-
-    new_lines = []
-    for line in lines:
-        line.sort(key=lambda span: span[0][0])
-
-        ocr_labels = [span[2] for span in line]
-        if "formula" in ocr_labels:
-            line = _sort_line_by_x_projection(input_img, general_ocr_pipeline, line)
-        if label == "reference":
-            line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
-        elif label != "content":
-            line = _format_line(line, x_min, x_max)
-        new_lines.append(line)
-
-    ocr_res["boxes"] = [span[0] for line in new_lines for span in line]
-    if label == "content":
-        ocr_res["rec_texts"] = [
-            "".join(f"{span[1]} " for span in line).rstrip() for line in new_lines
-        ]
-    else:
-        ocr_res["rec_texts"] = [span[1] + " " for line in new_lines for span in line]
-    return ocr_res, len(new_lines)
-
-
-def _process_text(input_text: str) -> str:
+def remove_extra_space(input_text: str) -> str:
     """
     Process the input text to handle spaces.
 
@@ -500,472 +394,22 @@ def _process_text(input_text: str) -> str:
         str: The processed text with properly formatted spaces.
     """
 
-    def handle_spaces_(text: str) -> str:
-        """
-        Handle spaces in the text by removing multiple consecutive spaces and inserting a single space
-        between Chinese and non-Chinese characters.
-
-        Args:
-            text (str): The text to handle spaces for.
-
-        Returns:
-            str: The text with properly formatted spaces.
-        """
-        spaces = re.finditer(r"\s+", text)
-        processed_text = list(text)
-
-        for space in reversed(list(spaces)):
-            start, end = space.span()
-            prev_char = processed_text[start - 1] if start > 0 else ""
-            next_char = processed_text[end] if end < len(processed_text) else ""
-
-            is_prev_chinese = (
-                re.match(r"[\u4e00-\u9fff]", prev_char) if prev_char else False
-            )
-            is_next_chinese = (
-                re.match(r"[\u4e00-\u9fff]", next_char) if next_char else False
-            )
-
-            if is_prev_chinese and is_next_chinese:
-                processed_text[start:end] = []
-            else:
-                processed_text[start:end] = [" "]
-
-        return "".join(processed_text)
-
-    text_without_spaces = handle_spaces_(input_text)
-
-    final_text = re.sub(r"\s+", " ", text_without_spaces).strip()
-    return final_text
-
-
-def get_single_block_parsing_res(
-    general_ocr_pipeline: Any,
-    overall_ocr_res: OCRResult,
-    layout_det_res: DetResult,
-    table_res_list: list,
-    seal_res_list: list,
-) -> OCRResult:
-    """
-    Extract structured information from OCR and layout detection results.
-
-    Args:
-        overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
-            - "input_img": The image on which OCR was performed.
-            - "dt_boxes": A list of detected text box coordinates.
-            - "rec_texts": A list of recognized text corresponding to the detected boxes.
-
-        layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
-            - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
-
-        table_res_list (list): A list of table detection results, where each item is a dictionary containing:
-            - "block_bbox": The bounding box of the table layout.
-            - "pred_html": The predicted HTML representation of the table.
-
-        seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
-
-    Returns:
-        list: A list of structured boxes where each item is a dictionary containing:
-            - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
-            - The label as a key with either table HTML or image data and text.
-            - "block_bbox": The coordinates of the layout box.
-    """
-
-    single_block_layout_parsing_res = []
-    input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]
-    seal_index = 0
-    with_doc_title = False
-    max_block_area = 0.0
-    paragraph_title_indexs = []
-
-    layout_det_res_list, _ = _remove_overlap_blocks(
-        deepcopy(layout_det_res["boxes"]),
-        threshold=0.5,
-        smaller=True,
+    # Remove spaces between Chinese characters
+    text_without_spaces = re.sub(
+        r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", input_text
     )
 
-    for box_idx, box_info in enumerate(layout_det_res_list):
-        block_bbox = box_info["coordinate"]
-        label = box_info["label"]
-        rec_res = {"boxes": [], "rec_texts": [], "rec_labels": [], "flag": False}
-        seg_start_coordinate = float("inf")
-        seg_end_coordinate = float("-inf")
-        num_of_lines = 1
-
-        if label == "doc_title":
-            with_doc_title = True
-        elif label == "paragraph_title":
-            paragraph_title_indexs.append(box_idx)
-
-        block_area = (block_bbox[2] - block_bbox[0]) * (block_bbox[3] - block_bbox[1])
-        max_block_area = max(max_block_area, block_area)
-
-        if label == "table":
-            for table_res in table_res_list:
-                if len(table_res["cell_box_list"]) == 0:
-                    continue
-                if (
-                    _calculate_overlap_area_div_minbox_area_ratio(
-                        block_bbox, table_res["cell_box_list"][0]
-                    )
-                    > 0.5
-                ):
-                    single_block_layout_parsing_res.append(
-                        {
-                            "block_label": label,
-                            "block_content": table_res["pred_html"],
-                            "block_bbox": block_bbox,
-                        },
-                    )
-                    break
-        elif label == "seal":
-            if len(seal_res_list) > 0:
-                single_block_layout_parsing_res.append(
-                    {
-                        "block_label": label,
-                        "block_content": _process_text(
-                            ", ".join(seal_res_list[seal_index]["rec_texts"])
-                        ),
-                        "block_bbox": block_bbox,
-                    },
-                )
-                seal_index += 1
-        else:
-            overall_text_boxes = overall_ocr_res["rec_boxes"]
-            for box_no in range(len(overall_text_boxes)):
-                if (
-                    _calculate_overlap_area_div_minbox_area_ratio(
-                        block_bbox, overall_text_boxes[box_no]
-                    )
-                    > 0.5
-                ):
-                    rec_res["boxes"].append(overall_text_boxes[box_no])
-                    rec_res["rec_texts"].append(
-                        overall_ocr_res["rec_texts"][box_no],
-                    )
-                    rec_res["rec_labels"].append(
-                        overall_ocr_res["rec_labels"][box_no],
-                    )
-                    rec_res["flag"] = True
-
-            if rec_res["flag"]:
-                rec_res, num_of_lines = _sort_ocr_res_by_y_projection(
-                    input_img, general_ocr_pipeline, label, block_bbox, rec_res, 0.7
-                )
-                seg_start_coordinate = rec_res["boxes"][0][0]
-                seg_end_coordinate = rec_res["boxes"][-1][2]
-                if label == "formula":
-                    rec_res["rec_texts"] = [
-                        rec_res_text.replace("$", "")
-                        for rec_res_text in rec_res["rec_texts"]
-                    ]
-
-            if label in ["chart", "image"]:
-                x_min, y_min, x_max, y_max = list(map(int, block_bbox))
-                img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
-                img = Image.fromarray(input_img[y_min:y_max, x_min:x_max, ::-1])
-                single_block_layout_parsing_res.append(
-                    {
-                        "block_label": label,
-                        "block_content": _process_text("".join(rec_res["rec_texts"])),
-                        "block_image": {img_path: img},
-                        "block_bbox": block_bbox,
-                    },
-                )
-            else:
-                if label in ["doc_title"]:
-                    content = " ".join(rec_res["rec_texts"])
-                elif label in ["content"]:
-                    content = "\n".join(rec_res["rec_texts"])
-                else:
-                    content = "".join(rec_res["rec_texts"])
-                    if label != "reference":
-                        content = _process_text(content)
-                single_block_layout_parsing_res.append(
-                    {
-                        "block_label": label,
-                        "block_content": content,
-                        "block_bbox": block_bbox,
-                        "seg_start_coordinate": seg_start_coordinate,
-                        "seg_end_coordinate": seg_end_coordinate,
-                        "num_of_lines": num_of_lines,
-                        "block_area": block_area,
-                    },
-                )
-
-    if (
-        not with_doc_title
-        and len(paragraph_title_indexs) == 1
-        and single_block_layout_parsing_res[paragraph_title_indexs[0]].get(
-            "block_area", 0
-        )
-        > max_block_area * 0.3
-    ):
-        single_block_layout_parsing_res[paragraph_title_indexs[0]][
-            "block_label"
-        ] = "doc_title"
-
-    if len(layout_det_res_list) == 0:
-        for ocr_rec_box, ocr_rec_text in zip(
-            overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"]
-        ):
-            single_block_layout_parsing_res.append(
-                {
-                    "block_label": "text",
-                    "block_content": ocr_rec_text,
-                    "block_bbox": ocr_rec_box,
-                    "seg_start_coordinate": ocr_rec_box[0],
-                    "seg_end_coordinate": ocr_rec_box[2],
-                },
-            )
-
-    single_block_layout_parsing_res = get_layout_ordering(
-        single_block_layout_parsing_res,
-        no_mask_labels=[
-            "text",
-            "formula",
-            "algorithm",
-            "reference",
-            "content",
-            "abstract",
-        ],
+    # Ensure single space between Chinese and non-Chinese characters
+    text_with_single_spaces = re.sub(
+        r"(?<=[\u4e00-\u9fff])\s+(?=[^\u4e00-\u9fff])|(?<=[^\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])",
+        " ",
+        text_without_spaces,
     )
 
-    return single_block_layout_parsing_res
-
-
-def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
-    """
-    Generate a 1D projection histogram from bounding boxes along a specified axis.
-
-    Args:
-        boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
-        axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
-
-    Returns:
-        A 1D numpy array representing the projection histogram based on bounding box intervals.
-    """
-    assert axis in [0, 1]
-    max_length = np.max(boxes[:, axis::2])
-    projection = np.zeros(max_length, dtype=int)
-
-    # Increment projection histogram over the interval defined by each bounding box
-    for start, end in boxes[:, axis::2]:
-        projection[start:end] += 1
-
-    return projection
-
-
-def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
-    """
-    Split the projection profile into segments based on specified thresholds.
-
-    Args:
-        arr_values: 1D array representing the projection profile.
-        min_value: Minimum value threshold to consider a profile segment significant.
-        min_gap: Minimum gap width to consider a separation between segments.
-
-    Returns:
-        A tuple of start and end indices for each segment that meets the criteria.
-    """
-    # Identify indices where the projection exceeds the minimum value
-    significant_indices = np.where(arr_values > min_value)[0]
-    if not len(significant_indices):
-        return
-
-    # Calculate gaps between significant indices
-    index_diffs = significant_indices[1:] - significant_indices[:-1]
-    gap_indices = np.where(index_diffs > min_gap)[0]
-
-    # Determine start and end indices of segments
-    segment_starts = np.insert(
-        significant_indices[gap_indices + 1],
-        0,
-        significant_indices[0],
-    )
-    segment_ends = np.append(
-        significant_indices[gap_indices],
-        significant_indices[-1] + 1,
-    )
-
-    return segment_starts, segment_ends
-
-
-def _recursive_yx_cut(
-    boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
-):
-    """
-    Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
-
-    Args:
-        boxes: A (N, 4) array representing bounding boxes.
-        indices: List of indices indicating the original position of boxes.
-        res: List to store indices of the final segmented bounding boxes.
-        min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
-
-    Returns:
-        None: This function modifies the `res` list in place.
-    """
-    assert len(boxes) == len(
-        indices
-    ), "The length of boxes and indices must be the same."
-
-    # Sort by y_min for Y-axis projection
-    y_sorted_indices = boxes[:, 1].argsort()
-    y_sorted_boxes = boxes[y_sorted_indices]
-    y_sorted_indices = np.array(indices)[y_sorted_indices]
-
-    # Perform Y-axis projection
-    y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
-    y_intervals = _split_projection_profile(y_projection, 0, 1)
-
-    if not y_intervals:
-        return
-
-    # Process each segment defined by Y-axis projection
-    for y_start, y_end in zip(*y_intervals):
-        # Select boxes within the current y interval
-        y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
-            y_sorted_boxes[:, 1] < y_end
-        )
-        y_boxes_chunk = y_sorted_boxes[y_interval_indices]
-        y_indices_chunk = y_sorted_indices[y_interval_indices]
-
-        # Sort by x_min for X-axis projection
-        x_sorted_indices = y_boxes_chunk[:, 0].argsort()
-        x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
-        x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
-
-        # Perform X-axis projection
-        x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
-        x_intervals = _split_projection_profile(x_projection, 0, min_gap)
-
-        if not x_intervals:
-            continue
-
-        # If X-axis cannot be further segmented, add current indices to results
-        if len(x_intervals[0]) == 1:
-            res.extend(x_sorted_indices_chunk)
-            continue
-
-        # Recursively process each segment defined by X-axis projection
-        for x_start, x_end in zip(*x_intervals):
-            x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
-                x_sorted_boxes_chunk[:, 0] < x_end
-            )
-            _recursive_yx_cut(
-                x_sorted_boxes_chunk[x_interval_indices],
-                x_sorted_indices_chunk[x_interval_indices],
-                res,
-            )
-
-
-def _recursive_xy_cut(
-    boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
-):
-    """
-    Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
-
-    Args:
-        boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
-        indices: A list of indices representing the position of boxes in the original data.
-        res: A list to store indices of bounding boxes that meet the criteria.
-        min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
+    # Reduce any remaining consecutive spaces to a single space
+    final_text = re.sub(r"\s+", " ", text_with_single_spaces).strip()
 
-    Returns:
-        None: This function modifies the `res` list in place.
-    """
-    # Ensure boxes and indices have the same length
-    assert len(boxes) == len(
-        indices
-    ), "The length of boxes and indices must be the same."
-
-    # Sort by x_min to prepare for X-axis projection
-    x_sorted_indices = boxes[:, 0].argsort()
-    x_sorted_boxes = boxes[x_sorted_indices]
-    x_sorted_indices = np.array(indices)[x_sorted_indices]
-
-    # Perform X-axis projection
-    x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
-    x_intervals = _split_projection_profile(x_projection, 0, 1)
-
-    if not x_intervals:
-        return
-
-    # Process each segment defined by X-axis projection
-    for x_start, x_end in zip(*x_intervals):
-        # Select boxes within the current x interval
-        x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
-            x_sorted_boxes[:, 0] < x_end
-        )
-        x_boxes_chunk = x_sorted_boxes[x_interval_indices]
-        x_indices_chunk = x_sorted_indices[x_interval_indices]
-
-        # Sort selected boxes by y_min to prepare for Y-axis projection
-        y_sorted_indices = x_boxes_chunk[:, 1].argsort()
-        y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
-        y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
-
-        # Perform Y-axis projection
-        y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
-        y_intervals = _split_projection_profile(y_projection, 0, min_gap)
-
-        if not y_intervals:
-            continue
-
-        # If Y-axis cannot be further segmented, add current indices to results
-        if len(y_intervals[0]) == 1:
-            res.extend(y_sorted_indices_chunk)
-            continue
-
-        # Recursively process each segment defined by Y-axis projection
-        for y_start, y_end in zip(*y_intervals):
-            y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
-                y_sorted_boxes_chunk[:, 1] < y_end
-            )
-            _recursive_xy_cut(
-                y_sorted_boxes_chunk[y_interval_indices],
-                y_sorted_indices_chunk[y_interval_indices],
-                res,
-            )
-
-
-def sort_by_xycut(
-    block_bboxes: Union[np.ndarray, List[List[int]]],
-    direction: int = 0,
-    min_gap: int = 1,
-) -> List[int]:
-    """
-    Sort bounding boxes using recursive XY cut method based on the specified direction.
-
-    Args:
-        block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
-                                                           where each box is represented as
-                                                           [x_min, y_min, x_max, y_max].
-        direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
-                         Defaults to 0.
-        min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
-
-    Returns:
-        List[int]: A list of indices representing the order of sorted bounding boxes.
-    """
-    block_bboxes = np.asarray(block_bboxes).astype(int)
-    res = []
-    if direction == 1:
-        _recursive_yx_cut(
-            block_bboxes,
-            np.arange(len(block_bboxes)).tolist(),
-            res,
-            min_gap,
-        )
-    else:
-        _recursive_xy_cut(
-            block_bboxes,
-            np.arange(len(block_bboxes)).tolist(),
-            res,
-            min_gap,
-        )
-    return res
+    return final_text
 
 
 def gather_imgs(original_img, layout_det_objs):
@@ -1020,7 +464,7 @@ def _get_minbox_if_overlap_by_ratio(
     return None
 
 
-def _remove_overlap_blocks(
+def remove_overlap_blocks(
     blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
 ) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
     """
@@ -1035,13 +479,12 @@ def _remove_overlap_blocks(
         Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
             A tuple containing the updated list of blocks and a list of dropped blocks.
     """
-    dropped_blocks = []
     dropped_indexes = set()
-
+    blocks = deepcopy(blocks)
     # Iterate over each pair of blocks to find overlaps
-    for i, block1 in enumerate(blocks):
-        for j in range(i + 1, len(blocks)):
-            block2 = blocks[j]
+    for i, block1 in enumerate(blocks["boxes"]):
+        for j in range(i + 1, len(blocks["boxes"])):
+            block2 = blocks["boxes"][j]
             # Skip blocks that are already marked for removal
             if i in dropped_indexes or j in dropped_indexes:
                 continue
@@ -1062,1291 +505,132 @@ def _remove_overlap_blocks(
 
     # Remove marked blocks from the original list
     for index in sorted(dropped_indexes, reverse=True):
-        dropped_blocks.append(blocks[index])
-        del blocks[index]
+        del blocks["boxes"][index]
 
-    return blocks, dropped_blocks
+    return blocks
 
 
-def _get_text_median_width(blocks: List[Dict[str, any]]) -> float:
+def get_bbox_intersection(bbox1, bbox2, return_format="bbox"):
     """
-    Calculate the median width of blocks labeled as "text".
+    Compute the intersection of two bounding boxes, supporting both 4-coordinate and 8-coordinate formats.
 
     Args:
-        blocks (List[Dict[str, any]]): List of block dictionaries, each containing a 'block_bbox' and 'label'.
+        bbox1 (tuple): The first bounding box, either in 4-coordinate format (x_min, y_min, x_max, y_max)
+                       or 8-coordinate format (x1, y1, x2, y2, x3, y3, x4, y4).
+        bbox2 (tuple): The second bounding box in the same format as bbox1.
+        return_format (str): The format of the output intersection, either 'bbox' or 'poly'.
 
     Returns:
-        float: The median width of text blocks, or infinity if no text blocks are found.
-    """
-    widths = [
-        block["block_bbox"][2] - block["block_bbox"][0]
-        for block in blocks
-        if block.get("block_label") == "text"
-    ]
-    return np.median(widths) if widths else float("inf")
-
-
-def _get_layout_property(
-    blocks: List[Dict[str, any]],
-    median_width: float,
-    no_mask_labels: List[str],
-    threshold: float = 0.8,
-) -> Tuple[List[Dict[str, any]], bool]:
-    """
-    Determine the layout (single or double column) of text blocks.
-
-    Args:
-        blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
-        median_width (float): Median width of text blocks.
-        no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
-        threshold (float): Threshold for determining layout overlap.
-
-    Returns:
-        Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
-        indicating if the double layout area is greater than the single layout area.
-    """
-    blocks.sort(
-        key=lambda x: (
-            x["block_bbox"][0],
-            (x["block_bbox"][2] - x["block_bbox"][0]),
-        ),
-    )
-    check_single_layout = {}
-    page_min_x, page_max_x = float("inf"), 0
-    double_label_area = 0
-    single_label_area = 0
-
-    for i, block in enumerate(blocks):
-        page_min_x = min(page_min_x, block["block_bbox"][0])
-        page_max_x = max(page_max_x, block["block_bbox"][2])
-    page_width = page_max_x - page_min_x
-
-    for i, block in enumerate(blocks):
-        if block["block_label"] not in no_mask_labels:
-            continue
-
-        x_min_i, _, x_max_i, _ = block["block_bbox"]
-        layout_length = x_max_i - x_min_i
-        cover_count, cover_with_threshold_count = 0, 0
-        match_block_with_threshold_indexes = []
-
-        for j, other_block in enumerate(blocks):
-            if i == j or other_block["block_label"] not in no_mask_labels:
-                continue
-
-            x_min_j, _, x_max_j, _ = other_block["block_bbox"]
-            x_match_min, x_match_max = max(
-                x_min_i,
-                x_min_j,
-            ), min(x_max_i, x_max_j)
-            match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)
-
-            if match_block_iou > 0:
-                cover_count += 1
-                if match_block_iou > threshold:
-                    cover_with_threshold_count += 1
-                    match_block_with_threshold_indexes.append(
-                        (j, match_block_iou),
-                    )
-                x_min_i = x_match_max
-                if x_min_i >= x_max_i:
-                    break
-
-        if (
-            layout_length > median_width * 1.3
-            and (cover_with_threshold_count >= 2 or cover_count >= 2)
-        ) or layout_length > 0.6 * page_width:
-            # if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
-            block["layout"] = "double"
-            double_label_area += (block["block_bbox"][2] - block["block_bbox"][0]) * (
-                block["block_bbox"][3] - block["block_bbox"][1]
-            )
-        else:
-            block["layout"] = "single"
-            check_single_layout[i] = match_block_with_threshold_indexes
-
-    # Check single-layout block
-    for i, single_layout in check_single_layout.items():
-        if single_layout:
-            index, match_iou = single_layout[-1]
-            if match_iou > 0.9 and blocks[index]["layout"] == "double":
-                blocks[i]["layout"] = "double"
-                double_label_area += (
-                    blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
-                ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
-            else:
-                single_label_area += (
-                    blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
-                ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
-
-    return blocks, (double_label_area > single_label_area)
-
-
-def _get_bbox_direction(input_bbox: List[float], ratio: float = 1.0) -> bool:
-    """
-    Determine if a bounding box is horizontal or vertical.
-
-    Args:
-        input_bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
-        ratio (float): Ratio for determining orientation. Default is 1.0.
-
-    Returns:
-        bool: True if the bounding box is considered horizontal, False if vertical.
-    """
-    width = input_bbox[2] - input_bbox[0]
-    height = input_bbox[3] - input_bbox[1]
-    return width * ratio >= height
-
-
-def _get_projection_iou(
-    input_bbox: List[float], match_bbox: List[float], is_horizontal: bool = True
-) -> float:
-    """
-    Calculate the IoU of lines between two bounding boxes.
-
-    Args:
-        input_bbox (List[float]): First bounding box [x_min, y_min, x_max, y_max].
-        match_bbox (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
-        is_horizontal (bool): Whether to compare horizontally or vertically.
-
-    Returns:
-        float: Line IoU. Returns 0 if there is no overlap.
-    """
-    if is_horizontal:
-        x_match_min = max(input_bbox[0], match_bbox[0])
-        x_match_max = min(input_bbox[2], match_bbox[2])
-        overlap = max(0, x_match_max - x_match_min)
-        input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
-    else:
-        y_match_min = max(input_bbox[1], match_bbox[1])
-        y_match_max = min(input_bbox[3], match_bbox[3])
-        overlap = max(0, y_match_max - y_match_min)
-        input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
-
-    return overlap / input_width if input_width > 0 else 0.0
-
-
-def _get_sub_category(
-    blocks: List[Dict[str, Any]], title_labels: List[str]
-) -> Tuple[List[Dict[str, Any]], List[float]]:
-    """
-    Determine the layout of title and text blocks and collect pre_cuts.
-
-    Args:
-        blocks (List[Dict[str, Any]]): List of block dictionaries.
-        title_labels (List[str]): List of labels considered as titles.
-
-    Returns:
-        List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
-        Dict[float]: Dict of pre_cuts coordinates.
-    """
-
-    sub_title_labels = ["paragraph_title"]
-    vision_labels = ["image", "table", "chart", "figure"]
-    vision_title_labels = ["figure_title", "chart_title", "table_title"]
-    all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
-    special_pre_cut_labels = sub_title_labels
-
-    # single doc title is irregular,pre cut not applicable
-    num_doc_title = 0
-    for block in blocks:
-        if block["block_label"] == "doc_title":
-            num_doc_title += 1
-            if num_doc_title == 2:
-                special_pre_cut_labels = title_labels + sub_title_labels
-                break
-    if len(blocks) == 0:
-        return blocks, {}
-
-    min_x = min(block["block_bbox"][0] for block in blocks)
-    min_y = min(block["block_bbox"][1] for block in blocks)
-    max_x = max(block["block_bbox"][2] for block in blocks)
-    max_y = max(block["block_bbox"][3] for block in blocks)
-    region_bbox = (min_x, min_y, max_x, max_y)
-    region_x_center = (region_bbox[0] + region_bbox[2]) / 2
-    region_y_center = (region_bbox[1] + region_bbox[3]) / 2
-    region_width = region_bbox[2] - region_bbox[0]
-    region_height = region_bbox[3] - region_bbox[1]
-
-    pre_cuts = {}
-
-    for i, block1 in enumerate(blocks):
-        block1.setdefault("title_text", [])
-        block1.setdefault("sub_title", [])
-        block1.setdefault("vision_footnote", [])
-        block1.setdefault("sub_label", block1["block_label"])
-
-        if block1["block_label"] not in all_labels:
-            continue
-
-        bbox1 = block1["block_bbox"]
-        x1, y1, x2, y2 = bbox1
-        is_horizontal_1 = _get_bbox_direction(block1["block_bbox"])
-        left_up_title_text_distance = float("inf")
-        left_up_title_text_index = -1
-        left_up_title_text_direction = None
-        right_down_title_text_distance = float("inf")
-        right_down_title_text_index = -1
-        right_down_title_text_direction = None
-
-        # pre-cuts
-        # Condition 1: Length is greater than half of the layout region
-        if is_horizontal_1:
-            block_length = x2 - x1
-            required_length = region_width / 2
-        else:
-            block_length = y2 - y1
-            required_length = region_height / 2
-        if block1["block_label"] in special_pre_cut_labels:
-            length_condition = True
-        else:
-            length_condition = block_length > required_length
-
-        # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
-        block_x_center = (x1 + x2) / 2
-        block_y_center = (y1 + y2) / 2
-        tolerance_len = block_length // 5
-        if block1["block_label"] in special_pre_cut_labels:
-            tolerance_len = block_length // 10
-        if is_horizontal_1:
-            is_centered = abs(block_x_center - region_x_center) <= tolerance_len
-        else:
-            is_centered = abs(block_y_center - region_y_center) <= tolerance_len
-
-        # Condition 3: Check for surrounding text
-        has_left_text = False
-        has_right_text = False
-        has_above_text = False
-        has_below_text = False
-        for block2 in blocks:
-            if block2["block_label"] != "text":
-                continue
-            bbox2 = block2["block_bbox"]
-            x1_2, y1_2, x2_2, y2_2 = bbox2
-            if is_horizontal_1:
-                if x2_2 <= x1 and not (y2_2 <= y1 or y1_2 >= y2):
-                    has_left_text = True
-                if x1_2 >= x2 and not (y2_2 <= y1 or y1_2 >= y2):
-                    has_right_text = True
-            else:
-                if y2_2 <= y1 and not (x2_2 <= x1 or x1_2 >= x2):
-                    has_above_text = True
-                if y1_2 >= y2 and not (x2_2 <= x1 or x1_2 >= x2):
-                    has_below_text = True
-
-            if (is_horizontal_1 and has_left_text and has_right_text) or (
-                not is_horizontal_1 and has_above_text and has_below_text
-            ):
-                break
-
-        no_text_on_sides = (
-            not (has_left_text or has_right_text)
-            if is_horizontal_1
-            else not (has_above_text or has_below_text)
-        )
-
-        # Add coordinates if all conditions are met
-        if is_centered and length_condition and no_text_on_sides:
-            if is_horizontal_1:
-                pre_cuts.setdefault("y", []).append(y1)
-            else:
-                pre_cuts.setdefault("x", []).append(x1)
-
-        for j, block2 in enumerate(blocks):
-            if i == j:
-                continue
-
-            bbox2 = block2["block_bbox"]
-            x1_prime, y1_prime, x2_prime, y2_prime = bbox2
-            is_horizontal_2 = _get_bbox_direction(bbox2)
-            match_block_iou = _get_projection_iou(
-                bbox2,
-                bbox1,
-                is_horizontal_1,
-            )
-
-            def distance_(is_horizontal, is_left_up):
-                if is_horizontal:
-                    if is_left_up:
-                        return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
-                    else:
-                        return (y1_prime - y2 + 2) // 5 + x1_prime / 5000
-
-                else:
-                    if is_left_up:
-                        return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
-                    else:
-                        return (x1_prime - x2 + 2) // 5 + y1_prime / 5000
-
-            block_iou_threshold = 0.1
-            if block1["block_label"] in sub_title_labels:
-                block_iou_threshold = 0.5
-
-            if is_horizontal_1:
-                if match_block_iou >= block_iou_threshold:
-                    left_up_distance = distance_(True, True)
-                    right_down_distance = distance_(True, False)
-                    if (
-                        y2_prime <= y1
-                        and left_up_distance <= left_up_title_text_distance
-                    ):
-                        left_up_title_text_distance = left_up_distance
-                        left_up_title_text_index = j
-                        left_up_title_text_direction = is_horizontal_2
-                    elif (
-                        y1_prime > y2
-                        and right_down_distance < right_down_title_text_distance
-                    ):
-                        right_down_title_text_distance = right_down_distance
-                        right_down_title_text_index = j
-                        right_down_title_text_direction = is_horizontal_2
-            else:
-                if match_block_iou >= block_iou_threshold:
-                    left_up_distance = distance_(False, True)
-                    right_down_distance = distance_(False, False)
-                    if (
-                        x2_prime <= x1
-                        and left_up_distance <= left_up_title_text_distance
-                    ):
-                        left_up_title_text_distance = left_up_distance
-                        left_up_title_text_index = j
-                        left_up_title_text_direction = is_horizontal_2
-                    elif (
-                        x1_prime > x2
-                        and right_down_distance < right_down_title_text_distance
-                    ):
-                        right_down_title_text_distance = right_down_distance
-                        right_down_title_text_index = j
-                        right_down_title_text_direction = is_horizontal_2
-
-        height = bbox1[3] - bbox1[1]
-        width = bbox1[2] - bbox1[0]
-        title_text_weight = [0.8, 0.8]
-
-        title_text, sub_title, vision_footnote = [], [], []
-
-        def get_sub_category_(
-            title_text_direction,
-            title_text_index,
-            label,
-            is_left_up=True,
-        ):
-            direction_ = [1, 3] if is_left_up else [2, 4]
-            if (
-                title_text_direction == is_horizontal_1
-                and title_text_index != -1
-                and (label == "text" or label == "paragraph_title")
-            ):
-                bbox2 = blocks[title_text_index]["block_bbox"]
-                if is_horizontal_1:
-                    height1 = bbox2[3] - bbox2[1]
-                    width1 = bbox2[2] - bbox2[0]
-                    if label == "text":
-                        if (
-                            _nearest_edge_distance(bbox1, bbox2)[0] <= 15
-                            and block1["block_label"] in vision_labels
-                            and width1 < width
-                            and height1 < 0.5 * height
-                        ):
-                            blocks[title_text_index]["sub_label"] = "vision_footnote"
-                            vision_footnote.append(bbox2)
-                        elif (
-                            height1 < height * title_text_weight[0]
-                            and (width1 < width or width1 > 1.5 * width)
-                            and block1["block_label"] in title_labels
-                        ):
-                            blocks[title_text_index]["sub_label"] = "title_text"
-                            title_text.append((direction_[0], bbox2))
-                    elif (
-                        label == "paragraph_title"
-                        and block1["block_label"] in sub_title_labels
-                    ):
-                        sub_title.append(bbox2)
-                else:
-                    height1 = bbox2[3] - bbox2[1]
-                    width1 = bbox2[2] - bbox2[0]
-                    if label == "text":
-                        if (
-                            _nearest_edge_distance(bbox1, bbox2)[0] <= 15
-                            and block1["block_label"] in vision_labels
-                            and height1 < height
-                            and width1 < 0.5 * width
-                        ):
-                            blocks[title_text_index]["sub_label"] = "vision_footnote"
-                            vision_footnote.append(bbox2)
-                        elif (
-                            width1 < width * title_text_weight[1]
-                            and block1["block_label"] in title_labels
-                        ):
-                            blocks[title_text_index]["sub_label"] = "title_text"
-                            title_text.append((direction_[1], bbox2))
-                    elif (
-                        label == "paragraph_title"
-                        and block1["block_label"] in sub_title_labels
-                    ):
-                        sub_title.append(bbox2)
-
-        if (
-            is_horizontal_1
-            and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
-            > height
-        ) or (
-            not is_horizontal_1
-            and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
-            > width
-        ):
-            if left_up_title_text_distance < right_down_title_text_distance:
-                get_sub_category_(
-                    left_up_title_text_direction,
-                    left_up_title_text_index,
-                    blocks[left_up_title_text_index]["block_label"],
-                    True,
-                )
-            else:
-                get_sub_category_(
-                    right_down_title_text_direction,
-                    right_down_title_text_index,
-                    blocks[right_down_title_text_index]["block_label"],
-                    False,
-                )
-        else:
-            get_sub_category_(
-                left_up_title_text_direction,
-                left_up_title_text_index,
-                blocks[left_up_title_text_index]["block_label"],
-                True,
-            )
-            get_sub_category_(
-                right_down_title_text_direction,
-                right_down_title_text_index,
-                blocks[right_down_title_text_index]["block_label"],
-                False,
-            )
-
-        if block1["block_label"] in title_labels:
-            if blocks[i].get("title_text") == []:
-                blocks[i]["title_text"] = title_text
-
-        if block1["block_label"] in sub_title_labels:
-            if blocks[i].get("sub_title") == []:
-                blocks[i]["sub_title"] = sub_title
-
-        if block1["block_label"] in vision_labels:
-            if blocks[i].get("vision_footnote") == []:
-                blocks[i]["vision_footnote"] = vision_footnote
-
-    return blocks, pre_cuts
-
-
-def get_layout_ordering(
-    parsing_res_list: List[Dict[str, Any]],
-    no_mask_labels: List[str] = [],
-) -> None:
-    """
-    Process layout parsing results to remove overlapping bounding boxes
-    and assign an ordering index based on their positions.
-
-    Modifies:
-        The 'parsing_res_list' list by adding an 'index' to each block.
-
-    Args:
-        parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
-        no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
-    """
-    title_text_labels = ["doc_title"]
-    title_labels = ["doc_title", "paragraph_title"]
-    vision_labels = ["image", "table", "seal", "chart", "figure"]
-    vision_title_labels = ["table_title", "chart_title", "figure_title"]
-
-    parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
-
-    parsing_res_by_pre_cuts_list = []
-    if len(pre_cuts) > 0:
-        block_bboxes = [block["block_bbox"] for block in parsing_res_list]
-        for axis, cuts in pre_cuts.items():
-            axis_index = 1 if axis == "y" else 0
-
-            max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
-
-            intervals = []
-            prev = 0
-            for cut in sorted(cuts):
-                intervals.append((prev, cut))
-                prev = cut
-            intervals.append((prev, max_val))
-
-            for start, end in intervals:
-                mask = [
-                    (bbox[axis_index] >= start) and (bbox[axis_index] < end)
-                    for bbox in block_bboxes
-                ]
-                parsing_res_by_pre_cuts_list.append(
-                    [parsing_res_list[i] for i, m in enumerate(mask) if m]
-                )
-    else:
-        parsing_res_by_pre_cuts_list = [parsing_res_list]
-
-    final_parsing_res_list = []
-    num_index = 0
-    num_sub_index = 0
-    for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
-
-        doc_flag = False
-        median_width = _get_text_median_width(parsing_res_by_pre_cuts)
-        parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
-            parsing_res_by_pre_cuts,
-            median_width,
-            no_mask_labels=no_mask_labels,
-            threshold=0.3,
-        )
-        # Convert bounding boxes to float and remove overlaps
-        (
-            double_text_blocks,
-            title_text_blocks,
-            title_blocks,
-            vision_blocks,
-            vision_title_blocks,
-            vision_footnote_blocks,
-            other_blocks,
-        ) = ([], [], [], [], [], [], [])
-
-        drop_indexes = []
-
-        for index, block in enumerate(parsing_res_by_pre_cuts):
-            label = block["sub_label"]
-            block["block_bbox"] = list(map(int, block["block_bbox"]))
-
-            if label == "doc_title":
-                doc_flag = True
-
-            if label in no_mask_labels:
-                if block["layout"] == "double":
-                    double_text_blocks.append(block)
-                    drop_indexes.append(index)
-            elif label == "title_text":
-                title_text_blocks.append(block)
-                drop_indexes.append(index)
-            elif label == "vision_footnote":
-                vision_footnote_blocks.append(block)
-                drop_indexes.append(index)
-            elif label in vision_title_labels:
-                vision_title_blocks.append(block)
-                drop_indexes.append(index)
-            elif label in title_labels:
-                title_blocks.append(block)
-                drop_indexes.append(index)
-            elif label in vision_labels:
-                vision_blocks.append(block)
-                drop_indexes.append(index)
-            else:
-                other_blocks.append(block)
-                drop_indexes.append(index)
-
-        for index in sorted(drop_indexes, reverse=True):
-            del parsing_res_by_pre_cuts[index]
-
-        if len(parsing_res_by_pre_cuts) > 0:
-            # single text label
-            if (
-                len(double_text_blocks) > len(parsing_res_by_pre_cuts)
-                or projection_direction
-            ):
-                parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
-                title_blocks = []
-                double_text_blocks = []
-                block_bboxes = [
-                    block["block_bbox"] for block in parsing_res_by_pre_cuts
-                ]
-                block_bboxes.sort(
-                    key=lambda x: (
-                        x[0] // max(20, median_width),
-                        x[1],
-                    ),
-                )
-                block_bboxes = np.array(block_bboxes)
-                sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
-            else:
-                block_bboxes = [
-                    block["block_bbox"] for block in parsing_res_by_pre_cuts
-                ]
-                block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
-                block_bboxes = np.array(block_bboxes)
-                sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
-
-            sorted_boxes = block_bboxes[sorted_indices].tolist()
-
-            for block in parsing_res_by_pre_cuts:
-                block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
-                block["sub_index"] = (
-                    num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
-                )
-
-        def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
-            for block in input_blocks:
-                bbox = block["block_bbox"]
-                min_distance = float("inf")
-                min_distance_config = [
-                    [float("inf"), float("inf")],
-                    float("inf"),
-                    float("inf"),
-                ]  # for double text
-                nearest_gt_index = 0
-                for match_block in parsing_res_by_pre_cuts:
-                    match_bbox = match_block["block_bbox"]
-                    if distance_type == "nearest_iou_edge_distance":
-                        distance, min_distance_config = _nearest_iou_edge_distance(
-                            bbox,
-                            match_bbox,
-                            block["sub_label"],
-                            vision_labels=vision_labels,
-                            no_mask_labels=no_mask_labels,
-                            median_width=median_width,
-                            title_labels=title_labels,
-                            title_text=block["title_text"],
-                            sub_title=block["sub_title"],
-                            min_distance_config=min_distance_config,
-                            tolerance_len=10,
-                        )
-                    elif distance_type == "title_text":
-                        if (
-                            match_block["block_label"] in title_labels + ["abstract"]
-                            and match_block["title_text"] != []
-                        ):
-                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
-                                bbox,
-                                match_block["title_text"][0][1],
-                            )
-                            iou_right_down = (
-                                _calculate_overlap_area_div_minbox_area_ratio(
-                                    bbox,
-                                    match_block["title_text"][-1][1],
-                                )
-                            )
-                            iou = 1 - max(iou_left_up, iou_right_down)
-                            distance = _manhattan_distance(bbox, match_bbox) * iou
-                        else:
-                            distance = float("inf")
-                    elif distance_type == "manhattan":
-                        distance = _manhattan_distance(bbox, match_bbox)
-                    elif distance_type == "vision_footnote":
-                        if (
-                            match_block["block_label"] in vision_labels
-                            and match_block["vision_footnote"] != []
-                        ):
-                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
-                                bbox,
-                                match_block["vision_footnote"][0],
-                            )
-                            iou_right_down = (
-                                _calculate_overlap_area_div_minbox_area_ratio(
-                                    bbox,
-                                    match_block["vision_footnote"][-1],
-                                )
-                            )
-                            iou = 1 - max(iou_left_up, iou_right_down)
-                            distance = _manhattan_distance(bbox, match_bbox) * iou
-                        else:
-                            distance = float("inf")
-                    elif distance_type == "vision_body":
-                        if (
-                            match_block["block_label"] in vision_title_labels
-                            and block["vision_footnote"] != []
-                        ):
-                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
-                                match_bbox,
-                                block["vision_footnote"][0],
-                            )
-                            iou_right_down = (
-                                _calculate_overlap_area_div_minbox_area_ratio(
-                                    match_bbox,
-                                    block["vision_footnote"][-1],
-                                )
-                            )
-                            iou = 1 - max(iou_left_up, iou_right_down)
-                            distance = _manhattan_distance(bbox, match_bbox) * iou
-                        else:
-                            distance = float("inf")
-                    # when reference block cross mulitple columns, its order should be after the blocks above it.
-                    elif distance_type == "append":
-                        if match_bbox[3] <= bbox[1]:
-                            distance = -(match_bbox[2] * 10 + match_bbox[3])
-                        else:
-                            distance = float("inf")
-                    else:
-                        raise NotImplementedError
-
-                    if distance < min_distance:
-                        min_distance = distance
-                        if is_add_index:
-                            nearest_gt_index = match_block.get("index", 999)
-                        else:
-                            nearest_gt_index = match_block.get("sub_index", 999)
-
-                if is_add_index:
-                    block["index"] = nearest_gt_index
-                else:
-                    block["sub_index"] = nearest_gt_index
-
-                parsing_res_by_pre_cuts.append(block)
-
-        # double text label
-        double_text_blocks.sort(
-            key=lambda x: (
-                x["block_bbox"][1] // 10,
-                x["block_bbox"][0] // median_width,
-                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
-            ),
-        )
-        # filter the reference blocks from all blocks that cross mulitple columns.
-        # they should be ordered using "append".
-        double_text_reference_blocks = []
-        i = 0
-        while i < len(double_text_blocks):
-            if double_text_blocks[i]["block_label"] == "reference":
-                double_text_reference_blocks.append(double_text_blocks.pop(i))
-            else:
-                i += 1
-        nearest_match_(
-            double_text_blocks,
-            distance_type="nearest_iou_edge_distance",
-        )
-        nearest_match_(
-            double_text_reference_blocks,
-            distance_type="append",
-        )
-        parsing_res_by_pre_cuts.sort(
-            key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
-        )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["index"] = num_index + idx + 1
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # title label
-        title_blocks.sort(
-            key=lambda x: (
-                x["block_bbox"][1] // 10,
-                x["block_bbox"][0] // median_width,
-                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
-            ),
-        )
-        nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
-
-        if doc_flag:
-            text_sort_labels = ["doc_title"]
-            text_label_priority = {
-                label: priority for priority, label in enumerate(text_sort_labels)
-            }
-            doc_titles = []
-            for i, block in enumerate(parsing_res_by_pre_cuts):
-                if block["block_label"] == "doc_title":
-                    doc_titles.append(
-                        (i, block["block_bbox"][1], block["block_bbox"][0]),
-                    )
-            doc_titles.sort(key=lambda x: (x[1], x[2]))
-            first_doc_title_index = doc_titles[0][0]
-            parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
-            parsing_res_by_pre_cuts.sort(
-                key=lambda x: (
-                    x["index"],
-                    text_label_priority.get(x["block_label"], 9999),
-                    x["block_bbox"][1],
-                    x["block_bbox"][0],
-                ),
-            )
-        else:
-            parsing_res_by_pre_cuts.sort(
-                key=lambda x: (
-                    x["index"],
-                    x["block_bbox"][1],
-                    x["block_bbox"][0],
-                ),
-            )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["index"] = num_index + idx + 1
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # title-text label
-        nearest_match_(title_text_blocks, distance_type="title_text")
-
-        def hor_tb_and_ver_lr(x):
-            input_bbox = x["block_bbox"]
-            is_horizontal = _get_bbox_direction(input_bbox)
-            if is_horizontal:
-                return input_bbox[1]
-            else:
-                return input_bbox[0]
-
-        parsing_res_by_pre_cuts.sort(
-            key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
-        )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["index"] = num_index + idx + 1
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # image,figure,chart,seal label
-        nearest_match_(
-            vision_blocks,
-            distance_type="nearest_iou_edge_distance",
-            is_add_index=False,
-        )
-        parsing_res_by_pre_cuts.sort(
-            key=lambda x: (
-                x["sub_index"],
-                x["block_bbox"][1],
-                x["block_bbox"][0],
-            ),
-        )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # image,figure,chart,seal title label
-        nearest_match_(
-            vision_title_blocks,
-            distance_type="nearest_iou_edge_distance",
-            is_add_index=False,
-        )
-        parsing_res_by_pre_cuts.sort(
-            key=lambda x: (
-                x["sub_index"],
-                x["block_bbox"][1],
-                x["block_bbox"][0],
-            ),
+        tuple or None: The intersection bounding box in the specified format, or None if there is no intersection.
+    """
+    bbox1 = np.array(bbox1)
+    bbox2 = np.array(bbox2)
+    # Convert both bounding boxes to rectangles
+    rect1 = bbox1 if len(bbox1.shape) == 1 else convert_points_to_boxes([bbox1])[0]
+    rect2 = bbox2 if len(bbox2.shape) == 1 else convert_points_to_boxes([bbox2])[0]
+
+    # Calculate the intersection rectangle
+
+    x_min_inter = max(rect1[0], rect2[0])
+    y_min_inter = max(rect1[1], rect2[1])
+    x_max_inter = min(rect1[2], rect2[2])
+    y_max_inter = min(rect1[3], rect2[3])
+
+    # Check if there is an intersection
+    if x_min_inter >= x_max_inter or y_min_inter >= y_max_inter:
+        return None
+
+    if return_format == "bbox":
+        return np.array([x_min_inter, y_min_inter, x_max_inter, y_max_inter])
+    elif return_format == "poly":
+        return np.array(
+            [
+                [x_min_inter, y_min_inter],
+                [x_max_inter, y_min_inter],
+                [x_max_inter, y_max_inter],
+                [x_min_inter, y_max_inter],
+            ],
+            dtype=np.int16,
         )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # vision footnote label
-        nearest_match_(
-            vision_footnote_blocks,
-            distance_type="vision_footnote",
-            is_add_index=False,
-        )
-        text_label_priority = {"vision_footnote": 9999}
-        parsing_res_by_pre_cuts.sort(
-            key=lambda x: (
-                x["sub_index"],
-                text_label_priority.get(x["sub_label"], 0),
-                x["block_bbox"][1],
-                x["block_bbox"][0],
-            ),
-        )
-
-        for idx, block in enumerate(parsing_res_by_pre_cuts):
-            block["sub_index"] = num_sub_index + idx + 1
-
-        # header、footnote、header_image... label
-        nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
-
-        # add all parsing result
-        final_parsing_res_list.extend(parsing_res_by_pre_cuts)
-
-        # update num index
-        num_sub_index += len(parsing_res_by_pre_cuts)
-        for parsing_res in parsing_res_by_pre_cuts:
-            if parsing_res.get("index"):
-                num_index += 1
-
-    parsing_res_list = [
-        {
-            "block_label": parsing_res["block_label"],
-            "block_content": parsing_res["block_content"],
-            "block_bbox": parsing_res["block_bbox"],
-            "block_image": parsing_res.get("block_image", None),
-            "sub_label": parsing_res["sub_label"],
-            "sub_index": parsing_res["sub_index"],
-            "index": parsing_res.get("index", None),
-            "seg_start_coordinate": parsing_res.get(
-                "seg_start_coordinate", float("inf")
-            ),
-            "seg_end_coordinate": parsing_res.get("seg_end_coordinate", float("-inf")),
-            "num_of_lines": parsing_res.get("num_of_lines", 1),
-        }
-        for parsing_res in final_parsing_res_list
-    ]
-
-    return parsing_res_list
-
-
-def _manhattan_distance(
-    point1: Tuple[float, float],
-    point2: Tuple[float, float],
-    weight_x: float = 1.0,
-    weight_y: float = 1.0,
-) -> float:
-    """
-    Calculate the weighted Manhattan distance between two points.
-
-    Args:
-        point1 (Tuple[float, float]): The first point as (x, y).
-        point2 (Tuple[float, float]): The second point as (x, y).
-        weight_x (float): The weight for the x-axis distance. Default is 1.0.
-        weight_y (float): The weight for the y-axis distance. Default is 1.0.
-
-    Returns:
-        float: The weighted Manhattan distance between the two points.
-    """
-    return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
-
-
-def _calculate_horizontal_distance(
-    input_bbox: List[int],
-    match_bbox: List[int],
-    height: int,
-    disperse: int,
-    title_text: List[Tuple[int, List[int]]],
-) -> float:
-    """
-    Calculate the horizontal distance between two bounding boxes, considering title text adjustments.
-
-    Args:
-        input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
-        match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
-        height (int): The height of the input bounding box used for normalization.
-        disperse (int): The dispersion factor used to normalize the horizontal distance.
-        title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
-                                                  Format: [(position_indicator, [x1, y1, x2, y2]), ...].
-
-    Returns:
-        float: The calculated horizontal distance taking into account the title text adjustments.
-    """
-    x1, y1, x2, y2 = input_bbox
-    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
-
-    # Determine vertical distance adjustment based on title text
-    if y2 < y1_prime:
-        if title_text and title_text[-1][0] == 2:
-            y2 += title_text[-1][1][3] - title_text[-1][1][1]
-        vertical_adjustment = (y1_prime - y2) * 0.5
-    else:
-        if title_text and title_text[0][0] == 1:
-            y1 -= title_text[0][1][3] - title_text[0][1][1]
-        vertical_adjustment = y1 - y2_prime
-
-    # Calculate horizontal distance with adjustments
-    horizontal_distance = (
-        abs(x2_prime - x1) // disperse
-        + vertical_adjustment // height
-        + vertical_adjustment / 5000
-    )
-
-    return horizontal_distance
-
-
-def _calculate_vertical_distance(
-    input_bbox: List[int],
-    match_bbox: List[int],
-    width: int,
-    disperse: int,
-    title_text: List[Tuple[int, List[int]]],
-) -> float:
-    """
-    Calculate the vertical distance between two bounding boxes, considering title text adjustments.
-
-    Args:
-        input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
-        match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
-        width (int): The width of the input bounding box used for normalization.
-        disperse (int): The dispersion factor used to normalize the vertical distance.
-        title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
-                                                  Format: [(position_indicator, [x1, y1, x2, y2]), ...].
-
-    Returns:
-        float: The calculated vertical distance taking into account the title text adjustments.
-    """
-    x1, y1, x2, y2 = input_bbox
-    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
-
-    # Determine horizontal distance adjustment based on title text
-    if x1 > x2_prime:
-        if title_text and title_text[0][0] == 3:
-            x1 -= title_text[0][1][2] - title_text[0][1][0]
-        horizontal_adjustment = (x1 - x2_prime) * 0.5
     else:
-        if title_text and title_text[-1][0] == 4:
-            x2 += title_text[-1][1][2] - title_text[-1][1][0]
-        horizontal_adjustment = x1_prime - x2
-
-    # Calculate vertical distance with adjustments
-    vertical_distance = (
-        abs(y2_prime - y1) // disperse
-        + horizontal_adjustment // width
-        + horizontal_adjustment / 5000
-    )
-
-    return vertical_distance
+        raise ValueError("return_format must be either 'bbox' or 'poly'.")
 
 
-def _nearest_edge_distance(
-    input_bbox: List[int],
-    match_bbox: List[int],
-    weight: List[float] = [1.0, 1.0, 1.0, 1.0],
-    label: str = "text",
-    no_mask_labels: List[str] = [],
-    min_edge_distance_config: List[float] = [],
-    tolerance_len: float = 10.0,
-) -> Tuple[float, List[float]]:
-    """
-    Calculate the nearest edge distance between two bounding boxes, considering directional weights.
-
-    Args:
-        input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
-        match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
-        weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
-        label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
-        no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
-        min_edge_distance_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
-        Defaults to [float('inf'), float('inf')].
-        tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.
+def update_layout_order_config_block_index(
+    config: dict, block_label: str, block_idx: int
+) -> None:
 
-    Returns:
-        Tuple[float, List[float]]: A tuple containing:
-            - The calculated minimum edge distance between the bounding boxes.
-            - A list with the minimum edge distances in the x and y directions.
-    """
-    match_bbox_iou = _calculate_overlap_area_div_minbox_area_ratio(
-        input_bbox,
-        match_bbox,
-    )
-    if match_bbox_iou > 0 and label not in no_mask_labels:
-        return 0, [0, 0]
-
-    if not min_edge_distance_config:
-        min_edge_distance_config = [float("inf"), float("inf")]
-    min_edge_distance_x, min_edge_distance_y = min_edge_distance_config
-
-    x1, y1, x2, y2 = input_bbox
-    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
-
-    direction_num = 0
-    distance_x = float("inf")
-    distance_y = float("inf")
-    distance = [float("inf")] * 4
-
-    # input_bbox is to the left of match_bbox
-    if x2 < x1_prime:
-        direction_num += 1
-        distance[0] = x1_prime - x2
-        if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
-            distance_x = min_edge_distance_x * weight[0]
-        else:
-            distance_x = distance[0] * weight[0]
-    # input_bbox is to the right of match_bbox
-    elif x1 > x2_prime:
-        direction_num += 1
-        distance[1] = x1 - x2_prime
-        if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
-            distance_x = min_edge_distance_x * weight[1]
-        else:
-            distance_x = distance[1] * weight[1]
-    elif match_bbox_iou > 0:
-        distance[0] = 0
-        distance_x = 0
-
-    # input_bbox is above match_bbox
-    if y2 < y1_prime:
-        direction_num += 1
-        distance[2] = y1_prime - y2
-        if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
-            distance_y = min_edge_distance_y * weight[2]
-        else:
-            distance_y = distance[2] * weight[2]
-        if label in no_mask_labels:
-            distance_y = max(0.1, distance_y) * 10  # for abstract
-    # input_bbox is below match_bbox
-    elif y1 > y2_prime:
-        direction_num += 1
-        distance[3] = y1 - y2_prime
-        if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
-            distance_y = min_edge_distance_y * weight[3]
-        else:
-            distance_y = distance[3] * weight[3]
-    elif match_bbox_iou > 0:
-        distance[2] = 0
-        distance_y = 0
-
-    if direction_num == 2:
-        return (distance_x + distance_y), [
-            min(distance[0], distance[1]),
-            min(distance[2], distance[3]),
+    doc_title_labels = config["doc_title_labels"]
+    paragraph_title_labels = config["paragraph_title_labels"]
+    vision_labels = config["vision_labels"]
+    vision_title_labels = config["vision_title_labels"]
+    header_labels = config["header_labels"]
+    unordered_labels = config["unordered_labels"]
+    footer_labels = config["footer_labels"]
+    text_labels = config["text_labels"]
+    text_title_labels = doc_title_labels + paragraph_title_labels
+    config["text_title_labels"] = text_title_labels
+
+    if block_label in doc_title_labels:
+        config["doc_title_block_idxes"].append(block_idx)
+    if block_label in paragraph_title_labels:
+        config["paragraph_title_block_idxes"].append(block_idx)
+    if block_label in vision_labels:
+        config["vision_block_idxes"].append(block_idx)
+    if block_label in vision_title_labels:
+        config["vision_title_block_idxes"].append(block_idx)
+    if block_label in unordered_labels:
+        config["unordered_block_idxes"].append(block_idx)
+    if block_label in text_title_labels:
+        config["text_title_block_idxes"].append(block_idx)
+    if block_label in text_labels:
+        config["text_block_idxes"].append(block_idx)
+    if block_label in header_labels:
+        config["header_block_idxes"].append(block_idx)
+    if block_label in footer_labels:
+        config["footer_block_idxes"].append(block_idx)
+
+
+def update_region_box(bbox, region_box):
+    if region_box is None:
+        return bbox
+
+    x1, y1, x2, y2 = bbox
+    x1_region, y1_region, x2_region, y2_region = region_box
+
+    x1_region = int(min(x1, x1_region))
+    y1_region = int(min(y1, y1_region))
+    x2_region = int(max(x2, x2_region))
+    y2_region = int(max(y2, y2_region))
+
+    region_box = [x1_region, y1_region, x2_region, y2_region]
+
+    return region_box
+
+
+def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
+    for formula_res in formula_res_list:
+        x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
+        poly_points = [
+            (x_min, y_min),
+            (x_max, y_min),
+            (x_max, y_max),
+            (x_min, y_max),
         ]
-    else:
-        return min(distance_x, distance_y), [
-            min(distance[0], distance[1]),
-            min(distance[2], distance[3]),
-        ]
-
-
-def _get_weights(label, horizontal):
-    """Define weights based on the label and orientation."""
-    if label == "doc_title":
-        return (
-            [1, 0.1, 0.1, 1] if horizontal else [0.2, 0.1, 1, 1]
-        )  # left-down ,  right-left
-    elif label in [
-        "paragraph_title",
-        "table_title",
-        "abstract",
-        "image",
-        "seal",
-        "chart",
-        "figure",
-    ]:
-        return [1, 1, 0.1, 1]  # down
-    else:
-        return [1, 1, 1, 0.1]  # up
-
-
-def _nearest_iou_edge_distance(
-    input_bbox: List[int],
-    match_bbox: List[int],
-    label: str,
-    vision_labels: List[str],
-    no_mask_labels: List[str],
-    median_width: int = -1,
-    title_labels: List[str] = [],
-    title_text: List[Tuple[int, List[int]]] = [],
-    sub_title: List[List[int]] = [],
-    min_distance_config: List[float] = [],
-    tolerance_len: float = 10.0,
-) -> Tuple[float, List[float]]:
-    """
-    Calculate the nearest IOU edge distance between two bounding boxes, considering label types, title adjustments, and minimum distance configurations.
-    This function computes the edge distance between two bounding boxes while considering their overlap (IOU) and various adjustments based on label types,
-    title text, and subtitle information. It also applies minimum distance configurations and tolerance adjustments.
-
-    Args:
-        input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
-        match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
-        label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
-        vision_labels (List[str]): List of labels for vision-related objects (e.g., images, icons).
-        no_mask_labels (List[str]): Labels for which no masking is applied when calculating edge distances.
-        median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
-        title_labels (List[str], optional): Labels that indicate the object is a title. Defaults to an empty list.
-        title_text (List[Tuple[int, List[int]]], optional): Text content associated with title labels, in the format [(position_indicator, [x1, y1, x2, y2]), ...].
-        sub_title (List[List[int]], optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
-        min_distance_config (List[float], optional): Configuration for minimum distances [min_edge_distance_config, up_edge_distances_config, total_distance].
-        tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.0.
-
-    Returns:
-        Tuple[float, List[float]]: A tuple containing:
-            - The calculated distance considering IOU and adjustments.
-            - The updated minimum distance configuration.
-    """
-
-    x1, y1, x2, y2 = input_bbox
-    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
-
-    min_edge_distance_config, up_edge_distances_config, total_distance = (
-        min_distance_config
-    )
-
-    iou_distance = 0
-
-    if label in vision_labels:
-        horizontal1 = horizontal2 = True
-    else:
-        horizontal1 = _get_bbox_direction(input_bbox)
-        horizontal2 = _get_bbox_direction(match_bbox, 3)
-
-    if (
-        horizontal1 != horizontal2
-        or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
-    ):
-        iou_distance = 1
-
-    if label == "doc_title":
-        # Calculate distance for titles
-        disperse = max(1, median_width)
-        tolerance_len = max(tolerance_len, disperse)
-
-    # Adjust input_bbox based on sub_title
-    if sub_title:
-        for sub in sub_title:
-            x1_, y1_, x2_, y2_ = sub
-            x1, y1, x2, y2 = (
-                min(x1, x1_),
-                min(y1, y1_),
-                min(x2, x2_),
-                max(y2, y2_),
-            )
-        input_bbox = [x1, y1, x2, y2]
-
-    if title_text:
-        for sub in title_text:
-            x1_, y1_, x2_, y2_ = sub[1]
-            if horizontal1:
-                x1, y1, x2, y2 = (
-                    min(x1, x1_),
-                    min(y1, y1_),
-                    min(x2, x2_),
-                    max(y2, y2_),
-                )
-            else:
-                x1, y1, x2, y2 = (
-                    min(x1, x1_),
-                    min(y1, y1_),
-                    max(x2, x2_),
-                    min(y2, y2_),
-                )
-        input_bbox = [x1, y1, x2, y2]
-
-    # Calculate edge distance
-    weight = _get_weights(label, horizontal1)
-    if label == "abstract":
-        tolerance_len *= 2
-
-    edge_distance, edge_distance_config = _nearest_edge_distance(
-        input_bbox,
-        match_bbox,
-        weight,
-        label=label,
-        no_mask_labels=no_mask_labels,
-        min_edge_distance_config=min_edge_distance_config,
-        tolerance_len=tolerance_len,
-    )
-
-    # Weights for combining distances
-    iou_edge_weight = [10**8, 10**4, 1, 0.0001]
-
-    # Calculate up and left edge distances
-    up_edge_distance = y1_prime
-    left_edge_distance = x1_prime
-    if (
-        label in no_mask_labels or label in title_labels or label in vision_labels
-    ) and y1 > y2_prime:
-        up_edge_distance = -y2_prime
-        left_edge_distance = -x2_prime
-
-    min_up_edge_distance = up_edge_distances_config
-    if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
-        up_edge_distance = min_up_edge_distance
-
-    # Calculate total distance
-    distance = (
-        iou_distance * iou_edge_weight[0]
-        + edge_distance * iou_edge_weight[1]
-        + up_edge_distance * iou_edge_weight[2]
-        + left_edge_distance * iou_edge_weight[3]
-    )
+        ocr_res["dt_polys"].append(poly_points)
+        ocr_res["rec_texts"].append(f"${formula_res['rec_formula']}$")
+        ocr_res["rec_boxes"] = np.vstack(
+            (ocr_res["rec_boxes"], [formula_res["dt_polys"]])
+        )
+        ocr_res["rec_labels"].append("formula")
+        ocr_res["rec_polys"].append(poly_points)
+        ocr_res["rec_scores"].append(1)
 
-    # Update minimum distance configuration if a smaller distance is found
-    if total_distance > distance:
-        edge_distance_config = [
-            edge_distance_config[0],
-            edge_distance_config[1],
-        ]
-        min_distance_config = [
-            edge_distance_config,
-            up_edge_distance,
-            distance,
-        ]
 
-    return distance, min_distance_config
+def caculate_bbox_area(bbox):
+    x1, y1, x2, y2 = bbox
+    area = abs((x2 - x1) * (y2 - y1))
+    return area
 
 
 def get_show_color(label: str) -> Tuple:

+ 16 - 0
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py

@@ -0,0 +1,16 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import *
+from .xycuts import *

+ 1030 - 0
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py

@@ -0,0 +1,1030 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+
+from ..result_v2 import LayoutParsingBlock
+
+
+def calculate_projection_iou(
+    bbox1: List[float], bbox2: List[float], direction: str = "horizontal"
+) -> float:
+    """
+    Calculate the IoU of lines between two bounding boxes.
+
+    Args:
+        bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
+        bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
+        direction (str): direction of the projection, "horizontal" or "vertical".
+
+    Returns:
+        float: Line IoU. Returns 0 if there is no overlap.
+    """
+    start_index, end_index = 1, 3
+    if direction == "horizontal":
+        start_index, end_index = 0, 2
+
+    intersection_start = max(bbox1[start_index], bbox2[start_index])
+    intersection_end = min(bbox1[end_index], bbox2[end_index])
+    overlap = intersection_end - intersection_start
+    if overlap <= 0:
+        return 0
+    union_width = max(bbox1[end_index], bbox2[end_index]) - min(
+        bbox1[start_index], bbox2[start_index]
+    )
+
+    return overlap / union_width if union_width > 0 else 0.0
+
+
+def calculate_iou(
+    bbox1: Union[list, tuple],
+    bbox2: Union[list, tuple],
+) -> float:
+    """
+    Calculate the Intersection over Union (IoU) of two bounding boxes.
+
+    Parameters:
+    bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
+    bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
+
+    Returns:
+    float: The IoU value between the two bounding boxes
+    """
+
+    x_min_inter = max(bbox1[0], bbox2[0])
+    y_min_inter = max(bbox1[1], bbox2[1])
+    x_max_inter = min(bbox1[2], bbox2[2])
+    y_max_inter = min(bbox1[3], bbox2[3])
+
+    inter_width = max(0, x_max_inter - x_min_inter)
+    inter_height = max(0, y_max_inter - y_min_inter)
+
+    inter_area = inter_width * inter_height
+
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+
+    union_area = bbox1_area + bbox2_area - inter_area
+
+    if union_area == 0:
+        return 0.0
+
+    return inter_area / union_area
+
+
+def get_nearest_edge_distance(
+    bbox1: List[int],
+    bbox2: List[int],
+    weight: List[float] = [1.0, 1.0, 1.0, 1.0],
+) -> Tuple[float]:
+    """
+    Calculate the nearest edge distance between two bounding boxes, considering directional weights.
+
+    Args:
+        bbox1 (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
+        bbox2 (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
+        weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
+
+    Returns:
+        float: The calculated minimum edge distance between the bounding boxes.
+    """
+    x1, y1, x2, y2 = bbox1
+    x1_prime, y1_prime, x2_prime, y2_prime = bbox2
+    min_x_distance, min_y_distance = 0, 0
+    horizontal_iou = calculate_projection_iou(bbox1, bbox2, "horizontal")
+    vertical_iou = calculate_projection_iou(bbox1, bbox2, "vertical")
+    if horizontal_iou > 0 and vertical_iou > 0:
+        return 0.0
+    if horizontal_iou == 0:
+        min_x_distance = min(abs(x1 - x2_prime), abs(x2 - x1_prime)) * (
+            weight[0] if x2 < x1_prime else weight[1]
+        )
+    if vertical_iou == 0:
+        min_y_distance = min(abs(y1 - y2_prime), abs(y2 - y1_prime)) * (
+            weight[2] if y2 < y1_prime else weight[3]
+        )
+
+    return min_x_distance + min_y_distance
+
+
+def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
+    """
+    Generate a 1D projection histogram from bounding boxes along a specified axis.
+
+    Args:
+        boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
+        axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
+
+    Returns:
+        A 1D numpy array representing the projection histogram based on bounding box intervals.
+    """
+    assert axis in [0, 1]
+    max_length = np.max(boxes[:, axis::2])
+    projection = np.zeros(max_length, dtype=int)
+
+    # Increment projection histogram over the interval defined by each bounding box
+    for start, end in boxes[:, axis::2]:
+        projection[start:end] += 1
+
+    return projection
+
+
+def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
+    """
+    Split the projection profile into segments based on specified thresholds.
+
+    Args:
+        arr_values: 1D array representing the projection profile.
+        min_value: Minimum value threshold to consider a profile segment significant.
+        min_gap: Minimum gap width to consider a separation between segments.
+
+    Returns:
+        A tuple of start and end indices for each segment that meets the criteria.
+    """
+    # Identify indices where the projection exceeds the minimum value
+    significant_indices = np.where(arr_values > min_value)[0]
+    if not len(significant_indices):
+        return
+
+    # Calculate gaps between significant indices
+    index_diffs = significant_indices[1:] - significant_indices[:-1]
+    gap_indices = np.where(index_diffs > min_gap)[0]
+
+    # Determine start and end indices of segments
+    segment_starts = np.insert(
+        significant_indices[gap_indices + 1],
+        0,
+        significant_indices[0],
+    )
+    segment_ends = np.append(
+        significant_indices[gap_indices],
+        significant_indices[-1] + 1,
+    )
+
+    return segment_starts, segment_ends
+
+
+def recursive_yx_cut(
+    boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
+):
+    """
+    Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
+
+    Args:
+        boxes: A (N, 4) array representing bounding boxes.
+        indices: List of indices indicating the original position of boxes.
+        res: List to store indices of the final segmented bounding boxes.
+        min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
+
+    Returns:
+        None: This function modifies the `res` list in place.
+    """
+    assert len(boxes) == len(
+        indices
+    ), "The length of boxes and indices must be the same."
+
+    # Sort by y_min for Y-axis projection
+    y_sorted_indices = boxes[:, 1].argsort()
+    y_sorted_boxes = boxes[y_sorted_indices]
+    y_sorted_indices = np.array(indices)[y_sorted_indices]
+
+    # Perform Y-axis projection
+    y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
+    y_intervals = _split_projection_profile(y_projection, 0, 1)
+
+    if not y_intervals:
+        return
+
+    # Process each segment defined by Y-axis projection
+    for y_start, y_end in zip(*y_intervals):
+        # Select boxes within the current y interval
+        y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
+            y_sorted_boxes[:, 1] < y_end
+        )
+        y_boxes_chunk = y_sorted_boxes[y_interval_indices]
+        y_indices_chunk = y_sorted_indices[y_interval_indices]
+
+        # Sort by x_min for X-axis projection
+        x_sorted_indices = y_boxes_chunk[:, 0].argsort()
+        x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
+        x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
+
+        # Perform X-axis projection
+        x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
+        x_intervals = _split_projection_profile(x_projection, 0, min_gap)
+
+        if not x_intervals:
+            continue
+
+        # If X-axis cannot be further segmented, add current indices to results
+        if len(x_intervals[0]) == 1:
+            res.extend(x_sorted_indices_chunk)
+            continue
+
+        # Recursively process each segment defined by X-axis projection
+        for x_start, x_end in zip(*x_intervals):
+            x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
+                x_sorted_boxes_chunk[:, 0] < x_end
+            )
+            recursive_yx_cut(
+                x_sorted_boxes_chunk[x_interval_indices],
+                x_sorted_indices_chunk[x_interval_indices],
+                res,
+            )
+
+
+def recursive_xy_cut(
+    boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
+):
+    """
+    Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
+
+    Args:
+        boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
+        indices: A list of indices representing the position of boxes in the original data.
+        res: A list to store indices of bounding boxes that meet the criteria.
+        min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
+
+    Returns:
+        None: This function modifies the `res` list in place.
+    """
+    # Ensure boxes and indices have the same length
+    assert len(boxes) == len(
+        indices
+    ), "The length of boxes and indices must be the same."
+
+    # Sort by x_min to prepare for X-axis projection
+    x_sorted_indices = boxes[:, 0].argsort()
+    x_sorted_boxes = boxes[x_sorted_indices]
+    x_sorted_indices = np.array(indices)[x_sorted_indices]
+
+    # Perform X-axis projection
+    x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
+    x_intervals = _split_projection_profile(x_projection, 0, 1)
+
+    if not x_intervals:
+        return
+
+    # Process each segment defined by X-axis projection
+    for x_start, x_end in zip(*x_intervals):
+        # Select boxes within the current x interval
+        x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
+            x_sorted_boxes[:, 0] < x_end
+        )
+        x_boxes_chunk = x_sorted_boxes[x_interval_indices]
+        x_indices_chunk = x_sorted_indices[x_interval_indices]
+
+        # Sort selected boxes by y_min to prepare for Y-axis projection
+        y_sorted_indices = x_boxes_chunk[:, 1].argsort()
+        y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
+        y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
+
+        # Perform Y-axis projection
+        y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
+        y_intervals = _split_projection_profile(y_projection, 0, min_gap)
+
+        if not y_intervals:
+            continue
+
+        # If Y-axis cannot be further segmented, add current indices to results
+        if len(y_intervals[0]) == 1:
+            res.extend(y_sorted_indices_chunk)
+            continue
+
+        # Recursively process each segment defined by Y-axis projection
+        for y_start, y_end in zip(*y_intervals):
+            y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
+                y_sorted_boxes_chunk[:, 1] < y_end
+            )
+            recursive_xy_cut(
+                y_sorted_boxes_chunk[y_interval_indices],
+                y_sorted_indices_chunk[y_interval_indices],
+                res,
+            )
+
+
+def reference_insert(
+    block: LayoutParsingBlock,
+    sorted_blocks: List[LayoutParsingBlock],
+    config: Dict,
+    median_width: float = 0.0,
+):
+    """
+    Insert reference block into sorted blocks based on the distance between the block and the nearest sorted block.
+
+    Args:
+        block: The block to insert into the sorted blocks.
+        sorted_blocks: The sorted blocks where the new block will be inserted.
+        config: Configuration dictionary containing parameters related to the layout parsing.
+        median_width: Median width of the document. Defaults to 0.0.
+
+    Returns:
+        sorted_blocks: The updated sorted blocks after insertion.
+    """
+    min_distance = float("inf")
+    nearest_sorted_block_index = 0
+    for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
+        if sorted_block.bbox[3] <= block.bbox[1]:
+            distance = -(sorted_block.bbox[2] * 10 + sorted_block.bbox[3])
+        if distance < min_distance:
+            min_distance = distance
+            nearest_sorted_block_index = sorted_block_idx
+
+    sorted_blocks.insert(nearest_sorted_block_index + 1, block)
+    return sorted_blocks
+
+
+def manhattan_insert(
+    block: LayoutParsingBlock,
+    sorted_blocks: List[LayoutParsingBlock],
+    config: Dict,
+    median_width: float = 0.0,
+):
+    """
+    Insert a block into a sorted list of blocks based on the Manhattan distance between the block and the nearest sorted block.
+
+    Args:
+        block: The block to insert into the sorted blocks.
+        sorted_blocks: The sorted blocks where the new block will be inserted.
+        config: Configuration dictionary containing parameters related to the layout parsing.
+        median_width: Median width of the document. Defaults to 0.0.
+
+    Returns:
+        sorted_blocks: The updated sorted blocks after insertion.
+    """
+    min_distance = float("inf")
+    nearest_sorted_block_index = 0
+    for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
+        distance = _manhattan_distance(block.bbox, sorted_block.bbox)
+        if distance < min_distance:
+            min_distance = distance
+            nearest_sorted_block_index = sorted_block_idx
+
+    sorted_blocks.insert(nearest_sorted_block_index + 1, block)
+    return sorted_blocks
+
+
+def weighted_distance_insert(
+    block: LayoutParsingBlock,
+    sorted_blocks: List[LayoutParsingBlock],
+    config: Dict,
+    median_width: float = 0.0,
+):
+    """
+    Insert a block into a sorted list of blocks based on the weighted distance between the block and the nearest sorted block.
+
+    Args:
+        block: The block to insert into the sorted blocks.
+        sorted_blocks: The sorted blocks where the new block will be inserted.
+        config: Configuration dictionary containing parameters related to the layout parsing.
+        median_width: Median width of the document. Defaults to 0.0.
+
+    Returns:
+        sorted_blocks: The updated sorted blocks after insertion.
+    """
+    doc_title_labels = config.get("doc_title_labels", [])
+    paragraph_title_labels = config.get("paragraph_title_labels", [])
+    vision_labels = config.get("vision_labels", [])
+    xy_cut_block_labels = config.get("xy_cut_block_labels", [])
+    tolerance_len = config.get("tolerance_len", 2)
+    x1, y1, x2, y2 = block.bbox
+    min_weighted_distance, min_edge_distance, min_up_edge_distance = (
+        float("inf"),
+        float("inf"),
+        float("inf"),
+    )
+    nearest_sorted_block_index = 0
+    for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
+
+        x1_prime, y1_prime, x2_prime, y2_prime = sorted_block.bbox
+
+        # Calculate edge distance
+        weight = _get_weights(block.region_label, block.direction)
+        edge_distance = get_nearest_edge_distance(block.bbox, sorted_block.bbox, weight)
+
+        if block.label in doc_title_labels:
+            disperse = max(1, median_width)
+            tolerance_len = max(tolerance_len, disperse)
+        if block.label == "abstract":
+            tolerance_len *= 2
+            edge_distance = max(0.1, edge_distance) * 10
+
+        # Calculate up edge distances
+        up_edge_distance = y1_prime
+        left_edge_distance = x1_prime
+        if (
+            block.label in xy_cut_block_labels
+            or block.label in doc_title_labels
+            or block.label in paragraph_title_labels
+            or block.label in vision_labels
+        ) and y1 > y2_prime:
+            up_edge_distance = -y2_prime
+            left_edge_distance = -x2_prime
+
+        if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
+            up_edge_distance = min_up_edge_distance
+
+        # Calculate weighted distance
+        weighted_distance = (
+            +edge_distance * config.get("edge_weight", 10**4)
+            + up_edge_distance * config.get("up_edge_weight", 1)
+            + left_edge_distance * config.get("left_edge_weight", 0.0001)
+        )
+
+        min_edge_distance = min(edge_distance, min_edge_distance)
+        min_up_edge_distance = min(up_edge_distance, min_up_edge_distance)
+
+        if weighted_distance < min_weighted_distance:
+            nearest_sorted_block_index = sorted_block_idx
+            min_weighted_distance = weighted_distance
+            if y1 > y1_prime or (y1 == y1_prime and x1 > x1_prime):
+                nearest_sorted_block_index = sorted_block_idx + 1
+
+    sorted_blocks.insert(nearest_sorted_block_index, block)
+    return sorted_blocks
+
+
+def insert_child_blocks(
+    block: LayoutParsingBlock,
+    block_idx: int,
+    sorted_blocks: List[LayoutParsingBlock],
+) -> List[LayoutParsingBlock]:
+    """
+    Insert child blocks of a block into the sorted blocks list.
+
+    Args:
+        block: The parent block whose child blocks need to be inserted.
+        block_idx: Index at which the parent block exists in the sorted blocks list.
+        sorted_blocks: Sorted blocks list where the child blocks are to be inserted.
+
+    Returns:
+        sorted_blocks: Updated sorted blocks list after inserting child blocks.
+    """
+    if block.child_blocks:
+        sub_blocks = block.get_child_blocks()
+        sub_blocks.append(block)
+        sub_blocks = sort_child_blocks(sub_blocks, block.direction)
+        sorted_blocks[block_idx] = sub_blocks[0]
+        for block in sub_blocks[1:]:
+            block_idx += 1
+            sorted_blocks.insert(block_idx, block)
+    return sorted_blocks
+
+
+def sort_child_blocks(blocks, direction="horizontal") -> List[LayoutParsingBlock]:
+    """
+    Sort child blocks based on their bounding box coordinates.
+
+    Args:
+        blocks: A list of LayoutParsingBlock objects representing the child blocks.
+        direction: Orientation of the blocks ('horizontal' or 'vertical'). Default is 'horizontal'.
+    Returns:
+        sorted_blocks: A sorted list of LayoutParsingBlock objects.
+    """
+    if direction == "horizontal":
+        # from top to bottom
+        blocks.sort(
+            key=lambda x: (
+                x.bbox[1],  # y_min
+                x.bbox[0],  # x_min
+                x.bbox[1] ** 2 + x.bbox[0] ** 2,  # distance with (0,0)
+            ),
+            reverse=False,
+        )
+    else:
+        # from right to left
+        blocks.sort(
+            key=lambda x: (
+                x.bbox[0],  # x_min
+                x.bbox[1],  # y_min
+                x.bbox[1] ** 2 + x.bbox[0] ** 2,  # distance with (0,0)
+            ),
+            reverse=True,
+        )
+    return blocks
+
+
+def _get_weights(label, dircetion="horizontal"):
+    """Define weights based on the label and orientation."""
+    if label == "doc_title":
+        return (
+            [1, 0.1, 0.1, 1] if dircetion == "horizontal" else [0.2, 0.1, 1, 1]
+        )  # left-down ,  right-left
+    elif label in [
+        "paragraph_title",
+        "table_title",
+        "abstract",
+        "image",
+        "seal",
+        "chart",
+        "figure",
+    ]:
+        return [1, 1, 0.1, 1]  # down
+    else:
+        return [1, 1, 1, 0.1]  # up
+
+
+def _manhattan_distance(
+    point1: Tuple[float, float],
+    point2: Tuple[float, float],
+    weight_x: float = 1.0,
+    weight_y: float = 1.0,
+) -> float:
+    """
+    Calculate the weighted Manhattan distance between two points.
+
+    Args:
+        point1 (Tuple[float, float]): The first point as (x, y).
+        point2 (Tuple[float, float]): The second point as (x, y).
+        weight_x (float): The weight for the x-axis distance. Default is 1.0.
+        weight_y (float): The weight for the y-axis distance. Default is 1.0.
+
+    Returns:
+        float: The weighted Manhattan distance between the two points.
+    """
+    return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
+
+
+def sort_blocks(blocks, median_width=None, reverse=False):
+    """
+    Sort blocks based on their y_min, x_min and distance with (0,0).
+
+    Args:
+        blocks (list): list of blocks to be sorted.
+        median_width (int): the median width of the text blocks.
+        reverse (bool, optional): whether to sort in descending order. Default is False.
+
+    Returns:
+        list: a list of sorted blocks.
+    """
+    if median_width is None:
+        median_width = 1
+    blocks.sort(
+        key=lambda x: (
+            x.bbox[1] // 10,  # y_min
+            x.bbox[0] // median_width,  # x_min
+            x.bbox[1] ** 2 + x.bbox[0] ** 2,  # distance with (0,0)
+        ),
+        reverse=reverse,
+    )
+    return blocks
+
+
+def get_cut_blocks(
+    blocks, cut_direction, cut_coordinates, overall_region_box, mask_labels=[]
+):
+    """
+    Cut blocks based on the given cut direction and coordinates.
+
+    Args:
+        blocks (list): list of blocks to be cut.
+        cut_direction (str): cut direction, either "horizontal" or "vertical".
+        cut_coordinates (list): list of cut coordinates.
+        overall_region_box (list): the overall region box that contains all blocks.
+
+    Returns:
+        list: a list of tuples containing the cutted blocks and their corresponding mean width。
+    """
+    cuted_list = []
+    # filter out mask blocks,including header, footer, unordered and child_blocks
+
+    # 0: horizontal, 1: vertical
+    cut_aixis = 0 if cut_direction == "horizontal" else 1
+    blocks.sort(key=lambda x: x.bbox[cut_aixis + 2])
+    overall_max_axis_coordinate = overall_region_box[cut_aixis + 2]
+    cut_coordinates.append(overall_max_axis_coordinate)
+
+    cut_coordinates = list(set(cut_coordinates))
+    cut_coordinates.sort()
+
+    cut_idx = 0
+    for cut_coordinate in cut_coordinates:
+        group_blocks = []
+        block_idx = cut_idx
+        while block_idx < len(blocks):
+            block = blocks[block_idx]
+            if block.bbox[cut_aixis + 2] > cut_coordinate:
+                break
+            elif block.region_label not in mask_labels:
+                group_blocks.append(block)
+            block_idx += 1
+        cut_idx = block_idx
+        if group_blocks:
+            cuted_list.append(group_blocks)
+
+    return cuted_list
+
+
+def split_sub_region_blocks(
+    blocks: List[LayoutParsingBlock],
+    config: Dict,
+) -> List:
+    """
+    Split blocks into sub regions based on the all layout region bbox.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): A list of blocks.
+        config (Dict): Configuration dictionary.
+    Returns:
+        List: A list of lists of blocks, each representing a sub region.
+    """
+
+    region_bbox = config.get("all_layout_region_box", None)
+    x1, y1, x2, y2 = region_bbox
+    region_width = x2 - x1
+    region_height = y2 - y1
+
+    if region_width < region_height:
+        return [(blocks, region_bbox)]
+
+    all_boxes = np.array([block.bbox for block in blocks])
+    discontinuous = calculate_discontinuous_projection(all_boxes, direction="vertical")
+    if len(discontinuous) > 1:
+        cut_coordinates = []
+        region_boxes = []
+        current_interval = discontinuous[0]
+        for x1, x2 in discontinuous[1:]:
+            if x1 - current_interval[1] > 100:
+                cut_coordinates.extend([x1, x2])
+                region_boxes.append([x1, y1, x2, y2])
+            current_interval = [x1, x2]
+        region_blocks = get_cut_blocks(blocks, "vertical", cut_coordinates, region_bbox)
+
+        return [region_info for region_info in zip(region_blocks, region_boxes)]
+    else:
+        return [(blocks, region_bbox)]
+
+
+def get_adjacent_blocks_by_direction(
+    blocks: List[LayoutParsingBlock],
+    block_idx: int,
+    ref_block_idxes: List[int],
+    iou_threshold,
+) -> List:
+    """
+    Get the adjacent blocks with the same direction as the current block.
+    Args:
+        block (LayoutParsingBlock): The current block.
+        blocks (List[LayoutParsingBlock]): A list of all blocks.
+        ref_block_idxes (List[int]): A list of indices of reference blocks.
+        iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
+    Returns:
+        Int: The index of the previous block with same direction.
+        Int: The index of the following block with same direction.
+    """
+    min_prev_block_distance = float("inf")
+    prev_block_index = None
+    min_post_block_distance = float("inf")
+    post_block_index = None
+    block = blocks[block_idx]
+    child_labels = [
+        "vision_footnote",
+        "sub_paragraph_title",
+        "doc_title_text",
+        "vision_title",
+    ]
+
+    # find the nearest text block with same direction to the current block
+    for ref_block_idx in ref_block_idxes:
+        ref_block = blocks[ref_block_idx]
+        ref_block_direction = ref_block.direction
+        if ref_block.region_label in child_labels:
+            continue
+        match_block_iou = calculate_projection_iou(
+            block.bbox,
+            ref_block.bbox,
+            ref_block_direction,
+        )
+
+        child_match_distance_tolerance_len = block.short_side_length / 10
+
+        if block.region_label == "vision":
+            if ref_block.num_of_lines == 1:
+                gap_tolerance_len = ref_block.short_side_length * 2
+            else:
+                gap_tolerance_len = block.short_side_length / 10
+        else:
+            gap_tolerance_len = block.short_side_length * 2
+
+        if match_block_iou >= iou_threshold:
+            prev_distance = (
+                block.secondary_direction_start_coordinate
+                - ref_block.secondary_direction_end_coordinate
+                + child_match_distance_tolerance_len
+            ) // 5 + ref_block.start_coordinate / 5000
+            next_distance = (
+                ref_block.secondary_direction_start_coordinate
+                - block.secondary_direction_end_coordinate
+                + child_match_distance_tolerance_len
+            ) // 5 + ref_block.start_coordinate / 5000
+            if (
+                ref_block.secondary_direction_end_coordinate
+                <= block.secondary_direction_start_coordinate
+                + child_match_distance_tolerance_len
+                and prev_distance < min_prev_block_distance
+            ):
+                min_prev_block_distance = prev_distance
+                if (
+                    block.secondary_direction_start_coordinate
+                    - ref_block.secondary_direction_end_coordinate
+                    < gap_tolerance_len
+                ):
+                    prev_block_index = ref_block_idx
+            elif (
+                ref_block.secondary_direction_start_coordinate
+                > block.secondary_direction_end_coordinate
+                - child_match_distance_tolerance_len
+                and next_distance < min_post_block_distance
+            ):
+                min_post_block_distance = next_distance
+                if (
+                    ref_block.secondary_direction_start_coordinate
+                    - block.secondary_direction_end_coordinate
+                    < gap_tolerance_len
+                ):
+                    post_block_index = ref_block_idx
+
+    diff_dist = abs(min_prev_block_distance - min_post_block_distance)
+
+    # if the difference in distance is too large, only consider the nearest one
+    if diff_dist * 5 > block.short_side_length:
+        if min_prev_block_distance < min_post_block_distance:
+            post_block_index = None
+        else:
+            prev_block_index = None
+
+    return prev_block_index, post_block_index
+
+
+def update_doc_title_child_blocks(
+    blocks: List[LayoutParsingBlock],
+    block: LayoutParsingBlock,
+    prev_idx: int,
+    post_idx: int,
+    config: dict,
+) -> None:
+    """
+    Update the child blocks of a document title block.
+
+    The child blocks need to meet the following conditions:
+        1. They must be adjacent
+        2. They must have the same direction as the parent block.
+        3. Their short side length should be less than 80% of the parent's short side length.
+        4. Their long side length should be less than 150% of the parent's long side length.
+        5. The child block must be text block.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): overall blocks.
+        block (LayoutParsingBlock): document title block.
+        prev_idx (int): previous block index, None if not exist.
+        post_idx (int): post block index, None if not exist.
+        config (dict): configurations.
+
+    Returns:
+        None
+
+    """
+    for idx in [prev_idx, post_idx]:
+        if idx is None:
+            continue
+        ref_block = blocks[idx]
+        with_seem_direction = ref_block.direction == block.direction
+
+        short_side_length_condition = (
+            ref_block.short_side_length < block.short_side_length * 0.8
+        )
+
+        long_side_length_condition = (
+            ref_block.long_side_length < block.long_side_length
+            or ref_block.long_side_length > 1.5 * block.long_side_length
+        )
+
+        if (
+            with_seem_direction
+            and short_side_length_condition
+            and long_side_length_condition
+            and ref_block.num_of_lines < 3
+        ):
+            ref_block.region_label = "doc_title_text"
+            block.append_child_block(ref_block)
+            config["text_block_idxes"].remove(idx)
+
+
+def update_paragraph_title_child_blocks(
+    blocks: List[LayoutParsingBlock],
+    block: LayoutParsingBlock,
+    prev_idx: int,
+    post_idx: int,
+    config: dict,
+) -> None:
+    """
+    Update the child blocks of a paragraph title block.
+
+    The child blocks need to meet the following conditions:
+        1. They must be adjacent
+        2. They must have the same direction as the parent block.
+        3. The child block must be paragraph title block.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): overall blocks.
+        block (LayoutParsingBlock): document title block.
+        prev_idx (int): previous block index, None if not exist.
+        post_idx (int): post block index, None if not exist.
+        config (dict): configurations.
+
+    Returns:
+        None
+
+    """
+    paragraph_title_labels = config.get("paragraph_title_labels", [])
+    for idx in [prev_idx, post_idx]:
+        if idx is None:
+            continue
+        ref_block = blocks[idx]
+        with_seem_direction = ref_block.direction == block.direction
+        if with_seem_direction and ref_block.label in paragraph_title_labels:
+            ref_block.region_label = "sub_paragraph_title"
+            block.append_child_block(ref_block)
+            config["paragraph_title_block_idxes"].remove(idx)
+
+
+def update_vision_child_blocks(
+    blocks: List[LayoutParsingBlock],
+    block: LayoutParsingBlock,
+    ref_block_idxes: List[int],
+    prev_idx: int,
+    post_idx: int,
+    config: dict,
+) -> None:
+    """
+    Update the child blocks of a paragraph title block.
+
+    The child blocks need to meet the following conditions:
+    - For Both:
+        1. They must be adjacent
+        2. The child block must be vision_title or text block.
+    - For vision_title:
+        1. The distance between the child block and the parent block should be less than 1/2 of the parent's height.
+    - For text block:
+        1. The distance between the child block and the parent block should be less than 15.
+        2. The child short_side_length should be less than the parent's short side length.
+        3. The child long_side_length should be less than 50% of the parent's long side length.
+        4. The difference between their centers is very small.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): overall blocks.
+        block (LayoutParsingBlock): document title block.
+        ref_block_idxes (List[int]): A list of indices of reference blocks.
+        prev_idx (int): previous block index, None if not exist.
+        post_idx (int): post block index, None if not exist.
+        config (dict): configurations.
+
+    Returns:
+        None
+
+    """
+    vision_title_labels = config.get("vision_title_labels", [])
+    text_labels = config.get("text_labels", [])
+    for idx in [prev_idx, post_idx]:
+        if idx is None:
+            continue
+        ref_block = blocks[idx]
+        nearest_edge_distance = get_nearest_edge_distance(block.bbox, ref_block.bbox)
+        block_center = block.get_centroid()
+        ref_block_center = ref_block.get_centroid()
+        if ref_block.label in vision_title_labels and nearest_edge_distance <= min(
+            block.height * 0.5, ref_block.height * 2
+        ):
+            ref_block.region_label = "vision_title"
+            block.append_child_block(ref_block)
+            config["vision_title_block_idxes"].remove(idx)
+        elif (
+            nearest_edge_distance <= 15
+            and ref_block.short_side_length < block.short_side_length
+            and ref_block.long_side_length < 0.5 * block.long_side_length
+            and ref_block.direction == block.direction
+            and (
+                abs(block_center[0] - ref_block_center[0]) < 10
+                or (
+                    block.bbox[0] - ref_block.bbox[0] < 10
+                    and ref_block.num_of_lines == 1
+                )
+                or (
+                    block.bbox[2] - ref_block.bbox[2] < 10
+                    and ref_block.num_of_lines == 1
+                )
+            )
+        ):
+            has_vision_footnote = False
+            if len(block.child_blocks) > 0:
+                for child_block in block.child_blocks:
+                    if child_block.label in text_labels:
+                        has_vision_footnote = True
+            if not has_vision_footnote:
+                ref_block.region_label = "vision_footnote"
+                block.append_child_block(ref_block)
+                config["text_block_idxes"].remove(idx)
+
+
+def calculate_discontinuous_projection(boxes, direction="horizontal") -> List:
+    """
+    Calculate the discontinuous projection of boxes along the specified direction.
+
+    Args:
+        boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
+        direction (str): Direction along which to perform the projection ('horizontal' or 'vertical').
+
+    Returns:
+        list: List of tuples representing the merged intervals.
+    """
+    if direction == "horizontal":
+        intervals = boxes[:, [0, 2]]
+    elif direction == "vertical":
+        intervals = boxes[:, [1, 3]]
+    else:
+        raise ValueError("Direction must be 'horizontal' or 'vertical'")
+
+    intervals = intervals[np.argsort(intervals[:, 0])]
+
+    merged_intervals = []
+    current_start, current_end = intervals[0]
+
+    for start, end in intervals[1:]:
+        if start <= current_end:
+            current_end = max(current_end, end)
+        else:
+            merged_intervals.append((current_start, current_end))
+            current_start, current_end = start, end
+
+    merged_intervals.append((current_start, current_end))
+    return merged_intervals
+
+
+def shrink_overlapping_boxes(
+    boxes, direction="horizontal", min_threshold=0, max_threshold=0.1
+) -> List:
+    """
+    Shrink overlapping boxes along the specified direction.
+
+    Args:
+        boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
+        direction (str): Direction along which to perform the shrinking ('horizontal' or 'vertical').
+        min_threshold (float): Minimum threshold for shrinking. Default is 0.
+        max_threshold (float): Maximum threshold for shrinking. Default is 0.2.
+
+    Returns:
+        list: List of tuples representing the merged intervals.
+    """
+    current_block = boxes[0]
+    for block in boxes[1:]:
+        x1, y1, x2, y2 = current_block.bbox
+        x1_prime, y1_prime, x2_prime, y2_prime = block.bbox
+        cut_iou = calculate_projection_iou(
+            current_block.bbox, block.bbox, direction=direction
+        )
+        match_iou = calculate_projection_iou(
+            current_block.bbox,
+            block.bbox,
+            direction="horizontal" if direction == "vertical" else "vertical",
+        )
+        if direction == "vertical":
+            if (
+                (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
+                or y2 == y1_prime
+                or abs(y2 - y1_prime) <= 3
+            ):
+                overlap_y_min = max(y1, y1_prime)
+                overlap_y_max = min(y2, y2_prime)
+                split_y = int((overlap_y_min + overlap_y_max) / 2)
+                overlap_y_min = split_y - 1
+                overlap_y_max = split_y + 1
+                current_block.bbox = [x1, y1, x2, overlap_y_min]
+                block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
+        else:
+            if (
+                (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
+                or x2 == x1_prime
+                or abs(x2 - x1_prime) <= 3
+            ):
+                overlap_x_min = max(x1, x1_prime)
+                overlap_x_max = min(x2, x2_prime)
+                split_x = int((overlap_x_min + overlap_x_max) / 2)
+                overlap_x_min = split_x - 1
+                overlap_x_max = split_x + 1
+                current_block.bbox = [x1, y1, overlap_x_min, y2]
+                block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
+        current_block = block
+    return boxes

+ 512 - 0
paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py

@@ -0,0 +1,512 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from ..result_v2 import LayoutParsingBlock
+from .utils import (
+    calculate_discontinuous_projection,
+    calculate_iou,
+    calculate_projection_iou,
+    get_adjacent_blocks_by_direction,
+    get_cut_blocks,
+    insert_child_blocks,
+    manhattan_insert,
+    recursive_xy_cut,
+    recursive_yx_cut,
+    reference_insert,
+    shrink_overlapping_boxes,
+    sort_blocks,
+    update_doc_title_child_blocks,
+    update_paragraph_title_child_blocks,
+    update_vision_child_blocks,
+    weighted_distance_insert,
+)
+
+
+def pre_process(
+    blocks: List[LayoutParsingBlock],
+    config: Dict,
+) -> List:
+    """
+    Preprocess the layout for sorting purposes.
+
+    This function performs two main tasks:
+    1. Pre-cuts the layout to ensure the document is correctly partitioned and sorted.
+    2. Match the blocks with their children.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): A list of LayoutParsingBlock objects representing the layout.
+        config (Dict): Configuration parameters that include settings for pre-cutting and sorting.
+
+    Returns:
+        List: A list of pre-cutted layout blocks list.
+    """
+    region_bbox = config.get("all_layout_region_box", None)
+    region_x_center = (region_bbox[0] + region_bbox[2]) / 2
+    region_y_center = (region_bbox[1] + region_bbox[3]) / 2
+
+    header_block_idxes = config.get("header_block_idxes", [])
+    header_blocks = []
+    for idx in header_block_idxes:
+        blocks[idx].region_label = "header"
+        header_blocks.append(blocks[idx])
+
+    unordered_block_idxes = config.get("unordered_block_idxes", [])
+    unordered_blocks = []
+    for idx in unordered_block_idxes:
+        blocks[idx].region_label = "unordered"
+        unordered_blocks.append(blocks[idx])
+
+    footer_block_idxes = config.get("footer_block_idxes", [])
+    footer_blocks = []
+    for idx in footer_block_idxes:
+        blocks[idx].region_label = "footer"
+        footer_blocks.append(blocks[idx])
+
+    mask_labels = ["header", "unordered", "footer"]
+    child_labels = [
+        "vision_footnote",
+        "sub_paragraph_title",
+        "doc_title_text",
+        "vision_title",
+    ]
+    pre_cut_block_idxes = []
+    for block_idx, block in enumerate(blocks):
+        if block.label in mask_labels:
+            continue
+
+        if block.region_label not in child_labels:
+            update_region_label(blocks, config, block_idx)
+
+        block_direction = block.direction
+        if block_direction == "horizontal":
+            region_bbox_center = region_x_center
+            tolerance_len = block.long_side_length // 5
+        else:
+            region_bbox_center = region_y_center
+            tolerance_len = block.short_side_length // 10
+
+        block_center = (block.start_coordinate + block.end_coordinate) / 2
+        center_offset = abs(block_center - region_bbox_center)
+        is_centered = center_offset <= tolerance_len
+
+        if is_centered:
+            pre_cut_block_idxes.append(block_idx)
+
+    pre_cut_list = []
+    cut_direction = "vertical"
+    cut_coordinates = []
+    discontinuous = []
+    mask_labels = child_labels + mask_labels
+    all_boxes = np.array(
+        [block.bbox for block in blocks if block.region_label not in mask_labels]
+    )
+    if pre_cut_block_idxes:
+        horizontal_cut_num = 0
+        for block_idx in pre_cut_block_idxes:
+            block = blocks[block_idx]
+            horizontal_cut_num += 1 if block.secondary_direction == "horizontal" else 0
+        cut_direction = (
+            "horizontal"
+            if horizontal_cut_num > len(pre_cut_block_idxes) * 0.5
+            else "vertical"
+        )
+        discontinuous = calculate_discontinuous_projection(
+            all_boxes, direction=cut_direction
+        )
+        for idx in pre_cut_block_idxes:
+            block = blocks[idx]
+            if (
+                block.region_label not in mask_labels
+                and block.secondary_direction == cut_direction
+            ):
+                if (
+                    block.secondary_direction_start_coordinate,
+                    block.secondary_direction_end_coordinate,
+                ) in discontinuous:
+                    cut_coordinates.append(block.secondary_direction_start_coordinate)
+                    cut_coordinates.append(block.secondary_direction_end_coordinate)
+    if not discontinuous:
+        discontinuous = calculate_discontinuous_projection(
+            all_boxes, direction=cut_direction
+        )
+    current_interval = discontinuous[0]
+    for interval in discontinuous[1:]:
+        gap_len = interval[0] - current_interval[1]
+        if gap_len > 40:
+            cut_coordinates.append(current_interval[1])
+        current_interval = interval
+    overall_region_box = config.get("all_layout_region_box")
+    cut_list = get_cut_blocks(
+        blocks, cut_direction, cut_coordinates, overall_region_box, mask_labels
+    )
+    pre_cut_list.extend(cut_list)
+
+    return header_blocks, pre_cut_list, footer_blocks, unordered_blocks
+
+
+def update_region_label(
+    blocks: List[LayoutParsingBlock], config: Dict[str, Any], block_idx: int
+) -> None:
+    """
+    Update the region label of a block based on its label and match the block with its children.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): The list of blocks to process.
+        config (Dict[str, Any]): The configuration dictionary containing the necessary information.
+        block_idx (int): The index of the current block being processed.
+
+    Returns:
+        None
+    """
+
+    # special title block labels
+    doc_title_labels = config.get("doc_title_labels", [])
+    paragraph_title_labels = config.get("paragraph_title_labels", [])
+    vision_labels = config.get("vision_labels", [])
+
+    block = blocks[block_idx]
+    if block.label in doc_title_labels:
+        block.region_label = "doc_title"
+    # Force the direction of vision type to be horizontal
+    if block.label in vision_labels:
+        block.region_label = "vision"
+        block.update_direction_info()
+    # some paragraph title block may be labeled as sub_title, so we need to check if block.region_label is "other"(default).
+    if block.label in paragraph_title_labels and block.region_label == "other":
+        block.region_label = "paragraph_title"
+
+    # only vision and doc title block can have child block
+    if block.region_label not in ["vision", "doc_title", "paragraph_title"]:
+        return
+
+    iou_threshold = config.get("child_block_match_iou_threshold", 0.1)
+    # match doc title text block
+    if block.region_label == "doc_title":
+        text_block_idxes = config.get("text_block_idxes", [])
+        prev_idx, post_idx = get_adjacent_blocks_by_direction(
+            blocks, block_idx, text_block_idxes, iou_threshold
+        )
+        update_doc_title_child_blocks(blocks, block, prev_idx, post_idx, config)
+    # match sub title block
+    elif block.region_label == "paragraph_title":
+        iou_threshold = config.get("sub_title_match_iou_threshold", 0.1)
+        paragraph_title_block_idxes = config.get("paragraph_title_block_idxes", [])
+        text_block_idxes = config.get("text_block_idxes", [])
+        megred_block_idxes = text_block_idxes + paragraph_title_block_idxes
+        prev_idx, post_idx = get_adjacent_blocks_by_direction(
+            blocks, block_idx, megred_block_idxes, iou_threshold
+        )
+        update_paragraph_title_child_blocks(blocks, block, prev_idx, post_idx, config)
+    # match vision title block
+    elif block.region_label == "vision":
+        # for matching vision title block
+        vision_title_block_idxes = config.get("vision_title_block_idxes", [])
+        # for matching vision footnote block
+        text_block_idxes = config.get("text_block_idxes", [])
+        megred_block_idxes = text_block_idxes + vision_title_block_idxes
+        # Some vision title block may be matched with multiple vision title block, so we need to try multiple times
+        for i in range(3):
+            prev_idx, post_idx = get_adjacent_blocks_by_direction(
+                blocks, block_idx, megred_block_idxes, iou_threshold
+            )
+            update_vision_child_blocks(
+                blocks, block, megred_block_idxes, prev_idx, post_idx, config
+            )
+
+
+def get_layout_structure(
+    blocks: List[LayoutParsingBlock],
+    median_width: float,
+    config: dict,
+    threshold: float = 0.8,
+) -> Tuple[List[Dict[str, any]], bool]:
+    """
+    Determine the layout cross column of blocks.
+
+    Args:
+        blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
+        median_width (float): Median width of text blocks.
+        no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
+        threshold (float): Threshold for determining layout overlap.
+
+    Returns:
+        Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
+        indicating if the cross layout area is greater than the single layout area.
+    """
+    blocks.sort(
+        key=lambda x: (x.bbox[0], x.width),
+    )
+    check_single_layout = {}
+
+    doc_title_labels = config.get("doc_title_labels", [])
+    region_box = config.get("all_layout_region_box", [0, 0, 0, 0])
+    for block_idx, block in enumerate(blocks):
+        cover_count = 0
+        match_block_with_threshold_indexes = []
+
+        for ref_idx, ref_block in enumerate(blocks):
+            if block_idx == ref_idx:
+                continue
+
+            bbox_iou = calculate_iou(block.bbox, ref_block.bbox)
+            if bbox_iou > 0:
+                if block.region_label == "vision" or block.area < ref_block.area:
+                    block.region_label = "cross_text"
+                    break
+
+            match_projection_iou = calculate_projection_iou(
+                block.bbox,
+                ref_block.bbox,
+                "horizontal",
+            )
+
+            if match_projection_iou > 0:
+                cover_count += 1
+                if match_projection_iou > threshold:
+                    match_block_with_threshold_indexes.append(
+                        (ref_idx, match_projection_iou),
+                    )
+                if ref_block.bbox[2] >= block.bbox[2]:
+                    break
+
+        block_center = (block.bbox[0] + block.bbox[2]) / 2
+        region_bbox_center = (region_box[0] + region_box[2]) / 2
+        center_offset = abs(block_center - region_bbox_center)
+        is_centered = center_offset <= median_width * 0.05
+        width_gather_than_median = block.width > median_width * 1.3
+
+        if (
+            cover_count >= 2
+            and block.label not in doc_title_labels
+            and (width_gather_than_median != is_centered)
+        ):
+            block.region_label = (
+                "cross_reference" if block.label == "reference" else "cross_text"
+            )
+        else:
+            check_single_layout[block_idx] = match_block_with_threshold_indexes
+
+    # Check single-layout block
+    for idx, single_layout in check_single_layout.items():
+        if single_layout:
+            index, match_iou = single_layout[-1]
+            if match_iou > 0.9 and blocks[index].region_label == "cross_text":
+                blocks[idx].region_label = (
+                    "cross_reference" if block.label == "reference" else "cross_text"
+                )
+
+
+def sort_by_xycut(
+    block_bboxes: List,
+    direction: int = 0,
+    min_gap: int = 1,
+) -> List[int]:
+    """
+    Sort bounding boxes using recursive XY cut method based on the specified direction.
+
+    Args:
+        block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
+                                                           where each box is represented as
+                                                           [x_min, y_min, x_max, y_max].
+        direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
+                         Defaults to 0.
+        min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
+
+    Returns:
+        List[int]: A list of indices representing the order of sorted bounding boxes.
+    """
+    block_bboxes = np.asarray(block_bboxes).astype(int)
+    res = []
+    if direction == 1:
+        recursive_yx_cut(
+            block_bboxes,
+            np.arange(len(block_bboxes)).tolist(),
+            res,
+            min_gap,
+        )
+    else:
+        recursive_xy_cut(
+            block_bboxes,
+            np.arange(len(block_bboxes)).tolist(),
+            res,
+            min_gap,
+        )
+    return res
+
+
+def match_unsorted_blocks(
+    sorted_blocks: List[LayoutParsingBlock],
+    unsorted_blocks: List[LayoutParsingBlock],
+    config: Dict,
+    median_width: int,
+) -> List[LayoutParsingBlock]:
+    """
+    Match special blocks with the sorted blocks based on their region labels.
+    Args:
+        sorted_blocks (List[LayoutParsingBlock]): Sorted blocks to be matched.
+        unsorted_blocks (List[LayoutParsingBlock]): Unsorted blocks to be matched.
+        config (Dict): Configuration dictionary containing various parameters.
+        median_width (int): Median width value used for calculations.
+
+    Returns:
+        List[LayoutParsingBlock]: The updated sorted blocks after matching special blocks.
+    """
+    distance_type_map = {
+        "cross_text": weighted_distance_insert,
+        "paragraph_title": weighted_distance_insert,
+        "doc_title": weighted_distance_insert,
+        "vision_title": weighted_distance_insert,
+        "vision": weighted_distance_insert,
+        "cross_reference": reference_insert,
+        "unordered": manhattan_insert,
+        "other": manhattan_insert,
+    }
+
+    unsorted_blocks = sort_blocks(unsorted_blocks, median_width, reverse=False)
+    for idx, block in enumerate(unsorted_blocks):
+        region_label = block.region_label
+        if idx == 0 and region_label == "doc_title":
+            sorted_blocks.insert(0, block)
+            continue
+        sorted_blocks = distance_type_map[region_label](
+            block, sorted_blocks, config, median_width
+        )
+    return sorted_blocks
+
+
+def xycut_enhanced(
+    blocks: List[LayoutParsingBlock], config: Dict
+) -> List[LayoutParsingBlock]:
+    """
+    xycut_enhance function performs the following steps:
+        1. Preprocess the input blocks by extracting headers, footers, and pre-cut blocks.
+        2. Mask blocks that are crossing different blocks.
+        3. Perform xycut_enhanced algorithm on the remaining blocks.
+        4. Match special blocks with the sorted blocks based on their region labels.
+        5. Update child blocks of the sorted blocks based on their parent blocks.
+        6. Return the ordered result list.
+
+    Args:
+        blocks (List[LayoutParsingBlock]): Input blocks to be processed.
+
+    Returns:
+        List[LayoutParsingBlock]: Ordered result list after processing.
+    """
+    if len(blocks) == 0:
+        return blocks
+
+    text_labels = config.get("text_labels", [])
+    header_blocks, pre_cut_list, footer_blocks, unordered_blocks = pre_process(
+        blocks, config
+    )
+    final_order_res_list: List[LayoutParsingBlock] = []
+
+    header_blocks = sort_blocks(header_blocks)
+    footer_blocks = sort_blocks(footer_blocks)
+    unordered_blocks = sort_blocks(unordered_blocks)
+
+    final_order_res_list.extend(header_blocks)
+
+    unsorted_blocks: List[LayoutParsingBlock] = []
+    sorted_blocks_by_pre_cuts = []
+    for pre_cut_blocks in pre_cut_list:
+        sorted_blocks: List[LayoutParsingBlock] = []
+        doc_title_blocks: List[LayoutParsingBlock] = []
+        xy_cut_blocks: List[LayoutParsingBlock] = []
+        pre_cut_blocks: List[LayoutParsingBlock]
+        median_width = 1
+        text_block_width = [
+            block.width for block in pre_cut_blocks if block.label in text_labels
+        ]
+        if len(text_block_width) > 0:
+            median_width = int(np.median(text_block_width))
+
+        get_layout_structure(
+            pre_cut_blocks,
+            median_width,
+            config,
+        )
+
+        # Get xy cut blocks and add other blocks in special_block_map
+        for block in pre_cut_blocks:
+            if block.region_label not in [
+                "cross_text",
+                "cross_reference",
+                "doc_title",
+                "unordered",
+            ]:
+                xy_cut_blocks.append(block)
+            elif block.label == "doc_title":
+                doc_title_blocks.append(block)
+            else:
+                unsorted_blocks.append(block)
+
+        if len(xy_cut_blocks) > 0:
+            block_bboxes = np.array([block.bbox for block in xy_cut_blocks])
+            block_text_lines = [block.num_of_lines for block in xy_cut_blocks]
+            discontinuous = calculate_discontinuous_projection(
+                block_bboxes, direction="horizontal"
+            )
+            if len(discontinuous) == 1 or max(block_text_lines) == 1:
+                xy_cut_blocks.sort(key=lambda x: (x.bbox[1] // 5, x.bbox[0]))
+                xy_cut_blocks = shrink_overlapping_boxes(xy_cut_blocks, "vertical")
+                block_bboxes = np.array([block.bbox for block in xy_cut_blocks])
+                sorted_indexes = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
+            else:
+                xy_cut_blocks.sort(key=lambda x: (x.bbox[0] // 20, x.bbox[1]))
+                xy_cut_blocks = shrink_overlapping_boxes(xy_cut_blocks, "horizontal")
+                block_bboxes = np.array([block.bbox for block in xy_cut_blocks])
+                sorted_indexes = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
+
+            sorted_blocks = [xy_cut_blocks[i] for i in sorted_indexes]
+
+        sorted_blocks = match_unsorted_blocks(
+            sorted_blocks,
+            doc_title_blocks,
+            config,
+            median_width,
+        )
+
+        sorted_blocks_by_pre_cuts.extend(sorted_blocks)
+
+    median_width = 1
+    text_block_width = [block.width for block in blocks if block.label in text_labels]
+    if len(text_block_width) > 0:
+        median_width = int(np.median(text_block_width))
+    final_order_res_list = match_unsorted_blocks(
+        sorted_blocks_by_pre_cuts,
+        unsorted_blocks,
+        config,
+        median_width,
+    )
+
+    final_order_res_list.extend(footer_blocks)
+    final_order_res_list.extend(unordered_blocks)
+
+    index = 0
+    visualize_index_labels = config.get("visualize_index_labels", [])
+    for block_idx, block in enumerate(final_order_res_list):
+        if block.label not in visualize_index_labels:
+            continue
+        final_order_res_list = insert_child_blocks(
+            block, block_idx, final_order_res_list
+        )
+        block = final_order_res_list[block_idx]
+        index += 1
+        block.index = index
+    return final_order_res_list