zhengchun
/
PaddleX


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956
							# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = [
    "get_sub_regions_ocr_res",
    "get_layout_ordering",
    "get_single_block_parsing_res",
    "recursive_img_array2path",
    "get_show_color",
    "sorted_layout_boxes",
]

import numpy as np
import copy
import cv2
import uuid
from pathlib import Path
from typing import List
from ..ocr.result import OCRResult
from ...models.object_detection.result import DetResult


def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
    """
    Get the indices of source boxes that overlap with reference boxes based on a specified threshold.

    Args:
        src_boxes (np.ndarray): A 2D numpy array of source bounding boxes.
        ref_boxes (np.ndarray): A 2D numpy array of reference bounding boxes.
    Returns:
        match_idx_list (list): A list of indices of source boxes that overlap with reference boxes.
    """
    match_idx_list = []
    src_boxes_num = len(src_boxes)
    if src_boxes_num > 0 and len(ref_boxes) > 0:
        for rno in range(len(ref_boxes)):
            ref_box = ref_boxes[rno]
            x1 = np.maximum(ref_box[0], src_boxes[:, 0])
            y1 = np.maximum(ref_box[1], src_boxes[:, 1])
            x2 = np.minimum(ref_box[2], src_boxes[:, 2])
            y2 = np.minimum(ref_box[3], src_boxes[:, 3])
            pub_w = x2 - x1
            pub_h = y2 - y1
            match_idx = np.where((pub_w > 3) & (pub_h > 3))[0]
            match_idx_list.extend(match_idx)
    return match_idx_list


def get_sub_regions_ocr_res(
    overall_ocr_res: OCRResult,
    object_boxes: List,
    flag_within: bool = True,
    return_match_idx: bool = False,
) -> OCRResult:
    """
    Filters OCR results to only include text boxes within specified object boxes based on a flag.

    Args:
        overall_ocr_res (OCRResult): The original OCR result containing all text boxes.
        object_boxes (list): A list of bounding boxes for the objects of interest.
        flag_within (bool): If True, only include text boxes within the object boxes. If False, exclude text boxes within the object boxes.
        return_match_idx (bool): If True, return the list of matching indices.

    Returns:
        OCRResult: A filtered OCR result containing only the relevant text boxes.
    """
    sub_regions_ocr_res = {}
    sub_regions_ocr_res["rec_polys"] = []
    sub_regions_ocr_res["rec_texts"] = []
    sub_regions_ocr_res["rec_scores"] = []
    sub_regions_ocr_res["rec_boxes"] = []

    overall_text_boxes = overall_ocr_res["rec_boxes"]
    match_idx_list = get_overlap_boxes_idx(overall_text_boxes, object_boxes)
    match_idx_list = list(set(match_idx_list))
    for box_no in range(len(overall_text_boxes)):
        if flag_within:
            if box_no in match_idx_list:
                flag_match = True
            else:
                flag_match = False
        else:
            if box_no not in match_idx_list:
                flag_match = True
            else:
                flag_match = False
        if flag_match:
            sub_regions_ocr_res["rec_polys"].append(
                overall_ocr_res["rec_polys"][box_no]
            )
            sub_regions_ocr_res["rec_texts"].append(
                overall_ocr_res["rec_texts"][box_no]
            )
            sub_regions_ocr_res["rec_scores"].append(
                overall_ocr_res["rec_scores"][box_no]
            )
            sub_regions_ocr_res["rec_boxes"].append(
                overall_ocr_res["rec_boxes"][box_no]
            )
    return (
        (sub_regions_ocr_res, match_idx_list)
        if return_match_idx
        else sub_regions_ocr_res
    )


def sorted_layout_boxes(res, w):
    """
    Sort text boxes in order from top to bottom, left to right
    Args:
        res: List of dictionaries containing layout information.
        w: Width of image.

    Returns:
        List of dictionaries containing sorted layout information.
    """
    num_boxes = len(res)
    if num_boxes == 1:
        res[0]["layout"] = "single"
        return res

    # Sort on the y axis first or sort it on the x axis
    sorted_boxes = sorted(res, key=lambda x: (x["layout_bbox"][1], x["layout_bbox"][0]))
    _boxes = list(sorted_boxes)

    new_res = []
    res_left = []
    res_right = []
    i = 0

    while True:
        if i >= num_boxes:
            break
        # Check that the bbox is on the left
        elif (
            _boxes[i]["layout_bbox"][0] < w / 4
            and _boxes[i]["layout_bbox"][2] < 3 * w / 5
        ):
            _boxes[i]["layout"] = "double"
            res_left.append(_boxes[i])
            i += 1
        elif _boxes[i]["layout_bbox"][0] > 2 * w / 5:
            _boxes[i]["layout"] = "double"
            res_right.append(_boxes[i])
            i += 1
        else:
            new_res += res_left
            new_res += res_right
            _boxes[i]["layout"] = "single"
            new_res.append(_boxes[i])
            res_left = []
            res_right = []
            i += 1

    res_left = sorted(res_left, key=lambda x: (x["layout_bbox"][1]))
    res_right = sorted(res_right, key=lambda x: (x["layout_bbox"][1]))

    if res_left:
        new_res += res_left
    if res_right:
        new_res += res_right

    return new_res


def _calculate_iou(box1, box2):
    """
    Calculate Intersection over Union (IoU) between two bounding boxes.

    Args:
        box1, box2: Lists or tuples representing bounding boxes [x_min, y_min, x_max, y_max].

    Returns:
        float: The IoU value.
    """
    box1 = list(map(int, box1))
    box2 = list(map(int, box2))

    x1_min, y1_min, x1_max, y1_max = box1
    x2_min, y2_min, x2_max, y2_max = box2

    inter_x_min = max(x1_min, x2_min)
    inter_y_min = max(y1_min, y2_min)
    inter_x_max = min(x1_max, x2_max)
    inter_y_max = min(y1_max, y2_max)

    if inter_x_max <= inter_x_min or inter_y_max <= inter_y_min:
        return 0.0

    inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
    box2_area = (x2_max - x2_min) * (y2_max - y2_min)
    min_area = min(box1_area, box2_area)

    if min_area <= 0:
        return 0.0

    iou = inter_area / min_area
    return iou


def _whether_y_overlap_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.6):
    """
    Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.

    Args:
        bbox1 (tuple): The first bounding box defined as (left, top, right, bottom).
        bbox2 (tuple): The second bounding box defined as (left, top, right, bottom).
        overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
                                         Defaults to 0.6.

    Returns:
        bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
              exceeds the overlap_ratio_threshold, otherwise False.
    """
    _, y1_0, _, y1_1 = bbox1
    _, y2_0, _, y2_1 = bbox2

    overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
    min_height = min(y1_1 - y1_0, y2_1 - y2_0)

    return (overlap / min_height) > overlap_ratio_threshold


def _adjust_span_text(span, prepend=False, append=False):
    """
    Adjust the text of a span by prepending or appending a newline.

    Args:
        span (list): A list where the second element is the text of the span.
        prepend (bool): If True, prepend a newline to the text.
        append (bool): If True, append a newline to the text.

    Returns:
        None: The function modifies the span in place.
    """
    if prepend:
        span[1] = "\n" + span[1]
    if append:
        span[1] = span[1] + "\n"


def _format_line(line, layout_min, layout_max, is_reference=False):
    """
    Format a line of text spans based on layout constraints.

    Args:
        line (list): A list of spans, where each span is a list containing a bounding box and text.
        layout_min (int): The minimum x-coordinate of the layout bounding box.
        layout_max (int): The maximum x-coordinate of the layout bounding box.
        is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.

    Returns:
        None: The function modifies the line in place.
    """
    first_span = line[0]
    end_span = line[-1]

    if not is_reference:
        if first_span[0][0] - layout_min > 10:
            _adjust_span_text(first_span, prepend=True)
        if layout_max - end_span[0][2] > 10:
            _adjust_span_text(end_span, append=True)
    else:
        if first_span[0][0] - layout_min < 5:
            _adjust_span_text(first_span, prepend=True)
        if layout_max - end_span[0][2] > 20:
            _adjust_span_text(end_span, append=True)


def _sort_ocr_res_by_y_projection(
    label, layout_bbox, ocr_res, line_height_iou_threshold=0.7
):
    """
    Sorts OCR results based on their spatial arrangement, grouping them into lines and blocks.

    Args:
        layout_bbox (tuple): A tuple representing the layout bounding box, defined as (left, top, right, bottom).
        ocr_res (dict): A dictionary containing OCR results with the following keys:
                        - "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
                        - "rec_texts": A corresponding list of recognized text strings for each box.
        line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
                                           the same line based on their vertical overlap. Defaults to 0.7.

    Returns:
        dict: A dictionary with the same structure as `ocr_res`, but with boxes and texts sorted
              and grouped into lines and blocks.
    """
    assert (
        ocr_res["boxes"] and ocr_res["rec_texts"]
    ), "OCR results must contain 'boxes' and 'rec_texts'"

    boxes = ocr_res["boxes"]
    rec_texts = ocr_res["rec_texts"]

    x_min, _, x_max, _ = layout_bbox
    inline_x_min = min([box[0] for box in boxes])
    inline_x_max = max([box[2] for box in boxes])

    spans = list(zip(boxes, rec_texts))

    spans.sort(key=lambda span: span[0][1])
    spans = [list(span) for span in spans]

    lines = []
    current_line = [spans[0]]
    current_y0, current_y1 = spans[0][0][1], spans[0][0][3]

    for span in spans[1:]:
        y0, y1 = span[0][1], span[0][3]
        if _whether_y_overlap_exceeds_threshold(
            (0, current_y0, 0, current_y1),
            (0, y0, 0, y1),
            line_height_iou_threshold,
        ):
            current_line.append(span)
            current_y0 = min(current_y0, y0)
            current_y1 = max(current_y1, y1)
        else:
            lines.append(current_line)
            current_line = [span]
            current_y0, current_y1 = y0, y1

    if current_line:
        lines.append(current_line)

    for line in lines:
        line.sort(key=lambda span: span[0][0])
        if label == "reference":
            line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
        else:
            line = _format_line(line, x_min, x_max)

    # Flatten lines back into a single list for boxes and texts
    ocr_res["boxes"] = [span[0] for line in lines for span in line]
    ocr_res["rec_texts"] = [span[1] + " " for line in lines for span in line]

    return ocr_res


def get_single_block_parsing_res(
    overall_ocr_res: OCRResult,
    layout_det_res: DetResult,
    table_res_list: list,
    seal_res_list: list,
) -> OCRResult:
    """
    Extract structured information from OCR and layout detection results.

    Args:
        overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
            - "input_img": The image on which OCR was performed.
            - "dt_boxes": A list of detected text box coordinates.
            - "rec_texts": A list of recognized text corresponding to the detected boxes.

        layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
            - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "label" for the type of content.

        table_res_list (list): A list of table detection results, where each item is a dictionary containing:
            - "layout_bbox": The bounding box of the table layout.
            - "pred_html": The predicted HTML representation of the table.

    Returns:
        list: A list of structured boxes where each item is a dictionary containing:
            - "label": The label of the content (e.g., 'table', 'chart', 'image').
            - The label as a key with either table HTML or image data and text.
            - "layout_bbox": The coordinates of the layout box.
    """

    single_block_layout_parsing_res = []
    input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]

    for box_info in layout_det_res["boxes"]:
        layout_bbox = box_info["coordinate"]
        label = box_info["label"]
        rec_res = {"boxes": [], "rec_texts": [], "flag": False}
        seg_start_flag = True
        seg_end_flag = True

        if label == "table":
            for i, table_res in enumerate(table_res_list):
                if (
                    _calculate_iou(
                        layout_bbox, table_res["table_ocr_pred"]["rec_boxes"][0]
                    )
                    > 0.5
                ):
                    single_block_layout_parsing_res.append(
                        {
                            "label": label,
                            f"{label}": table_res["pred_html"],
                            "layout_bbox": layout_bbox,
                            "seg_start_flag": seg_start_flag,
                            "seg_end_flag": seg_end_flag,
                        },
                    )
                    del table_res_list[i]
                    break
        else:
            overall_text_boxes = overall_ocr_res["rec_boxes"]
            for box_no in range(len(overall_text_boxes)):
                if _calculate_iou(layout_bbox, overall_text_boxes[box_no]) > 0.5:
                    rec_res["boxes"].append(overall_text_boxes[box_no])
                    rec_res["rec_texts"].append(
                        overall_ocr_res["rec_texts"][box_no],
                    )
                    rec_res["flag"] = True

            if rec_res["flag"]:
                rec_res = _sort_ocr_res_by_y_projection(
                    label, layout_bbox, rec_res, 0.7
                )
                rec_res_first_bbox = rec_res["boxes"][0]
                rec_res_end_bbox = rec_res["boxes"][-1]
                if rec_res_first_bbox[0] - layout_bbox[0] < 10:
                    seg_start_flag = False
                if layout_bbox[2] - rec_res_end_bbox[2] < 10:
                    seg_end_flag = False
                if label == "formula":
                    rec_res["rec_texts"] = [
                        rec_res_text.replace("$", "")
                        for rec_res_text in rec_res["rec_texts"]
                    ]

            if label in ["chart", "image"]:
                single_block_layout_parsing_res.append(
                    {
                        "label": label,
                        f"{label}": {
                            "img": input_img[
                                int(layout_bbox[1]) : int(layout_bbox[3]),
                                int(layout_bbox[0]) : int(layout_bbox[2]),
                            ],
                        },
                        "layout_bbox": layout_bbox,
                        "seg_start_flag": seg_start_flag,
                        "seg_end_flag": seg_end_flag,
                    },
                )
            else:
                single_block_layout_parsing_res.append(
                    {
                        "label": label,
                        f"{label}": "".join(rec_res["rec_texts"]),
                        "layout_bbox": layout_bbox,
                        "seg_start_flag": seg_start_flag,
                        "seg_end_flag": seg_end_flag,
                    },
                )

    return single_block_layout_parsing_res


def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
    """
    Generate a 1D projection histogram from bounding boxes along a specified axis.

    Args:
        boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
        axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).

    Returns:
        A 1D numpy array representing the projection histogram based on bounding box intervals.
    """
    assert axis in [0, 1]
    max_length = np.max(boxes[:, axis::2])
    projection = np.zeros(max_length, dtype=int)

    # Increment projection histogram over the interval defined by each bounding box
    for start, end in boxes[:, axis::2]:
        projection[start:end] += 1

    return projection


def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
    """
    Split the projection profile into segments based on specified thresholds.

    Args:
        arr_values: 1D array representing the projection profile.
        min_value: Minimum value threshold to consider a profile segment significant.
        min_gap: Minimum gap width to consider a separation between segments.

    Returns:
        A tuple of start and end indices for each segment that meets the criteria.
    """
    # Identify indices where the projection exceeds the minimum value
    significant_indices = np.where(arr_values > min_value)[0]
    if not len(significant_indices):
        return

    # Calculate gaps between significant indices
    index_diffs = significant_indices[1:] - significant_indices[:-1]
    gap_indices = np.where(index_diffs > min_gap)[0]

    # Determine start and end indices of segments
    segment_starts = np.insert(
        significant_indices[gap_indices + 1],
        0,
        significant_indices[0],
    )
    segment_ends = np.append(
        significant_indices[gap_indices],
        significant_indices[-1] + 1,
    )

    return segment_starts, segment_ends


def _recursive_yx_cut(boxes: np.ndarray, indices: List[int], res: List[int], min_gap=1):
    """
    Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.

    Args:
        boxes: A (N, 4) array representing bounding boxes.
        indices: List of indices indicating the original position of boxes.
        res: List to store indices of the final segmented bounding boxes.
    """
    assert len(boxes) == len(indices)

    # Sort by y_min for Y-axis projection
    y_sorted_indices = boxes[:, 1].argsort()
    y_sorted_boxes = boxes[y_sorted_indices]
    y_sorted_indices = np.array(indices)[y_sorted_indices]

    # Perform Y-axis projection
    y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
    y_intervals = _split_projection_profile(y_projection, 0, 1)

    if not y_intervals:
        return

    # Process each segment defined by Y-axis projection
    for y_start, y_end in zip(*y_intervals):
        # Select boxes within the current y interval
        y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
            y_sorted_boxes[:, 1] < y_end
        )
        y_boxes_chunk = y_sorted_boxes[y_interval_indices]
        y_indices_chunk = y_sorted_indices[y_interval_indices]

        # Sort by x_min for X-axis projection
        x_sorted_indices = y_boxes_chunk[:, 0].argsort()
        x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
        x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]

        # Perform X-axis projection
        x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
        x_intervals = _split_projection_profile(x_projection, 0, min_gap)

        if not x_intervals:
            continue

        # If X-axis cannot be further segmented, add current indices to results
        if len(x_intervals[0]) == 1:
            res.extend(x_sorted_indices_chunk)
            continue

        # Recursively process each segment defined by X-axis projection
        for x_start, x_end in zip(*x_intervals):
            x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
                x_sorted_boxes_chunk[:, 0] < x_end
            )
            _recursive_yx_cut(
                x_sorted_boxes_chunk[x_interval_indices],
                x_sorted_indices_chunk[x_interval_indices],
                res,
            )


def _recursive_xy_cut(boxes: np.ndarray, indices: List[int], res: List[int], min_gap=1):
    """
    Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.

    Args:
        boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
        indices: A list of indices representing the position of boxes in the original data.
        res: A list to store indices of bounding boxes that meet the criteria.
    """
    # Ensure boxes and indices have the same length
    assert len(boxes) == len(indices)

    # Sort by x_min to prepare for X-axis projection
    x_sorted_indices = boxes[:, 0].argsort()
    x_sorted_boxes = boxes[x_sorted_indices]
    x_sorted_indices = np.array(indices)[x_sorted_indices]

    # Perform X-axis projection
    x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
    x_intervals = _split_projection_profile(x_projection, 0, 1)

    if not x_intervals:
        return

    # Process each segment defined by X-axis projection
    for x_start, x_end in zip(*x_intervals):
        # Select boxes within the current x interval
        x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
            x_sorted_boxes[:, 0] < x_end
        )
        x_boxes_chunk = x_sorted_boxes[x_interval_indices]
        x_indices_chunk = x_sorted_indices[x_interval_indices]

        # Sort selected boxes by y_min to prepare for Y-axis projection
        y_sorted_indices = x_boxes_chunk[:, 1].argsort()
        y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
        y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]

        # Perform Y-axis projection
        y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
        y_intervals = _split_projection_profile(y_projection, 0, min_gap)

        if not y_intervals:
            continue

        # If Y-axis cannot be further segmented, add current indices to results
        if len(y_intervals[0]) == 1:
            res.extend(y_sorted_indices_chunk)
            continue

        # Recursively process each segment defined by Y-axis projection
        for y_start, y_end in zip(*y_intervals):
            y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
                y_sorted_boxes_chunk[:, 1] < y_end
            )
            _recursive_xy_cut(
                y_sorted_boxes_chunk[y_interval_indices],
                y_sorted_indices_chunk[y_interval_indices],
                res,
            )


def sort_by_xycut(block_bboxes, direction=0, min_gap=1):
    block_bboxes = np.asarray(block_bboxes).astype(int)
    res = []
    if direction == 1:
        _recursive_yx_cut(
            block_bboxes,
            np.arange(
                len(block_bboxes),
            ),
            res,
            min_gap,
        )
    else:
        _recursive_xy_cut(
            block_bboxes,
            np.arange(
                len(block_bboxes),
            ),
            res,
            min_gap,
        )
    return res


def _img_array2path(data, save_path):
    """
    Save an image array to disk and return the file path.
    Args:
        data (np.ndarray): An image represented as a numpy array.
        save_path (str or Path): The base path where images should be saved.

    Returns:
        str: The relative path of the saved image file.
    """
    if isinstance(data, np.ndarray) and data.ndim == 3:
        # Generate a unique filename using UUID
        img_name = f"image_{uuid.uuid4().hex}.png"
        img_path = Path(save_path) / "imgs" / img_name
        img_path.parent.mkdir(
            parents=True,
            exist_ok=True,
        )  # Ensure the directory exists
        cv2.imwrite(str(img_path), data)
        return f"imgs/{img_name}"
    else:
        return ValueError


def recursive_img_array2path(data, save_path, labels=[]):
    """
    Process a dictionary or list to save image arrays to disk and replace them with file paths.

    Args:
        data (dict or list): The data structure that may contain image arrays.
        save_path (str or Path): The base path where images should be saved.
    """
    if isinstance(data, dict):
        for k, v in data.items():
            if k in labels and isinstance(v, np.ndarray) and v.ndim == 3:
                data[k] = _img_array2path(v, save_path)
            else:
                recursive_img_array2path(v, save_path, labels)
    elif isinstance(data, list):
        for item in data:
            recursive_img_array2path(item, save_path, labels)


def _calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
    """
    Calculate the ratio of the overlap area between bbox1 and bbox2
    to the area of the smaller bounding box.

    Args:
        bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
        bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].

    Returns:
        float: The ratio of the overlap area to the area of the smaller bounding box.
    """
    x_left = max(bbox1[0], bbox2[0])
    y_top = max(bbox1[1], bbox2[1])
    x_right = min(bbox1[2], bbox2[2])
    y_bottom = min(bbox1[3], bbox2[3])
    if x_right <= x_left or y_bottom <= y_top:
        return 0.0
    # Calculate the area of the overlap
    intersection_area = (x_right - x_left) * (y_bottom - y_top)
    # Calculate the areas of both bounding boxes
    area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
    # Determine the minimum non-zero box area
    min_box_area = min(area_bbox1, area_bbox2)
    # Avoid division by zero in case of zero-area boxes
    if min_box_area == 0:
        return 0.0
    return intersection_area / min_box_area


def _get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio, smaller=True):
    """
    Determine if the overlap area between two bounding boxes exceeds a given ratio
    and return the smaller (or larger) bounding box based on the `smaller` flag.

    Args:
        bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
        bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
        ratio (float): The overlap ratio threshold.
        smaller (bool): If True, return the smaller bounding box; otherwise, return the larger one.

    Returns:
        list or tuple: The selected bounding box or None if the overlap ratio is not exceeded.
    """
    # Calculate the areas of both bounding boxes
    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
    # Calculate the overlap ratio using a helper function
    overlap_ratio = _calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
    # Check if the overlap ratio exceeds the threshold
    if overlap_ratio > ratio:
        if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
            return 1
        else:
            return 2
    return None


def _remove_overlap_blocks(blocks, threshold=0.65, smaller=True):
    """
    Remove overlapping blocks based on a specified overlap ratio threshold.

    Args:
        blocks (list): List of block dictionaries, each containing a 'layout_bbox' key.
        threshold (float): Ratio threshold to determine significant overlap.
        smaller (bool): If True, the smaller block in overlap is removed.

    Returns:
        tuple: A tuple containing the updated list of blocks and a list of dropped blocks.
    """
    dropped_blocks = []
    dropped_indexes = []
    # Iterate over each pair of blocks to find overlaps
    for i in range(len(blocks)):
        block1 = blocks[i]
        for j in range(i + 1, len(blocks)):
            block2 = blocks[j]
            # Skip blocks that are already marked for removal
            if i in dropped_indexes or j in dropped_indexes:
                continue
            # Check for overlap and determine which block to remove
            overlap_box_index = _get_minbox_if_overlap_by_ratio(
                block1["layout_bbox"],
                block2["layout_bbox"],
                threshold,
                smaller=smaller,
            )
            if overlap_box_index is not None:
                if overlap_box_index == 1:
                    block_to_remove = block1
                    drop_index = i
                else:
                    block_to_remove = block2
                    drop_index = j
                if drop_index not in dropped_indexes:
                    dropped_indexes.append(drop_index)
                    dropped_blocks.append(block_to_remove)

    dropped_indexes.sort()
    for i in reversed(dropped_indexes):
        del blocks[i]

    return blocks, dropped_blocks


def _text_median_width(blocks):
    widths = [
        block["layout_bbox"][2] - block["layout_bbox"][0]
        for block in blocks
        if block["label"] in ["text"]
    ]
    return np.median(widths) if widths else float("inf")


def _get_layout_property(blocks, median_width, no_mask_labels, threshold=0.8):
    """
    Determine the layout (single or double column) of text blocks.

    Args:
        blocks (list): List of block dictionaries containing 'label' and 'layout_bbox'.
        median_width (float): Median width of text blocks.
        threshold (float): Threshold for determining layout overlap.

    Returns:
        list: Updated list of blocks with layout information.
    """
    blocks.sort(
        key=lambda x: (
            x["layout_bbox"][0],
            (x["layout_bbox"][2] - x["layout_bbox"][0]),
        ),
    )
    check_single_layout = {}
    page_min_x, page_max_x = float("inf"), 0
    double_label_height = 0
    double_label_area = 0
    single_label_area = 0

    for i, block in enumerate(blocks):
        page_min_x = min(page_min_x, block["layout_bbox"][0])
        page_max_x = max(page_max_x, block["layout_bbox"][2])
    page_width = page_max_x - page_min_x

    for i, block in enumerate(blocks):
        if block["label"] not in no_mask_labels:
            continue

        x_min_i, _, x_max_i, _ = block["layout_bbox"]
        layout_length = x_max_i - x_min_i
        cover_count, cover_with_threshold_count = 0, 0
        match_block_with_threshold_indexes = []

        for j, other_block in enumerate(blocks):
            if i == j or other_block["label"] not in no_mask_labels:
                continue

            x_min_j, _, x_max_j, _ = other_block["layout_bbox"]
            x_match_min, x_match_max = max(
                x_min_i,
                x_min_j,
            ), min(x_max_i, x_max_j)
            match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)

            if match_block_iou > 0:
                cover_count += 1
                if match_block_iou > threshold:
                    cover_with_threshold_count += 1
                    match_block_with_threshold_indexes.append(
                        (j, match_block_iou),
                    )
                x_min_i = x_match_max
                if x_min_i >= x_max_i:
                    break

        if (
            layout_length > median_width * 1.3
            and (cover_with_threshold_count >= 2 or cover_count >= 2)
        ) or layout_length > 0.6 * page_width:
            # if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
            block["layout"] = "double"
            double_label_height += block["layout_bbox"][3] - block["layout_bbox"][1]
            double_label_area += (block["layout_bbox"][2] - block["layout_bbox"][0]) * (
                block["layout_bbox"][3] - block["layout_bbox"][1]
            )
        else:
            block["layout"] = "single"
            check_single_layout[i] = match_block_with_threshold_indexes

    # Check single-layout block
    for i, single_layout in check_single_layout.items():
        if single_layout:
            index, match_iou = single_layout[-1]
            if match_iou > 0.9 and blocks[index]["layout"] == "double":
                blocks[i]["layout"] = "double"
                double_label_height += (
                    blocks[i]["layout_bbox"][3] - blocks[i]["layout_bbox"][1]
                )
                double_label_area += (
                    blocks[i]["layout_bbox"][2] - blocks[i]["layout_bbox"][0]
                ) * (blocks[i]["layout_bbox"][3] - blocks[i]["layout_bbox"][1])
            else:
                single_label_area += (
                    blocks[i]["layout_bbox"][2] - blocks[i]["layout_bbox"][0]
                ) * (blocks[i]["layout_bbox"][3] - blocks[i]["layout_bbox"][1])

    return blocks, (double_label_area > single_label_area)


def _get_bbox_direction(input_bbox, ratio=1):
    """
    Determine if a bounding box is horizontal or vertical.

    Args:
        input_bbox (list): Bounding box [x_min, y_min, x_max, y_max].
        ratio (float): Ratio for determining orientation.

    Returns:
        bool: True if horizontal, False if vertical.
    """
    return (input_bbox[2] - input_bbox[0]) * ratio >= (input_bbox[3] - input_bbox[1])


def _get_projection_iou(input_bbox, match_bbox, is_horizontal=True):
    """
    Calculate the IoU of lines between two bounding boxes.

    Args:
        input_bbox (list): First bounding box [x_min, y_min, x_max, y_max].
        match_bbox (list): Second bounding box [x_min, y_min, x_max, y_max].
        is_horizontal (bool): Whether to compare horizontally or vertically.

    Returns:
        float: Line IoU.
    """
    if is_horizontal:
        x_match_min = max(input_bbox[0], match_bbox[0])
        x_match_max = min(input_bbox[2], match_bbox[2])
        return (x_match_max - x_match_min) / (input_bbox[2] - input_bbox[0])
    else:
        y_match_min = max(input_bbox[1], match_bbox[1])
        y_match_max = min(input_bbox[3], match_bbox[3])
        return (y_match_max - y_match_min) / (input_bbox[3] - input_bbox[1])


def _get_sub_category(blocks, title_labels):
    """
    Determine the layout of title and text blocks.

    Args:
        blocks (list): List of block dictionaries.
        title_labels (list): List of labels considered as titles.

    Returns:
        list: Updated list of blocks with title-text layout information.
    """

    sub_title_labels = ["paragraph_title"]
    vision_labels = ["image", "table", "chart", "figure"]

    for i, block1 in enumerate(blocks):
        if block1.get("title_text") is None:
            block1["title_text"] = []
        if block1.get("sub_title") is None:
            block1["sub_title"] = []
        if block1.get("vision_footnote") is None:
            block1["vision_footnote"] = []
        if block1.get("sub_label") is None:
            block1["sub_label"] = block1["label"]

        if (
            block1["label"] not in title_labels
            and block1["label"] not in sub_title_labels
            and block1["label"] not in vision_labels
        ):
            continue

        bbox1 = block1["layout_bbox"]
        x1, y1, x2, y2 = bbox1
        is_horizontal_1 = _get_bbox_direction(block1["layout_bbox"])
        left_up_title_text_distance = float("inf")
        left_up_title_text_index = -1
        left_up_title_text_direction = None
        right_down_title_text_distance = float("inf")
        right_down_title_text_index = -1
        right_down_title_text_direction = None

        for j, block2 in enumerate(blocks):
            if i == j:
                continue

            bbox2 = block2["layout_bbox"]
            x1_prime, y1_prime, x2_prime, y2_prime = bbox2
            is_horizontal_2 = _get_bbox_direction(bbox2)
            match_block_iou = _get_projection_iou(
                bbox2,
                bbox1,
                is_horizontal_1,
            )

            def distance_(is_horizontal, is_left_up):
                if is_horizontal:
                    if is_left_up:
                        return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
                    else:
                        return (y1_prime - y2 + 2) // 5 + x1_prime / 5000

                else:
                    if is_left_up:
                        return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
                    else:
                        return (x1_prime - x2 + 2) // 5 + y1_prime / 5000

            block_iou_threshold = 0.1
            if block1["label"] in sub_title_labels:
                match_block_iou = _calculate_overlap_area_2_minbox_area_ratio(
                    bbox2,
                    bbox1,
                )
                block_iou_threshold = 0.7

            if is_horizontal_1:
                if match_block_iou >= block_iou_threshold:
                    left_up_distance = distance_(True, True)
                    right_down_distance = distance_(True, False)
                    if (
                        y2_prime <= y1
                        and left_up_distance <= left_up_title_text_distance
                    ):
                        left_up_title_text_distance = left_up_distance
                        left_up_title_text_index = j
                        left_up_title_text_direction = is_horizontal_2
                    elif (
                        y1_prime > y2
                        and right_down_distance < right_down_title_text_distance
                    ):
                        right_down_title_text_distance = right_down_distance
                        right_down_title_text_index = j
                        right_down_title_text_direction = is_horizontal_2
            else:
                if match_block_iou >= block_iou_threshold:
                    left_up_distance = distance_(False, True)
                    right_down_distance = distance_(False, False)
                    if (
                        x2_prime <= x1
                        and left_up_distance <= left_up_title_text_distance
                    ):
                        left_up_title_text_distance = left_up_distance
                        left_up_title_text_index = j
                        left_up_title_text_direction = is_horizontal_2
                    elif (
                        x1_prime > x2
                        and right_down_distance < right_down_title_text_distance
                    ):
                        right_down_title_text_distance = right_down_distance
                        right_down_title_text_index = j
                        right_down_title_text_direction = is_horizontal_2

        height = bbox1[3] - bbox1[1]
        width = bbox1[2] - bbox1[0]
        title_text_weight = [0.8, 0.8]
        # title_text_weight = [2, 2]

        title_text = []
        sub_title = []
        vision_footnote = []

        def get_sub_category_(
            title_text_direction,
            title_text_index,
            label,
            is_left_up=True,
        ):
            direction_ = [1, 3] if is_left_up else [2, 4]
            if (
                title_text_direction == is_horizontal_1
                and title_text_index != -1
                and (label == "text" or label == "paragraph_title")
            ):
                bbox2 = blocks[title_text_index]["layout_bbox"]
                if is_horizontal_1:
                    height1 = bbox2[3] - bbox2[1]
                    width1 = bbox2[2] - bbox2[0]
                    if label == "text":
                        if (
                            _nearest_edge_distance(bbox1, bbox2)[0] <= 15
                            and block1["label"] in vision_labels
                            and width1 < width
                            and height1 < 0.5 * height
                        ):
                            blocks[title_text_index]["sub_label"] = "vision_footnote"
                            vision_footnote.append(bbox2)
                        elif (
                            height1 < height * title_text_weight[0]
                            and (width1 < width or width1 > 1.5 * width)
                            and block1["label"] in title_labels
                        ):
                            blocks[title_text_index]["sub_label"] = "title_text"
                            title_text.append((direction_[0], bbox2))
                    elif (
                        label == "paragraph_title"
                        and block1["label"] in sub_title_labels
                    ):
                        sub_title.append(bbox2)
                else:
                    height1 = bbox2[3] - bbox2[1]
                    width1 = bbox2[2] - bbox2[0]
                    if label == "text":
                        if (
                            _nearest_edge_distance(bbox1, bbox2)[0] <= 15
                            and block1["label"] in vision_labels
                            and height1 < height
                            and width1 < 0.5 * width
                        ):
                            blocks[title_text_index]["sub_label"] = "vision_footnote"
                            vision_footnote.append(bbox2)
                        elif (
                            width1 < width * title_text_weight[1]
                            and block1["label"] in title_labels
                        ):
                            blocks[title_text_index]["sub_label"] = "title_text"
                            title_text.append((direction_[1], bbox2))
                    elif (
                        label == "paragraph_title"
                        and block1["label"] in sub_title_labels
                    ):
                        sub_title.append(bbox2)

        if (
            is_horizontal_1
            and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
            > height
        ) or (
            not is_horizontal_1
            and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
            > width
        ):
            if left_up_title_text_distance < right_down_title_text_distance:
                get_sub_category_(
                    left_up_title_text_direction,
                    left_up_title_text_index,
                    blocks[left_up_title_text_index]["label"],
                    True,
                )
            else:
                get_sub_category_(
                    right_down_title_text_direction,
                    right_down_title_text_index,
                    blocks[right_down_title_text_index]["label"],
                    False,
                )
        else:
            get_sub_category_(
                left_up_title_text_direction,
                left_up_title_text_index,
                blocks[left_up_title_text_index]["label"],
                True,
            )
            get_sub_category_(
                right_down_title_text_direction,
                right_down_title_text_index,
                blocks[right_down_title_text_index]["label"],
                False,
            )

        if block1["label"] in title_labels:
            if blocks[i].get("title_text") == []:
                blocks[i]["title_text"] = title_text

        if block1["label"] in sub_title_labels:
            if blocks[i].get("sub_title") == []:
                blocks[i]["sub_title"] = sub_title

        if block1["label"] in vision_labels:
            if blocks[i].get("vision_footnote") == []:
                blocks[i]["vision_footnote"] = vision_footnote

    return blocks


def get_layout_ordering(data, no_mask_labels=[], already_sorted=False):
    """
    Process layout parsing results to remove overlapping bounding boxes
    and assign an ordering index based on their positions.

    Modifies:
        The 'parsing_result' list in 'parsing_res_list' by adding an 'index' to each block.

    """
    if already_sorted:
        return data
    title_text_labels = ["doc_title"]
    title_labels = ["doc_title", "paragraph_title"]
    vision_labels = ["image", "table", "seal", "chart", "figure"]
    vision_title_labels = ["table_title", "chart_title", "figure_title"]

    parsing_result = data["sub_blocks"]
    parsing_result, _ = _remove_overlap_blocks(
        parsing_result,
        threshold=0.5,
        smaller=True,
    )
    parsing_result = _get_sub_category(parsing_result, title_text_labels)

    doc_flag = False
    median_width = _text_median_width(parsing_result)
    parsing_result, projection_direction = _get_layout_property(
        parsing_result,
        median_width,
        no_mask_labels=no_mask_labels,
        threshold=0.3,
    )
    # Convert bounding boxes to float and remove overlaps
    (
        double_text_blocks,
        title_text_blocks,
        title_blocks,
        vision_blocks,
        vision_title_blocks,
        vision_footnote_blocks,
        other_blocks,
    ) = ([], [], [], [], [], [], [])

    drop_indexes = []

    for index, block in enumerate(parsing_result):
        label = block["sub_label"]
        block["layout_bbox"] = list(map(int, block["layout_bbox"]))

        if label == "doc_title":
            doc_flag = True

        if label in no_mask_labels:
            if block["layout"] == "double":
                double_text_blocks.append(block)
                drop_indexes.append(index)
        elif label == "title_text":
            title_text_blocks.append(block)
            drop_indexes.append(index)
        elif label == "vision_footnote":
            vision_footnote_blocks.append(block)
            drop_indexes.append(index)
        elif label in vision_title_labels:
            vision_title_blocks.append(block)
            drop_indexes.append(index)
        elif label in title_labels:
            title_blocks.append(block)
            drop_indexes.append(index)
        elif label in vision_labels:
            vision_blocks.append(block)
            drop_indexes.append(index)
        else:
            other_blocks.append(block)
            drop_indexes.append(index)

    for index in sorted(drop_indexes, reverse=True):
        del parsing_result[index]

    if len(parsing_result) > 0:
        # single text label
        if len(double_text_blocks) > len(parsing_result) or projection_direction:
            parsing_result.extend(title_blocks + double_text_blocks)
            title_blocks = []
            double_text_blocks = []
            block_bboxes = [block["layout_bbox"] for block in parsing_result]
            block_bboxes.sort(
                key=lambda x: (
                    x[0] // max(20, median_width),
                    x[1],
                ),
            )
            block_bboxes = np.array(block_bboxes)
            sorted_indices = sort_by_xycut(
                block_bboxes,
                direction=1,
                min_gap=1,
            )
        else:
            block_bboxes = [block["layout_bbox"] for block in parsing_result]
            block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
            block_bboxes = np.array(block_bboxes)
            sorted_indices = sort_by_xycut(
                block_bboxes,
                direction=0,
                min_gap=20,
            )

        sorted_boxes = block_bboxes[sorted_indices].tolist()

        for block in parsing_result:
            block["index"] = sorted_boxes.index(block["layout_bbox"]) + 1
            block["sub_index"] = sorted_boxes.index(block["layout_bbox"]) + 1

    def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
        for block in input_blocks:
            bbox = block["layout_bbox"]
            min_distance = float("inf")
            min_distance_config = [
                [float("inf"), float("inf")],
                float("inf"),
                float("inf"),
            ]  # for double text
            nearest_gt_index = 0
            for match_block in parsing_result:
                match_bbox = match_block["layout_bbox"]
                if distance_type == "nearest_iou_edge_distance":
                    distance, min_distance_config = _nearest_iou_edge_distance(
                        bbox,
                        match_bbox,
                        block["sub_label"],
                        vision_labels=vision_labels,
                        no_mask_labels=no_mask_labels,
                        median_width=median_width,
                        title_labels=title_labels,
                        title_text=block["title_text"],
                        sub_title=block["sub_title"],
                        min_distance_config=min_distance_config,
                        tolerance_len=10,
                    )
                elif distance_type == "title_text":
                    if (
                        match_block["label"] in title_labels + ["abstract"]
                        and match_block["title_text"] != []
                    ):
                        iou_left_up = _calculate_overlap_area_2_minbox_area_ratio(
                            bbox,
                            match_block["title_text"][0][1],
                        )
                        iou_right_down = _calculate_overlap_area_2_minbox_area_ratio(
                            bbox,
                            match_block["title_text"][-1][1],
                        )
                        iou = 1 - max(iou_left_up, iou_right_down)
                        distance = _manhattan_distance(bbox, match_bbox) * iou
                    else:
                        distance = float("inf")
                elif distance_type == "manhattan":
                    distance = _manhattan_distance(bbox, match_bbox)
                elif distance_type == "vision_footnote":
                    if (
                        match_block["label"] in vision_labels
                        and match_block["vision_footnote"] != []
                    ):
                        iou_left_up = _calculate_overlap_area_2_minbox_area_ratio(
                            bbox,
                            match_block["vision_footnote"][0],
                        )
                        iou_right_down = _calculate_overlap_area_2_minbox_area_ratio(
                            bbox,
                            match_block["vision_footnote"][-1],
                        )
                        iou = 1 - max(iou_left_up, iou_right_down)
                        distance = _manhattan_distance(bbox, match_bbox) * iou
                    else:
                        distance = float("inf")
                elif distance_type == "vision_body":
                    if (
                        match_block["label"] in vision_title_labels
                        and block["vision_footnote"] != []
                    ):
                        iou_left_up = _calculate_overlap_area_2_minbox_area_ratio(
                            match_bbox,
                            block["vision_footnote"][0],
                        )
                        iou_right_down = _calculate_overlap_area_2_minbox_area_ratio(
                            match_bbox,
                            block["vision_footnote"][-1],
                        )
                        iou = 1 - max(iou_left_up, iou_right_down)
                        distance = _manhattan_distance(bbox, match_bbox) * iou
                    else:
                        distance = float("inf")
                else:
                    raise NotImplementedError

                if distance < min_distance:
                    min_distance = distance
                    if is_add_index:
                        nearest_gt_index = match_block.get("index", 999)
                    else:
                        nearest_gt_index = match_block.get("sub_index", 999)

            if is_add_index:
                block["index"] = nearest_gt_index
            else:
                block["sub_index"] = nearest_gt_index

            parsing_result.append(block)

    # double text label
    double_text_blocks.sort(
        key=lambda x: (
            x["layout_bbox"][1] // 10,
            x["layout_bbox"][0] // median_width,
            x["layout_bbox"][1] ** 2 + x["layout_bbox"][0] ** 2,
        ),
    )
    nearest_match_(
        double_text_blocks,
        distance_type="nearest_iou_edge_distance",
    )
    parsing_result.sort(
        key=lambda x: (x["index"], x["layout_bbox"][1], x["layout_bbox"][0]),
    )

    for idx, block in enumerate(parsing_result):
        block["index"] = idx + 1
        block["sub_index"] = idx + 1

    # title label
    title_blocks.sort(
        key=lambda x: (
            x["layout_bbox"][1] // 10,
            x["layout_bbox"][0] // median_width,
            x["layout_bbox"][1] ** 2 + x["layout_bbox"][0] ** 2,
        ),
    )
    nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")

    if doc_flag:
        # text_sort_labels = ["doc_title","paragraph_title","abstract"]
        text_sort_labels = ["doc_title"]
        text_label_priority = {
            label: priority for priority, label in enumerate(text_sort_labels)
        }
        doc_titles = []
        for i, block in enumerate(parsing_result):
            if block["label"] == "doc_title":
                doc_titles.append(
                    (i, block["layout_bbox"][1], block["layout_bbox"][0]),
                )
        doc_titles.sort(key=lambda x: (x[1], x[2]))
        first_doc_title_index = doc_titles[0][0]
        parsing_result[first_doc_title_index]["index"] = 1
        parsing_result.sort(
            key=lambda x: (
                x["index"],
                text_label_priority.get(x["label"], 9999),
                x["layout_bbox"][1],
                x["layout_bbox"][0],
            ),
        )
    else:
        parsing_result.sort(
            key=lambda x: (
                x["index"],
                x["layout_bbox"][1],
                x["layout_bbox"][0],
            ),
        )

    for idx, block in enumerate(parsing_result):
        block["index"] = idx + 1
        block["sub_index"] = idx + 1

    # title-text label
    nearest_match_(title_text_blocks, distance_type="title_text")
    text_sort_labels = ["doc_title", "paragraph_title", "title_text"]
    text_label_priority = {
        label: priority for priority, label in enumerate(text_sort_labels)
    }
    parsing_result.sort(
        key=lambda x: (
            x["index"],
            text_label_priority.get(x["sub_label"], 9999),
            x["layout_bbox"][1],
            x["layout_bbox"][0],
        ),
    )

    for idx, block in enumerate(parsing_result):
        block["index"] = idx + 1
        block["sub_index"] = idx + 1

    # image,figure,chart,seal label
    nearest_match_(
        vision_title_blocks,
        distance_type="nearest_iou_edge_distance",
        is_add_index=False,
    )
    parsing_result.sort(
        key=lambda x: (
            x["sub_index"],
            x["layout_bbox"][1],
            x["layout_bbox"][0],
        ),
    )

    for idx, block in enumerate(parsing_result):
        block["sub_index"] = idx + 1

    # image,figure,chart,seal label
    nearest_match_(
        vision_blocks,
        distance_type="nearest_iou_edge_distance",
        is_add_index=False,
    )
    parsing_result.sort(
        key=lambda x: (
            x["sub_index"],
            x["layout_bbox"][1],
            x["layout_bbox"][0],
        ),
    )

    for idx, block in enumerate(parsing_result):
        block["sub_index"] = idx + 1

    # vision footnote label
    nearest_match_(
        vision_footnote_blocks,
        distance_type="vision_footnote",
        is_add_index=False,
    )
    text_label_priority = {"vision_footnote": 9999}
    parsing_result.sort(
        key=lambda x: (
            x["sub_index"],
            text_label_priority.get(x["sub_label"], 0),
            x["layout_bbox"][1],
            x["layout_bbox"][0],
        ),
    )

    for idx, block in enumerate(parsing_result):
        block["sub_index"] = idx + 1

    # header、footnote、header_image... label
    nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)

    return data


def _generate_input_data(parsing_result):
    """
    The evaluation input data is generated based on the parsing results.

    :param parsing_result: A list containing the results of the layout parsing
    :return: A formatted list of input data
    """
    input_data = [
        {
            "block_bbox": block["block_bbox"],
            "sub_indices": [],
            "sub_bboxes": [],
        }
        for block in parsing_result
    ]

    for block_index, block in enumerate(parsing_result):
        sub_blocks = block["sub_blocks"]
        get_layout_ordering(
            block_index=block_index,
            no_mask_labels=[
                "text",
                "formula",
                "algorithm",
                "reference",
                "content",
                "abstract",
            ],
        )
        for sub_block in sub_blocks:
            input_data[block_index]["sub_bboxes"].append(
                list(map(int, sub_block["layout_bbox"])),
            )
            input_data[block_index]["sub_indices"].append(
                int(sub_block["index"]),
            )

    return input_data


def _manhattan_distance(point1, point2, weight_x=1, weight_y=1):
    return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])


def _calculate_horizontal_distance(
    input_bbox,
    match_bbox,
    height,
    disperse,
    title_text,
):
    """
    Calculate the horizontal distance between two bounding boxes, considering title text adjustments.

    Args:
        input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
        match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
        height (int): The height of the input bounding box used for normalization.
        disperse (int): The dispersion factor used to normalize the horizontal distance.
        title_text (list): A list of tuples containing title text information and their bounding box coordinates.
                        Format: [(position_indicator, [x1, y1, x2, y2]), ...].

    Returns:
        float: The calculated horizontal distance taking into account the title text adjustments.
    """
    x1, y1, x2, y2 = input_bbox
    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox

    if y2 < y1_prime:
        if title_text and title_text[-1][0] == 2:
            y2 += title_text[-1][1][3] - title_text[-1][1][1]
        distance1 = (y1_prime - y2) * 0.5
    else:
        if title_text and title_text[0][0] == 1:
            y1 -= title_text[0][1][3] - title_text[0][1][1]
        distance1 = y1 - y2_prime

    return (
        abs(x2_prime - x1) // disperse + distance1 // height + distance1 / 5000
    )  # if page max size == 5000


def _calculate_vertical_distance(input_bbox, match_bbox, width, disperse, title_text):
    """
    Calculate the vertical distance between two bounding boxes, considering title text adjustments.

    Args:
        input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
        match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
        width (int): The width of the input bounding box used for normalization.
        disperse (int): The dispersion factor used to normalize the vertical distance.
        title_text (list): A list of tuples containing title text information and their bounding box coordinates.
                        Format: [(position_indicator, [x1, y1, x2, y2]), ...].

    Returns:
        float: The calculated vertical distance taking into account the title text adjustments.
    """
    x1, y1, x2, y2 = input_bbox
    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox

    if x1 > x2_prime:
        if title_text and title_text[0][0] == 3:
            x1 -= title_text[0][1][2] - title_text[0][1][0]
        distance2 = (x1 - x2_prime) * 0.5
    else:
        if title_text and title_text[-1][0] == 4:
            x2 += title_text[-1][1][2] - title_text[-1][1][0]
        distance2 = x1_prime - x2

    return abs(y2_prime - y1) // disperse + distance2 // width + distance2 / 5000


def _nearest_edge_distance(
    input_bbox,
    match_bbox,
    weight=[1, 1, 1, 1],
    label="text",
    no_mask_labels=[],
    min_edge_distances_config=[],
    tolerance_len=10,
):
    """
    Calculate the nearest edge distance between two bounding boxes, considering directional weights.

    Args:
        input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
        match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
        weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
        label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
        no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
        min_edge_distances_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
        Defaults to [float('inf'), float('inf')].

    Returns:
        tuple: A tuple containing:
            - The calculated minimum edge distance between the bounding boxes.
            - A list with the minimum edge distances in the x and y directions.
    """
    match_bbox_iou = _calculate_overlap_area_2_minbox_area_ratio(
        input_bbox,
        match_bbox,
    )
    if match_bbox_iou > 0 and label not in no_mask_labels:
        return 0, [0, 0]

    if not min_edge_distances_config:
        min_edge_distances_config = [float("inf"), float("inf")]
    min_edge_distance_x, min_edge_distance_y = min_edge_distances_config

    x1, y1, x2, y2 = input_bbox
    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox

    direction_num = 0
    distance_x = float("inf")
    distance_y = float("inf")
    distance = [float("inf")] * 4

    # input_bbox is to the left of match_bbox
    if x2 < x1_prime:
        direction_num += 1
        distance[0] = x1_prime - x2
        if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
            distance_x = min_edge_distance_x * weight[0]
        else:
            distance_x = distance[0] * weight[0]
    # input_bbox is to the right of match_bbox
    elif x1 > x2_prime:
        direction_num += 1
        distance[1] = x1 - x2_prime
        if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
            distance_x = min_edge_distance_x * weight[1]
        else:
            distance_x = distance[1] * weight[1]
    elif match_bbox_iou > 0:
        distance[0] = 0
        distance_x = 0

    # input_bbox is above match_bbox
    if y2 < y1_prime:
        direction_num += 1
        distance[2] = y1_prime - y2
        if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
            distance_y = min_edge_distance_y * weight[2]
        else:
            distance_y = distance[2] * weight[2]
        if label in no_mask_labels:
            distance_y = max(0.1, distance_y) * 100
    # input_bbox is below match_bbox
    elif y1 > y2_prime:
        direction_num += 1
        distance[3] = y1 - y2_prime
        if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
            distance_y = min_edge_distance_y * weight[3]
        else:
            distance_y = distance[3] * weight[3]
    elif match_bbox_iou > 0:
        distance[2] = 0
        distance_y = 0

    if direction_num == 2:
        return (distance_x + distance_y), [
            min(distance[0], distance[1]),
            min(distance[2], distance[3]),
        ]
    else:
        return min(distance_x, distance_y), [
            min(distance[0], distance[1]),
            min(distance[2], distance[3]),
        ]


def _get_weights(label, horizontal):
    """Define weights based on the label and orientation."""
    if label == "doc_title":
        return (
            [1, 0.1, 0.1, 1] if horizontal else [0.2, 0.1, 1, 1]
        )  # left-down ,  right-left
    elif label in [
        "paragraph_title",
        "abstract",
        "figure_title",
        "chart_title",
        "image",
        "seal",
        "chart",
        "figure",
    ]:
        return [1, 1, 0.1, 1]  # down
    else:
        return [1, 1, 1, 0.1]  # up


def _nearest_iou_edge_distance(
    input_bbox,
    match_bbox,
    label,
    vision_labels,
    no_mask_labels,
    median_width=-1,
    title_labels=[],
    title_text=[],
    sub_title=[],
    min_distance_config=[],
    tolerance_len=10,
):
    """
    Calculate the nearest IOU edge distance between two bounding boxes.

    Args:
        input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
        match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
        label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
        no_mask_labels (list): Labels for which no masking is applied when calculating edge distances.
        median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
        title_labels (list, optional): Labels that indicate the object is a title. Defaults to an empty list.
        title_text (list, optional): Text content associated with title labels. Defaults to an empty list.
        sub_title (list, optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
        min_distance_config (list, optional): Configuration for minimum distances [min_edge_distances_config, up_edge_distances_config, total_distance].

    Returns:
        tuple: A tuple containing the calculated distance and updated minimum distance configuration.
    """

    x1, y1, x2, y2 = input_bbox
    x1_prime, y1_prime, x2_prime, y2_prime = match_bbox

    min_edge_distances_config, up_edge_distances_config, total_distance = (
        min_distance_config
    )

    iou_distance = 0

    if label in vision_labels:
        horizontal1 = horizontal2 = True
    else:
        horizontal1 = _get_bbox_direction(input_bbox)
        horizontal2 = _get_bbox_direction(match_bbox, 3)

    if (
        horizontal1 != horizontal2
        or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
    ):
        iou_distance = 1
    elif label == "doc_title" or (label in title_labels and title_text):
        # Calculate distance for titles
        disperse = max(1, median_width)
        width = x2 - x1
        height = y2 - y1
        if horizontal1:
            return (
                _calculate_horizontal_distance(
                    input_bbox,
                    match_bbox,
                    height,
                    disperse,
                    title_text,
                ),
                min_distance_config,
            )
        else:
            return (
                _calculate_vertical_distance(
                    input_bbox,
                    match_bbox,
                    width,
                    disperse,
                    title_text,
                ),
                min_distance_config,
            )

    # Adjust input_bbox based on sub_title
    if sub_title:
        for sub in sub_title:
            x1_, y1_, x2_, y2_ = sub
            x1, y1, x2, y2 = (
                min(x1, x1_),
                min(
                    y1,
                    y1_,
                ),
                max(x2, x2_),
                max(y2, y2_),
            )
        input_bbox = [x1, y1, x2, y2]

    # Calculate edge distance
    weight = _get_weights(label, horizontal1)
    if label == "abstract":
        tolerance_len *= 3
    edge_distance, edge_distance_config = _nearest_edge_distance(
        input_bbox,
        match_bbox,
        weight,
        label=label,
        no_mask_labels=no_mask_labels,
        min_edge_distances_config=min_edge_distances_config,
        tolerance_len=tolerance_len,
    )

    # Weights for combining distances
    iou_edge_weight = [10**6, 10**3, 1, 0.001]

    # Calculate up and left edge distances
    up_edge_distance = y1_prime
    left_edge_distance = x1_prime
    if (
        label in no_mask_labels or label == "paragraph_title" or label in vision_labels
    ) and y1 > y2_prime:
        up_edge_distance = -y2_prime
        left_edge_distance = -x2_prime

    min_up_edge_distance = up_edge_distances_config
    if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
        up_edge_distance = min_up_edge_distance

    # Calculate total distance
    distance = (
        iou_distance * iou_edge_weight[0]
        + edge_distance * iou_edge_weight[1]
        + up_edge_distance * iou_edge_weight[2]
        + left_edge_distance * iou_edge_weight[3]
    )

    # Update minimum distance configuration if a smaller distance is found
    if total_distance > distance:
        edge_distance_config = [
            min(min_edge_distances_config[0], edge_distance_config[0]),
            min(min_edge_distances_config[1], edge_distance_config[1]),
        ]
        min_distance_config = [
            edge_distance_config,
            min(up_edge_distance, up_edge_distances_config),
            distance,
        ]

    return distance, min_distance_config


def get_show_color(label):
    label_colors = {
        # Medium Blue (from 'titles_list')
        "paragraph_title": (102, 102, 255, 100),
        "doc_title": (255, 248, 220, 100),  # Cornsilk
        # Light Yellow (from 'tables_caption_list')
        "table_title": (255, 255, 102, 100),
        # Sky Blue (from 'imgs_caption_list')
        "figure_title": (102, 178, 255, 100),
        "chart_title": (221, 160, 221, 100),  # Plum
        "vision_footnote": (144, 238, 144, 100),  # Light Green
        # Deep Purple (from 'texts_list')
        "text": (153, 0, 76, 100),
        # Bright Green (from 'interequations_list')
        "formula": (0, 255, 0, 100),
        "abstract": (255, 239, 213, 100),  # Papaya Whip
        # Medium Green (from 'lists_list' and 'indexs_list')
        "content": (40, 169, 92, 100),
        # Neutral Gray (from 'dropped_bbox_list')
        "seal": (158, 158, 158, 100),
        # Olive Yellow (from 'tables_body_list')
        "table": (204, 204, 0, 100),
        # Bright Green (from 'imgs_body_list')
        "image": (153, 255, 51, 100),
        # Bright Green (from 'imgs_body_list')
        "figure": (153, 255, 51, 100),
        "chart": (216, 191, 216, 100),  # Thistle
        # Pale Yellow-Green (from 'tables_footnote_list')
        "reference": (229, 255, 204, 100),
        "algorithm": (255, 250, 240, 100),  # Floral White
    }
    default_color = (158, 158, 158, 100)
    return label_colors.get(label, default_color)