8 months ago · de05aaac84
--- a/paddlex/inference/pipelines/layout_parsing/utils.py
+++ b/paddlex/inference/pipelines/layout_parsing/utils.py
@@ -760,7 +760,6 @@ def sort_by_xycut(
 
															     block_bboxes: Union[np.ndarray, List[List[int]]],
														
 
															     direction: int = 0,
														
 
															     min_gap: int = 1,
														
 
															-    pre_cuts: Optional[Dict[str, List[int]]] = None,
														
 
															 ) -> List[int]:
														
 
															     """
														
 
															     Sort bounding boxes using recursive XY cut method based on the specified direction.
														
@@ -772,56 +771,26 @@ def sort_by_xycut(
 
															         direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
														
 
															                          Defaults to 0.
														
 
															         min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
														
 
															-        pre_cuts (Optional[Dict[str, List[int]]]): A dictionary specifying pre-cut points along the axes.
														
 
															-                                                  The keys are 'x' or 'y', representing the axis to pre-cut,
														
 
															-                                                  and the values are lists of integers specifying the cut points.
														
 
															-                                                  For example, {'y': [100, 200]} will pre-cut the y-axis at
														
 
															-                                                  positions 100 and 200 before applying the main XY cut algorithm.
														
 
															-                                                  Defaults to None.
														
 
															     Returns:
														
 
															         List[int]: A list of indices representing the order of sorted bounding boxes.
														
 
															     """
														
 
															     block_bboxes = np.asarray(block_bboxes).astype(int)
														
 
															     res = []
														
 
															-    axis = "x" if direction == 1 else "y"
														
 
															-    if len(pre_cuts[axis]) > 0:
														
 
															-        cuts = sorted(pre_cuts[axis])
														
 
															-        axis_index = 1 if axis == "y" else 0
														
 
															-        max_val = block_bboxes[:, 3].max() if axis == "y" else block_bboxes[:, 2].max()
														
 
															-        intervals = []
														
 
															-        prev = 0
														
 
															-        for cut in cuts:
														
 
															-            intervals.append((prev, cut))
														
 
															-            prev = cut
														
 
															-        intervals.append((prev, max_val))
														
 
															-        for start, end in intervals:
														
 
															-            mask = (block_bboxes[:, axis_index] >= start) & (
														
 
															-                block_bboxes[:, axis_index] < end
														
 
															-            )
														
 
															-            sub_boxes = block_bboxes[mask]
														
 
															-            sub_indices = np.arange(len(block_bboxes))[mask].tolist()
														
 
															-            if len(sub_boxes) > 0:
														
 
															-                if direction == 1:
														
 
															-                    _recursive_yx_cut(sub_boxes, sub_indices, res, min_gap)
														
 
															-                else:
														
 
															-                    _recursive_xy_cut(sub_boxes, sub_indices, res, min_gap)
														
 
															+    if direction == 1:
														
 
															+        _recursive_yx_cut(
														
 
															+            block_bboxes,
														
 
															+            np.arange(len(block_bboxes)).tolist(),
														
 
															+            res,
														
 
															+            min_gap,
														
 
															+        )
														
 
															     else:
														
 
															-        if direction == 1:
														
 
															-            _recursive_yx_cut(
														
 
															-                block_bboxes,
														
 
															-                np.arange(len(block_bboxes)).tolist(),
														
 
															-                res,
														
 
															-                min_gap,
														
 
															-            )
														
 
															-        else:
														
 
															-            _recursive_xy_cut(
														
 
															-                block_bboxes,
														
 
															-                np.arange(len(block_bboxes)).tolist(),
														
 
															-                res,
														
 
															-                min_gap,
														
 
															-            )
														
 
															-
														
 
															+        _recursive_xy_cut(
														
 
															+            block_bboxes,
														
 
															+            np.arange(len(block_bboxes)).tolist(),
														
 
															+            res,
														
 
															+            min_gap,
														
 
															+        )
														
 
															     return res
														
@@ -842,6 +811,7 @@ def _img_array2path(data: np.ndarray) -> str:
 
															     if isinstance(data, np.ndarray) and data.ndim == 3:
														
 
															         # Generate a unique filename using UUID
														
 
															         img_name = f"image_{uuid.uuid4().hex}.png"
														
 
															+
														
 
															         return {f"imgs/{img_name}": Image.fromarray(data[:, :, ::-1])}
														
 
															     else:
														
 
															         raise ValueError(
														
@@ -1106,12 +1076,12 @@ def _get_projection_iou(
 
															         x_match_min = max(input_bbox[0], match_bbox[0])
														
 
															         x_match_max = min(input_bbox[2], match_bbox[2])
														
 
															         overlap = max(0, x_match_max - x_match_min)
														
 
															-        input_width = input_bbox[2] - input_bbox[0]
														
 
															+        input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
														
 
															     else:
														
 
															         y_match_min = max(input_bbox[1], match_bbox[1])
														
 
															         y_match_max = min(input_bbox[3], match_bbox[3])
														
 
															         overlap = max(0, y_match_max - y_match_min)
														
 
															-        input_width = input_bbox[3] - input_bbox[1]
														
 
															+        input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
														
 
															     return overlap / input_width if input_width > 0 else 0.0
														
@@ -1128,29 +1098,26 @@ def _get_sub_category(
 
															     Returns:
														
 
															         List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
														
 
															-        List[float]: List of pre_cuts coordinates.
														
 
															+        Dict[float]: Dict of pre_cuts coordinates.
														
 
															     """
														
 
															     sub_title_labels = ["paragraph_title"]
														
 
															     vision_labels = ["image", "table", "chart", "figure"]
														
 
															     vision_title_labels = ["figure_title", "chart_title", "table_title"]
														
 
															     all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
														
 
															+    special_pre_cut_labels = title_labels + sub_title_labels
														
 
															-    relevant_blocks = [block for block in blocks if block["block_label"] in all_labels]
														
 
															+    min_x = min(block["block_bbox"][0] for block in blocks)
														
 
															+    min_y = min(block["block_bbox"][1] for block in blocks)
														
 
															+    max_x = max(block["block_bbox"][2] for block in blocks)
														
 
															+    max_y = max(block["block_bbox"][3] for block in blocks)
														
 
															+    region_bbox = (min_x, min_y, max_x, max_y)
														
 
															+    region_x_center = (region_bbox[0] + region_bbox[2]) / 2
														
 
															+    region_y_center = (region_bbox[1] + region_bbox[3]) / 2
														
 
															+    region_width = region_bbox[2] - region_bbox[0]
														
 
															+    region_height = region_bbox[3] - region_bbox[1]
														
 
															-    region_bbox = None
														
 
															-    if relevant_blocks:
														
 
															-        min_x = min(block["block_bbox"][0] for block in relevant_blocks)
														
 
															-        min_y = min(block["block_bbox"][1] for block in relevant_blocks)
														
 
															-        max_x = max(block["block_bbox"][2] for block in relevant_blocks)
														
 
															-        max_y = max(block["block_bbox"][3] for block in relevant_blocks)
														
 
															-        region_bbox = (min_x, min_y, max_x, max_y)
														
 
															-        region_x_center = (region_bbox[0] + region_bbox[2]) / 2
														
 
															-        region_y_center = (region_bbox[1] + region_bbox[3]) / 2
														
 
															-        region_width = region_bbox[2] - region_bbox[0]
														
 
															-        region_height = region_bbox[3] - region_bbox[1]
														
 
															-
														
 
															-    pre_cuts = []
														
 
															+    pre_cuts = {}
														
 
															     for i, block1 in enumerate(blocks):
														
 
															         block1.setdefault("title_text", [])
														
@@ -1179,16 +1146,21 @@ def _get_sub_category(
 
															         else:
														
 
															             block_length = y2 - y1
														
 
															             required_length = region_height / 2
														
 
															-        length_condition = block_length > required_length
														
 
															+        if block1["block_label"] in special_pre_cut_labels:
														
 
															+            length_condition = True
														
 
															+        else:
														
 
															+            length_condition = block_length > required_length
														
 
															         # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
														
 
															         block_x_center = (x1 + x2) / 2
														
 
															         block_y_center = (y1 + y2) / 2
														
 
															         tolerance_len = block_length // 5
														
 
															-        is_centered = (
														
 
															-            abs(block_x_center - region_x_center) <= tolerance_len
														
 
															-            and abs(block_y_center - region_y_center) <= tolerance_len
														
 
															-        )
														
 
															+        if block1["block_label"] in special_pre_cut_labels:
														
 
															+            tolerance_len = block_length // 10
														
 
															+        if is_horizontal_1:
														
 
															+            is_centered = abs(block_x_center - region_x_center) <= tolerance_len
														
 
															+        else:
														
 
															+            is_centered = abs(block_y_center - region_y_center) <= tolerance_len
														
 
															         # Condition 3: Check for surrounding text
														
 
															         has_left_text = False
														
@@ -1225,9 +1197,9 @@ def _get_sub_category(
 
															         # Add coordinates if all conditions are met
														
 
															         if is_centered and length_condition and no_text_on_sides:
														
 
															             if is_horizontal_1:
														
 
															-                pre_cuts.append(y1)
														
 
															+                pre_cuts.setdefault("y", []).append(y1)
														
 
															             else:
														
 
															-                pre_cuts.append(x1)
														
 
															+                pre_cuts.setdefault("x", []).append(x1)
														
 
															         for j, block2 in enumerate(blocks):
														
 
															             if i == j:
														
@@ -1257,11 +1229,7 @@ def _get_sub_category(
 
															             block_iou_threshold = 0.1
														
 
															             if block1["block_label"] in sub_title_labels:
														
 
															-                match_block_iou = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															-                    bbox2,
														
 
															-                    bbox1,
														
 
															-                )
														
 
															-                block_iou_threshold = 0.7
														
 
															+                block_iou_threshold = 0.5
														
 
															             if is_horizontal_1:
														
 
															                 if match_block_iou >= block_iou_threshold:
														
@@ -1446,326 +1414,377 @@ def get_layout_ordering(
 
															     )
														
 
															     parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
														
 
															-    doc_flag = False
														
 
															-    median_width = _get_text_median_width(parsing_res_list)
														
 
															-    parsing_res_list, projection_direction = _get_layout_property(
														
 
															-        parsing_res_list,
														
 
															-        median_width,
														
 
															-        no_mask_labels=no_mask_labels,
														
 
															-        threshold=0.3,
														
 
															-    )
														
 
															-    # Convert bounding boxes to float and remove overlaps
														
 
															-    (
														
 
															-        double_text_blocks,
														
 
															-        title_text_blocks,
														
 
															-        title_blocks,
														
 
															-        vision_blocks,
														
 
															-        vision_title_blocks,
														
 
															-        vision_footnote_blocks,
														
 
															-        other_blocks,
														
 
															-    ) = ([], [], [], [], [], [], [])
														
 
															-
														
 
															-    drop_indexes = []
														
 
															-
														
 
															-    for index, block in enumerate(parsing_res_list):
														
 
															-        label = block["sub_label"]
														
 
															-        block["block_bbox"] = list(map(int, block["block_bbox"]))
														
 
															-
														
 
															-        if label == "doc_title":
														
 
															-            doc_flag = True
														
 
															-
														
 
															-        if label in no_mask_labels:
														
 
															-            if block["layout"] == "double":
														
 
															-                double_text_blocks.append(block)
														
 
															+    parsing_res_by_pre_cuts_list = []
														
 
															+    if len(pre_cuts) > 0:
														
 
															+        block_bboxes = [block["block_bbox"] for block in parsing_res_list]
														
 
															+        for axis, cuts in pre_cuts.items():
														
 
															+            axis_index = 1 if axis == "y" else 0
														
 
															+
														
 
															+            max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
														
 
															+
														
 
															+            intervals = []
														
 
															+            prev = 0
														
 
															+            for cut in sorted(cuts):
														
 
															+                intervals.append((prev, cut))
														
 
															+                prev = cut
														
 
															+            intervals.append((prev, max_val))
														
 
															+
														
 
															+            for start, end in intervals:
														
 
															+                mask = [
														
 
															+                    (bbox[axis_index] >= start) and (bbox[axis_index] < end)
														
 
															+                    for bbox in block_bboxes
														
 
															+                ]
														
 
															+                parsing_res_by_pre_cuts_list.append(
														
 
															+                    [parsing_res_list[i] for i, m in enumerate(mask) if m]
														
 
															+                )
														
 
															+    else:
														
 
															+        parsing_res_by_pre_cuts_list = [parsing_res_list]
														
 
															+
														
 
															+    final_parsing_res_list = []
														
 
															+    num_index = 0
														
 
															+    num_sub_index = 0
														
 
															+    for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
														
 
															+
														
 
															+        doc_flag = False
														
 
															+        median_width = _get_text_median_width(parsing_res_by_pre_cuts)
														
 
															+        parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
														
 
															+            parsing_res_by_pre_cuts,
														
 
															+            median_width,
														
 
															+            no_mask_labels=no_mask_labels,
														
 
															+            threshold=0.3,
														
 
															+        )
														
 
															+        # Convert bounding boxes to float and remove overlaps
														
 
															+        (
														
 
															+            double_text_blocks,
														
 
															+            title_text_blocks,
														
 
															+            title_blocks,
														
 
															+            vision_blocks,
														
 
															+            vision_title_blocks,
														
 
															+            vision_footnote_blocks,
														
 
															+            other_blocks,
														
 
															+        ) = ([], [], [], [], [], [], [])
														
 
															+
														
 
															+        drop_indexes = []
														
 
															+
														
 
															+        for index, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            label = block["sub_label"]
														
 
															+            block["block_bbox"] = list(map(int, block["block_bbox"]))
														
 
															+
														
 
															+            if label == "doc_title":
														
 
															+                doc_flag = True
														
 
															+
														
 
															+            if label in no_mask_labels:
														
 
															+                if block["layout"] == "double":
														
 
															+                    double_text_blocks.append(block)
														
 
															+                    drop_indexes.append(index)
														
 
															+            elif label == "title_text":
														
 
															+                title_text_blocks.append(block)
														
 
															+                drop_indexes.append(index)
														
 
															+            elif label == "vision_footnote":
														
 
															+                vision_footnote_blocks.append(block)
														
 
															+                drop_indexes.append(index)
														
 
															+            elif label in vision_title_labels:
														
 
															+                vision_title_blocks.append(block)
														
 
															+                drop_indexes.append(index)
														
 
															+            elif label in title_labels:
														
 
															+                title_blocks.append(block)
														
 
															+                drop_indexes.append(index)
														
 
															+            elif label in vision_labels:
														
 
															+                vision_blocks.append(block)
														
 
															+                drop_indexes.append(index)
														
 
															+            else:
														
 
															+                other_blocks.append(block)
														
 
															                 drop_indexes.append(index)
														
 
															-        elif label == "title_text":
														
 
															-            title_text_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-        elif label == "vision_footnote":
														
 
															-            vision_footnote_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-        elif label in vision_title_labels:
														
 
															-            vision_title_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-        elif label in title_labels:
														
 
															-            title_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-        elif label in vision_labels:
														
 
															-            vision_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-        else:
														
 
															-            other_blocks.append(block)
														
 
															-            drop_indexes.append(index)
														
 
															-
														
 
															-    for index in sorted(drop_indexes, reverse=True):
														
 
															-        del parsing_res_list[index]
														
 
															-
														
 
															-    if len(parsing_res_list) > 0:
														
 
															-        # single text label
														
 
															-        if len(double_text_blocks) > len(parsing_res_list) or projection_direction:
														
 
															-            parsing_res_list.extend(title_blocks + double_text_blocks)
														
 
															-            title_blocks = []
														
 
															-            double_text_blocks = []
														
 
															-            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
														
 
															-            block_bboxes.sort(
														
 
															-                key=lambda x: (
														
 
															-                    x[0] // max(20, median_width),
														
 
															-                    x[1],
														
 
															-                ),
														
 
															-            )
														
 
															-            block_bboxes = np.array(block_bboxes)
														
 
															-            sorted_indices = sort_by_xycut(
														
 
															-                block_bboxes, direction=1, min_gap=1, pre_cuts={"x": pre_cuts}
														
 
															-            )
														
 
															-        else:
														
 
															-            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
														
 
															-            block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
														
 
															-            block_bboxes = np.array(block_bboxes)
														
 
															-            sorted_indices = sort_by_xycut(
														
 
															-                block_bboxes, direction=0, min_gap=20, pre_cuts={"y": pre_cuts}
														
 
															-            )
														
 
															-        sorted_boxes = block_bboxes[sorted_indices].tolist()
														
 
															-
														
 
															-        for block in parsing_res_list:
														
 
															-            block["index"] = sorted_boxes.index(block["block_bbox"]) + 1
														
 
															-            block["sub_index"] = sorted_boxes.index(block["block_bbox"]) + 1
														
 
															-
														
 
															-    def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
														
 
															-        for block in input_blocks:
														
 
															-            bbox = block["block_bbox"]
														
 
															-            min_distance = float("inf")
														
 
															-            min_distance_config = [
														
 
															-                [float("inf"), float("inf")],
														
 
															-                float("inf"),
														
 
															-                float("inf"),
														
 
															-            ]  # for double text
														
 
															-            nearest_gt_index = 0
														
 
															-            for match_block in parsing_res_list:
														
 
															-                match_bbox = match_block["block_bbox"]
														
 
															-                if distance_type == "nearest_iou_edge_distance":
														
 
															-                    distance, min_distance_config = _nearest_iou_edge_distance(
														
 
															-                        bbox,
														
 
															-                        match_bbox,
														
 
															-                        block["sub_label"],
														
 
															-                        vision_labels=vision_labels,
														
 
															-                        no_mask_labels=no_mask_labels,
														
 
															-                        median_width=median_width,
														
 
															-                        title_labels=title_labels,
														
 
															-                        title_text=block["title_text"],
														
 
															-                        sub_title=block["sub_title"],
														
 
															-                        min_distance_config=min_distance_config,
														
 
															-                        tolerance_len=10,
														
 
															-                    )
														
 
															-                elif distance_type == "title_text":
														
 
															-                    if (
														
 
															-                        match_block["block_label"] in title_labels + ["abstract"]
														
 
															-                        and match_block["title_text"] != []
														
 
															-                    ):
														
 
															-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															-                            bbox,
														
 
															-                            match_block["title_text"][0][1],
														
 
															-                        )
														
 
															-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															-                            bbox,
														
 
															-                            match_block["title_text"][-1][1],
														
 
															-                        )
														
 
															-                        iou = 1 - max(iou_left_up, iou_right_down)
														
 
															-                        distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															-                    else:
														
 
															-                        distance = float("inf")
														
 
															-                elif distance_type == "manhattan":
														
 
															-                    distance = _manhattan_distance(bbox, match_bbox)
														
 
															-                elif distance_type == "vision_footnote":
														
 
															-                    if (
														
 
															-                        match_block["block_label"] in vision_labels
														
 
															-                        and match_block["vision_footnote"] != []
														
 
															-                    ):
														
 
															-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															-                            bbox,
														
 
															-                            match_block["vision_footnote"][0],
														
 
															-                        )
														
 
															-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+        for index in sorted(drop_indexes, reverse=True):
														
 
															+            del parsing_res_by_pre_cuts[index]
														
 
															+
														
 
															+        if len(parsing_res_by_pre_cuts) > 0:
														
 
															+            # single text label
														
 
															+            if (
														
 
															+                len(double_text_blocks) > len(parsing_res_by_pre_cuts)
														
 
															+                or projection_direction
														
 
															+            ):
														
 
															+                parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
														
 
															+                title_blocks = []
														
 
															+                double_text_blocks = []
														
 
															+                block_bboxes = [
														
 
															+                    block["block_bbox"] for block in parsing_res_by_pre_cuts
														
 
															+                ]
														
 
															+                block_bboxes.sort(
														
 
															+                    key=lambda x: (
														
 
															+                        x[0] // max(20, median_width),
														
 
															+                        x[1],
														
 
															+                    ),
														
 
															+                )
														
 
															+                block_bboxes = np.array(block_bboxes)
														
 
															+                sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
														
 
															+            else:
														
 
															+                block_bboxes = [
														
 
															+                    block["block_bbox"] for block in parsing_res_by_pre_cuts
														
 
															+                ]
														
 
															+                block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
														
 
															+                block_bboxes = np.array(block_bboxes)
														
 
															+                sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
														
 
															+
														
 
															+            sorted_boxes = block_bboxes[sorted_indices].tolist()
														
 
															+
														
 
															+            for block in parsing_res_by_pre_cuts:
														
 
															+                block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
														
 
															+                block["sub_index"] = (
														
 
															+                    num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
														
 
															+                )
														
 
															+
														
 
															+        def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
														
 
															+            for block in input_blocks:
														
 
															+                bbox = block["block_bbox"]
														
 
															+                min_distance = float("inf")
														
 
															+                min_distance_config = [
														
 
															+                    [float("inf"), float("inf")],
														
 
															+                    float("inf"),
														
 
															+                    float("inf"),
														
 
															+                ]  # for double text
														
 
															+                nearest_gt_index = 0
														
 
															+                for match_block in parsing_res_by_pre_cuts:
														
 
															+                    match_bbox = match_block["block_bbox"]
														
 
															+                    if distance_type == "nearest_iou_edge_distance":
														
 
															+                        distance, min_distance_config = _nearest_iou_edge_distance(
														
 
															                             bbox,
														
 
															-                            match_block["vision_footnote"][-1],
														
 
															-                        )
														
 
															-                        iou = 1 - max(iou_left_up, iou_right_down)
														
 
															-                        distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															-                    else:
														
 
															-                        distance = float("inf")
														
 
															-                elif distance_type == "vision_body":
														
 
															-                    if (
														
 
															-                        match_block["block_label"] in vision_title_labels
														
 
															-                        and block["vision_footnote"] != []
														
 
															-                    ):
														
 
															-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															-                            match_bbox,
														
 
															-                            block["vision_footnote"][0],
														
 
															-                        )
														
 
															-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															                             match_bbox,
														
 
															-                            block["vision_footnote"][-1],
														
 
															+                            block["sub_label"],
														
 
															+                            vision_labels=vision_labels,
														
 
															+                            no_mask_labels=no_mask_labels,
														
 
															+                            median_width=median_width,
														
 
															+                            title_labels=title_labels,
														
 
															+                            title_text=block["title_text"],
														
 
															+                            sub_title=block["sub_title"],
														
 
															+                            min_distance_config=min_distance_config,
														
 
															+                            tolerance_len=10,
														
 
															                         )
														
 
															-                        iou = 1 - max(iou_left_up, iou_right_down)
														
 
															-                        distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															+                    elif distance_type == "title_text":
														
 
															+                        if (
														
 
															+                            match_block["block_label"] in title_labels + ["abstract"]
														
 
															+                            and match_block["title_text"] != []
														
 
															+                        ):
														
 
															+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                bbox,
														
 
															+                                match_block["title_text"][0][1],
														
 
															+                            )
														
 
															+                            iou_right_down = (
														
 
															+                                _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                    bbox,
														
 
															+                                    match_block["title_text"][-1][1],
														
 
															+                                )
														
 
															+                            )
														
 
															+                            iou = 1 - max(iou_left_up, iou_right_down)
														
 
															+                            distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															+                        else:
														
 
															+                            distance = float("inf")
														
 
															+                    elif distance_type == "manhattan":
														
 
															+                        distance = _manhattan_distance(bbox, match_bbox)
														
 
															+                    elif distance_type == "vision_footnote":
														
 
															+                        if (
														
 
															+                            match_block["block_label"] in vision_labels
														
 
															+                            and match_block["vision_footnote"] != []
														
 
															+                        ):
														
 
															+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                bbox,
														
 
															+                                match_block["vision_footnote"][0],
														
 
															+                            )
														
 
															+                            iou_right_down = (
														
 
															+                                _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                    bbox,
														
 
															+                                    match_block["vision_footnote"][-1],
														
 
															+                                )
														
 
															+                            )
														
 
															+                            iou = 1 - max(iou_left_up, iou_right_down)
														
 
															+                            distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															+                        else:
														
 
															+                            distance = float("inf")
														
 
															+                    elif distance_type == "vision_body":
														
 
															+                        if (
														
 
															+                            match_block["block_label"] in vision_title_labels
														
 
															+                            and block["vision_footnote"] != []
														
 
															+                        ):
														
 
															+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                match_bbox,
														
 
															+                                block["vision_footnote"][0],
														
 
															+                            )
														
 
															+                            iou_right_down = (
														
 
															+                                _calculate_overlap_area_div_minbox_area_ratio(
														
 
															+                                    match_bbox,
														
 
															+                                    block["vision_footnote"][-1],
														
 
															+                                )
														
 
															+                            )
														
 
															+                            iou = 1 - max(iou_left_up, iou_right_down)
														
 
															+                            distance = _manhattan_distance(bbox, match_bbox) * iou
														
 
															+                        else:
														
 
															+                            distance = float("inf")
														
 
															                     else:
														
 
															-                        distance = float("inf")
														
 
															+                        raise NotImplementedError
														
 
															+
														
 
															+                    if distance < min_distance:
														
 
															+                        min_distance = distance
														
 
															+                        if is_add_index:
														
 
															+                            nearest_gt_index = match_block.get("index", 999)
														
 
															+                        else:
														
 
															+                            nearest_gt_index = match_block.get("sub_index", 999)
														
 
															+
														
 
															+                if is_add_index:
														
 
															+                    block["index"] = nearest_gt_index
														
 
															                 else:
														
 
															-                    raise NotImplementedError
														
 
															+                    block["sub_index"] = nearest_gt_index
														
 
															-                if distance < min_distance:
														
 
															-                    min_distance = distance
														
 
															-                    if is_add_index:
														
 
															-                        nearest_gt_index = match_block.get("index", 999)
														
 
															-                    else:
														
 
															-                        nearest_gt_index = match_block.get("sub_index", 999)
														
 
															+                parsing_res_by_pre_cuts.append(block)
														
 
															-            if is_add_index:
														
 
															-                block["index"] = nearest_gt_index
														
 
															-            else:
														
 
															-                block["sub_index"] = nearest_gt_index
														
 
															+        # double text label
														
 
															+        double_text_blocks.sort(
														
 
															+            key=lambda x: (
														
 
															+                x["block_bbox"][1] // 10,
														
 
															+                x["block_bbox"][0] // median_width,
														
 
															+                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
														
 
															+            ),
														
 
															+        )
														
 
															+        nearest_match_(
														
 
															+            double_text_blocks,
														
 
															+            distance_type="nearest_iou_edge_distance",
														
 
															+        )
														
 
															+        parsing_res_by_pre_cuts.sort(
														
 
															+            key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
														
 
															+        )
														
 
															-            parsing_res_list.append(block)
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["index"] = num_index + idx + 1
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															-    # double text label
														
 
															-    double_text_blocks.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["block_bbox"][1] // 10,
														
 
															-            x["block_bbox"][0] // median_width,
														
 
															-            x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
														
 
															-        ),
														
 
															-    )
														
 
															-    nearest_match_(
														
 
															-        double_text_blocks,
														
 
															-        distance_type="nearest_iou_edge_distance",
														
 
															-    )
														
 
															-    parsing_res_list.sort(
														
 
															-        key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
														
 
															-    )
														
 
															+        # title label
														
 
															+        title_blocks.sort(
														
 
															+            key=lambda x: (
														
 
															+                x["block_bbox"][1] // 10,
														
 
															+                x["block_bbox"][0] // median_width,
														
 
															+                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
														
 
															+            ),
														
 
															+        )
														
 
															+        nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
														
 
															+
														
 
															+        if doc_flag:
														
 
															+            text_sort_labels = ["doc_title"]
														
 
															+            text_label_priority = {
														
 
															+                label: priority for priority, label in enumerate(text_sort_labels)
														
 
															+            }
														
 
															+            doc_titles = []
														
 
															+            for i, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+                if block["block_label"] == "doc_title":
														
 
															+                    doc_titles.append(
														
 
															+                        (i, block["block_bbox"][1], block["block_bbox"][0]),
														
 
															+                    )
														
 
															+            doc_titles.sort(key=lambda x: (x[1], x[2]))
														
 
															+            first_doc_title_index = doc_titles[0][0]
														
 
															+            parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
														
 
															+            parsing_res_by_pre_cuts.sort(
														
 
															+                key=lambda x: (
														
 
															+                    x["index"],
														
 
															+                    text_label_priority.get(x["block_label"], 9999),
														
 
															+                    x["block_bbox"][1],
														
 
															+                    x["block_bbox"][0],
														
 
															+                ),
														
 
															+            )
														
 
															+        else:
														
 
															+            parsing_res_by_pre_cuts.sort(
														
 
															+                key=lambda x: (
														
 
															+                    x["index"],
														
 
															+                    x["block_bbox"][1],
														
 
															+                    x["block_bbox"][0],
														
 
															+                ),
														
 
															+            )
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["index"] = idx + 1
														
 
															-        block["sub_index"] = idx + 1
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["index"] = num_index + idx + 1
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															-    # title label
														
 
															-    title_blocks.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["block_bbox"][1] // 10,
														
 
															-            x["block_bbox"][0] // median_width,
														
 
															-            x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
														
 
															-        ),
														
 
															-    )
														
 
															-    nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
														
 
															+        # title-text label
														
 
															+        nearest_match_(title_text_blocks, distance_type="title_text")
														
 
															-    if doc_flag:
														
 
															-        text_sort_labels = ["doc_title"]
														
 
															-        text_label_priority = {
														
 
															-            label: priority for priority, label in enumerate(text_sort_labels)
														
 
															-        }
														
 
															-        doc_titles = []
														
 
															-        for i, block in enumerate(parsing_res_list):
														
 
															-            if block["block_label"] == "doc_title":
														
 
															-                doc_titles.append(
														
 
															-                    (i, block["block_bbox"][1], block["block_bbox"][0]),
														
 
															-                )
														
 
															-        doc_titles.sort(key=lambda x: (x[1], x[2]))
														
 
															-        first_doc_title_index = doc_titles[0][0]
														
 
															-        parsing_res_list[first_doc_title_index]["index"] = 1
														
 
															-        parsing_res_list.sort(
														
 
															+        def hor_tb_and_ver_lr(x):
														
 
															+            input_bbox = x["block_bbox"]
														
 
															+            is_horizontal = _get_bbox_direction(input_bbox)
														
 
															+            if is_horizontal:
														
 
															+                return input_bbox[1]
														
 
															+            else:
														
 
															+                return input_bbox[0]
														
 
															+
														
 
															+        parsing_res_by_pre_cuts.sort(
														
 
															+            key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
														
 
															+        )
														
 
															+
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["index"] = num_index + idx + 1
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															+
														
 
															+        # image,figure,chart,seal label
														
 
															+        nearest_match_(
														
 
															+            vision_blocks,
														
 
															+            distance_type="nearest_iou_edge_distance",
														
 
															+            is_add_index=False,
														
 
															+        )
														
 
															+        parsing_res_by_pre_cuts.sort(
														
 
															             key=lambda x: (
														
 
															-                x["index"],
														
 
															-                text_label_priority.get(x["block_label"], 9999),
														
 
															+                x["sub_index"],
														
 
															                 x["block_bbox"][1],
														
 
															                 x["block_bbox"][0],
														
 
															             ),
														
 
															         )
														
 
															-    else:
														
 
															-        parsing_res_list.sort(
														
 
															+
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															+
														
 
															+        # image,figure,chart,seal title label
														
 
															+        nearest_match_(
														
 
															+            vision_title_blocks,
														
 
															+            distance_type="nearest_iou_edge_distance",
														
 
															+            is_add_index=False,
														
 
															+        )
														
 
															+        parsing_res_by_pre_cuts.sort(
														
 
															             key=lambda x: (
														
 
															-                x["index"],
														
 
															+                x["sub_index"],
														
 
															                 x["block_bbox"][1],
														
 
															                 x["block_bbox"][0],
														
 
															             ),
														
 
															         )
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["index"] = idx + 1
														
 
															-        block["sub_index"] = idx + 1
														
 
															-
														
 
															-    # title-text label
														
 
															-    nearest_match_(title_text_blocks, distance_type="title_text")
														
 
															-    text_sort_labels = ["doc_title", "paragraph_title", "title_text"]
														
 
															-    text_label_priority = {
														
 
															-        label: priority for priority, label in enumerate(text_sort_labels)
														
 
															-    }
														
 
															-    parsing_res_list.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["index"],
														
 
															-            text_label_priority.get(x["sub_label"], 9999),
														
 
															-            x["block_bbox"][1],
														
 
															-            x["block_bbox"][0],
														
 
															-        ),
														
 
															-    )
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["index"] = idx + 1
														
 
															-        block["sub_index"] = idx + 1
														
 
															-
														
 
															-    # image,figure,chart,seal label
														
 
															-    nearest_match_(
														
 
															-        vision_blocks,
														
 
															-        distance_type="nearest_iou_edge_distance",
														
 
															-        is_add_index=False,
														
 
															-    )
														
 
															-    parsing_res_list.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["sub_index"],
														
 
															-            x["block_bbox"][1],
														
 
															-            x["block_bbox"][0],
														
 
															-        ),
														
 
															-    )
														
 
															-
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["sub_index"] = idx + 1
														
 
															-
														
 
															-    # image,figure,chart,seal title label
														
 
															-    nearest_match_(
														
 
															-        vision_title_blocks,
														
 
															-        distance_type="nearest_iou_edge_distance",
														
 
															-        is_add_index=False,
														
 
															-    )
														
 
															-    parsing_res_list.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["sub_index"],
														
 
															-            x["block_bbox"][1],
														
 
															-            x["block_bbox"][0],
														
 
															-        ),
														
 
															-    )
														
 
															+        # vision footnote label
														
 
															+        nearest_match_(
														
 
															+            vision_footnote_blocks,
														
 
															+            distance_type="vision_footnote",
														
 
															+            is_add_index=False,
														
 
															+        )
														
 
															+        text_label_priority = {"vision_footnote": 9999}
														
 
															+        parsing_res_by_pre_cuts.sort(
														
 
															+            key=lambda x: (
														
 
															+                x["sub_index"],
														
 
															+                text_label_priority.get(x["sub_label"], 0),
														
 
															+                x["block_bbox"][1],
														
 
															+                x["block_bbox"][0],
														
 
															+            ),
														
 
															+        )
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["sub_index"] = idx + 1
														
 
															+        for idx, block in enumerate(parsing_res_by_pre_cuts):
														
 
															+            block["sub_index"] = num_sub_index + idx + 1
														
 
															-    # vision footnote label
														
 
															-    nearest_match_(
														
 
															-        vision_footnote_blocks,
														
 
															-        distance_type="vision_footnote",
														
 
															-        is_add_index=False,
														
 
															-    )
														
 
															-    text_label_priority = {"vision_footnote": 9999}
														
 
															-    parsing_res_list.sort(
														
 
															-        key=lambda x: (
														
 
															-            x["sub_index"],
														
 
															-            text_label_priority.get(x["sub_label"], 0),
														
 
															-            x["block_bbox"][1],
														
 
															-            x["block_bbox"][0],
														
 
															-        ),
														
 
															-    )
														
 
															+        # header、footnote、header_image... label
														
 
															+        nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
														
 
															-    for idx, block in enumerate(parsing_res_list):
														
 
															-        block["sub_index"] = idx + 1
														
 
															+        # add all parsing result
														
 
															+        final_parsing_res_list.extend(parsing_res_by_pre_cuts)
														
 
															-    # header、footnote、header_image... label
														
 
															-    nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
														
 
															+        # update num index
														
 
															+        num_sub_index += len(parsing_res_by_pre_cuts)
														
 
															+        for parsing_res in parsing_res_by_pre_cuts:
														
 
															+            if parsing_res.get("index"):
														
 
															+                num_index += 1
														
 
															     parsing_res_list = [
														
 
															         {
														
@@ -1779,7 +1798,7 @@ def get_layout_ordering(
 
															             "sub_index": parsing_res["sub_index"],
														
 
															             "index": parsing_res.get("index", None),
														
 
															         }
														
 
															-        for parsing_res in parsing_res_list
														
 
															+        for parsing_res in final_parsing_res_list
														
 
															     ]
														
 
															     return parsing_res_list
														
@@ -1969,7 +1988,7 @@ def _nearest_edge_distance(
 
															         else:
														
 
															             distance_y = distance[2] * weight[2]
														
 
															         if label in no_mask_labels:
														
 
															-            distance_y = max(0.1, distance_y) * 100
														
 
															+            distance_y = max(0.1, distance_y) * 10  # for abstract
														
 
															     # input_bbox is below match_bbox
														
 
															     elif y1 > y2_prime:
														
 
															         direction_num += 1
														
@@ -2071,33 +2090,11 @@ def _nearest_iou_edge_distance(
 
															         or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
														
 
															     ):
														
 
															         iou_distance = 1
														
 
															-    elif label == "doc_title" or (label in title_labels and title_text):
														
 
															+
														
 
															+    if label == "doc_title":
														
 
															         # Calculate distance for titles
														
 
															         disperse = max(1, median_width)
														
 
															-        width = x2 - x1
														
 
															-        height = y2 - y1
														
 
															-        if horizontal1:
														
 
															-            return (
														
 
															-                _calculate_horizontal_distance(
														
 
															-                    input_bbox,
														
 
															-                    match_bbox,
														
 
															-                    height,
														
 
															-                    disperse,
														
 
															-                    title_text,
														
 
															-                ),
														
 
															-                min_distance_config,
														
 
															-            )
														
 
															-        else:
														
 
															-            return (
														
 
															-                _calculate_vertical_distance(
														
 
															-                    input_bbox,
														
 
															-                    match_bbox,
														
 
															-                    width,
														
 
															-                    disperse,
														
 
															-                    title_text,
														
 
															-                ),
														
 
															-                min_distance_config,
														
 
															-            )
														
 
															+        tolerance_len = max(tolerance_len, disperse)
														
 
															     # Adjust input_bbox based on sub_title
														
 
															     if sub_title:
														
@@ -2105,19 +2102,36 @@ def _nearest_iou_edge_distance(
 
															             x1_, y1_, x2_, y2_ = sub
														
 
															             x1, y1, x2, y2 = (
														
 
															                 min(x1, x1_),
														
 
															-                min(
														
 
															-                    y1,
														
 
															-                    y1_,
														
 
															-                ),
														
 
															-                max(x2, x2_),
														
 
															+                min(y1, y1_),
														
 
															+                min(x2, x2_),
														
 
															                 max(y2, y2_),
														
 
															             )
														
 
															         input_bbox = [x1, y1, x2, y2]
														
 
															+    if title_text:
														
 
															+        for sub in title_text:
														
 
															+            x1_, y1_, x2_, y2_ = sub[1]
														
 
															+            if horizontal1:
														
 
															+                x1, y1, x2, y2 = (
														
 
															+                    min(x1, x1_),
														
 
															+                    min(y1, y1_),
														
 
															+                    min(x2, x2_),
														
 
															+                    max(y2, y2_),
														
 
															+                )
														
 
															+            else:
														
 
															+                x1, y1, x2, y2 = (
														
 
															+                    min(x1, x1_),
														
 
															+                    min(y1, y1_),
														
 
															+                    max(x2, x2_),
														
 
															+                    min(y2, y2_),
														
 
															+                )
														
 
															+        input_bbox = [x1, y1, x2, y2]
														
 
															+
														
 
															     # Calculate edge distance
														
 
															     weight = _get_weights(label, horizontal1)
														
 
															     if label == "abstract":
														
 
															-        tolerance_len *= 3
														
 
															+        tolerance_len *= 2
														
 
															+
														
 
															     edge_distance, edge_distance_config = _nearest_edge_distance(
														
 
															         input_bbox,
														
 
															         match_bbox,
														
@@ -2129,13 +2143,13 @@ def _nearest_iou_edge_distance(
 
															     )
														
 
															     # Weights for combining distances
														
 
															-    iou_edge_weight = [10**6, 10**3, 1, 0.001]
														
 
															+    iou_edge_weight = [10**8, 10**4, 1, 0.0001]
														
 
															     # Calculate up and left edge distances
														
 
															     up_edge_distance = y1_prime
														
 
															     left_edge_distance = x1_prime
														
 
															     if (
														
 
															-        label in no_mask_labels or label == "paragraph_title" or label in vision_labels
														
 
															+        label in no_mask_labels or label in title_labels or label in vision_labels
														
 
															     ) and y1 > y2_prime:
														
 
															         up_edge_distance = -y2_prime
														
 
															         left_edge_distance = -x2_prime
														
@@ -2155,12 +2169,12 @@ def _nearest_iou_edge_distance(
 
															     # Update minimum distance configuration if a smaller distance is found
														
 
															     if total_distance > distance:
														
 
															         edge_distance_config = [
														
 
															-            min(min_edge_distance_config[0], edge_distance_config[0]),
														
 
															-            min(min_edge_distance_config[1], edge_distance_config[1]),
														
 
															+            edge_distance_config[0],
														
 
															+            edge_distance_config[1],
														
 
															         ]
														
 
															         min_distance_config = [
														
 
															             edge_distance_config,
														
 
															-            min(up_edge_distance, up_edge_distances_config),
														
 
															+            up_edge_distance,
														
 
															             distance,
														
 
															         ]