před 8 měsíci · de05aaac84
--- a/paddlex/inference/pipelines/layout_parsing/utils.py
+++ b/paddlex/inference/pipelines/layout_parsing/utils.py
@@ -760,7 +760,6 @@ def sort_by_xycut(
 
				     block_bboxes: Union[np.ndarray, List[List[int]]],
			
 
				     direction: int = 0,
			
 
				     min_gap: int = 1,
			
 
				-    pre_cuts: Optional[Dict[str, List[int]]] = None,
			
 
				 ) -> List[int]:
			
 
				     """
			
 
				     Sort bounding boxes using recursive XY cut method based on the specified direction.
			
@@ -772,56 +771,26 @@ def sort_by_xycut(
 
				         direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
			
 
				                          Defaults to 0.
			
 
				         min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
			
 
				-        pre_cuts (Optional[Dict[str, List[int]]]): A dictionary specifying pre-cut points along the axes.
			
 
				-                                                  The keys are 'x' or 'y', representing the axis to pre-cut,
			
 
				-                                                  and the values are lists of integers specifying the cut points.
			
 
				-                                                  For example, {'y': [100, 200]} will pre-cut the y-axis at
			
 
				-                                                  positions 100 and 200 before applying the main XY cut algorithm.
			
 
				-                                                  Defaults to None.
			
 
				 
			
 
				     Returns:
			
 
				         List[int]: A list of indices representing the order of sorted bounding boxes.
			
 
				     """
			
 
				     block_bboxes = np.asarray(block_bboxes).astype(int)
			
 
				     res = []
			
 
				-    axis = "x" if direction == 1 else "y"
			
 
				-    if len(pre_cuts[axis]) > 0:
			
 
				-        cuts = sorted(pre_cuts[axis])
			
 
				-        axis_index = 1 if axis == "y" else 0
			
 
				-        max_val = block_bboxes[:, 3].max() if axis == "y" else block_bboxes[:, 2].max()
			
 
				-        intervals = []
			
 
				-        prev = 0
			
 
				-        for cut in cuts:
			
 
				-            intervals.append((prev, cut))
			
 
				-            prev = cut
			
 
				-        intervals.append((prev, max_val))
			
 
				-        for start, end in intervals:
			
 
				-            mask = (block_bboxes[:, axis_index] >= start) & (
			
 
				-                block_bboxes[:, axis_index] < end
			
 
				-            )
			
 
				-            sub_boxes = block_bboxes[mask]
			
 
				-            sub_indices = np.arange(len(block_bboxes))[mask].tolist()
			
 
				-            if len(sub_boxes) > 0:
			
 
				-                if direction == 1:
			
 
				-                    _recursive_yx_cut(sub_boxes, sub_indices, res, min_gap)
			
 
				-                else:
			
 
				-                    _recursive_xy_cut(sub_boxes, sub_indices, res, min_gap)
			
 
				+    if direction == 1:
			
 
				+        _recursive_yx_cut(
			
 
				+            block_bboxes,
			
 
				+            np.arange(len(block_bboxes)).tolist(),
			
 
				+            res,
			
 
				+            min_gap,
			
 
				+        )
			
 
				     else:
			
 
				-        if direction == 1:
			
 
				-            _recursive_yx_cut(
			
 
				-                block_bboxes,
			
 
				-                np.arange(len(block_bboxes)).tolist(),
			
 
				-                res,
			
 
				-                min_gap,
			
 
				-            )
			
 
				-        else:
			
 
				-            _recursive_xy_cut(
			
 
				-                block_bboxes,
			
 
				-                np.arange(len(block_bboxes)).tolist(),
			
 
				-                res,
			
 
				-                min_gap,
			
 
				-            )
			
 
				-
			
 
				+        _recursive_xy_cut(
			
 
				+            block_bboxes,
			
 
				+            np.arange(len(block_bboxes)).tolist(),
			
 
				+            res,
			
 
				+            min_gap,
			
 
				+        )
			
 
				     return res
			
 
				 
			
 
				 
			
@@ -842,6 +811,7 @@ def _img_array2path(data: np.ndarray) -> str:
 
				     if isinstance(data, np.ndarray) and data.ndim == 3:
			
 
				         # Generate a unique filename using UUID
			
 
				         img_name = f"image_{uuid.uuid4().hex}.png"
			
 
				+
			
 
				         return {f"imgs/{img_name}": Image.fromarray(data[:, :, ::-1])}
			
 
				     else:
			
 
				         raise ValueError(
			
@@ -1106,12 +1076,12 @@ def _get_projection_iou(
 
				         x_match_min = max(input_bbox[0], match_bbox[0])
			
 
				         x_match_max = min(input_bbox[2], match_bbox[2])
			
 
				         overlap = max(0, x_match_max - x_match_min)
			
 
				-        input_width = input_bbox[2] - input_bbox[0]
			
 
				+        input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
			
 
				     else:
			
 
				         y_match_min = max(input_bbox[1], match_bbox[1])
			
 
				         y_match_max = min(input_bbox[3], match_bbox[3])
			
 
				         overlap = max(0, y_match_max - y_match_min)
			
 
				-        input_width = input_bbox[3] - input_bbox[1]
			
 
				+        input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
			
 
				 
			
 
				     return overlap / input_width if input_width > 0 else 0.0
			
 
				 
			
@@ -1128,29 +1098,26 @@ def _get_sub_category(
 
				 
			
 
				     Returns:
			
 
				         List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
			
 
				-        List[float]: List of pre_cuts coordinates.
			
 
				+        Dict[float]: Dict of pre_cuts coordinates.
			
 
				     """
			
 
				 
			
 
				     sub_title_labels = ["paragraph_title"]
			
 
				     vision_labels = ["image", "table", "chart", "figure"]
			
 
				     vision_title_labels = ["figure_title", "chart_title", "table_title"]
			
 
				     all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
			
 
				+    special_pre_cut_labels = title_labels + sub_title_labels
			
 
				 
			
 
				-    relevant_blocks = [block for block in blocks if block["block_label"] in all_labels]
			
 
				+    min_x = min(block["block_bbox"][0] for block in blocks)
			
 
				+    min_y = min(block["block_bbox"][1] for block in blocks)
			
 
				+    max_x = max(block["block_bbox"][2] for block in blocks)
			
 
				+    max_y = max(block["block_bbox"][3] for block in blocks)
			
 
				+    region_bbox = (min_x, min_y, max_x, max_y)
			
 
				+    region_x_center = (region_bbox[0] + region_bbox[2]) / 2
			
 
				+    region_y_center = (region_bbox[1] + region_bbox[3]) / 2
			
 
				+    region_width = region_bbox[2] - region_bbox[0]
			
 
				+    region_height = region_bbox[3] - region_bbox[1]
			
 
				 
			
 
				-    region_bbox = None
			
 
				-    if relevant_blocks:
			
 
				-        min_x = min(block["block_bbox"][0] for block in relevant_blocks)
			
 
				-        min_y = min(block["block_bbox"][1] for block in relevant_blocks)
			
 
				-        max_x = max(block["block_bbox"][2] for block in relevant_blocks)
			
 
				-        max_y = max(block["block_bbox"][3] for block in relevant_blocks)
			
 
				-        region_bbox = (min_x, min_y, max_x, max_y)
			
 
				-        region_x_center = (region_bbox[0] + region_bbox[2]) / 2
			
 
				-        region_y_center = (region_bbox[1] + region_bbox[3]) / 2
			
 
				-        region_width = region_bbox[2] - region_bbox[0]
			
 
				-        region_height = region_bbox[3] - region_bbox[1]
			
 
				-
			
 
				-    pre_cuts = []
			
 
				+    pre_cuts = {}
			
 
				 
			
 
				     for i, block1 in enumerate(blocks):
			
 
				         block1.setdefault("title_text", [])
			
@@ -1179,16 +1146,21 @@ def _get_sub_category(
 
				         else:
			
 
				             block_length = y2 - y1
			
 
				             required_length = region_height / 2
			
 
				-        length_condition = block_length > required_length
			
 
				+        if block1["block_label"] in special_pre_cut_labels:
			
 
				+            length_condition = True
			
 
				+        else:
			
 
				+            length_condition = block_length > required_length
			
 
				 
			
 
				         # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
			
 
				         block_x_center = (x1 + x2) / 2
			
 
				         block_y_center = (y1 + y2) / 2
			
 
				         tolerance_len = block_length // 5
			
 
				-        is_centered = (
			
 
				-            abs(block_x_center - region_x_center) <= tolerance_len
			
 
				-            and abs(block_y_center - region_y_center) <= tolerance_len
			
 
				-        )
			
 
				+        if block1["block_label"] in special_pre_cut_labels:
			
 
				+            tolerance_len = block_length // 10
			
 
				+        if is_horizontal_1:
			
 
				+            is_centered = abs(block_x_center - region_x_center) <= tolerance_len
			
 
				+        else:
			
 
				+            is_centered = abs(block_y_center - region_y_center) <= tolerance_len
			
 
				 
			
 
				         # Condition 3: Check for surrounding text
			
 
				         has_left_text = False
			
@@ -1225,9 +1197,9 @@ def _get_sub_category(
 
				         # Add coordinates if all conditions are met
			
 
				         if is_centered and length_condition and no_text_on_sides:
			
 
				             if is_horizontal_1:
			
 
				-                pre_cuts.append(y1)
			
 
				+                pre_cuts.setdefault("y", []).append(y1)
			
 
				             else:
			
 
				-                pre_cuts.append(x1)
			
 
				+                pre_cuts.setdefault("x", []).append(x1)
			
 
				 
			
 
				         for j, block2 in enumerate(blocks):
			
 
				             if i == j:
			
@@ -1257,11 +1229,7 @@ def _get_sub_category(
 
				 
			
 
				             block_iou_threshold = 0.1
			
 
				             if block1["block_label"] in sub_title_labels:
			
 
				-                match_block_iou = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				-                    bbox2,
			
 
				-                    bbox1,
			
 
				-                )
			
 
				-                block_iou_threshold = 0.7
			
 
				+                block_iou_threshold = 0.5
			
 
				 
			
 
				             if is_horizontal_1:
			
 
				                 if match_block_iou >= block_iou_threshold:
			
@@ -1446,326 +1414,377 @@ def get_layout_ordering(
 
				     )
			
 
				     parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
			
 
				 
			
 
				-    doc_flag = False
			
 
				-    median_width = _get_text_median_width(parsing_res_list)
			
 
				-    parsing_res_list, projection_direction = _get_layout_property(
			
 
				-        parsing_res_list,
			
 
				-        median_width,
			
 
				-        no_mask_labels=no_mask_labels,
			
 
				-        threshold=0.3,
			
 
				-    )
			
 
				-    # Convert bounding boxes to float and remove overlaps
			
 
				-    (
			
 
				-        double_text_blocks,
			
 
				-        title_text_blocks,
			
 
				-        title_blocks,
			
 
				-        vision_blocks,
			
 
				-        vision_title_blocks,
			
 
				-        vision_footnote_blocks,
			
 
				-        other_blocks,
			
 
				-    ) = ([], [], [], [], [], [], [])
			
 
				-
			
 
				-    drop_indexes = []
			
 
				-
			
 
				-    for index, block in enumerate(parsing_res_list):
			
 
				-        label = block["sub_label"]
			
 
				-        block["block_bbox"] = list(map(int, block["block_bbox"]))
			
 
				-
			
 
				-        if label == "doc_title":
			
 
				-            doc_flag = True
			
 
				-
			
 
				-        if label in no_mask_labels:
			
 
				-            if block["layout"] == "double":
			
 
				-                double_text_blocks.append(block)
			
 
				+    parsing_res_by_pre_cuts_list = []
			
 
				+    if len(pre_cuts) > 0:
			
 
				+        block_bboxes = [block["block_bbox"] for block in parsing_res_list]
			
 
				+        for axis, cuts in pre_cuts.items():
			
 
				+            axis_index = 1 if axis == "y" else 0
			
 
				+
			
 
				+            max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
			
 
				+
			
 
				+            intervals = []
			
 
				+            prev = 0
			
 
				+            for cut in sorted(cuts):
			
 
				+                intervals.append((prev, cut))
			
 
				+                prev = cut
			
 
				+            intervals.append((prev, max_val))
			
 
				+
			
 
				+            for start, end in intervals:
			
 
				+                mask = [
			
 
				+                    (bbox[axis_index] >= start) and (bbox[axis_index] < end)
			
 
				+                    for bbox in block_bboxes
			
 
				+                ]
			
 
				+                parsing_res_by_pre_cuts_list.append(
			
 
				+                    [parsing_res_list[i] for i, m in enumerate(mask) if m]
			
 
				+                )
			
 
				+    else:
			
 
				+        parsing_res_by_pre_cuts_list = [parsing_res_list]
			
 
				+
			
 
				+    final_parsing_res_list = []
			
 
				+    num_index = 0
			
 
				+    num_sub_index = 0
			
 
				+    for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
			
 
				+
			
 
				+        doc_flag = False
			
 
				+        median_width = _get_text_median_width(parsing_res_by_pre_cuts)
			
 
				+        parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
			
 
				+            parsing_res_by_pre_cuts,
			
 
				+            median_width,
			
 
				+            no_mask_labels=no_mask_labels,
			
 
				+            threshold=0.3,
			
 
				+        )
			
 
				+        # Convert bounding boxes to float and remove overlaps
			
 
				+        (
			
 
				+            double_text_blocks,
			
 
				+            title_text_blocks,
			
 
				+            title_blocks,
			
 
				+            vision_blocks,
			
 
				+            vision_title_blocks,
			
 
				+            vision_footnote_blocks,
			
 
				+            other_blocks,
			
 
				+        ) = ([], [], [], [], [], [], [])
			
 
				+
			
 
				+        drop_indexes = []
			
 
				+
			
 
				+        for index, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            label = block["sub_label"]
			
 
				+            block["block_bbox"] = list(map(int, block["block_bbox"]))
			
 
				+
			
 
				+            if label == "doc_title":
			
 
				+                doc_flag = True
			
 
				+
			
 
				+            if label in no_mask_labels:
			
 
				+                if block["layout"] == "double":
			
 
				+                    double_text_blocks.append(block)
			
 
				+                    drop_indexes.append(index)
			
 
				+            elif label == "title_text":
			
 
				+                title_text_blocks.append(block)
			
 
				+                drop_indexes.append(index)
			
 
				+            elif label == "vision_footnote":
			
 
				+                vision_footnote_blocks.append(block)
			
 
				+                drop_indexes.append(index)
			
 
				+            elif label in vision_title_labels:
			
 
				+                vision_title_blocks.append(block)
			
 
				+                drop_indexes.append(index)
			
 
				+            elif label in title_labels:
			
 
				+                title_blocks.append(block)
			
 
				+                drop_indexes.append(index)
			
 
				+            elif label in vision_labels:
			
 
				+                vision_blocks.append(block)
			
 
				+                drop_indexes.append(index)
			
 
				+            else:
			
 
				+                other_blocks.append(block)
			
 
				                 drop_indexes.append(index)
			
 
				-        elif label == "title_text":
			
 
				-            title_text_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-        elif label == "vision_footnote":
			
 
				-            vision_footnote_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-        elif label in vision_title_labels:
			
 
				-            vision_title_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-        elif label in title_labels:
			
 
				-            title_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-        elif label in vision_labels:
			
 
				-            vision_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-        else:
			
 
				-            other_blocks.append(block)
			
 
				-            drop_indexes.append(index)
			
 
				-
			
 
				-    for index in sorted(drop_indexes, reverse=True):
			
 
				-        del parsing_res_list[index]
			
 
				-
			
 
				-    if len(parsing_res_list) > 0:
			
 
				-        # single text label
			
 
				-        if len(double_text_blocks) > len(parsing_res_list) or projection_direction:
			
 
				-            parsing_res_list.extend(title_blocks + double_text_blocks)
			
 
				-            title_blocks = []
			
 
				-            double_text_blocks = []
			
 
				-            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
			
 
				-            block_bboxes.sort(
			
 
				-                key=lambda x: (
			
 
				-                    x[0] // max(20, median_width),
			
 
				-                    x[1],
			
 
				-                ),
			
 
				-            )
			
 
				-            block_bboxes = np.array(block_bboxes)
			
 
				-            sorted_indices = sort_by_xycut(
			
 
				-                block_bboxes, direction=1, min_gap=1, pre_cuts={"x": pre_cuts}
			
 
				-            )
			
 
				-        else:
			
 
				-            block_bboxes = [block["block_bbox"] for block in parsing_res_list]
			
 
				-            block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
			
 
				-            block_bboxes = np.array(block_bboxes)
			
 
				-            sorted_indices = sort_by_xycut(
			
 
				-                block_bboxes, direction=0, min_gap=20, pre_cuts={"y": pre_cuts}
			
 
				-            )
			
 
				 
			
 
				-        sorted_boxes = block_bboxes[sorted_indices].tolist()
			
 
				-
			
 
				-        for block in parsing_res_list:
			
 
				-            block["index"] = sorted_boxes.index(block["block_bbox"]) + 1
			
 
				-            block["sub_index"] = sorted_boxes.index(block["block_bbox"]) + 1
			
 
				-
			
 
				-    def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
			
 
				-        for block in input_blocks:
			
 
				-            bbox = block["block_bbox"]
			
 
				-            min_distance = float("inf")
			
 
				-            min_distance_config = [
			
 
				-                [float("inf"), float("inf")],
			
 
				-                float("inf"),
			
 
				-                float("inf"),
			
 
				-            ]  # for double text
			
 
				-            nearest_gt_index = 0
			
 
				-            for match_block in parsing_res_list:
			
 
				-                match_bbox = match_block["block_bbox"]
			
 
				-                if distance_type == "nearest_iou_edge_distance":
			
 
				-                    distance, min_distance_config = _nearest_iou_edge_distance(
			
 
				-                        bbox,
			
 
				-                        match_bbox,
			
 
				-                        block["sub_label"],
			
 
				-                        vision_labels=vision_labels,
			
 
				-                        no_mask_labels=no_mask_labels,
			
 
				-                        median_width=median_width,
			
 
				-                        title_labels=title_labels,
			
 
				-                        title_text=block["title_text"],
			
 
				-                        sub_title=block["sub_title"],
			
 
				-                        min_distance_config=min_distance_config,
			
 
				-                        tolerance_len=10,
			
 
				-                    )
			
 
				-                elif distance_type == "title_text":
			
 
				-                    if (
			
 
				-                        match_block["block_label"] in title_labels + ["abstract"]
			
 
				-                        and match_block["title_text"] != []
			
 
				-                    ):
			
 
				-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				-                            bbox,
			
 
				-                            match_block["title_text"][0][1],
			
 
				-                        )
			
 
				-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				-                            bbox,
			
 
				-                            match_block["title_text"][-1][1],
			
 
				-                        )
			
 
				-                        iou = 1 - max(iou_left_up, iou_right_down)
			
 
				-                        distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				-                    else:
			
 
				-                        distance = float("inf")
			
 
				-                elif distance_type == "manhattan":
			
 
				-                    distance = _manhattan_distance(bbox, match_bbox)
			
 
				-                elif distance_type == "vision_footnote":
			
 
				-                    if (
			
 
				-                        match_block["block_label"] in vision_labels
			
 
				-                        and match_block["vision_footnote"] != []
			
 
				-                    ):
			
 
				-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				-                            bbox,
			
 
				-                            match_block["vision_footnote"][0],
			
 
				-                        )
			
 
				-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+        for index in sorted(drop_indexes, reverse=True):
			
 
				+            del parsing_res_by_pre_cuts[index]
			
 
				+
			
 
				+        if len(parsing_res_by_pre_cuts) > 0:
			
 
				+            # single text label
			
 
				+            if (
			
 
				+                len(double_text_blocks) > len(parsing_res_by_pre_cuts)
			
 
				+                or projection_direction
			
 
				+            ):
			
 
				+                parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
			
 
				+                title_blocks = []
			
 
				+                double_text_blocks = []
			
 
				+                block_bboxes = [
			
 
				+                    block["block_bbox"] for block in parsing_res_by_pre_cuts
			
 
				+                ]
			
 
				+                block_bboxes.sort(
			
 
				+                    key=lambda x: (
			
 
				+                        x[0] // max(20, median_width),
			
 
				+                        x[1],
			
 
				+                    ),
			
 
				+                )
			
 
				+                block_bboxes = np.array(block_bboxes)
			
 
				+                sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
			
 
				+            else:
			
 
				+                block_bboxes = [
			
 
				+                    block["block_bbox"] for block in parsing_res_by_pre_cuts
			
 
				+                ]
			
 
				+                block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
			
 
				+                block_bboxes = np.array(block_bboxes)
			
 
				+                sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
			
 
				+
			
 
				+            sorted_boxes = block_bboxes[sorted_indices].tolist()
			
 
				+
			
 
				+            for block in parsing_res_by_pre_cuts:
			
 
				+                block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
			
 
				+                block["sub_index"] = (
			
 
				+                    num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
			
 
				+                )
			
 
				+
			
 
				+        def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
			
 
				+            for block in input_blocks:
			
 
				+                bbox = block["block_bbox"]
			
 
				+                min_distance = float("inf")
			
 
				+                min_distance_config = [
			
 
				+                    [float("inf"), float("inf")],
			
 
				+                    float("inf"),
			
 
				+                    float("inf"),
			
 
				+                ]  # for double text
			
 
				+                nearest_gt_index = 0
			
 
				+                for match_block in parsing_res_by_pre_cuts:
			
 
				+                    match_bbox = match_block["block_bbox"]
			
 
				+                    if distance_type == "nearest_iou_edge_distance":
			
 
				+                        distance, min_distance_config = _nearest_iou_edge_distance(
			
 
				                             bbox,
			
 
				-                            match_block["vision_footnote"][-1],
			
 
				-                        )
			
 
				-                        iou = 1 - max(iou_left_up, iou_right_down)
			
 
				-                        distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				-                    else:
			
 
				-                        distance = float("inf")
			
 
				-                elif distance_type == "vision_body":
			
 
				-                    if (
			
 
				-                        match_block["block_label"] in vision_title_labels
			
 
				-                        and block["vision_footnote"] != []
			
 
				-                    ):
			
 
				-                        iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				-                            match_bbox,
			
 
				-                            block["vision_footnote"][0],
			
 
				-                        )
			
 
				-                        iou_right_down = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				                             match_bbox,
			
 
				-                            block["vision_footnote"][-1],
			
 
				+                            block["sub_label"],
			
 
				+                            vision_labels=vision_labels,
			
 
				+                            no_mask_labels=no_mask_labels,
			
 
				+                            median_width=median_width,
			
 
				+                            title_labels=title_labels,
			
 
				+                            title_text=block["title_text"],
			
 
				+                            sub_title=block["sub_title"],
			
 
				+                            min_distance_config=min_distance_config,
			
 
				+                            tolerance_len=10,
			
 
				                         )
			
 
				-                        iou = 1 - max(iou_left_up, iou_right_down)
			
 
				-                        distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				+                    elif distance_type == "title_text":
			
 
				+                        if (
			
 
				+                            match_block["block_label"] in title_labels + ["abstract"]
			
 
				+                            and match_block["title_text"] != []
			
 
				+                        ):
			
 
				+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                bbox,
			
 
				+                                match_block["title_text"][0][1],
			
 
				+                            )
			
 
				+                            iou_right_down = (
			
 
				+                                _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                    bbox,
			
 
				+                                    match_block["title_text"][-1][1],
			
 
				+                                )
			
 
				+                            )
			
 
				+                            iou = 1 - max(iou_left_up, iou_right_down)
			
 
				+                            distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				+                        else:
			
 
				+                            distance = float("inf")
			
 
				+                    elif distance_type == "manhattan":
			
 
				+                        distance = _manhattan_distance(bbox, match_bbox)
			
 
				+                    elif distance_type == "vision_footnote":
			
 
				+                        if (
			
 
				+                            match_block["block_label"] in vision_labels
			
 
				+                            and match_block["vision_footnote"] != []
			
 
				+                        ):
			
 
				+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                bbox,
			
 
				+                                match_block["vision_footnote"][0],
			
 
				+                            )
			
 
				+                            iou_right_down = (
			
 
				+                                _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                    bbox,
			
 
				+                                    match_block["vision_footnote"][-1],
			
 
				+                                )
			
 
				+                            )
			
 
				+                            iou = 1 - max(iou_left_up, iou_right_down)
			
 
				+                            distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				+                        else:
			
 
				+                            distance = float("inf")
			
 
				+                    elif distance_type == "vision_body":
			
 
				+                        if (
			
 
				+                            match_block["block_label"] in vision_title_labels
			
 
				+                            and block["vision_footnote"] != []
			
 
				+                        ):
			
 
				+                            iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                match_bbox,
			
 
				+                                block["vision_footnote"][0],
			
 
				+                            )
			
 
				+                            iou_right_down = (
			
 
				+                                _calculate_overlap_area_div_minbox_area_ratio(
			
 
				+                                    match_bbox,
			
 
				+                                    block["vision_footnote"][-1],
			
 
				+                                )
			
 
				+                            )
			
 
				+                            iou = 1 - max(iou_left_up, iou_right_down)
			
 
				+                            distance = _manhattan_distance(bbox, match_bbox) * iou
			
 
				+                        else:
			
 
				+                            distance = float("inf")
			
 
				                     else:
			
 
				-                        distance = float("inf")
			
 
				+                        raise NotImplementedError
			
 
				+
			
 
				+                    if distance < min_distance:
			
 
				+                        min_distance = distance
			
 
				+                        if is_add_index:
			
 
				+                            nearest_gt_index = match_block.get("index", 999)
			
 
				+                        else:
			
 
				+                            nearest_gt_index = match_block.get("sub_index", 999)
			
 
				+
			
 
				+                if is_add_index:
			
 
				+                    block["index"] = nearest_gt_index
			
 
				                 else:
			
 
				-                    raise NotImplementedError
			
 
				+                    block["sub_index"] = nearest_gt_index
			
 
				 
			
 
				-                if distance < min_distance:
			
 
				-                    min_distance = distance
			
 
				-                    if is_add_index:
			
 
				-                        nearest_gt_index = match_block.get("index", 999)
			
 
				-                    else:
			
 
				-                        nearest_gt_index = match_block.get("sub_index", 999)
			
 
				+                parsing_res_by_pre_cuts.append(block)
			
 
				 
			
 
				-            if is_add_index:
			
 
				-                block["index"] = nearest_gt_index
			
 
				-            else:
			
 
				-                block["sub_index"] = nearest_gt_index
			
 
				+        # double text label
			
 
				+        double_text_blocks.sort(
			
 
				+            key=lambda x: (
			
 
				+                x["block_bbox"][1] // 10,
			
 
				+                x["block_bbox"][0] // median_width,
			
 
				+                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
			
 
				+            ),
			
 
				+        )
			
 
				+        nearest_match_(
			
 
				+            double_text_blocks,
			
 
				+            distance_type="nearest_iou_edge_distance",
			
 
				+        )
			
 
				+        parsing_res_by_pre_cuts.sort(
			
 
				+            key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
			
 
				+        )
			
 
				 
			
 
				-            parsing_res_list.append(block)
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["index"] = num_index + idx + 1
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				 
			
 
				-    # double text label
			
 
				-    double_text_blocks.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["block_bbox"][1] // 10,
			
 
				-            x["block_bbox"][0] // median_width,
			
 
				-            x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
			
 
				-        ),
			
 
				-    )
			
 
				-    nearest_match_(
			
 
				-        double_text_blocks,
			
 
				-        distance_type="nearest_iou_edge_distance",
			
 
				-    )
			
 
				-    parsing_res_list.sort(
			
 
				-        key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
			
 
				-    )
			
 
				+        # title label
			
 
				+        title_blocks.sort(
			
 
				+            key=lambda x: (
			
 
				+                x["block_bbox"][1] // 10,
			
 
				+                x["block_bbox"][0] // median_width,
			
 
				+                x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
			
 
				+            ),
			
 
				+        )
			
 
				+        nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
			
 
				+
			
 
				+        if doc_flag:
			
 
				+            text_sort_labels = ["doc_title"]
			
 
				+            text_label_priority = {
			
 
				+                label: priority for priority, label in enumerate(text_sort_labels)
			
 
				+            }
			
 
				+            doc_titles = []
			
 
				+            for i, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+                if block["block_label"] == "doc_title":
			
 
				+                    doc_titles.append(
			
 
				+                        (i, block["block_bbox"][1], block["block_bbox"][0]),
			
 
				+                    )
			
 
				+            doc_titles.sort(key=lambda x: (x[1], x[2]))
			
 
				+            first_doc_title_index = doc_titles[0][0]
			
 
				+            parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
			
 
				+            parsing_res_by_pre_cuts.sort(
			
 
				+                key=lambda x: (
			
 
				+                    x["index"],
			
 
				+                    text_label_priority.get(x["block_label"], 9999),
			
 
				+                    x["block_bbox"][1],
			
 
				+                    x["block_bbox"][0],
			
 
				+                ),
			
 
				+            )
			
 
				+        else:
			
 
				+            parsing_res_by_pre_cuts.sort(
			
 
				+                key=lambda x: (
			
 
				+                    x["index"],
			
 
				+                    x["block_bbox"][1],
			
 
				+                    x["block_bbox"][0],
			
 
				+                ),
			
 
				+            )
			
 
				 
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["index"] = idx + 1
			
 
				-        block["sub_index"] = idx + 1
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["index"] = num_index + idx + 1
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				 
			
 
				-    # title label
			
 
				-    title_blocks.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["block_bbox"][1] // 10,
			
 
				-            x["block_bbox"][0] // median_width,
			
 
				-            x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
			
 
				-        ),
			
 
				-    )
			
 
				-    nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
			
 
				+        # title-text label
			
 
				+        nearest_match_(title_text_blocks, distance_type="title_text")
			
 
				 
			
 
				-    if doc_flag:
			
 
				-        text_sort_labels = ["doc_title"]
			
 
				-        text_label_priority = {
			
 
				-            label: priority for priority, label in enumerate(text_sort_labels)
			
 
				-        }
			
 
				-        doc_titles = []
			
 
				-        for i, block in enumerate(parsing_res_list):
			
 
				-            if block["block_label"] == "doc_title":
			
 
				-                doc_titles.append(
			
 
				-                    (i, block["block_bbox"][1], block["block_bbox"][0]),
			
 
				-                )
			
 
				-        doc_titles.sort(key=lambda x: (x[1], x[2]))
			
 
				-        first_doc_title_index = doc_titles[0][0]
			
 
				-        parsing_res_list[first_doc_title_index]["index"] = 1
			
 
				-        parsing_res_list.sort(
			
 
				+        def hor_tb_and_ver_lr(x):
			
 
				+            input_bbox = x["block_bbox"]
			
 
				+            is_horizontal = _get_bbox_direction(input_bbox)
			
 
				+            if is_horizontal:
			
 
				+                return input_bbox[1]
			
 
				+            else:
			
 
				+                return input_bbox[0]
			
 
				+
			
 
				+        parsing_res_by_pre_cuts.sort(
			
 
				+            key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
			
 
				+        )
			
 
				+
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["index"] = num_index + idx + 1
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				+
			
 
				+        # image,figure,chart,seal label
			
 
				+        nearest_match_(
			
 
				+            vision_blocks,
			
 
				+            distance_type="nearest_iou_edge_distance",
			
 
				+            is_add_index=False,
			
 
				+        )
			
 
				+        parsing_res_by_pre_cuts.sort(
			
 
				             key=lambda x: (
			
 
				-                x["index"],
			
 
				-                text_label_priority.get(x["block_label"], 9999),
			
 
				+                x["sub_index"],
			
 
				                 x["block_bbox"][1],
			
 
				                 x["block_bbox"][0],
			
 
				             ),
			
 
				         )
			
 
				-    else:
			
 
				-        parsing_res_list.sort(
			
 
				+
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				+
			
 
				+        # image,figure,chart,seal title label
			
 
				+        nearest_match_(
			
 
				+            vision_title_blocks,
			
 
				+            distance_type="nearest_iou_edge_distance",
			
 
				+            is_add_index=False,
			
 
				+        )
			
 
				+        parsing_res_by_pre_cuts.sort(
			
 
				             key=lambda x: (
			
 
				-                x["index"],
			
 
				+                x["sub_index"],
			
 
				                 x["block_bbox"][1],
			
 
				                 x["block_bbox"][0],
			
 
				             ),
			
 
				         )
			
 
				 
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["index"] = idx + 1
			
 
				-        block["sub_index"] = idx + 1
			
 
				-
			
 
				-    # title-text label
			
 
				-    nearest_match_(title_text_blocks, distance_type="title_text")
			
 
				-    text_sort_labels = ["doc_title", "paragraph_title", "title_text"]
			
 
				-    text_label_priority = {
			
 
				-        label: priority for priority, label in enumerate(text_sort_labels)
			
 
				-    }
			
 
				-    parsing_res_list.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["index"],
			
 
				-            text_label_priority.get(x["sub_label"], 9999),
			
 
				-            x["block_bbox"][1],
			
 
				-            x["block_bbox"][0],
			
 
				-        ),
			
 
				-    )
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				 
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["index"] = idx + 1
			
 
				-        block["sub_index"] = idx + 1
			
 
				-
			
 
				-    # image,figure,chart,seal label
			
 
				-    nearest_match_(
			
 
				-        vision_blocks,
			
 
				-        distance_type="nearest_iou_edge_distance",
			
 
				-        is_add_index=False,
			
 
				-    )
			
 
				-    parsing_res_list.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["sub_index"],
			
 
				-            x["block_bbox"][1],
			
 
				-            x["block_bbox"][0],
			
 
				-        ),
			
 
				-    )
			
 
				-
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["sub_index"] = idx + 1
			
 
				-
			
 
				-    # image,figure,chart,seal title label
			
 
				-    nearest_match_(
			
 
				-        vision_title_blocks,
			
 
				-        distance_type="nearest_iou_edge_distance",
			
 
				-        is_add_index=False,
			
 
				-    )
			
 
				-    parsing_res_list.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["sub_index"],
			
 
				-            x["block_bbox"][1],
			
 
				-            x["block_bbox"][0],
			
 
				-        ),
			
 
				-    )
			
 
				+        # vision footnote label
			
 
				+        nearest_match_(
			
 
				+            vision_footnote_blocks,
			
 
				+            distance_type="vision_footnote",
			
 
				+            is_add_index=False,
			
 
				+        )
			
 
				+        text_label_priority = {"vision_footnote": 9999}
			
 
				+        parsing_res_by_pre_cuts.sort(
			
 
				+            key=lambda x: (
			
 
				+                x["sub_index"],
			
 
				+                text_label_priority.get(x["sub_label"], 0),
			
 
				+                x["block_bbox"][1],
			
 
				+                x["block_bbox"][0],
			
 
				+            ),
			
 
				+        )
			
 
				 
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["sub_index"] = idx + 1
			
 
				+        for idx, block in enumerate(parsing_res_by_pre_cuts):
			
 
				+            block["sub_index"] = num_sub_index + idx + 1
			
 
				 
			
 
				-    # vision footnote label
			
 
				-    nearest_match_(
			
 
				-        vision_footnote_blocks,
			
 
				-        distance_type="vision_footnote",
			
 
				-        is_add_index=False,
			
 
				-    )
			
 
				-    text_label_priority = {"vision_footnote": 9999}
			
 
				-    parsing_res_list.sort(
			
 
				-        key=lambda x: (
			
 
				-            x["sub_index"],
			
 
				-            text_label_priority.get(x["sub_label"], 0),
			
 
				-            x["block_bbox"][1],
			
 
				-            x["block_bbox"][0],
			
 
				-        ),
			
 
				-    )
			
 
				+        # header、footnote、header_image... label
			
 
				+        nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
			
 
				 
			
 
				-    for idx, block in enumerate(parsing_res_list):
			
 
				-        block["sub_index"] = idx + 1
			
 
				+        # add all parsing result
			
 
				+        final_parsing_res_list.extend(parsing_res_by_pre_cuts)
			
 
				 
			
 
				-    # header、footnote、header_image... label
			
 
				-    nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
			
 
				+        # update num index
			
 
				+        num_sub_index += len(parsing_res_by_pre_cuts)
			
 
				+        for parsing_res in parsing_res_by_pre_cuts:
			
 
				+            if parsing_res.get("index"):
			
 
				+                num_index += 1
			
 
				 
			
 
				     parsing_res_list = [
			
 
				         {
			
@@ -1779,7 +1798,7 @@ def get_layout_ordering(
 
				             "sub_index": parsing_res["sub_index"],
			
 
				             "index": parsing_res.get("index", None),
			
 
				         }
			
 
				-        for parsing_res in parsing_res_list
			
 
				+        for parsing_res in final_parsing_res_list
			
 
				     ]
			
 
				 
			
 
				     return parsing_res_list
			
@@ -1969,7 +1988,7 @@ def _nearest_edge_distance(
 
				         else:
			
 
				             distance_y = distance[2] * weight[2]
			
 
				         if label in no_mask_labels:
			
 
				-            distance_y = max(0.1, distance_y) * 100
			
 
				+            distance_y = max(0.1, distance_y) * 10  # for abstract
			
 
				     # input_bbox is below match_bbox
			
 
				     elif y1 > y2_prime:
			
 
				         direction_num += 1
			
@@ -2071,33 +2090,11 @@ def _nearest_iou_edge_distance(
 
				         or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
			
 
				     ):
			
 
				         iou_distance = 1
			
 
				-    elif label == "doc_title" or (label in title_labels and title_text):
			
 
				+
			
 
				+    if label == "doc_title":
			
 
				         # Calculate distance for titles
			
 
				         disperse = max(1, median_width)
			
 
				-        width = x2 - x1
			
 
				-        height = y2 - y1
			
 
				-        if horizontal1:
			
 
				-            return (
			
 
				-                _calculate_horizontal_distance(
			
 
				-                    input_bbox,
			
 
				-                    match_bbox,
			
 
				-                    height,
			
 
				-                    disperse,
			
 
				-                    title_text,
			
 
				-                ),
			
 
				-                min_distance_config,
			
 
				-            )
			
 
				-        else:
			
 
				-            return (
			
 
				-                _calculate_vertical_distance(
			
 
				-                    input_bbox,
			
 
				-                    match_bbox,
			
 
				-                    width,
			
 
				-                    disperse,
			
 
				-                    title_text,
			
 
				-                ),
			
 
				-                min_distance_config,
			
 
				-            )
			
 
				+        tolerance_len = max(tolerance_len, disperse)
			
 
				 
			
 
				     # Adjust input_bbox based on sub_title
			
 
				     if sub_title:
			
@@ -2105,19 +2102,36 @@ def _nearest_iou_edge_distance(
 
				             x1_, y1_, x2_, y2_ = sub
			
 
				             x1, y1, x2, y2 = (
			
 
				                 min(x1, x1_),
			
 
				-                min(
			
 
				-                    y1,
			
 
				-                    y1_,
			
 
				-                ),
			
 
				-                max(x2, x2_),
			
 
				+                min(y1, y1_),
			
 
				+                min(x2, x2_),
			
 
				                 max(y2, y2_),
			
 
				             )
			
 
				         input_bbox = [x1, y1, x2, y2]
			
 
				 
			
 
				+    if title_text:
			
 
				+        for sub in title_text:
			
 
				+            x1_, y1_, x2_, y2_ = sub[1]
			
 
				+            if horizontal1:
			
 
				+                x1, y1, x2, y2 = (
			
 
				+                    min(x1, x1_),
			
 
				+                    min(y1, y1_),
			
 
				+                    min(x2, x2_),
			
 
				+                    max(y2, y2_),
			
 
				+                )
			
 
				+            else:
			
 
				+                x1, y1, x2, y2 = (
			
 
				+                    min(x1, x1_),
			
 
				+                    min(y1, y1_),
			
 
				+                    max(x2, x2_),
			
 
				+                    min(y2, y2_),
			
 
				+                )
			
 
				+        input_bbox = [x1, y1, x2, y2]
			
 
				+
			
 
				     # Calculate edge distance
			
 
				     weight = _get_weights(label, horizontal1)
			
 
				     if label == "abstract":
			
 
				-        tolerance_len *= 3
			
 
				+        tolerance_len *= 2
			
 
				+
			
 
				     edge_distance, edge_distance_config = _nearest_edge_distance(
			
 
				         input_bbox,
			
 
				         match_bbox,
			
@@ -2129,13 +2143,13 @@ def _nearest_iou_edge_distance(
 
				     )
			
 
				 
			
 
				     # Weights for combining distances
			
 
				-    iou_edge_weight = [10**6, 10**3, 1, 0.001]
			
 
				+    iou_edge_weight = [10**8, 10**4, 1, 0.0001]
			
 
				 
			
 
				     # Calculate up and left edge distances
			
 
				     up_edge_distance = y1_prime
			
 
				     left_edge_distance = x1_prime
			
 
				     if (
			
 
				-        label in no_mask_labels or label == "paragraph_title" or label in vision_labels
			
 
				+        label in no_mask_labels or label in title_labels or label in vision_labels
			
 
				     ) and y1 > y2_prime:
			
 
				         up_edge_distance = -y2_prime
			
 
				         left_edge_distance = -x2_prime
			
@@ -2155,12 +2169,12 @@ def _nearest_iou_edge_distance(
 
				     # Update minimum distance configuration if a smaller distance is found
			
 
				     if total_distance > distance:
			
 
				         edge_distance_config = [
			
 
				-            min(min_edge_distance_config[0], edge_distance_config[0]),
			
 
				-            min(min_edge_distance_config[1], edge_distance_config[1]),
			
 
				+            edge_distance_config[0],
			
 
				+            edge_distance_config[1],
			
 
				         ]
			
 
				         min_distance_config = [
			
 
				             edge_distance_config,
			
 
				-            min(up_edge_distance, up_edge_distances_config),
			
 
				+            up_edge_distance,
			
 
				             distance,
			
 
				         ]