Explorar o código

refactor: implement block preprocessing utilities for improved bounding box management

myhloli hai 5 meses
pai
achega
ae7b0a6eba

+ 61 - 2
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -1,4 +1,5 @@
 # Copyright (c) Opendatalab. All rights reserved.
+from mineru.utils.block_pre_proc import prepare_block_bboxes
 from mineru.utils.pipeline_magic_model import MagicModel
 from mineru.version import __version__
 from mineru.utils.hash_utils import str_md5
@@ -8,9 +9,51 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
     scale = image_dict["scale"]
     page_pil_img = image_dict["img_pil"]
     page_img_md5 = str_md5(image_dict["img_base64"])
-    width, height = map(int, page.get_size())
+    page_w, page_h = map(int, page.get_size())
     magic_model = MagicModel(page_model_info, scale)
 
+    """从magic_model对象中获取后面会用到的区块信息"""
+    img_groups = magic_model.get_imgs()
+    table_groups = magic_model.get_tables()
+
+    """对image和table的区块分组"""
+    img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
+        img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
+    )
+
+    table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
+        table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
+    )
+
+    discarded_blocks = magic_model.get_discarded()
+    text_blocks = magic_model.get_text_blocks()
+    title_blocks = magic_model.get_title_blocks()
+    inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations()
+
+    """将所有区块的bbox整理到一起"""
+    interline_equation_blocks = []
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
+            img_body_blocks, img_caption_blocks, img_footnote_blocks,
+            table_body_blocks, table_caption_blocks, table_footnote_blocks,
+            discarded_blocks,
+            text_blocks,
+            title_blocks,
+            interline_equation_blocks,
+            page_w,
+            page_h,
+        )
+    else:
+        all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
+            img_body_blocks, img_caption_blocks, img_footnote_blocks,
+            table_body_blocks, table_caption_blocks, table_footnote_blocks,
+            discarded_blocks,
+            text_blocks,
+            title_blocks,
+            interline_equations,
+            page_w,
+            page_h,
+        )
 
 
 def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False):
@@ -22,4 +65,20 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
             page_model_info, image_dict, page, image_writer, page_index, lang=lang, ocr=ocr
         )
         middle_json["pdf_info"].append(page_info)
-    return middle_json
+    return middle_json
+
+
+def process_groups(groups, body_key, caption_key, footnote_key):
+    body_blocks = []
+    caption_blocks = []
+    footnote_blocks = []
+    for i, group in enumerate(groups):
+        group[body_key]['group_id'] = i
+        body_blocks.append(group[body_key])
+        for caption_block in group[caption_key]:
+            caption_block['group_id'] = i
+            caption_blocks.append(caption_block)
+        for footnote_block in group[footnote_key]:
+            footnote_block['group_id'] = i
+            footnote_blocks.append(footnote_block)
+    return body_blocks, caption_blocks, footnote_blocks

+ 224 - 0
mineru/utils/block_pre_proc.py

@@ -0,0 +1,224 @@
+# Copyright (c) Opendatalab. All rights reserved.
+from mineru.utils.boxbase import (
+    calculate_iou,
+    calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_vertical_projection_overlap_ratio,
+    get_minbox_if_overlap_by_ratio
+)
+from mineru.utils.enum_class import BlockType
+
+
+def prepare_block_bboxes(
+    img_body_blocks,
+    img_caption_blocks,
+    img_footnote_blocks,
+    table_body_blocks,
+    table_caption_blocks,
+    table_footnote_blocks,
+    discarded_blocks,
+    text_blocks,
+    title_blocks,
+    interline_equation_blocks,
+    page_w,
+    page_h,
+):
+    all_bboxes = []
+
+    add_bboxes(img_body_blocks, BlockType.IMAGE_BODY, all_bboxes)
+    add_bboxes(img_caption_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
+    add_bboxes(img_footnote_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
+    add_bboxes(table_body_blocks, BlockType.TABLE_BODY, all_bboxes)
+    add_bboxes(table_caption_blocks, BlockType.TABLE_CAPTION, all_bboxes)
+    add_bboxes(table_footnote_blocks, BlockType.TABLE_FOOTNOTE, all_bboxes)
+    add_bboxes(text_blocks, BlockType.TEXT, all_bboxes)
+    add_bboxes(title_blocks, BlockType.TITLE, all_bboxes)
+    add_bboxes(interline_equation_blocks, BlockType.INTERLINE_EQUATION, all_bboxes)
+
+    """block嵌套问题解决"""
+    """文本框与标题框重叠,优先信任文本框"""
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    """任何框体与舍弃框重叠,优先信任舍弃框"""
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+
+    # interline_equation 与title或text框冲突的情况,分两种情况处理
+    """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
+    """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
+    # 通过后续大框套小框逻辑删除
+
+    """discarded_blocks"""
+    all_discarded_blocks = []
+    add_bboxes(discarded_blocks, BlockType.DISCARDED, all_discarded_blocks)
+
+    """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
+    footnote_blocks = []
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
+            footnote_blocks.append([x0, y0, x1, y1])
+
+    """移除在footnote下面的任何框"""
+    need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
+    if len(need_remove_blocks) > 0:
+        for block in need_remove_blocks:
+            all_bboxes.remove(block)
+            all_discarded_blocks.append(block)
+
+    """经过以上处理后,还存在大框套小框的情况,则删除小框"""
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    """将剩余的bbox做分离处理,防止后面分layout时出错"""
+    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    all_bboxes.sort(key=lambda x: x[0]+x[1])
+    return all_bboxes, all_discarded_blocks, footnote_blocks
+
+
+def add_bboxes(blocks, block_type, bboxes):
+    for block in blocks:
+        x0, y0, x1, y1 = block['bbox']
+        if block_type in [
+            BlockType.IMAGE_BODY,
+            BlockType.IMAGE_CAPTION,
+            BlockType.IMAGE_FOOTNOTE,
+            BlockType.TABLE_BODY,
+            BlockType.TABLE_CAPTION,
+            BlockType.TABLE_FOOTNOTE,
+        ]:
+            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score'], block['group_id']])
+        else:
+            bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score']])
+
+
+def fix_text_overlap_title_blocks(all_bboxes):
+    # 先提取所有text和title block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.TEXT:
+            text_blocks.append(block)
+    title_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.TITLE:
+            title_blocks.append(block)
+
+    need_remove = []
+
+    for text_block in text_blocks:
+        for title_block in title_blocks:
+            text_block_bbox = text_block[:4]
+            title_block_bbox = title_block[:4]
+            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
+                if title_block not in need_remove:
+                    need_remove.append(title_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
+
+
+def remove_need_drop_blocks(all_bboxes, discarded_blocks):
+    need_remove = []
+    for block in all_bboxes:
+        for discarded_block in discarded_blocks:
+            block_bbox = block[:4]
+            if (
+                calculate_overlap_area_in_bbox1_area_ratio(
+                    block_bbox, discarded_block['bbox']
+                )
+                > 0.6
+            ):
+                if block not in need_remove:
+                    need_remove.append(block)
+                    break
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes
+
+
+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
+    # 先提取所有text和interline block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.TEXT:
+            text_blocks.append(block)
+    interline_equation_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.INTERLINE_EQUATION:
+            interline_equation_blocks.append(block)
+
+    need_remove = []
+
+    for interline_equation_block in interline_equation_blocks:
+        for text_block in text_blocks:
+            interline_equation_block_bbox = interline_equation_block[:4]
+            text_block_bbox = text_block[:4]
+            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
+                if text_block not in need_remove:
+                    need_remove.append(text_block)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
+
+
+def find_blocks_under_footnote(all_bboxes, footnote_blocks):
+    need_remove_blocks = []
+    for block in all_bboxes:
+        block_x0, block_y0, block_x1, block_y1 = block[:4]
+        for footnote_bbox in footnote_blocks:
+            footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
+            # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
+            if (
+                block_y0 >= footnote_y1
+                and calculate_vertical_projection_overlap_ratio(
+                    (block_x0, block_y0, block_x1, block_y1), footnote_bbox
+                )
+                >= 0.8
+            ):
+                if block not in need_remove_blocks:
+                    need_remove_blocks.append(block)
+                    break
+    return need_remove_blocks
+
+
+def remove_overlaps_min_blocks(all_bboxes):
+    #  重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
+    #  删除重叠blocks中较小的那些
+    need_remove = []
+    for block1 in all_bboxes:
+        for block2 in all_bboxes:
+            if block1 != block2:
+                block1_bbox = block1[:4]
+                block2_bbox = block2[:4]
+                overlap_box = get_minbox_if_overlap_by_ratio(
+                    block1_bbox, block2_bbox, 0.8
+                )
+                if overlap_box is not None:
+                    block_to_remove = next(
+                        (block for block in all_bboxes if block[:4] == overlap_box),
+                        None,
+                    )
+                    if (
+                        block_to_remove is not None
+                        and block_to_remove not in need_remove
+                    ):
+                        large_block = block1 if block1 != block_to_remove else block2
+                        x1, y1, x2, y2 = large_block[:4]
+                        sx1, sy1, sx2, sy2 = block_to_remove[:4]
+                        x1 = min(x1, sx1)
+                        y1 = min(y1, sy1)
+                        x2 = max(x2, sx2)
+                        y2 = max(y2, sy2)
+                        large_block[:4] = [x1, y1, x2, y2]
+                        need_remove.append(block_to_remove)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes

+ 56 - 1
mineru/utils/boxbase.py

@@ -156,4 +156,59 @@ def _is_in(box1, box2) -> bool:
     return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
             y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
             x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
-            y1_1 <= y1_2)  # box1的下边界不在box2的下边外
+            y1_1 <= y1_2)  # box1的下边界不在box2的下边外
+
+
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """计算box1和box2的重叠面积占bbox1的比例."""
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+
+
+def calculate_vertical_projection_overlap_ratio(block1, block2):
+    """
+    Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
+
+    Args:
+        block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
+        block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
+
+    Returns:
+        float: The proportion of the x-axis covered by the vertical projection of the two blocks.
+    """
+    x0_1, _, x1_1, _ = block1
+    x0_2, _, x1_2, _ = block2
+
+    # Calculate the intersection of the x-coordinates
+    x_left = max(x0_1, x0_2)
+    x_right = min(x1_1, x1_2)
+
+    if x_right < x_left:
+        return 0.0
+
+    # Length of the intersection
+    intersection_length = x_right - x_left
+
+    # Length of the x-axis projection of the first block
+    block1_length = x1_1 - x0_1
+
+    if block1_length == 0:
+        return 0.0
+
+    # Proportion of the x-axis covered by the intersection
+    # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
+    return intersection_length / block1_length