|
|
@@ -0,0 +1,224 @@
|
|
|
+# Copyright (c) Opendatalab. All rights reserved.
|
|
|
+from mineru.utils.boxbase import (
|
|
|
+ calculate_iou,
|
|
|
+ calculate_overlap_area_in_bbox1_area_ratio,
|
|
|
+ calculate_vertical_projection_overlap_ratio,
|
|
|
+ get_minbox_if_overlap_by_ratio
|
|
|
+)
|
|
|
+from mineru.utils.enum_class import BlockType
|
|
|
+
|
|
|
+
|
|
|
+def prepare_block_bboxes(
|
|
|
+ img_body_blocks,
|
|
|
+ img_caption_blocks,
|
|
|
+ img_footnote_blocks,
|
|
|
+ table_body_blocks,
|
|
|
+ table_caption_blocks,
|
|
|
+ table_footnote_blocks,
|
|
|
+ discarded_blocks,
|
|
|
+ text_blocks,
|
|
|
+ title_blocks,
|
|
|
+ interline_equation_blocks,
|
|
|
+ page_w,
|
|
|
+ page_h,
|
|
|
+):
|
|
|
+ all_bboxes = []
|
|
|
+
|
|
|
+ add_bboxes(img_body_blocks, BlockType.IMAGE_BODY, all_bboxes)
|
|
|
+ add_bboxes(img_caption_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
|
|
|
+ add_bboxes(img_footnote_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
|
|
|
+ add_bboxes(table_body_blocks, BlockType.TABLE_BODY, all_bboxes)
|
|
|
+ add_bboxes(table_caption_blocks, BlockType.TABLE_CAPTION, all_bboxes)
|
|
|
+ add_bboxes(table_footnote_blocks, BlockType.TABLE_FOOTNOTE, all_bboxes)
|
|
|
+ add_bboxes(text_blocks, BlockType.TEXT, all_bboxes)
|
|
|
+ add_bboxes(title_blocks, BlockType.TITLE, all_bboxes)
|
|
|
+ add_bboxes(interline_equation_blocks, BlockType.INTERLINE_EQUATION, all_bboxes)
|
|
|
+
|
|
|
+ """block嵌套问题解决"""
|
|
|
+ """文本框与标题框重叠,优先信任文本框"""
|
|
|
+ all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
|
|
+ """任何框体与舍弃框重叠,优先信任舍弃框"""
|
|
|
+ all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
|
|
+
|
|
|
+ # interline_equation 与title或text框冲突的情况,分两种情况处理
|
|
|
+ """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
|
|
|
+ all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
|
|
|
+ """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
|
|
|
+ # 通过后续大框套小框逻辑删除
|
|
|
+
|
|
|
+ """discarded_blocks"""
|
|
|
+ all_discarded_blocks = []
|
|
|
+ add_bboxes(discarded_blocks, BlockType.DISCARDED, all_discarded_blocks)
|
|
|
+
|
|
|
+ """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
|
|
|
+ footnote_blocks = []
|
|
|
+ for discarded in discarded_blocks:
|
|
|
+ x0, y0, x1, y1 = discarded['bbox']
|
|
|
+ if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
|
|
|
+ footnote_blocks.append([x0, y0, x1, y1])
|
|
|
+
|
|
|
+ """移除在footnote下面的任何框"""
|
|
|
+ need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
|
|
|
+ if len(need_remove_blocks) > 0:
|
|
|
+ for block in need_remove_blocks:
|
|
|
+ all_bboxes.remove(block)
|
|
|
+ all_discarded_blocks.append(block)
|
|
|
+
|
|
|
+ """经过以上处理后,还存在大框套小框的情况,则删除小框"""
|
|
|
+ all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
|
|
+ all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
|
|
+ """将剩余的bbox做分离处理,防止后面分layout时出错"""
|
|
|
+ # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
|
|
+ all_bboxes.sort(key=lambda x: x[0]+x[1])
|
|
|
+ return all_bboxes, all_discarded_blocks, footnote_blocks
|
|
|
+
|
|
|
+
|
|
|
+def add_bboxes(blocks, block_type, bboxes):
|
|
|
+ for block in blocks:
|
|
|
+ x0, y0, x1, y1 = block['bbox']
|
|
|
+ if block_type in [
|
|
|
+ BlockType.IMAGE_BODY,
|
|
|
+ BlockType.IMAGE_CAPTION,
|
|
|
+ BlockType.IMAGE_FOOTNOTE,
|
|
|
+ BlockType.TABLE_BODY,
|
|
|
+ BlockType.TABLE_CAPTION,
|
|
|
+ BlockType.TABLE_FOOTNOTE,
|
|
|
+ ]:
|
|
|
+ bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score'], block['group_id']])
|
|
|
+ else:
|
|
|
+ bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score']])
|
|
|
+
|
|
|
+
|
|
|
+def fix_text_overlap_title_blocks(all_bboxes):
|
|
|
+ # 先提取所有text和title block
|
|
|
+ text_blocks = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ if block[7] == BlockType.TEXT:
|
|
|
+ text_blocks.append(block)
|
|
|
+ title_blocks = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ if block[7] == BlockType.TITLE:
|
|
|
+ title_blocks.append(block)
|
|
|
+
|
|
|
+ need_remove = []
|
|
|
+
|
|
|
+ for text_block in text_blocks:
|
|
|
+ for title_block in title_blocks:
|
|
|
+ text_block_bbox = text_block[:4]
|
|
|
+ title_block_bbox = title_block[:4]
|
|
|
+ if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
|
|
|
+ if title_block not in need_remove:
|
|
|
+ need_remove.append(title_block)
|
|
|
+
|
|
|
+ if len(need_remove) > 0:
|
|
|
+ for block in need_remove:
|
|
|
+ all_bboxes.remove(block)
|
|
|
+
|
|
|
+ return all_bboxes
|
|
|
+
|
|
|
+
|
|
|
+def remove_need_drop_blocks(all_bboxes, discarded_blocks):
|
|
|
+ need_remove = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ for discarded_block in discarded_blocks:
|
|
|
+ block_bbox = block[:4]
|
|
|
+ if (
|
|
|
+ calculate_overlap_area_in_bbox1_area_ratio(
|
|
|
+ block_bbox, discarded_block['bbox']
|
|
|
+ )
|
|
|
+ > 0.6
|
|
|
+ ):
|
|
|
+ if block not in need_remove:
|
|
|
+ need_remove.append(block)
|
|
|
+ break
|
|
|
+
|
|
|
+ if len(need_remove) > 0:
|
|
|
+ for block in need_remove:
|
|
|
+ all_bboxes.remove(block)
|
|
|
+ return all_bboxes
|
|
|
+
|
|
|
+
|
|
|
+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
|
|
|
+ # 先提取所有text和interline block
|
|
|
+ text_blocks = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ if block[7] == BlockType.TEXT:
|
|
|
+ text_blocks.append(block)
|
|
|
+ interline_equation_blocks = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ if block[7] == BlockType.INTERLINE_EQUATION:
|
|
|
+ interline_equation_blocks.append(block)
|
|
|
+
|
|
|
+ need_remove = []
|
|
|
+
|
|
|
+ for interline_equation_block in interline_equation_blocks:
|
|
|
+ for text_block in text_blocks:
|
|
|
+ interline_equation_block_bbox = interline_equation_block[:4]
|
|
|
+ text_block_bbox = text_block[:4]
|
|
|
+ if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
|
|
|
+ if text_block not in need_remove:
|
|
|
+ need_remove.append(text_block)
|
|
|
+
|
|
|
+ if len(need_remove) > 0:
|
|
|
+ for block in need_remove:
|
|
|
+ all_bboxes.remove(block)
|
|
|
+
|
|
|
+ return all_bboxes
|
|
|
+
|
|
|
+
|
|
|
+def find_blocks_under_footnote(all_bboxes, footnote_blocks):
|
|
|
+ need_remove_blocks = []
|
|
|
+ for block in all_bboxes:
|
|
|
+ block_x0, block_y0, block_x1, block_y1 = block[:4]
|
|
|
+ for footnote_bbox in footnote_blocks:
|
|
|
+ footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
|
|
|
+ # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
|
|
|
+ if (
|
|
|
+ block_y0 >= footnote_y1
|
|
|
+ and calculate_vertical_projection_overlap_ratio(
|
|
|
+ (block_x0, block_y0, block_x1, block_y1), footnote_bbox
|
|
|
+ )
|
|
|
+ >= 0.8
|
|
|
+ ):
|
|
|
+ if block not in need_remove_blocks:
|
|
|
+ need_remove_blocks.append(block)
|
|
|
+ break
|
|
|
+ return need_remove_blocks
|
|
|
+
|
|
|
+
|
|
|
+def remove_overlaps_min_blocks(all_bboxes):
|
|
|
+ # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
|
|
|
+ # 删除重叠blocks中较小的那些
|
|
|
+ need_remove = []
|
|
|
+ for block1 in all_bboxes:
|
|
|
+ for block2 in all_bboxes:
|
|
|
+ if block1 != block2:
|
|
|
+ block1_bbox = block1[:4]
|
|
|
+ block2_bbox = block2[:4]
|
|
|
+ overlap_box = get_minbox_if_overlap_by_ratio(
|
|
|
+ block1_bbox, block2_bbox, 0.8
|
|
|
+ )
|
|
|
+ if overlap_box is not None:
|
|
|
+ block_to_remove = next(
|
|
|
+ (block for block in all_bboxes if block[:4] == overlap_box),
|
|
|
+ None,
|
|
|
+ )
|
|
|
+ if (
|
|
|
+ block_to_remove is not None
|
|
|
+ and block_to_remove not in need_remove
|
|
|
+ ):
|
|
|
+ large_block = block1 if block1 != block_to_remove else block2
|
|
|
+ x1, y1, x2, y2 = large_block[:4]
|
|
|
+ sx1, sy1, sx2, sy2 = block_to_remove[:4]
|
|
|
+ x1 = min(x1, sx1)
|
|
|
+ y1 = min(y1, sy1)
|
|
|
+ x2 = max(x2, sx2)
|
|
|
+ y2 = max(y2, sy2)
|
|
|
+ large_block[:4] = [x1, y1, x2, y2]
|
|
|
+ need_remove.append(block_to_remove)
|
|
|
+
|
|
|
+ if len(need_remove) > 0:
|
|
|
+ for block in need_remove:
|
|
|
+ all_bboxes.remove(block)
|
|
|
+
|
|
|
+ return all_bboxes
|