|
|
@@ -1,7 +1,7 @@
|
|
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
|
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
|
|
from mineru.utils.enum_class import BlockType, ContentType
|
|
|
-from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold
|
|
|
+from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold, __is_overlaps_x_exceeds_threshold
|
|
|
|
|
|
|
|
|
def fill_spans_in_blocks(blocks, spans, radio):
|
|
|
@@ -71,8 +71,26 @@ def fix_text_block(block):
|
|
|
for span in block['spans']:
|
|
|
if span['type'] == ContentType.INTERLINE_EQUATION:
|
|
|
span['type'] = ContentType.INLINE_EQUATION
|
|
|
- block_lines = merge_spans_to_line(block['spans'])
|
|
|
- sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
|
|
+
|
|
|
+ # 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块
|
|
|
+ vertical_span_count = sum(
|
|
|
+ 1 for span in block['spans']
|
|
|
+ if (span['bbox'][3] - span['bbox'][1]) / (span['bbox'][2] - span['bbox'][0]) > 2
|
|
|
+ )
|
|
|
+ total_span_count = len(block['spans'])
|
|
|
+ if total_span_count == 0:
|
|
|
+ vertical_ratio = 0
|
|
|
+ else:
|
|
|
+ vertical_ratio = vertical_span_count / total_span_count
|
|
|
+
|
|
|
+ if vertical_ratio > 0.8:
|
|
|
+ # 如果是纵向文本块,则按纵向lines处理
|
|
|
+ block_lines = merge_spans_to_vertical_line(block['spans'])
|
|
|
+ sort_block_lines = vertical_line_sort_spans_from_top_to_bottom(block_lines)
|
|
|
+ else:
|
|
|
+ block_lines = merge_spans_to_line(block['spans'])
|
|
|
+ sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
|
|
+
|
|
|
block['lines'] = sort_block_lines
|
|
|
del block['spans']
|
|
|
return block
|
|
|
@@ -117,6 +135,44 @@ def merge_spans_to_line(spans, threshold=0.6):
|
|
|
return lines
|
|
|
|
|
|
|
|
|
+def merge_spans_to_vertical_line(spans, threshold=0.6):
|
|
|
+ """将纵向文本的spans合并成纵向lines(从右向左阅读)"""
|
|
|
+ if len(spans) == 0:
|
|
|
+ return []
|
|
|
+ else:
|
|
|
+ # 按照x2坐标从大到小排序(从右向左)
|
|
|
+ spans.sort(key=lambda span: span['bbox'][2], reverse=True)
|
|
|
+
|
|
|
+ vertical_lines = []
|
|
|
+ current_line = [spans[0]]
|
|
|
+
|
|
|
+ for span in spans[1:]:
|
|
|
+ # 特殊类型元素单独成列
|
|
|
+ if span['type'] in [
|
|
|
+ ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
|
|
|
+ ContentType.TABLE
|
|
|
+ ] or any(s['type'] in [
|
|
|
+ ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
|
|
|
+ ContentType.TABLE
|
|
|
+ ] for s in current_line):
|
|
|
+ vertical_lines.append(current_line)
|
|
|
+ current_line = [span]
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
|
|
+ if __is_overlaps_x_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
|
|
|
+ current_line.append(span)
|
|
|
+ else:
|
|
|
+ vertical_lines.append(current_line)
|
|
|
+ current_line = [span]
|
|
|
+
|
|
|
+ # 添加最后一列
|
|
|
+ if current_line:
|
|
|
+ vertical_lines.append(current_line)
|
|
|
+
|
|
|
+ return vertical_lines
|
|
|
+
|
|
|
+
|
|
|
# 将每一个line中的span从左到右排序
|
|
|
def line_sort_spans_by_left_to_right(lines):
|
|
|
line_objects = []
|
|
|
@@ -136,6 +192,28 @@ def line_sort_spans_by_left_to_right(lines):
|
|
|
return line_objects
|
|
|
|
|
|
|
|
|
+def vertical_line_sort_spans_from_top_to_bottom(vertical_lines):
|
|
|
+ line_objects = []
|
|
|
+ for line in vertical_lines:
|
|
|
+ # 按照y0坐标排序(从上到下)
|
|
|
+ line.sort(key=lambda span: span['bbox'][1])
|
|
|
+
|
|
|
+ # 计算整个列的边界框
|
|
|
+ line_bbox = [
|
|
|
+ min(span['bbox'][0] for span in line), # x0
|
|
|
+ min(span['bbox'][1] for span in line), # y0
|
|
|
+ max(span['bbox'][2] for span in line), # x1
|
|
|
+ max(span['bbox'][3] for span in line), # y1
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 组装结果
|
|
|
+ line_objects.append({
|
|
|
+ 'bbox': line_bbox,
|
|
|
+ 'spans': line,
|
|
|
+ })
|
|
|
+ return line_objects
|
|
|
+
|
|
|
+
|
|
|
def fix_block_spans(block_with_spans):
|
|
|
fix_blocks = []
|
|
|
for block in block_with_spans:
|