Forráskód Böngészése

fix: some text char removed by interline_equations overlap

赵小蒙 1 éve
szülő
commit
3c145ba0ca

+ 5 - 4
magic_pdf/pre_proc/equations_replace.py

@@ -107,6 +107,7 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
         or y0_1 > y1_2
     )  # box1在box2的下边
 
+
 def remove_text_block_overlap_interline_equation_bbox(
     interline_eq_bboxes, pymu_block_list
 ):
@@ -122,10 +123,10 @@ def remove_text_block_overlap_interline_equation_bbox(
                 deleted_chars = []
                 for char in span["chars"]:
                     if any(
-                        [
-                            _is_in_or_part_overlap(char["bbox"], eq_bbox["bbox"])
-                            for eq_bbox in interline_eq_bboxes
-                        ]
+                            [
+                                (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
+                                for eq_bbox in interline_eq_bboxes
+                            ]
                     ):
                         deleted_chars.append(char)
                 # 检查span里没有char则删除这个span

+ 19 - 5
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -160,12 +160,12 @@ def fill_spans_in_blocks(blocks, spans, radio):
                 block_spans.append(span)
 
         '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
-        displayed_list = []
-        text_inline_lines = []
-        modify_y_axis(block_spans, displayed_list, text_inline_lines)
+        # displayed_list = []
+        # text_inline_lines = []
+        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
 
         '''模型识别错误的行间公式, type类型转换成行内公式'''
-        block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
+        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
 
         '''bbox去除粘连'''  # 去粘连会影响span的bbox,导致后续fill的时候出错
         # block_spans = remove_overlap_between_bbox_for_span(block_spans)
@@ -196,8 +196,10 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
             block = fix_image_block(block, img_blocks)
         elif block_type == BlockType.Table:
             block = fix_table_block(block, table_blocks)
-        elif block_type in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+        elif block_type in [BlockType.Text, BlockType.Title]:
             block = fix_text_block(block)
+        elif block_type == BlockType.InterlineEquation:
+            block = fix_interline_block(block)
         else:
             continue
         fix_blocks.append(block)
@@ -315,6 +317,18 @@ def fix_table_block(block, table_blocks):
 
 
 def fix_text_block(block):
+    # 文本block中的公式span都应该转换成行内type
+    for span in block['spans']:
+        if span['type'] == ContentType.InterlineEquation:
+            span['type'] = ContentType.InlineEquation
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+
+def fix_interline_block(block):
     block_lines = merge_spans_to_line(block['spans'])
     sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
     block['lines'] = sort_block_lines