Ver código fonte

fix lost image bug

赵小蒙 1 ano atrás
pai
commit
f1252e71bd
1 arquivos alterados com 6 adições e 5 exclusões
  1. 6 5
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 6 - 5
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -1,7 +1,7 @@
 from loguru import logger
 from loguru import logger
 
 
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
-    calculate_overlap_area_in_bbox1_area_ratio
+    calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
 from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
@@ -247,10 +247,11 @@ def fix_image_block(block, img_blocks):
     block['blocks'] = []
     block['blocks'] = []
     # 遍历img_blocks,找到与当前block匹配的img_block
     # 遍历img_blocks,找到与当前block匹配的img_block
     for img_block in img_blocks:
     for img_block in img_blocks:
-        if img_block['bbox'] == block['bbox']:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
+
             # 创建img_body_block
             # 创建img_body_block
             for span in block['spans']:
             for span in block['spans']:
-                if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
+                if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
                     # 创建img_body_block
                     # 创建img_body_block
                     img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
                     img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
                     block['blocks'].append(img_body_block)
                     block['blocks'].append(img_body_block)
@@ -275,11 +276,11 @@ def fix_table_block(block, table_blocks):
     block['blocks'] = []
     block['blocks'] = []
     # 遍历table_blocks,找到与当前block匹配的table_block
     # 遍历table_blocks,找到与当前block匹配的table_block
     for table_block in table_blocks:
     for table_block in table_blocks:
-        if table_block['bbox'] == block['bbox']:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
 
 
             # 创建table_body_block
             # 创建table_body_block
             for span in block['spans']:
             for span in block['spans']:
-                if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
+                if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
                     # 创建table_body_block
                     # 创建table_body_block
                     table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
                     table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
                     block['blocks'].append(table_body_block)
                     block['blocks'].append(table_body_block)