Przeglądaj źródła

Merge pull request #830 from myhloli/dev

fix(pdf_parse): improve span removal logic for all content types
Xiaomeng Zhao 1 rok temu
rodzic
commit
73afb7d6e1
1 zmienionych plików z 8 dodań i 2 usunięć
  1. 8 2
      magic_pdf/pdf_parse_union_core_v2.py

+ 8 - 2
magic_pdf/pdf_parse_union_core_v2.py

@@ -385,17 +385,20 @@ def revert_group_blocks(blocks):
 def remove_outside_spans(spans, all_bboxes):
     image_bboxes = []
     table_bboxes = []
+    other_block_bboxes = []
     for block in all_bboxes:
         block_type = block[7]
         block_bbox = block[0:4]
+
         if block_type == BlockType.ImageBody:
             image_bboxes.append(block_bbox)
         elif block_type == BlockType.TableBody:
             table_bboxes.append(block_bbox)
         else:
-            continue
+            other_block_bboxes.append(block_bbox)
 
     new_spans = []
+
     for span in spans:
         if span['type'] == ContentType.Image:
             for block_bbox in image_bboxes:
@@ -408,7 +411,10 @@ def remove_outside_spans(spans, all_bboxes):
                     new_spans.append(span)
                     break
         else:
-            new_spans.append(span)
+            for block_bbox in other_block_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                    new_spans.append(span)
+                    break
 
     return new_spans