Bladeren bron

fix(pdf_parse): improve span removal logic for all content types

- Update remove_outside_spans function to handle all content types
- Add processing for text and equation spans
- Improve overlap calculation for better accuracy
myhloli 1 jaar geleden
bovenliggende
commit
eeda90af31
1 gewijzigde bestanden met toevoegingen van 8 en 0 verwijderingen
  1. 8 0
      magic_pdf/pdf_parse_union_core_v2.py

+ 8 - 0
magic_pdf/pdf_parse_union_core_v2.py

@@ -385,9 +385,11 @@ def revert_group_blocks(blocks):
 def remove_outside_spans(spans, all_bboxes):
     image_bboxes = []
     table_bboxes = []
+    all_block_bboxes = []
     for block in all_bboxes:
         block_type = block[7]
         block_bbox = block[0:4]
+        all_block_bboxes.append(block_bbox)
         if block_type == BlockType.ImageBody:
             image_bboxes.append(block_bbox)
         elif block_type == BlockType.TableBody:
@@ -396,6 +398,7 @@ def remove_outside_spans(spans, all_bboxes):
             continue
 
     new_spans = []
+
     for span in spans:
         if span['type'] == ContentType.Image:
             for block_bbox in image_bboxes:
@@ -407,6 +410,11 @@ def remove_outside_spans(spans, all_bboxes):
                 if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
                     new_spans.append(span)
                     break
+        elif span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
+            for block_bbox in all_block_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                    new_spans.append(span)
+                    break
         else:
             new_spans.append(span)