瀏覽代碼

feat(pre_proc): add block type compatibility check for span allocation

- Introduce span_block_type_compatible function to check compatibility between span and block types
- Update fill_spans_in_blocks function to use the new compatibility check
- Improve accuracy of span allocation to blocks based on content type
myhloli 8 月之前
父節點
當前提交
19916856e7
共有 1 個文件被更改,包括 14 次插入2 次删除
  1. 14 2
      magic_pdf/pre_proc/ocr_dict_merge.py

+ 14 - 2
magic_pdf/pre_proc/ocr_dict_merge.py

@@ -60,6 +60,19 @@ def merge_spans_to_line(spans, threshold=0.6):
         return lines
 
 
+def span_block_type_compatible(span_type, block_type):
+    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
+    elif span_type == ContentType.InterlineEquation:
+        return block_type in [BlockType.InterlineEquation]
+    elif span_type == ContentType.Image:
+        return block_type in [BlockType.ImageBody]
+    elif span_type == ContentType.Table:
+        return block_type in [BlockType.TableBody]
+    else:
+        return False
+
+
 def fill_spans_in_blocks(blocks, spans, radio):
     """将allspans中的span按位置关系,放入blocks中."""
     block_with_spans = []
@@ -78,8 +91,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
         block_spans = []
         for span in spans:
             span_bbox = span['bbox']
-            if calculate_overlap_area_in_bbox1_area_ratio(
-                    span_bbox, block_bbox) > radio:
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
                 block_spans.append(span)
 
         block_dict['spans'] = block_spans