소스 검색

refactor(magic_pdf): improve title block merging logic

- Rename and update merge_title_blocks function
- Implement merge_two_bbox helper function
- Refactor merging logic to preserve original block structure- Update function calls and integrate with existing pipeline
myhloli 10 달 전
부모
커밋
8570e006f8
1개의 변경된 파일33개의 추가작업 그리고 21개의 파일을 삭제
  1. 33 21
      magic_pdf/pdf_parse_union_core_v2.py

+ 33 - 21
magic_pdf/pdf_parse_union_core_v2.py

@@ -674,38 +674,48 @@ def parse_page_core(
     page_w, page_h = magic_model.get_page_size(page_id)
 
     def merge_title_blocks(blocks, x_distance_threshold=0.1*page_w):
-        def merge_two_blocks(b1, b2):
-            # 合并两个标题块的边界框
+        def merge_two_bbox(b1, b2):
             x_min = min(b1['bbox'][0], b2['bbox'][0])
             y_min = min(b1['bbox'][1], b2['bbox'][1])
             x_max = max(b1['bbox'][2], b2['bbox'][2])
             y_max = max(b1['bbox'][3], b2['bbox'][3])
-            merged_bbox = (x_min, y_min, x_max, y_max)
+            return x_min, y_min, x_max, y_max
+
+        def merge_two_blocks(b1, b2):
+            # 合并两个标题块的边界框
+            b1['bbox'] = merge_two_bbox(b1, b2)
 
             # 合并两个标题块的文本内容
-            merged_score = (b1['score'] + b2['score']) / 2
+            line1 = b1['lines'][0]
+            line2 = b2['lines'][0]
+            line1['bbox'] = merge_two_bbox(line1, line2)
+            line1['spans'].extend(line2['spans'])
 
-            return {'bbox': merged_bbox, 'score': merged_score}
+            return b1, b2
 
         # 按 y 轴重叠度聚集标题块
         y_overlapping_blocks = []
-        while blocks:
-            block1 = blocks.pop(0)
+        title_bs = [b for b in blocks if b['type'] == BlockType.Title]
+        while title_bs:
+            block1 = title_bs.pop(0)
             current_row = [block1]
             to_remove = []
-            for block2 in blocks:
-                if __is_overlaps_y_exceeds_threshold(block1['bbox'], block2['bbox'], 0.9):
+            for block2 in title_bs:
+                if (
+                    __is_overlaps_y_exceeds_threshold(block1['bbox'], block2['bbox'], 0.9)
+                    and len(block1['lines']) == 1
+                    and len(block2['lines']) == 1
+                ):
                     current_row.append(block2)
                     to_remove.append(block2)
             for b in to_remove:
-                blocks.remove(b)
+                title_bs.remove(b)
             y_overlapping_blocks.append(current_row)
 
         # 按x轴坐标排序并合并标题块
-        merged_blocks = []
+        to_remove_blocks = []
         for row in y_overlapping_blocks:
             if len(row) == 1:
-                merged_blocks.append(row[0])
                 continue
 
             # 按x轴坐标排序
@@ -719,18 +729,17 @@ def parse_page_core(
                 left_height = left_block['bbox'][3] - left_block['bbox'][1]
                 right_height = right_block['bbox'][3] - right_block['bbox'][1]
 
-                if right_block['bbox'][0] - left_block['bbox'][2] < x_distance_threshold and left_height * 0.95 < right_height < left_height * 1.05:
-                    merged_block = merge_two_blocks(merged_block, right_block)
+                if (
+                    right_block['bbox'][0] - left_block['bbox'][2] < x_distance_threshold
+                    and left_height * 0.95 < right_height < left_height * 1.05
+                ):
+                    merged_block, to_remove_block = merge_two_blocks(merged_block, right_block)
+                    to_remove_blocks.append(to_remove_block)
                 else:
-                    merged_blocks.append(merged_block)
                     merged_block = right_block
 
-            merged_blocks.append(merged_block)
-
-        return merged_blocks
-
-    """同一行被断开的titile合并"""
-    title_blocks = merge_title_blocks(title_blocks)
+        for b in to_remove_blocks:
+            blocks.remove(b)
 
     """将所有区块的bbox整理到一起"""
     # interline_equation_blocks参数不够准,后面切换到interline_equations上
@@ -816,6 +825,9 @@ def parse_page_core(
     """对block进行fix操作"""
     fix_blocks = fix_block_spans_v2(block_with_spans)
 
+    """同一行被断开的titile合并"""
+    merge_title_blocks(fix_blocks)
+
     """获取所有line并计算正文line的高度"""
     line_height = get_line_height(fix_blocks)