Przeglądaj źródła

refactor(para): improve block merging logic in para_split_v3.py

- Add checks for uppercase character start in the first span of a block
myhloli 11 miesięcy temu
rodzic
commit
160624bd36
1 zmienionych plików z 7 dodań i 2 usunięć
  1. 7 2
      magic_pdf/para/para_split_v3.py

+ 7 - 2
magic_pdf/para/para_split_v3.py

@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
                     first_span = first_line['spans'][0]
                     if len(first_span['content']) > 0:
                         span_start_with_num = first_span['content'][0].isdigit()
+                        span_start_with_big_char = first_span['content'][0].isupper()
                         if (
-                            abs(block2['bbox_fs'][2] - last_line['bbox'][2])
-                            < line_height
+                            # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
+                            abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
+                            # 上一个block的最后一个span不是以特定符号结尾
                             and not last_span['content'].endswith(LINE_STOP_FLAG)
                             # 两个block宽度差距超过2倍也不合并
                             and abs(block1_weight - block2_weight) < min_block_weight
+                            # 下一个block的第一个字符是数字
                             and not span_start_with_num
+                            # 下一个block的第一个字符是大写字母
+                            and not span_start_with_big_char
                         ):
                             if block1['page_num'] != block2['page_num']:
                                 for line in block1['lines']: