Просмотр исходного кода

refactor(magic_pdf): improve paragraph splitting logic and update dependencies

- Optimize paragraph splitting algorithm for better text block separation
- Update fast-langdetect dependency to ensure compatibility
myhloli 8 месяцев назад
Родитель
Сommit
842483ccb3
2 измененных файлов с 17 добавлено и 14 удалено
  1. 16 13
      magic_pdf/post_proc/para_split_v3.py
  2. 1 1
      requirements.txt

+ 16 - 13
magic_pdf/post_proc/para_split_v3.py

@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
         ):
             multiple_para_flag = True
 
-        for line in block['lines']:
-            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
-            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
-            if (
-                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
-                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
-            ):
-                external_sides_not_close_num += 1
-            if abs(line_mid_x - block_mid_x) < line_height / 2:
-                center_close_num += 1
+        block_text = ''
 
+        for line in block['lines']:
             line_text = ''
 
             for span in line['spans']:
                 span_type = span['type']
                 if span_type == ContentType.Text:
                     line_text += span['content'].strip()
-
             # 添加所有文本,包括空行,保持与block['lines']长度一致
             lines_text_list.append(line_text)
             block_text = ''.join(lines_text_list)
-            block_lang = detect_lang(block_text)
-            # logger.info(f"block_lang: {block_lang}")
+
+        block_lang = detect_lang(block_text)
+        # logger.info(f"block_lang: {block_lang}")
+
+        for line in block['lines']:
+            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
+            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
+            if (
+                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
+                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
+            ):
+                external_sides_not_close_num += 1
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
+                center_close_num += 1
 
             # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
             if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:

+ 1 - 1
requirements.txt

@@ -1,7 +1,7 @@
 boto3>=1.28.43
 Brotli>=1.1.0
 click>=8.1.7
-fast-langdetect>=0.2.3
+fast-langdetect>=0.2.3,<0.3.0
 loguru>=0.6.0
 numpy>=1.21.6,<2.0.0
 pydantic>=2.7.2