Kaynağa Gözat

refactor(para): improve language detection and block splitting

- Add language detection for each block of text
- Implement language-specific logic for right margin alignment
- Introduce logging for debugging purposes
myhloli 11 ay önce
ebeveyn
işleme
f674b8d413
1 değiştirilmiş dosya ile 14 ekleme ve 5 silme
  1. 14 5
      magic_pdf/para/para_split_v3.py

+ 14 - 5
magic_pdf/para/para_split_v3.py

@@ -1,7 +1,10 @@
 import copy
 
+from loguru import logger
+
 from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
+from magic_pdf.libs.language import detect_lang
 
 LINE_STOP_FLAG = (
     '.',
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
 
             # 添加所有文本,包括空行,保持与block['lines']长度一致
             lines_text_list.append(line_text)
+            block_text = ''.join(lines_text_list)
+            block_lang = detect_lang(block_text)
+            # logger.info(f"block_lang: {block_lang}")
 
             # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
             if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
             if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
                 right_close_num += 1
             else:
-                # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
-                # block宽的阈值可以小些,block窄的阈值要大
-
-                if block_weight_radio >= 0.5:
+                # 类中文没有超长单词的情况,可以用统一的阈值
+                if block_lang in ['zh', 'ja', 'ko']:
                     closed_area = 0.26 * block_weight
                 else:
-                    closed_area = 0.36 * block_weight
+                    # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
+                    # block宽的阈值可以小些,block窄的阈值要大
+                    if block_weight_radio >= 0.5:
+                        closed_area = 0.26 * block_weight
+                    else:
+                        closed_area = 0.36 * block_weight
                 if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
                     right_not_close_num += 1