瀏覽代碼

refactor(mkcontent): optimize paragraph text merging and language detection

- Extract language detection to block level instead of line level
- Improve logic for handling Chinese, Japanese, and Korean languages
- Refactor code for better readability and performance
- Optimize handling of hyphenated words at line ends
myhloli 11 月之前
父節點
當前提交
b80befe9cf
共有 1 個文件被更改,包括 14 次插入12 次删除
  1. 14 12
      magic_pdf/dict2md/ocr_mkcontent.py

+ 14 - 12
magic_pdf/dict2md/ocr_mkcontent.py

@@ -136,21 +136,19 @@ def __replace_ligatures(text: str):
 
 
 def merge_para_with_text(para_block):
+    block_text = ''
+    for line in para_block['lines']:
+        for span in line['spans']:
+            if span['type'] in [ContentType.Text]:
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+
     para_text = ''
     for i, line in enumerate(para_block['lines']):
 
         if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
             para_text += '  \n'
 
-        line_text = ''
-        line_lang = ''
-        for span in line['spans']:
-            span_type = span['type']
-            if span_type == ContentType.Text:
-                line_text += span['content'].strip()
-
-        if line_text != '':
-            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
 
             span_type = span['type']
@@ -166,12 +164,16 @@ def merge_para_with_text(para_block):
 
             if content:
                 langs = ['zh', 'ja', 'ko']
-                if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
-                    para_text += content if j == len(line['spans']) - 1 else f'{content} '
+                # logger.info(f'block_lang: {block_lang}, content: {content}')
+                if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
+                    if j == len(line['spans']) - 1:
+                        para_text += content
+                    else:
+                        para_text += f'{content} '
                 else:
                     if span_type in [ContentType.Text, ContentType.InlineEquation]:
                         # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                        if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
                             para_text += content[:-1]
                         else:  # 西方文本语境下 content间需要空格分隔
                             para_text += f'{content} '