11 ay önce · c8cabb3cf6
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -5,6 +5,7 @@ from loguru import logger
 
															 from magic_pdf.config.make_content_config import DropMode, MakeMode
														
 
															 from magic_pdf.config.ocr_content_type import BlockType, ContentType
														
 
															 from magic_pdf.libs.commons import join_path
														
 
															+from magic_pdf.libs.language import detect_lang
														
 
															 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
														
 
															 from magic_pdf.para.para_split_v3 import ListLineTag
														
@@ -142,11 +143,14 @@ def merge_para_with_text(para_block):
 
															             para_text += '  \n'
														
 
															         line_text = ''
														
 
															+        line_lang = ''
														
 
															         for span in line['spans']:
														
 
															             span_type = span['type']
														
 
															             if span_type == ContentType.Text:
														
 
															                 line_text += span['content'].strip()
														
 
															+        if line_text != '':
														
 
															+            line_lang = detect_lang(line_text)
														
 
															         for j, span in enumerate(line['spans']):
														
 
															             span_type = span['type']
														
@@ -159,15 +163,20 @@ def merge_para_with_text(para_block):
 
															                 content = f"\n$$\n{span['content']}\n$$\n"
														
 
															             content = content.strip()
														
 
															-            if content != '':
														
 
															-                if span_type in [ContentType.Text, ContentType.InlineEquation]:
														
 
															-                    # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
														
 
															-                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
														
 
															-                        para_text += content[:-1]
														
 
															-                    else:  # content间需要空格分隔
														
 
															-                        para_text += f'{content} '
														
 
															-                elif span_type == ContentType.InterlineEquation:
														
 
															-                    para_text += content
														
 
															+
														
 
															+            if content:
														
 
															+                langs = ['zh', 'ja', 'ko']
														
 
															+                if line_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
														
 
															+                    para_text += content if j == len(line['spans']) - 1 else f'{content} '
														
 
															+                else:
														
 
															+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
														
 
															+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
														
 
															+                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
														
 
															+                            para_text += content[:-1]
														
 
															+                        else:  # 西方文本语境下 content间需要空格分隔
														
 
															+                            para_text += f'{content} '
														
 
															+                    elif span_type == ContentType.InterlineEquation:
														
 
															+                        para_text += content
														
 
															             else:
														
 
															                 continue
														
 
															     # 连写字符拆分