Kaynağa Gözat

feat(ocr_mkcontent): add language detection for line spacing

- Introduce language detection to determine line spacing based on language context
- Implement different spacing rules for Chinese/Japanese/Korean and Western texts
- Adjust span content handling based on detected language and span type
myhloli 11 ay önce
ebeveyn
işleme
c8cabb3cf6
1 değiştirilmiş dosya ile 18 ekleme ve 9 silme
  1. 18 9
      magic_pdf/dict2md/ocr_mkcontent.py

+ 18 - 9
magic_pdf/dict2md/ocr_mkcontent.py

@@ -5,6 +5,7 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.para.para_split_v3 import ListLineTag
 from magic_pdf.para.para_split_v3 import ListLineTag
 
 
@@ -142,11 +143,14 @@ def merge_para_with_text(para_block):
             para_text += '  \n'
             para_text += '  \n'
 
 
         line_text = ''
         line_text = ''
+        line_lang = ''
         for span in line['spans']:
         for span in line['spans']:
             span_type = span['type']
             span_type = span['type']
             if span_type == ContentType.Text:
             if span_type == ContentType.Text:
                 line_text += span['content'].strip()
                 line_text += span['content'].strip()
 
 
+        if line_text != '':
+            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
         for j, span in enumerate(line['spans']):
 
 
             span_type = span['type']
             span_type = span['type']
@@ -159,15 +163,20 @@ def merge_para_with_text(para_block):
                 content = f"\n$$\n{span['content']}\n$$\n"
                 content = f"\n$$\n{span['content']}\n$$\n"
 
 
             content = content.strip()
             content = content.strip()
-            if content != '':
-                if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                    # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
-                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:  # content间需要空格分隔
-                        para_text += f'{content} '
-                elif span_type == ContentType.InterlineEquation:
-                    para_text += content
+
+            if content:
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
+                    para_text += content if j == len(line['spans']) - 1 else f'{content} '
+                else:
+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                        # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
+                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                            para_text += content[:-1]
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f'{content} '
+                    elif span_type == ContentType.InterlineEquation:
+                        para_text += content
             else:
             else:
                 continue
                 continue
     # 连写字符拆分
     # 连写字符拆分