Selaa lähdekoodia

fix(dict2md): add space for inline equations in CJK contexts

- In Chinese, Japanese, and Korean (CJK) languages, no space is needed for line breaks within paragraphs.
- However, if an inline equation is at the end of a line, a space should be added to separate it from the following text.
- This change improves the formatting of documents containing both CJK text and inline equations.
myhloli 11 kuukautta sitten
vanhempi
commit
74ee428bbb
1 muutettua tiedostoa jossa 2 lisäystä ja 2 poistoa
  1. 2 2
      magic_pdf/dict2md/ocr_mkcontent.py

+ 2 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -165,8 +165,8 @@ def merge_para_with_text(para_block):
             if content:
                 langs = ['zh', 'ja', 'ko']
                 # logger.info(f'block_lang: {block_lang}, content: {content}')
-                if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
-                    if j == len(line['spans']) - 1:
+                if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
+                    if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
                         para_text += content
                     else:
                         para_text += f'{content} '