|
|
@@ -5,6 +5,7 @@ from loguru import logger
|
|
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
|
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
|
|
from magic_pdf.libs.commons import join_path
|
|
|
+from magic_pdf.libs.language import detect_lang
|
|
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
|
|
from magic_pdf.para.para_split_v3 import ListLineTag
|
|
|
|
|
|
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
|
|
|
|
|
|
|
|
|
def merge_para_with_text(para_block):
|
|
|
+ block_text = ''
|
|
|
+ for line in para_block['lines']:
|
|
|
+ for span in line['spans']:
|
|
|
+ if span['type'] in [ContentType.Text]:
|
|
|
+ block_text += span['content']
|
|
|
+ block_lang = detect_lang(block_text)
|
|
|
+
|
|
|
para_text = ''
|
|
|
for i, line in enumerate(para_block['lines']):
|
|
|
|
|
|
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
|
|
|
para_text += ' \n'
|
|
|
|
|
|
- line_text = ''
|
|
|
- for span in line['spans']:
|
|
|
- span_type = span['type']
|
|
|
- if span_type == ContentType.Text:
|
|
|
- line_text += span['content'].strip()
|
|
|
-
|
|
|
for j, span in enumerate(line['spans']):
|
|
|
|
|
|
span_type = span['type']
|
|
|
@@ -159,15 +161,24 @@ def merge_para_with_text(para_block):
|
|
|
content = f"\n$$\n{span['content']}\n$$\n"
|
|
|
|
|
|
content = content.strip()
|
|
|
- if content != '':
|
|
|
- if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
|
|
- # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
|
|
- if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
|
|
|
- para_text += content[:-1]
|
|
|
- else: # content间需要空格分隔
|
|
|
+
|
|
|
+ if content:
|
|
|
+ langs = ['zh', 'ja', 'ko']
|
|
|
+ # logger.info(f'block_lang: {block_lang}, content: {content}')
|
|
|
+ if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
|
|
|
+ if j == len(line['spans']) - 1:
|
|
|
+ para_text += content
|
|
|
+ else:
|
|
|
para_text += f'{content} '
|
|
|
- elif span_type == ContentType.InterlineEquation:
|
|
|
- para_text += content
|
|
|
+ else:
|
|
|
+ if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
|
|
+ # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
|
|
+ if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
|
|
|
+ para_text += content[:-1]
|
|
|
+ else: # 西方文本语境下 content间需要空格分隔
|
|
|
+ para_text += f'{content} '
|
|
|
+ elif span_type == ContentType.InterlineEquation:
|
|
|
+ para_text += content
|
|
|
else:
|
|
|
continue
|
|
|
# 连写字符拆分
|