|
|
@@ -14,7 +14,7 @@ def split_long_words(text):
|
|
|
for i in range(len(segments)):
|
|
|
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
|
|
|
for j in range(len(words)):
|
|
|
- if len(words[j]) > 15:
|
|
|
+ if len(words[j]) > 10:
|
|
|
words[j] = ' '.join(wordninja.split(words[j]))
|
|
|
segments[i] = ''.join(words)
|
|
|
return ' '.join(segments)
|
|
|
@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
|
|
|
|
|
|
|
def merge_para_with_text(para_block):
|
|
|
+ def detect_language(text):
|
|
|
+ en_pattern = r'[a-zA-Z]+'
|
|
|
+ en_matches = re.findall(en_pattern, text)
|
|
|
+ en_length = sum(len(match) for match in en_matches)
|
|
|
+ if len(text) > 0:
|
|
|
+ if en_length / len(text) >= 0.5:
|
|
|
+ return 'en'
|
|
|
+ else:
|
|
|
+ return "unknown"
|
|
|
+ else:
|
|
|
+ return "empty"
|
|
|
+
|
|
|
para_text = ''
|
|
|
for line in para_block['lines']:
|
|
|
line_text = ""
|
|
|
@@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
|
|
|
content = ''
|
|
|
if span_type == ContentType.Text:
|
|
|
content = span['content']
|
|
|
- language = detect_lang(content)
|
|
|
+ # language = detect_lang(content)
|
|
|
+ language = detect_language(content)
|
|
|
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
|
|
|
content = ocr_escape_special_markdown_char(split_long_words(content))
|
|
|
else:
|
|
|
@@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
|
|
|
content = f" ${span['content']}$ "
|
|
|
elif span_type == ContentType.InterlineEquation:
|
|
|
content = f"\n$$\n{span['content']}\n$$\n"
|
|
|
-
|
|
|
if content != '':
|
|
|
- if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
|
|
- para_text += content # 中文语境下,content间不需要空格分隔
|
|
|
+ langs = ['zh', 'ja', 'ko']
|
|
|
+ if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
|
|
+ para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
|
|
|
else:
|
|
|
- para_text += content + ' ' # 英文语境下 content间需要空格分隔
|
|
|
+ para_text += content + ' ' # 西方文本语境下 content间需要空格分隔
|
|
|
return para_text
|
|
|
|
|
|
|
|
|
@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
|
|
|
elif span_type == ContentType.InlineEquation:
|
|
|
content = f"${span['content']}$"
|
|
|
inline_equation_num += 1
|
|
|
-
|
|
|
if language == 'en': # 英文语境下 content间需要空格分隔
|
|
|
para_text += content + ' '
|
|
|
else: # 中文语境下,content间不需要空格分隔
|