1 gadu atpakaļ · 66e3ce9c4a
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -14,7 +14,7 @@ def split_long_words(text):
 
				     for i in range(len(segments)):
			
 
				         words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
			
 
				         for j in range(len(words)):
			
 
				-            if len(words[j]) > 15:
			
 
				+            if len(words[j]) > 10:
			
 
				                 words[j] = ' '.join(wordninja.split(words[j]))
			
 
				         segments[i] = ''.join(words)
			
 
				     return ' '.join(segments)
			
@@ -147,6 +147,18 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
				 
			
 
				 
			
 
				 def merge_para_with_text(para_block):
			
 
				+    def detect_language(text):
			
 
				+        en_pattern = r'[a-zA-Z]+'
			
 
				+        en_matches = re.findall(en_pattern, text)
			
 
				+        en_length = sum(len(match) for match in en_matches)
			
 
				+        if len(text) > 0:
			
 
				+            if en_length / len(text) >= 0.5:
			
 
				+                return 'en'
			
 
				+            else:
			
 
				+                return "unknown"
			
 
				+        else:
			
 
				+            return "empty"
			
 
				+
			
 
				     para_text = ''
			
 
				     for line in para_block['lines']:
			
 
				         line_text = ""
			
@@ -162,7 +174,8 @@ def merge_para_with_text(para_block):
 
				             content = ''
			
 
				             if span_type == ContentType.Text:
			
 
				                 content = span['content']
			
 
				-                language = detect_lang(content)
			
 
				+                # language = detect_lang(content)
			
 
				+                language = detect_language(content)
			
 
				                 if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
			
 
				                     content = ocr_escape_special_markdown_char(split_long_words(content))
			
 
				                 else:
			
@@ -171,12 +184,12 @@ def merge_para_with_text(para_block):
 
				                 content = f" ${span['content']}$ "
			
 
				             elif span_type == ContentType.InterlineEquation:
			
 
				                 content = f"\n$$\n{span['content']}\n$$\n"
			
 
				-
			
 
				             if content != '':
			
 
				-                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
			
 
				-                    para_text += content  # 中文语境下，content间不需要空格分隔
			
 
				+                langs = ['zh', 'ja', 'ko']
			
 
				+                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
			
 
				+                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
			
 
				                 else:
			
 
				-                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
			
 
				+                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
			
 
				     return para_text
			
 
				 
			
 
				 
			
@@ -202,7 +215,6 @@ def para_to_standard_format(para, img_buket_path):
 
				                 elif span_type == ContentType.InlineEquation:
			
 
				                     content = f"${span['content']}$"
			
 
				                     inline_equation_num += 1
			
 
				-
			
 
				                 if language == 'en':  # 英文语境下 content间需要空格分隔
			
 
				                     para_text += content + ' '
			
 
				                 else:  # 中文语境下，content间不需要空格分隔