hace 1 año · 05fe0548b1
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,4 +1,5 @@
 
				 from magic_pdf.libs.commons import s3_image_save_path, join_path
			
 
				+from magic_pdf.libs.language import detect_lang
			
 
				 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
			
 
				 from magic_pdf.libs.ocr_content_type import ContentType
			
 
				 import wordninja
			
@@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
 
				                 for span in line['spans']:
			
 
				                     span_type = span.get('type')
			
 
				                     content = ''
			
 
				+                    language = ''
			
 
				                     if span_type == ContentType.Text:
			
 
				-                        content = ocr_escape_special_markdown_char(split_long_words(span['content']))
			
 
				+                        content = span['content']
			
 
				+                        language = detect_lang(content)
			
 
				+                        if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
			
 
				+                            content = ocr_escape_special_markdown_char(split_long_words(content))
			
 
				+                        else:
			
 
				+                            content = ocr_escape_special_markdown_char(content)
			
 
				                     elif span_type == ContentType.InlineEquation:
			
 
				                         content = f"${ocr_escape_special_markdown_char(span['content'])}$"
			
 
				                     elif span_type == ContentType.InterlineEquation:
			
@@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
 
				                         elif mode == 'nlp':
			
 
				                             pass
			
 
				                     if content != '':
			
 
				-                        para_text += content + ' '
			
 
				+                        if language == 'en':  # 英文语境下 content间需要空格分隔
			
 
				+                            para_text += content + ' '
			
 
				+                        else:  # 中文语境下，content间不需要空格分隔
			
 
				+                            para_text += content
			
 
				             if para_text.strip() == '':
			
 
				                 continue
			
 
				             else: