Explorar el Código

修复中文语境下长文本因分词导致文本丢失和content间被增加额外空格的问题

赵小蒙 hace 1 año
padre
commit
05fe0548b1
Se han modificado 1 ficheros con 12 adiciones y 2 borrados
  1. 12 2
      magic_pdf/dict2md/ocr_mkcontent.py

+ 12 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import s3_image_save_path, join_path
+from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType
 import wordninja
@@ -108,8 +109,14 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
                 for span in line['spans']:
                     span_type = span.get('type')
                     content = ''
+                    language = ''
                     if span_type == ContentType.Text:
-                        content = ocr_escape_special_markdown_char(split_long_words(span['content']))
+                        content = span['content']
+                        language = detect_lang(content)
+                        if language == 'en':  # 只对英文长词进行分词处理,中文分词会丢失文本
+                            content = ocr_escape_special_markdown_char(split_long_words(content))
+                        else:
+                            content = ocr_escape_special_markdown_char(content)
                     elif span_type == ContentType.InlineEquation:
                         content = f"${ocr_escape_special_markdown_char(span['content'])}$"
                     elif span_type == ContentType.InterlineEquation:
@@ -120,7 +127,10 @@ def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
                         elif mode == 'nlp':
                             pass
                     if content != '':
-                        para_text += content + ' '
+                        if language == 'en':  # 英文语境下 content间需要空格分隔
+                            para_text += content + ' '
+                        else:  # 中文语境下,content间不需要空格分隔
+                            para_text += content
             if para_text.strip() == '':
                 continue
             else: