فهرست منبع

Merge pull request #79 from myhloli/master

fix UNIPipe and spans space with language
myhloli 1 سال پیش
والد
کامیت
83b96f2fcf
2فایلهای تغییر یافته به همراه21 افزوده شده و 13 حذف شده
  1. 4 4
      magic_pdf/dict2md/ocr_mkcontent.py
  2. 17 9
      magic_pdf/pipe/UNIPipe.py

+ 4 - 4
magic_pdf/dict2md/ocr_mkcontent.py

@@ -159,10 +159,10 @@ def merge_para_with_text(para_block):
                 content = f"\n$$\n{span['content']}\n$$\n"
 
             if content != '':
-                if language == 'en':  # 英文语境下 content间需要空格分隔
-                    para_text += content + ' '
-                else:  # 中文语境下,content间不需要空格分隔
-                    para_text += content
+                if 'zh' in language:
+                    para_text += content  # 中文语境下,content间不需要空格分隔
+                else:
+                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
     return para_text
 
 

+ 17 - 9
magic_pdf/pipe/UNIPipe.py

@@ -10,11 +10,12 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
+        self.pdf_type = jso_useful_key["_pdf_type"]
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
 
     def pipe_classify(self):
-        self.pdf_type = UNIPipe.classify(self.pdf_bytes)
+        self.pdf_type = AbsPipe.classify(self.pdf_bytes)
 
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
@@ -46,14 +47,21 @@ if __name__ == '__main__':
     img_bucket_path = "imgs"
     img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
 
-    pipe = UNIPipe(pdf_bytes, model_list, img_writer, img_bucket_path)
+    # pdf_type = UNIPipe.classify(pdf_bytes)
+    # jso_useful_key = {
+    #     "_pdf_type": pdf_type,
+    #     "model_list": model_list
+    # }
+
+    jso_useful_key = {
+        "_pdf_type": "",
+        "model_list": model_list
+    }
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
     pipe.pipe_classify()
     pipe.pipe_parse()
-    md_content = pipe.pipe_mk_markdown()
-    try:
-        content_list = pipe.pipe_mk_uni_format()
-    except Exception as e:
-        logger.exception(e)
+    md_content = pipe.pipe_mk_markdown(img_bucket_path)
+    content_list = pipe.pipe_mk_uni_format(img_bucket_path)
 
     md_writer = DiskReaderWriter(write_path)
     md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)