浏览代码

update:Add md make mode config in do_parse.You can control whether the produced md is for NLP or MM by changing the value of f_make_md_mode

myhloli 1 年之前
父节点
当前提交
f8f6ba6fd3
共有 5 个文件被更改,包括 44 次插入39 次删除
  1. 3 2
      magic_pdf/cli/magicpdf.py
  2. 8 14
      magic_pdf/pipe/AbsPipe.py
  3. 12 8
      magic_pdf/pipe/OCRPipe.py
  4. 12 8
      magic_pdf/pipe/TXTPipe.py
  5. 9 7
      magic_pdf/pipe/UNIPipe.py

+ 3 - 2
magic_pdf/cli/magicpdf.py

@@ -28,7 +28,7 @@ from loguru import logger
 from pathlib import Path
 from magic_pdf.libs.version import __version__
 
-from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.pipe.OCRPipe import OCRPipe
@@ -81,6 +81,7 @@ def do_parse(
         f_dump_model_json=True,
         f_dump_orig_pdf=True,
         f_dump_content_list=True,
+        f_make_md_mode=MakeMode.MM_MD,
 ):
     orig_model_list = copy.deepcopy(model_list)
 
@@ -118,7 +119,7 @@ def do_parse(
     if f_draw_span_bbox:
         draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
 
-    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
     if f_dump_md:
         """写markdown"""
         md_writer.write(

+ 8 - 14
magic_pdf/pipe/AbsPipe.py

@@ -47,19 +47,13 @@ class AbsPipe(ABC):
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
-        """
-        有状态的组装统一格式
-        """
-        raise NotImplementedError
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
+        return content_list
 
-    @abstractmethod
-    def pipe_mk_markdown(self, img_parent_path, drop_mode):
-        """
-        有状态的组装markdown
-        """
-        raise NotImplementedError
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
+        return md_content
 
     @staticmethod
     def classify(pdf_bytes: bytes) -> str:
@@ -101,13 +95,13 @@ class AbsPipe(ABC):
         return content_list
 
     @staticmethod
-    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
+    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
         """
         根据pdf类型,markdown
         """
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
         pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
         return md_content
 
 

+ 12 - 8
magic_pdf/pipe/OCRPipe.py

@@ -1,4 +1,6 @@
-from magic_pdf.libs.MakeContentConfig import DropMode
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf
 
 class OCRPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
         super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return md_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("ocr_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"ocr_pipe mk {md_make_mode} finished")
+        return result

+ 12 - 8
magic_pdf/pipe/TXTPipe.py

@@ -1,4 +1,6 @@
-from magic_pdf.libs.MakeContentConfig import DropMode
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf
 
 class TXTPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
         super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return md_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("txt_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"txt_pipe mk {md_make_mode} finished")
+        return result

+ 9 - 7
magic_pdf/pipe/UNIPipe.py

@@ -2,7 +2,7 @@ import json
 
 from loguru import logger
 
-from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
                                               is_debug=self.is_debug)
 
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return markdown_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("uni_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"uni_pipe mk {md_make_mode} finished")
+        return result
 
 
 if __name__ == '__main__':