Bladeren bron

change content make logic to union_make

赵小蒙 1 jaar geleden
bovenliggende
commit
4bd31ceda4
5 gewijzigde bestanden met toevoegingen van 28 en 33 verwijderingen
  1. 4 3
      magic_pdf/cli/magicpdf.py
  2. 8 18
      magic_pdf/pipe/AbsPipe.py
  3. 5 4
      magic_pdf/pipe/OCRPipe.py
  4. 5 4
      magic_pdf/pipe/TXTPipe.py
  5. 6 4
      magic_pdf/pipe/UNIPipe.py

+ 4 - 3
magic_pdf/cli/magicpdf.py

@@ -28,6 +28,7 @@ import click
 from loguru import logger
 from pathlib import Path
 
+from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.pipe.OCRPipe import OCRPipe
@@ -78,8 +79,8 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
     pdf_info = pipe.pdf_mid_data['pdf_info']
     draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
     draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
-    md_content = pipe.pipe_mk_markdown(image_dir)
-    #part_file_name = datetime.now().strftime("%H-%M-%S")
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
+
     md_writer.write(
         content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
     )
@@ -89,7 +90,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
         mode=AbsReaderWriter.MODE_TXT,
     )
     try:
-        content_list = pipe.pipe_mk_uni_format(image_dir)
+        content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
     except Exception as e:
         logger.exception(e)
     md_writer.write(

+ 8 - 18
magic_pdf/pipe/AbsPipe.py

@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
 
 from magic_pdf.dict2md.mkcontent import mk_universal_format, mk_mm_markdown
-from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para
+from magic_pdf.dict2md.ocr_mkcontent import make_standard_format_with_para, ocr_mk_mm_markdown_with_para, union_make
 from magic_pdf.filter.pdf_classify_by_type import classify
 from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
@@ -41,14 +42,14 @@ class AbsPipe(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def pipe_mk_uni_format(self):
+    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
         """
         有状态的组装统一格式
         """
         raise NotImplementedError
 
     @abstractmethod
-    def pipe_mk_markdown(self):
+    def pipe_mk_markdown(self, img_parent_path, drop_mode):
         """
         有状态的组装markdown
         """
@@ -83,34 +84,23 @@ class AbsPipe(ABC):
                     return AbsPipe.PIP_OCR
 
     @staticmethod
-    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
+    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
         """
         根据pdf类型,生成统一格式content_list
         """
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
-        parse_type = pdf_mid_data["_parse_type"]
         pdf_info_list = pdf_mid_data["pdf_info"]
-        if parse_type == AbsPipe.PIP_TXT:
-            # content_list = mk_universal_format(pdf_info_list, img_buket_path)
-            content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
-        elif parse_type == AbsPipe.PIP_OCR:
-            content_list = make_standard_format_with_para(pdf_info_list, img_buket_path)
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
         return content_list
 
     @staticmethod
-    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str) -> list:
+    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
         """
         根据pdf类型,markdown
         """
         pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
-        parse_type = pdf_mid_data["_parse_type"]
         pdf_info_list = pdf_mid_data["pdf_info"]
-        if parse_type == AbsPipe.PIP_TXT:
-            # content_list = mk_universal_format(pdf_info_list, img_buket_path)
-            # md_content = mk_mm_markdown(content_list)
-            md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
-        elif parse_type == AbsPipe.PIP_OCR:
-            md_content = ocr_mk_mm_markdown_with_para(pdf_info_list, img_buket_path)
+        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
         return md_content
 
 

+ 5 - 4
magic_pdf/pipe/OCRPipe.py

@@ -1,3 +1,4 @@
+from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -15,10 +16,10 @@ class OCRPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self, img_parent_path: str):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return content_list
 
-    def pipe_mk_markdown(self, img_parent_path: str):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return md_content

+ 5 - 4
magic_pdf/pipe/TXTPipe.py

@@ -1,3 +1,4 @@
+from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -15,10 +16,10 @@ class TXTPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self, img_parent_path: str):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return content_list
 
-    def pipe_mk_markdown(self, img_parent_path: str):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return md_content

+ 6 - 4
magic_pdf/pipe/UNIPipe.py

@@ -1,6 +1,8 @@
 import json
 
 from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.libs.commons import join_path
@@ -25,12 +27,12 @@ class UNIPipe(AbsPipe):
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self, img_parent_path: str):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return content_list
 
-    def pipe_mk_markdown(self, img_parent_path: str):
-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
         return markdown_content