Răsfoiți Sursa

pipe初始化移除img_parent_path参数

赵小蒙 1 an în urmă
părinte
comite
5f3cf14afc

+ 5 - 5
magic_pdf/cli/magicpdf.py

@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
 
 def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
     if parse_method == "auto":
-        pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = UNIPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     elif parse_method == "txt":
-        pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     elif parse_method == "ocr":
-        pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     else:
         print("unknow parse method")
         os.exit(1)
@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
     pdf_info = pipe.pdf_mid_data['pdf_info']
     draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
     draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
-    md_content = pipe.pipe_mk_markdown()
+    md_content = pipe.pipe_mk_markdown(image_dir)
     #part_file_name = datetime.now().strftime("%H-%M-%S")
     md_writer.write(
         content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
         mode=AbsReaderWriter.MODE_TXT,
     )
     try:
-        content_list = pipe.pipe_mk_uni_format()
+        content_list = pipe.pipe_mk_uni_format(image_dir)
     except Exception as e:
         logger.exception(e)
     md_writer.write(

+ 2 - 3
magic_pdf/pipe/AbsPipe.py

@@ -16,12 +16,11 @@ class AbsPipe(ABC):
     PIP_OCR = "ocr"
     PIP_TXT = "txt"
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.image_writer = image_writer
-        self.img_parent_path = img_parent_path
-        self.pdf_mid_data = None # 未压缩
+        self.pdf_mid_data = None  # 未压缩
         self.is_debug = is_debug
     
     def get_compress_pdf_mid_data(self):

+ 6 - 6
magic_pdf/pipe/OCRPipe.py

@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
 
 class OCRPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         pass
@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return md_content

+ 6 - 6
magic_pdf/pipe/TXTPipe.py

@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
 
 class TXTPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         pass
@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return md_content

+ 6 - 8
magic_pdf/pipe/UNIPipe.py

@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
-                 is_debug: bool = False):
-        self.pdf_type = self.PIP_OCR
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         self.pdf_type = UNIPipe.classify(self.pdf_bytes)
@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return markdown_content