Pārlūkot izejas kodu

Merge pull request #77 from myhloli/master

fix
myhloli 1 gadu atpakaļ
vecāks
revīzija
683fa63370

+ 5 - 5
magic_pdf/cli/magicpdf.py

@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
 
 def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
     if parse_method == "auto":
-        pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = UNIPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     elif parse_method == "txt":
-        pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     elif parse_method == "ocr":
-        pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
     else:
         print("unknow parse method")
         os.exit(1)
@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
     pdf_info = pipe.pdf_mid_data['pdf_info']
     draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
     draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
-    md_content = pipe.pipe_mk_markdown()
+    md_content = pipe.pipe_mk_markdown(image_dir)
     #part_file_name = datetime.now().strftime("%H-%M-%S")
     md_writer.write(
         content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
         mode=AbsReaderWriter.MODE_TXT,
     )
     try:
-        content_list = pipe.pipe_mk_uni_format()
+        content_list = pipe.pipe_mk_uni_format(image_dir)
     except Exception as e:
         logger.exception(e)
     md_writer.write(

+ 2 - 2
magic_pdf/filter/pdf_meta_scan.py

@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
         page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
         # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
 
-        svgs_per_page = get_svgs_per_page(doc)
+        # svgs_per_page = get_svgs_per_page(doc)
         # logger.info(f"svgs_per_page: {svgs_per_page}")
         imgs_per_page = get_imgs_per_page(doc)
         # logger.info(f"imgs_per_page: {imgs_per_page}")
@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
             "text_len_per_page": text_len_per_page,
             "text_layout_per_page": text_layout_per_page,
             "text_language": text_language,
-            "svgs_per_page": svgs_per_page,
+            # "svgs_per_page": svgs_per_page,
             "imgs_per_page": imgs_per_page,  # 增加每页img数量list
             "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
             "metadata": doc.metadata

+ 2 - 3
magic_pdf/pipe/AbsPipe.py

@@ -16,12 +16,11 @@ class AbsPipe(ABC):
     PIP_OCR = "ocr"
     PIP_TXT = "txt"
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.image_writer = image_writer
-        self.img_parent_path = img_parent_path
-        self.pdf_mid_data = None # 未压缩
+        self.pdf_mid_data = None  # 未压缩
         self.is_debug = is_debug
     
     def get_compress_pdf_mid_data(self):

+ 6 - 6
magic_pdf/pipe/OCRPipe.py

@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
 
 class OCRPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         pass
@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return md_content

+ 6 - 6
magic_pdf/pipe/TXTPipe.py

@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
 
 class TXTPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         pass
@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return md_content

+ 6 - 8
magic_pdf/pipe/UNIPipe.py

@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
-                 is_debug: bool = False):
-        self.pdf_type = self.PIP_OCR
-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
 
     def pipe_classify(self):
         self.pdf_type = UNIPipe.classify(self.pdf_bytes)
@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug)
 
-    def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_uni_format(self, img_parent_path: str):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
         return content_list
 
-    def pipe_mk_markdown(self):
-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
+    def pipe_mk_markdown(self, img_parent_path: str):
+        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
         return markdown_content