1 gadu atpakaļ · 683fa63370
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -60,11 +60,11 @@ def prepare_env(pdf_file_name, method):
 
				 
			
 
				 def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
			
 
				     if parse_method == "auto":
			
 
				-        pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
			
 
				+        pipe = UNIPipe(pdf_bytes, model_list, image_writer, is_debug=True)
			
 
				     elif parse_method == "txt":
			
 
				-        pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
			
 
				+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
			
 
				     elif parse_method == "ocr":
			
 
				-        pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
			
 
				+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
			
 
				     else:
			
 
				         print("unknow parse method")
			
 
				         os.exit(1)
			
@@ -74,7 +74,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
 
				     pdf_info = pipe.pdf_mid_data['pdf_info']
			
 
				     draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
			
 
				     draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
			
 
				-    md_content = pipe.pipe_mk_markdown()
			
 
				+    md_content = pipe.pipe_mk_markdown(image_dir)
			
 
				     #part_file_name = datetime.now().strftime("%H-%M-%S")
			
 
				     md_writer.write(
			
 
				         content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
			
@@ -85,7 +85,7 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
 
				         mode=AbsReaderWriter.MODE_TXT,
			
 
				     )
			
 
				     try:
			
 
				-        content_list = pipe.pipe_mk_uni_format()
			
 
				+        content_list = pipe.pipe_mk_uni_format(image_dir)
			
 
				     except Exception as e:
			
 
				         logger.exception(e)
			
 
				     md_writer.write(
			
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -305,7 +305,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
 
				         page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
			
 
				         # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
			
 
				 
			
 
				-        svgs_per_page = get_svgs_per_page(doc)
			
 
				+        # svgs_per_page = get_svgs_per_page(doc)
			
 
				         # logger.info(f"svgs_per_page: {svgs_per_page}")
			
 
				         imgs_per_page = get_imgs_per_page(doc)
			
 
				         # logger.info(f"imgs_per_page: {imgs_per_page}")
			
@@ -331,7 +331,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
 
				             "text_len_per_page": text_len_per_page,
			
 
				             "text_layout_per_page": text_layout_per_page,
			
 
				             "text_language": text_language,
			
 
				-            "svgs_per_page": svgs_per_page,
			
 
				+            # "svgs_per_page": svgs_per_page,
			
 
				             "imgs_per_page": imgs_per_page,  # 增加每页img数量list
			
 
				             "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
			
 
				             "metadata": doc.metadata
			
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -16,12 +16,11 @@ class AbsPipe(ABC):
 
				     PIP_OCR = "ocr"
			
 
				     PIP_TXT = "txt"
			
 
				 
			
 
				-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, is_debug:bool=False):
			
 
				+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
			
 
				         self.pdf_bytes = pdf_bytes
			
 
				         self.model_list = model_list
			
 
				         self.image_writer = image_writer
			
 
				-        self.img_parent_path = img_parent_path
			
 
				-        self.pdf_mid_data = None # 未压缩
			
 
				+        self.pdf_mid_data = None  # 未压缩
			
 
				         self.is_debug = is_debug
			
 
				     
			
 
				     def get_compress_pdf_mid_data(self):
			
--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_ocr_pdf
 
				 
			
 
				 class OCRPipe(AbsPipe):
			
 
				 
			
 
				-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
			
 
				-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
			
 
				+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
			
 
				+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
			
 
				 
			
 
				     def pipe_classify(self):
			
 
				         pass
			
@@ -15,10 +15,10 @@ class OCRPipe(AbsPipe):
 
				     def pipe_parse(self):
			
 
				         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
			
 
				 
			
 
				-    def pipe_mk_uni_format(self):
			
 
				-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_uni_format(self, img_parent_path: str):
			
 
				+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return content_list
			
 
				 
			
 
				-    def pipe_mk_markdown(self):
			
 
				-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_markdown(self, img_parent_path: str):
			
 
				+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return md_content
			
--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -6,8 +6,8 @@ from magic_pdf.user_api import parse_txt_pdf
 
				 
			
 
				 class TXTPipe(AbsPipe):
			
 
				 
			
 
				-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str, is_debug:bool=False):
			
 
				-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
			
 
				+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
			
 
				+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
			
 
				 
			
 
				     def pipe_classify(self):
			
 
				         pass
			
@@ -15,10 +15,10 @@ class TXTPipe(AbsPipe):
 
				     def pipe_parse(self):
			
 
				         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
			
 
				 
			
 
				-    def pipe_mk_uni_format(self):
			
 
				-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_uni_format(self, img_parent_path: str):
			
 
				+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return content_list
			
 
				 
			
 
				-    def pipe_mk_markdown(self):
			
 
				-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_markdown(self, img_parent_path: str):
			
 
				+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return md_content
			
--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -10,10 +10,8 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
				 
			
 
				 class UNIPipe(AbsPipe):
			
 
				 
			
 
				-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str,
			
 
				-                 is_debug: bool = False):
			
 
				-        self.pdf_type = self.PIP_OCR
			
 
				-        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path, is_debug)
			
 
				+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
			
 
				+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
			
 
				 
			
 
				     def pipe_classify(self):
			
 
				         self.pdf_type = UNIPipe.classify(self.pdf_bytes)
			
@@ -26,12 +24,12 @@ class UNIPipe(AbsPipe):
 
				             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
			
 
				                                               is_debug=self.is_debug)
			
 
				 
			
 
				-    def pipe_mk_uni_format(self):
			
 
				-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_uni_format(self, img_parent_path: str):
			
 
				+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return content_list
			
 
				 
			
 
				-    def pipe_mk_markdown(self):
			
 
				-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
			
 
				+    def pipe_mk_markdown(self, img_parent_path: str):
			
 
				+        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path)
			
 
				         return markdown_content