Przeglądaj źródła

修改pipe模块

kernel.h@qq.com 1 rok temu
rodzic
commit
698c4a83d2

+ 3 - 3
magic_pdf/cli/magicpdf.py

@@ -62,13 +62,13 @@ def _do_parse(pdf_bytes, model_list, parse_method, image_writer, md_writer, imag
     if parse_method == "ocr":
         jso_useful_key["_pdf_type"] = "ocr"
 
-    pdf_mid_data = uni_pipe.pipe_parse()
-    md_content = UNIPipe.mk_markdown(pdf_mid_data, image_dir)
+    uni_pipe.pipe_parse()
+    md_content = uni_pipe.pipe_mk_markdown()
     part_file_name = datetime.now().strftime("%H-%M-%S")
     md_writer.write(content=md_content, path=f"{part_file_name}.md", mode=MODE_TXT)
     md_writer.write(
         content=json_parse.dumps(
-            JsonCompressor.decompress_json(pdf_mid_data), ensure_ascii=False, indent=4
+            uni_pipe.pdf_mid_data, ensure_ascii=False, indent=4
         ),
         path=f"{part_file_name}.json",
         mode=MODE_TXT,

+ 2 - 0
magic_pdf/para/para_split.py

@@ -589,6 +589,8 @@ def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
     3. 参照上述行尾特征进行分段。
     4. 图、表,目前独占一行,不考虑分段。
     """
+    if page_num==343:
+        pass
     lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
     layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
     layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落

+ 8 - 1
magic_pdf/pipe/AbsPipe.py

@@ -13,11 +13,18 @@ class AbsPipe(ABC):
     """
     txt和ocr处理的抽象类
     """
+    PIP_OCR = "ocr"
+    PIP_TXT = "txt"
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path:str, ):
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.image_writer = image_writer
+        self.img_parent_path = img_parent_path
+        self.pdf_mid_data = None # 未压缩
+    
+    def get_compress_pdf_mid_data(self):
+        return JsonCompressor.compress_json(self.pdf_mid_data)
 
     @abstractmethod
     def pipe_classify(self):

+ 4 - 10
magic_pdf/pipe/OCRPipe.py

@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_ocr_pdf
 
 class OCRPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
-        self.compressed_pdf_mid_data = None
-        self.pdf_mid_data = None
-        self.pdf_bytes = pdf_bytes
-        self.model_list = model_list
-        self.image_writer = image_writer
-        self.img_bucket_path = img_bucket_path
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
 
     def pipe_classify(self):
         pass
 
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
-        self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
 
     def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return content_list
 
     def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return md_content

+ 4 - 10
magic_pdf/pipe/TXTPipe.py

@@ -6,25 +6,19 @@ from magic_pdf.user_api import parse_txt_pdf
 
 class TXTPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
-        self.compressed_pdf_mid_data = None
-        self.pdf_mid_data = None
-        self.pdf_bytes = pdf_bytes
-        self.model_list = model_list
-        self.image_writer = image_writer
-        self.img_bucket_path = img_bucket_path
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
 
     def pipe_classify(self):
         pass
 
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer)
-        self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
 
     def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return content_list
 
     def pipe_mk_markdown(self):
-        md_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return md_content

+ 7 - 13
magic_pdf/pipe/UNIPipe.py

@@ -15,31 +15,25 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
 class UNIPipe(AbsPipe):
 
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_bucket_path: str):
-        self.pdf_type = "ocr"
-        self.compressed_pdf_mid_data = None
-        self.pdf_mid_data = None
-        self.pdf_bytes = pdf_bytes
-        self.model_list = model_list
-        self.image_writer = image_writer
-        self.img_bucket_path = img_bucket_path
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, img_parent_path: str):
+        self.pdf_type = self.PIP_OCR
+        super().__init__(pdf_bytes, model_list, image_writer, img_parent_path)
 
     def pipe_classify(self):
         self.pdf_type = UNIPipe.classify(self.pdf_bytes)
 
     def pipe_parse(self):
-        if self.pdf_type == "txt":
+        if self.pdf_type == self.PIP_TXT:
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer)
-        elif self.pdf_type == "ocr":
+        elif self.pdf_type == self.PIP_OCR:
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer)
-        self.compressed_pdf_mid_data = JsonCompressor.compress_json(self.pdf_mid_data)
 
     def pipe_mk_uni_format(self):
-        content_list = AbsPipe.mk_uni_format(self.compressed_pdf_mid_data, self.img_bucket_path)
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return content_list
 
     def pipe_mk_markdown(self):
-        markdown_content = AbsPipe.mk_markdown(self.compressed_pdf_mid_data, self.img_bucket_path)
+        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), self.img_parent_path)
         return markdown_content
 
 if __name__ == '__main__':

+ 0 - 2
magic_pdf/rw/DiskReaderWriter.py

@@ -41,12 +41,10 @@ class DiskReaderWriter(AbsReaderWriter):
         if mode == MODE_TXT:
             with open(abspath, "w", encoding=self.encoding) as f:
                 f.write(content)
-                logger.info(f"内容已成功写入 {abspath}")
 
         elif mode == MODE_BIN:
             with open(abspath, "wb") as f:
                 f.write(content)
-                logger.info(f"内容已成功写入 {abspath}")
         else:
             raise ValueError("Invalid mode. Use 'text' or 'binary'.")