11 달 전 · 4a82d6a07a
--- a/magic_pdf/data/dataset.py
+++ b/magic_pdf/data/dataset.py
@@ -32,10 +32,28 @@ class PageableData(ABC):
 
				 
			
 
				     @abstractmethod
			
 
				     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
			
 
				+        """draw rectangle.
			
 
				+
			
 
				+        Args:
			
 
				+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
			
 
				+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
			
 
				+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
			
 
				+            fill_opacity (float): opacity of the fill, range from [0, 1]
			
 
				+            width (float): the width of board
			
 
				+            overlay (bool): fill the color in foreground or background. True means fill in background.
			
 
				+        """
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				     def insert_text(self, coord, content, fontsize, color):
			
 
				+        """insert text.
			
 
				+
			
 
				+        Args:
			
 
				+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
			
 
				+            content (str): the text content
			
 
				+            fontsize (int): font size of the text
			
 
				+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
			
 
				+        """
			
 
				         pass
			
 
				 
			
 
				 
			
@@ -244,6 +262,16 @@ class Doc(PageableData):
 
				             return getattr(self._doc, name)
			
 
				 
			
 
				     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
			
 
				+        """draw rectangle.
			
 
				+
			
 
				+        Args:
			
 
				+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
			
 
				+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
			
 
				+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
			
 
				+            fill_opacity (float): opacity of the fill, range from [0, 1]
			
 
				+            width (float): the width of board
			
 
				+            overlay (bool): fill the color in foreground or background. True means fill in background.
			
 
				+        """
			
 
				         self._doc.draw_rect(
			
 
				             rect_coords,
			
 
				             color=color,
			
@@ -254,4 +282,12 @@ class Doc(PageableData):
 
				         )
			
 
				 
			
 
				     def insert_text(self, coord, content, fontsize, color):
			
 
				+        """insert text.
			
 
				+
			
 
				+        Args:
			
 
				+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
			
 
				+            content (str): the text content
			
 
				+            fontsize (int): font size of the text
			
 
				+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
			
 
				+        """
			
 
				         self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
			
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -13,7 +13,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
 
				                                           get_local_models_dir,
			
 
				                                           get_table_recog_config)
			
 
				 from magic_pdf.model.model_list import MODEL
			
 
				-from magic_pdf.model.types import InferenceResult
			
 
				+from magic_pdf.model.operators import InferenceResult
			
 
				 
			
 
				 
			
 
				 def dict_compare(d1, d2):
			
--- a/magic_pdf/model/operators.py
+++ b/magic_pdf/model/operators.py
@@ -0,0 +1,177 @@
 
				+import copy
			
 
				+import json
			
 
				+import os
			
 
				+from typing import Callable
			
 
				+
			
 
				+from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				+from magic_pdf.data.data_reader_writer import DataWriter
			
 
				+from magic_pdf.data.dataset import Dataset
			
 
				+from magic_pdf.filter import classify
			
 
				+from magic_pdf.libs.draw_bbox import draw_model_bbox
			
 
				+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
			
 
				+from magic_pdf.pipe.operators import PipeResult
			
 
				+
			
 
				+
			
 
				+class InferenceResult:
			
 
				+    def __init__(self, inference_results: list, dataset: Dataset):
			
 
				+        """Initialized method.
			
 
				+
			
 
				+        Args:
			
 
				+            inference_results (list): the inference result generated by model
			
 
				+            dataset (Dataset): the dataset related with model inference result
			
 
				+        """
			
 
				+        self._infer_res = inference_results
			
 
				+        self._dataset = dataset
			
 
				+
			
 
				+    def draw_model(self, file_path: str) -> None:
			
 
				+        """Draw model inference result.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path (str): the output file path
			
 
				+        """
			
 
				+        dir_name = os.path.dirname(file_path)
			
 
				+        base_name = os.path.basename(file_path)
			
 
				+        if not os.path.exists(dir_name):
			
 
				+            os.makedirs(dir_name, exist_ok=True)
			
 
				+        draw_model_bbox(
			
 
				+            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
			
 
				+        )
			
 
				+
			
 
				+    def dump_model(self, writer: DataWriter, file_path: str):
			
 
				+        """Dump model inference result to file.
			
 
				+
			
 
				+        Args:
			
 
				+            writer (DataWriter): writer handle
			
 
				+            file_path (str): the location of target file
			
 
				+        """
			
 
				+        writer.write_string(
			
 
				+            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
			
 
				+        )
			
 
				+
			
 
				+    def get_infer_res(self):
			
 
				+        """Get the inference result.
			
 
				+
			
 
				+        Returns:
			
 
				+            list[dict]: the inference result generated by model
			
 
				+        """
			
 
				+        return self._infer_res
			
 
				+
			
 
				+    def apply(self, proc: Callable, *args, **kwargs):
			
 
				+        """Apply callable method which.
			
 
				+
			
 
				+        Args:
			
 
				+            proc (Callable): invoke proc as follows:
			
 
				+                proc(inference_result, *args, **kwargs)
			
 
				+
			
 
				+        Returns:
			
 
				+            Any: return the result generated by proc
			
 
				+        """
			
 
				+        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
			
 
				+
			
 
				+    def pipe_auto_mode(
			
 
				+        self,
			
 
				+        imageWriter: DataWriter,
			
 
				+        start_page_id=0,
			
 
				+        end_page_id=None,
			
 
				+        debug_mode=False,
			
 
				+        lang=None,
			
 
				+    ) -> PipeResult:
			
 
				+        """Post-proc the model inference result.
			
 
				+            step1: classify the dataset type
			
 
				+            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
			
 
				+
			
 
				+        Args:
			
 
				+            imageWriter (DataWriter): the image writer handle
			
 
				+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
			
 
				+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
			
 
				+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
			
 
				+            lang (_type_, optional): Defaults to None.
			
 
				+
			
 
				+        Returns:
			
 
				+            PipeResult: the result
			
 
				+        """
			
 
				+
			
 
				+        pdf_proc_method = classify(self._dataset.data_bits())
			
 
				+
			
 
				+        if pdf_proc_method == SupportedPdfParseMethod.TXT:
			
 
				+            return self.pipe_txt_mode(
			
 
				+                imageWriter, start_page_id, end_page_id, debug_mode, lang
			
 
				+            )
			
 
				+        else:
			
 
				+            return self.pipe_ocr_mode(
			
 
				+                imageWriter, start_page_id, end_page_id, debug_mode, lang
			
 
				+            )
			
 
				+
			
 
				+    def pipe_txt_mode(
			
 
				+        self,
			
 
				+        imageWriter: DataWriter,
			
 
				+        start_page_id=0,
			
 
				+        end_page_id=None,
			
 
				+        debug_mode=False,
			
 
				+        lang=None,
			
 
				+    ) -> PipeResult:
			
 
				+        """Post-proc the model inference result, Extract the text using the
			
 
				+        third library, such as `pymupdf`
			
 
				+
			
 
				+        Args:
			
 
				+            imageWriter (DataWriter): the image writer handle
			
 
				+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
			
 
				+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
			
 
				+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
			
 
				+            lang (_type_, optional): Defaults to None.
			
 
				+
			
 
				+        Returns:
			
 
				+            PipeResult: the result
			
 
				+        """
			
 
				+
			
 
				+        def proc(*args, **kwargs) -> PipeResult:
			
 
				+            res = pdf_parse_union(*args, **kwargs)
			
 
				+            return PipeResult(res, self._dataset)
			
 
				+
			
 
				+        return self.apply(
			
 
				+            proc,
			
 
				+            self._dataset,
			
 
				+            imageWriter,
			
 
				+            SupportedPdfParseMethod.TXT,
			
 
				+            start_page_id=start_page_id,
			
 
				+            end_page_id=end_page_id,
			
 
				+            debug_mode=debug_mode,
			
 
				+            lang=lang,
			
 
				+        )
			
 
				+
			
 
				+    def pipe_ocr_mode(
			
 
				+        self,
			
 
				+        imageWriter: DataWriter,
			
 
				+        start_page_id=0,
			
 
				+        end_page_id=None,
			
 
				+        debug_mode=False,
			
 
				+        lang=None,
			
 
				+    ) -> PipeResult:
			
 
				+        """Post-proc the model inference result, Extract the text using `OCR`
			
 
				+        technical.
			
 
				+
			
 
				+        Args:
			
 
				+            imageWriter (DataWriter): the image writer handle
			
 
				+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
			
 
				+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
			
 
				+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
			
 
				+            lang (_type_, optional): Defaults to None.
			
 
				+
			
 
				+        Returns:
			
 
				+            PipeResult: the result
			
 
				+        """
			
 
				+
			
 
				+        def proc(*args, **kwargs) -> PipeResult:
			
 
				+            res = pdf_parse_union(*args, **kwargs)
			
 
				+            return PipeResult(res, self._dataset)
			
 
				+
			
 
				+        return self.apply(
			
 
				+            proc,
			
 
				+            self._dataset,
			
 
				+            imageWriter,
			
 
				+            SupportedPdfParseMethod.TXT,
			
 
				+            start_page_id=start_page_id,
			
 
				+            end_page_id=end_page_id,
			
 
				+            debug_mode=debug_mode,
			
 
				+            lang=lang,
			
 
				+        )
			
--- a/magic_pdf/model/types.py
+++ b/magic_pdf/model/types.py
@@ -1,122 +0,0 @@
 
				-import copy
			
 
				-import json
			
 
				-import os
			
 
				-from typing import Callable
			
 
				-
			
 
				-from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				-from magic_pdf.data.data_reader_writer import DataWriter
			
 
				-from magic_pdf.data.dataset import Dataset
			
 
				-from magic_pdf.filter import classify
			
 
				-from magic_pdf.libs.draw_bbox import draw_model_bbox
			
 
				-from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
			
 
				-from magic_pdf.pipe.types import PipeResult
			
 
				-
			
 
				-
			
 
				-class InferenceResult:
			
 
				-    def __init__(self, inference_results: list, dataset: Dataset):
			
 
				-        self._infer_res = inference_results
			
 
				-        self._dataset = dataset
			
 
				-
			
 
				-    def draw_model(self, file_path: str) -> None:
			
 
				-        dir_name = os.path.dirname(file_path)
			
 
				-        base_name = os.path.basename(file_path)
			
 
				-        if not os.path.exists(dir_name):
			
 
				-            os.makedirs(dir_name, exist_ok=True)
			
 
				-        draw_model_bbox(
			
 
				-            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
			
 
				-        )
			
 
				-
			
 
				-    def dump_model(self, writer: DataWriter, file_path: str):
			
 
				-        writer.write_string(
			
 
				-            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
			
 
				-        )
			
 
				-
			
 
				-    def get_infer_res(self):
			
 
				-        return self._infer_res
			
 
				-
			
 
				-    def apply(self, proc: Callable, *args, **kwargs):
			
 
				-        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
			
 
				-
			
 
				-    def pipe_auto_mode(
			
 
				-        self,
			
 
				-        imageWriter: DataWriter,
			
 
				-        start_page_id=0,
			
 
				-        end_page_id=None,
			
 
				-        debug_mode=False,
			
 
				-        lang=None,
			
 
				-    ) -> PipeResult:
			
 
				-        def proc(*args, **kwargs) -> PipeResult:
			
 
				-            res = pdf_parse_union(*args, **kwargs)
			
 
				-            return PipeResult(res, self._dataset)
			
 
				-
			
 
				-        pdf_proc_method = classify(self._dataset.data_bits())
			
 
				-
			
 
				-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
			
 
				-            return self.apply(
			
 
				-                proc,
			
 
				-                self._dataset,
			
 
				-                imageWriter,
			
 
				-                SupportedPdfParseMethod.TXT,
			
 
				-                start_page_id=0,
			
 
				-                end_page_id=None,
			
 
				-                debug_mode=False,
			
 
				-                lang=None,
			
 
				-            )
			
 
				-        else:
			
 
				-            return self.apply(
			
 
				-                proc,
			
 
				-                self._dataset,
			
 
				-                imageWriter,
			
 
				-                SupportedPdfParseMethod.OCR,
			
 
				-                start_page_id=0,
			
 
				-                end_page_id=None,
			
 
				-                debug_mode=False,
			
 
				-                lang=None,
			
 
				-            )
			
 
				-
			
 
				-    def pipe_txt_mode(
			
 
				-        self,
			
 
				-        imageWriter: DataWriter,
			
 
				-        start_page_id=0,
			
 
				-        end_page_id=None,
			
 
				-        debug_mode=False,
			
 
				-        lang=None,
			
 
				-    ) -> PipeResult:
			
 
				-        def proc(*args, **kwargs) -> PipeResult:
			
 
				-            res = pdf_parse_union(*args, **kwargs)
			
 
				-            return PipeResult(res, self._dataset)
			
 
				-
			
 
				-        return self.apply(
			
 
				-            proc,
			
 
				-            self._dataset,
			
 
				-            imageWriter,
			
 
				-            SupportedPdfParseMethod.TXT,
			
 
				-            start_page_id=0,
			
 
				-            end_page_id=None,
			
 
				-            debug_mode=False,
			
 
				-            lang=None,
			
 
				-        )
			
 
				-
			
 
				-    def pipe_ocr_mode(
			
 
				-        self,
			
 
				-        imageWriter: DataWriter,
			
 
				-        start_page_id=0,
			
 
				-        end_page_id=None,
			
 
				-        debug_mode=False,
			
 
				-        lang=None,
			
 
				-    ) -> PipeResult:
			
 
				-
			
 
				-        def proc(*args, **kwargs) -> PipeResult:
			
 
				-            res = pdf_parse_union(*args, **kwargs)
			
 
				-            return PipeResult(res, self._dataset)
			
 
				-
			
 
				-        return self.apply(
			
 
				-            proc,
			
 
				-            self._dataset,
			
 
				-            imageWriter,
			
 
				-            SupportedPdfParseMethod.TXT,
			
 
				-            start_page_id=0,
			
 
				-            end_page_id=None,
			
 
				-            debug_mode=False,
			
 
				-            lang=None,
			
 
				-        )
			
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -4,8 +4,8 @@ import statistics
 
				 import time
			
 
				 from typing import List
			
 
				 
			
 
				-import torch
			
 
				 import fitz
			
 
				+import torch
			
 
				 from loguru import logger
			
 
				 
			
 
				 from magic_pdf.config.enums import SupportedPdfParseMethod
			
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
 
				 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
			
 
				 from magic_pdf.libs.convert_utils import dict_to_list
			
 
				 from magic_pdf.libs.hash_utils import compute_md5
			
 
				-
			
 
				 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
			
 
				 from magic_pdf.model.magic_model import MagicModel
			
 
				 
			
 
				-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
			
 
				-os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
			
 
				-
			
 
				 try:
			
 
				     import torchtext
			
 
				 
			
 
				-    if torchtext.__version__ >= "0.18.0":
			
 
				+    if torchtext.__version__ >= '0.18.0':
			
 
				         torchtext.disable_torchtext_deprecation_warning()
			
 
				 except ImportError:
			
 
				     pass
			
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
 
				 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
			
 
				 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
			
 
				 
			
 
				+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
			
 
				+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
			
 
				+
			
 
				 
			
 
				 def __replace_STX_ETX(text_str: str):
			
 
				     """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
			
@@ -90,7 +89,10 @@ def chars_to_content(span):
 
				 
			
 
				 
			
 
				 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
			
 
				+<<<<<<< HEAD
			
 
				 LINE_START_FLAG = ('(', '（', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
			
 
				+=======
			
 
				+>>>>>>> 731f4bf (feat: add function definitions)
			
 
				 
			
 
				 
			
 
				 def fill_char_in_spans(spans, all_chars):
			
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
				         # 初始化ocr模型
			
 
				         atom_model_manager = AtomModelSingleton()
			
 
				         ocr_model = atom_model_manager.get_atom_model(
			
 
				-            atom_model_name="ocr",
			
 
				+            atom_model_name='ocr',
			
 
				             ocr_show_log=False,
			
 
				             det_db_box_thresh=0.3,
			
 
				             lang=lang
			
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
				 
			
 
				         for span in empty_spans:
			
 
				             # 对span的bbox截图再ocr
			
 
				-            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
			
 
				+            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
			
 
				             ocr_res = ocr_model.ocr(span_img, det=False)
			
 
				             if ocr_res and len(ocr_res) > 0:
			
 
				                 if len(ocr_res[0]) > 0:
			
@@ -681,7 +683,7 @@ def parse_page_core(
 
				     """根据parse_mode，构造spans，主要是文本类的字符填充"""
			
 
				     if parse_mode == SupportedPdfParseMethod.TXT:
			
 
				 
			
 
				-        """使用新版本的混合ocr方案"""
			
 
				+        """使用新版本的混合ocr方案."""
			
 
				         spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
			
 
				 
			
 
				     elif parse_mode == SupportedPdfParseMethod.OCR:
			
@@ -689,7 +691,6 @@ def parse_page_core(
 
				     else:
			
 
				         raise Exception('parse_mode must be txt or ocr')
			
 
				 
			
 
				-
			
 
				     """先处理不需要排版的discarded_blocks"""
			
 
				     discarded_block_with_spans, spans = fill_spans_in_blocks(
			
 
				         all_discarded_blocks, spans, 0.4
			
@@ -762,8 +763,8 @@ def parse_page_core(
 
				 
			
 
				 
			
 
				 def pdf_parse_union(
			
 
				-    dataset: Dataset,
			
 
				     model_list,
			
 
				+    dataset: Dataset,
			
 
				     imageWriter,
			
 
				     parse_mode,
			
 
				     start_page_id=0,
			
@@ -832,4 +833,4 @@ def pdf_parse_union(
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    pass
			
 
				+    pass
			
--- a/magic_pdf/pipe/operators.py
+++ b/magic_pdf/pipe/operators.py
@@ -0,0 +1,124 @@
 
				+import json
			
 
				+import os
			
 
				+
			
 
				+from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				+from magic_pdf.data.data_reader_writer import DataWriter
			
 
				+from magic_pdf.data.dataset import Dataset
			
 
				+from magic_pdf.dict2md.ocr_mkcontent import union_make
			
 
				+from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
			
 
				+                                      draw_span_bbox)
			
 
				+from magic_pdf.libs.json_compressor import JsonCompressor
			
 
				+
			
 
				+
			
 
				+class PipeResult:
			
 
				+    def __init__(self, pipe_res, dataset: Dataset):
			
 
				+        """Initialized.
			
 
				+
			
 
				+        Args:
			
 
				+            pipe_res (list[dict]): the pipeline processed result of model inference result
			
 
				+            dataset (Dataset): the dataset associated with pipe_res
			
 
				+        """
			
 
				+        self._pipe_res = pipe_res
			
 
				+        self._dataset = dataset
			
 
				+
			
 
				+    def dump_md(
			
 
				+        self,
			
 
				+        writer: DataWriter,
			
 
				+        file_path: str,
			
 
				+        img_dir_or_bucket_prefix: str,
			
 
				+        drop_mode=DropMode.WHOLE_PDF,
			
 
				+        md_make_mode=MakeMode.MM_MD,
			
 
				+    ):
			
 
				+        """Dump The Markdown.
			
 
				+
			
 
				+        Args:
			
 
				+            writer (DataWriter): File writer handle
			
 
				+            file_path (str): The file location of markdown
			
 
				+            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
			
 
				+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
			
 
				+        """
			
 
				+        pdf_info_list = self._pipe_res['pdf_info']
			
 
				+        md_content = union_make(
			
 
				+            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
			
 
				+        )
			
 
				+        writer.write_string(file_path, md_content)
			
 
				+
			
 
				+    def dump_content_list(
			
 
				+        self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
			
 
				+    ):
			
 
				+        """Dump Content List.
			
 
				+
			
 
				+        Args:
			
 
				+            writer (DataWriter): File writer handle
			
 
				+            file_path (str): The file location of content list
			
 
				+            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				+        """
			
 
				+        pdf_info_list = self._pipe_res['pdf_info']
			
 
				+        content_list = union_make(
			
 
				+            pdf_info_list,
			
 
				+            MakeMode.STANDARD_FORMAT,
			
 
				+            DropMode.NONE,
			
 
				+            image_dir_or_bucket_prefix,
			
 
				+        )
			
 
				+        writer.write_string(
			
 
				+            file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
			
 
				+        )
			
 
				+
			
 
				+    def dump_middle_json(self, writer: DataWriter, file_path: str):
			
 
				+        """Dump the result of pipeline.
			
 
				+
			
 
				+        Args:
			
 
				+            writer (DataWriter): File writer handler
			
 
				+            file_path (str): The file location of middle json
			
 
				+        """
			
 
				+        writer.write_string(
			
 
				+            file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
			
 
				+        )
			
 
				+
			
 
				+    def draw_layout(self, file_path: str) -> None:
			
 
				+        """Draw the layout.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path (str): The file location of layout result file
			
 
				+        """
			
 
				+        dir_name = os.path.dirname(file_path)
			
 
				+        base_name = os.path.basename(file_path)
			
 
				+        if not os.path.exists(dir_name):
			
 
				+            os.makedirs(dir_name, exist_ok=True)
			
 
				+        pdf_info = self._pipe_res['pdf_info']
			
 
				+        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				+
			
 
				+    def draw_span(self, file_path: str):
			
 
				+        """Draw the Span.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path (str): The file location of span result file
			
 
				+        """
			
 
				+        dir_name = os.path.dirname(file_path)
			
 
				+        base_name = os.path.basename(file_path)
			
 
				+        if not os.path.exists(dir_name):
			
 
				+            os.makedirs(dir_name, exist_ok=True)
			
 
				+        pdf_info = self._pipe_res['pdf_info']
			
 
				+        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				+
			
 
				+    def draw_line_sort(self, file_path: str):
			
 
				+        """Draw line sort.
			
 
				+
			
 
				+        Args:
			
 
				+            file_path (str): The file location of line sort result file
			
 
				+        """
			
 
				+        dir_name = os.path.dirname(file_path)
			
 
				+        base_name = os.path.basename(file_path)
			
 
				+        if not os.path.exists(dir_name):
			
 
				+            os.makedirs(dir_name, exist_ok=True)
			
 
				+        pdf_info = self._pipe_res['pdf_info']
			
 
				+        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				+
			
 
				+    def get_compress_pdf_mid_data(self):
			
 
				+        """Compress the pipeline result.
			
 
				+
			
 
				+        Returns:
			
 
				+            str: compress the pipeline result and return
			
 
				+        """
			
 
				+        return JsonCompressor.compress_json(self.pdf_mid_data)
			
--- a/magic_pdf/pipe/types.py
+++ b/magic_pdf/pipe/types.py
@@ -1,62 +0,0 @@
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-
			
 
				-from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				-from magic_pdf.data.data_reader_writer import DataWriter
			
 
				-from magic_pdf.data.dataset import Dataset
			
 
				-from magic_pdf.dict2md.ocr_mkcontent import union_make
			
 
				-from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
			
 
				-                                      draw_span_bbox)
			
 
				-from magic_pdf.libs.json_compressor import JsonCompressor
			
 
				-
			
 
				-
			
 
				-class PipeResult:
			
 
				-    def __init__(self, pipe_res, dataset: Dataset):
			
 
				-        self._pipe_res = pipe_res
			
 
				-        self._dataset = dataset
			
 
				-
			
 
				-    def dump_md(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
			
 
				-        pdf_info_list = self._pipe_res['pdf_info']
			
 
				-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix)
			
 
				-        writer.write_string(file_path, md_content)
			
 
				-
			
 
				-    def dump_content_list(self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str, drop_mode=DropMode.NONE):
			
 
				-        pdf_info_list = self._pipe_res['pdf_info']
			
 
				-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, image_dir_or_bucket_prefix)
			
 
				-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
			
 
				-
			
 
				-    def dump_middle_json(self, writer: DataWriter, file_path: str):
			
 
				-        writer.write_string(file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4))
			
 
				-
			
 
				-    def draw_layout(self, file_path: str) -> None:
			
 
				-        dir_name = os.path.dirname(file_path)
			
 
				-        base_name = os.path.basename(file_path)
			
 
				-        if not os.path.exists(dir_name):
			
 
				-            os.makedirs(dir_name, exist_ok=True)
			
 
				-        pdf_info = self._pipe_res['pdf_info']
			
 
				-        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				-
			
 
				-    def draw_span(self, file_path: str):
			
 
				-        dir_name = os.path.dirname(file_path)
			
 
				-        base_name = os.path.basename(file_path)
			
 
				-        if not os.path.exists(dir_name):
			
 
				-            os.makedirs(dir_name, exist_ok=True)
			
 
				-        pdf_info = self._pipe_res['pdf_info']
			
 
				-        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				-
			
 
				-    def draw_line_sort(self, file_path: str):
			
 
				-        dir_name = os.path.dirname(file_path)
			
 
				-        base_name = os.path.basename(file_path)
			
 
				-        if not os.path.exists(dir_name):
			
 
				-            os.makedirs(dir_name, exist_ok=True)
			
 
				-        pdf_info = self._pipe_res['pdf_info']
			
 
				-        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
			
 
				-
			
 
				-    def draw_content_list(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF):
			
 
				-        pdf_info_list = self._pipe_res['pdf_info']
			
 
				-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_dir_or_bucket_prefix)
			
 
				-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
			
 
				-
			
 
				-    def get_compress_pdf_mid_data(self):
			
 
				-        return JsonCompressor.compress_json(self.pdf_mid_data)
			
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -10,7 +10,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode
 
				 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				 from magic_pdf.data.dataset import PymuDocDataset
			
 
				 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				-from magic_pdf.model.types import InferenceResult
			
 
				+from magic_pdf.model.operators import InferenceResult
			
 
				 
			
 
				 # from io import BytesIO
			
 
				 # from pypdf import PdfReader, PdfWriter
			
@@ -223,8 +223,7 @@ def do_parse(
 
				         pipe_result.dump_content_list(
			
 
				             md_writer,
			
 
				             f'{pdf_file_name}_content_list.json',
			
 
				-            image_dir,
			
 
				-            drop_mode=DropMode.NONE,
			
 
				+            image_dir
			
 
				         )
			
 
				 
			
 
				     logger.info(f'local output dir is {local_md_dir}')