Răsfoiți Sursa

feat: add function definitions

icecraft 11 luni în urmă
părinte
comite
4a82d6a07a

+ 36 - 0
magic_pdf/data/dataset.py

@@ -32,10 +32,28 @@ class PageableData(ABC):
 
 
     @abstractmethod
     @abstractmethod
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
         pass
         pass
 
 
     @abstractmethod
     @abstractmethod
     def insert_text(self, coord, content, fontsize, color):
     def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
+        """
         pass
         pass
 
 
 
 
@@ -244,6 +262,16 @@ class Doc(PageableData):
             return getattr(self._doc, name)
             return getattr(self._doc, name)
 
 
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
         self._doc.draw_rect(
         self._doc.draw_rect(
             rect_coords,
             rect_coords,
             color=color,
             color=color,
@@ -254,4 +282,12 @@ class Doc(PageableData):
         )
         )
 
 
     def insert_text(self, coord, content, fontsize, color):
     def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
+        """
         self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
         self._doc.insert_text(coord, content, fontsize=fontsize, color=color)

+ 1 - 1
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -13,7 +13,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
                                           get_local_models_dir,
                                           get_local_models_dir,
                                           get_table_recog_config)
                                           get_table_recog_config)
 from magic_pdf.model.model_list import MODEL
 from magic_pdf.model.model_list import MODEL
-from magic_pdf.model.types import InferenceResult
+from magic_pdf.model.operators import InferenceResult
 
 
 
 
 def dict_compare(d1, d2):
 def dict_compare(d1, d2):

+ 177 - 0
magic_pdf/model/operators.py

@@ -0,0 +1,177 @@
+import copy
+import json
+import os
+from typing import Callable
+
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.data_reader_writer import DataWriter
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.filter import classify
+from magic_pdf.libs.draw_bbox import draw_model_bbox
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+from magic_pdf.pipe.operators import PipeResult
+
+
+class InferenceResult:
+    def __init__(self, inference_results: list, dataset: Dataset):
+        """Initialized method.
+
+        Args:
+            inference_results (list): the inference result generated by model
+            dataset (Dataset): the dataset related with model inference result
+        """
+        self._infer_res = inference_results
+        self._dataset = dataset
+
+    def draw_model(self, file_path: str) -> None:
+        """Draw model inference result.
+
+        Args:
+            file_path (str): the output file path
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        draw_model_bbox(
+            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
+        )
+
+    def dump_model(self, writer: DataWriter, file_path: str):
+        """Dump model inference result to file.
+
+        Args:
+            writer (DataWriter): writer handle
+            file_path (str): the location of target file
+        """
+        writer.write_string(
+            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
+        )
+
+    def get_infer_res(self):
+        """Get the inference result.
+
+        Returns:
+            list[dict]: the inference result generated by model
+        """
+        return self._infer_res
+
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(inference_result, *args, **kwargs)
+
+        Returns:
+            Any: return the result generated by proc
+        """
+        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
+
+    def pipe_auto_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result.
+            step1: classify the dataset type
+            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        pdf_proc_method = classify(self._dataset.data_bits())
+
+        if pdf_proc_method == SupportedPdfParseMethod.TXT:
+            return self.pipe_txt_mode(
+                imageWriter, start_page_id, end_page_id, debug_mode, lang
+            )
+        else:
+            return self.pipe_ocr_mode(
+                imageWriter, start_page_id, end_page_id, debug_mode, lang
+            )
+
+    def pipe_txt_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result, Extract the text using the
+        third library, such as `pymupdf`
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        def proc(*args, **kwargs) -> PipeResult:
+            res = pdf_parse_union(*args, **kwargs)
+            return PipeResult(res, self._dataset)
+
+        return self.apply(
+            proc,
+            self._dataset,
+            imageWriter,
+            SupportedPdfParseMethod.TXT,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id,
+            debug_mode=debug_mode,
+            lang=lang,
+        )
+
+    def pipe_ocr_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result, Extract the text using `OCR`
+        technical.
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        def proc(*args, **kwargs) -> PipeResult:
+            res = pdf_parse_union(*args, **kwargs)
+            return PipeResult(res, self._dataset)
+
+        return self.apply(
+            proc,
+            self._dataset,
+            imageWriter,
+            SupportedPdfParseMethod.TXT,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id,
+            debug_mode=debug_mode,
+            lang=lang,
+        )

+ 0 - 122
magic_pdf/model/types.py

@@ -1,122 +0,0 @@
-import copy
-import json
-import os
-from typing import Callable
-
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.filter import classify
-from magic_pdf.libs.draw_bbox import draw_model_bbox
-from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
-from magic_pdf.pipe.types import PipeResult
-
-
-class InferenceResult:
-    def __init__(self, inference_results: list, dataset: Dataset):
-        self._infer_res = inference_results
-        self._dataset = dataset
-
-    def draw_model(self, file_path: str) -> None:
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        draw_model_bbox(
-            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
-        )
-
-    def dump_model(self, writer: DataWriter, file_path: str):
-        writer.write_string(
-            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
-        )
-
-    def get_infer_res(self):
-        return self._infer_res
-
-    def apply(self, proc: Callable, *args, **kwargs):
-        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
-
-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        pdf_proc_method = classify(self._dataset.data_bits())
-
-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
-            return self.apply(
-                proc,
-                self._dataset,
-                imageWriter,
-                SupportedPdfParseMethod.TXT,
-                start_page_id=0,
-                end_page_id=None,
-                debug_mode=False,
-                lang=None,
-            )
-        else:
-            return self.apply(
-                proc,
-                self._dataset,
-                imageWriter,
-                SupportedPdfParseMethod.OCR,
-                start_page_id=0,
-                end_page_id=None,
-                debug_mode=False,
-                lang=None,
-            )
-
-    def pipe_txt_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        return self.apply(
-            proc,
-            self._dataset,
-            imageWriter,
-            SupportedPdfParseMethod.TXT,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-        )
-
-    def pipe_ocr_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        return self.apply(
-            proc,
-            self._dataset,
-            imageWriter,
-            SupportedPdfParseMethod.TXT,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-        )

+ 13 - 12
magic_pdf/pdf_parse_union_core_v2.py

@@ -4,8 +4,8 @@ import statistics
 import time
 import time
 from typing import List
 from typing import List
 
 
-import torch
 import fitz
 import fitz
+import torch
 from loguru import logger
 from loguru import logger
 
 
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.hash_utils import compute_md5
-
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.model.magic_model import MagicModel
 
 
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
-
 try:
 try:
     import torchtext
     import torchtext
 
 
-    if torchtext.__version__ >= "0.18.0":
+    if torchtext.__version__ >= '0.18.0':
         torchtext.disable_torchtext_deprecation_warning()
         torchtext.disable_torchtext_deprecation_warning()
 except ImportError:
 except ImportError:
     pass
     pass
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
 
 
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
+
 
 
 def __replace_STX_ETX(text_str: str):
 def __replace_STX_ETX(text_str: str):
     """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
     """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
@@ -90,7 +89,10 @@ def chars_to_content(span):
 
 
 
 
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
+<<<<<<< HEAD
 LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
 LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
+=======
+>>>>>>> 731f4bf (feat: add function definitions)
 
 
 
 
 def fill_char_in_spans(spans, all_chars):
 def fill_char_in_spans(spans, all_chars):
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
         # 初始化ocr模型
         # 初始化ocr模型
         atom_model_manager = AtomModelSingleton()
         atom_model_manager = AtomModelSingleton()
         ocr_model = atom_model_manager.get_atom_model(
         ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name="ocr",
+            atom_model_name='ocr',
             ocr_show_log=False,
             ocr_show_log=False,
             det_db_box_thresh=0.3,
             det_db_box_thresh=0.3,
             lang=lang
             lang=lang
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
 
         for span in empty_spans:
         for span in empty_spans:
             # 对span的bbox截图再ocr
             # 对span的bbox截图再ocr
-            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
+            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
             ocr_res = ocr_model.ocr(span_img, det=False)
             ocr_res = ocr_model.ocr(span_img, det=False)
             if ocr_res and len(ocr_res) > 0:
             if ocr_res and len(ocr_res) > 0:
                 if len(ocr_res[0]) > 0:
                 if len(ocr_res[0]) > 0:
@@ -681,7 +683,7 @@ def parse_page_core(
     """根据parse_mode,构造spans,主要是文本类的字符填充"""
     """根据parse_mode,构造spans,主要是文本类的字符填充"""
     if parse_mode == SupportedPdfParseMethod.TXT:
     if parse_mode == SupportedPdfParseMethod.TXT:
 
 
-        """使用新版本的混合ocr方案"""
+        """使用新版本的混合ocr方案."""
         spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
         spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
 
 
     elif parse_mode == SupportedPdfParseMethod.OCR:
     elif parse_mode == SupportedPdfParseMethod.OCR:
@@ -689,7 +691,6 @@ def parse_page_core(
     else:
     else:
         raise Exception('parse_mode must be txt or ocr')
         raise Exception('parse_mode must be txt or ocr')
 
 
-
     """先处理不需要排版的discarded_blocks"""
     """先处理不需要排版的discarded_blocks"""
     discarded_block_with_spans, spans = fill_spans_in_blocks(
     discarded_block_with_spans, spans = fill_spans_in_blocks(
         all_discarded_blocks, spans, 0.4
         all_discarded_blocks, spans, 0.4
@@ -762,8 +763,8 @@ def parse_page_core(
 
 
 
 
 def pdf_parse_union(
 def pdf_parse_union(
-    dataset: Dataset,
     model_list,
     model_list,
+    dataset: Dataset,
     imageWriter,
     imageWriter,
     parse_mode,
     parse_mode,
     start_page_id=0,
     start_page_id=0,
@@ -832,4 +833,4 @@ def pdf_parse_union(
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    pass
+    pass

+ 124 - 0
magic_pdf/pipe/operators.py

@@ -0,0 +1,124 @@
+import json
+import os
+
+from magic_pdf.config.make_content_config import DropMode, MakeMode
+from magic_pdf.data.data_reader_writer import DataWriter
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.dict2md.ocr_mkcontent import union_make
+from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
+                                      draw_span_bbox)
+from magic_pdf.libs.json_compressor import JsonCompressor
+
+
+class PipeResult:
+    def __init__(self, pipe_res, dataset: Dataset):
+        """Initialized.
+
+        Args:
+            pipe_res (list[dict]): the pipeline processed result of model inference result
+            dataset (Dataset): the dataset associated with pipe_res
+        """
+        self._pipe_res = pipe_res
+        self._dataset = dataset
+
+    def dump_md(
+        self,
+        writer: DataWriter,
+        file_path: str,
+        img_dir_or_bucket_prefix: str,
+        drop_mode=DropMode.WHOLE_PDF,
+        md_make_mode=MakeMode.MM_MD,
+    ):
+        """Dump The Markdown.
+
+        Args:
+            writer (DataWriter): File writer handle
+            file_path (str): The file location of markdown
+            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
+        """
+        pdf_info_list = self._pipe_res['pdf_info']
+        md_content = union_make(
+            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
+        )
+        writer.write_string(file_path, md_content)
+
+    def dump_content_list(
+        self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
+    ):
+        """Dump Content List.
+
+        Args:
+            writer (DataWriter): File writer handle
+            file_path (str): The file location of content list
+            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+        """
+        pdf_info_list = self._pipe_res['pdf_info']
+        content_list = union_make(
+            pdf_info_list,
+            MakeMode.STANDARD_FORMAT,
+            DropMode.NONE,
+            image_dir_or_bucket_prefix,
+        )
+        writer.write_string(
+            file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
+        )
+
+    def dump_middle_json(self, writer: DataWriter, file_path: str):
+        """Dump the result of pipeline.
+
+        Args:
+            writer (DataWriter): File writer handler
+            file_path (str): The file location of middle json
+        """
+        writer.write_string(
+            file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
+        )
+
+    def draw_layout(self, file_path: str) -> None:
+        """Draw the layout.
+
+        Args:
+            file_path (str): The file location of layout result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def draw_span(self, file_path: str):
+        """Draw the Span.
+
+        Args:
+            file_path (str): The file location of span result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def draw_line_sort(self, file_path: str):
+        """Draw line sort.
+
+        Args:
+            file_path (str): The file location of line sort result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def get_compress_pdf_mid_data(self):
+        """Compress the pipeline result.
+
+        Returns:
+            str: compress the pipeline result and return
+        """
+        return JsonCompressor.compress_json(self.pdf_mid_data)

+ 0 - 62
magic_pdf/pipe/types.py

@@ -1,62 +0,0 @@
-
-import json
-import os
-
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.dict2md.ocr_mkcontent import union_make
-from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
-                                      draw_span_bbox)
-from magic_pdf.libs.json_compressor import JsonCompressor
-
-
-class PipeResult:
-    def __init__(self, pipe_res, dataset: Dataset):
-        self._pipe_res = pipe_res
-        self._dataset = dataset
-
-    def dump_md(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
-        pdf_info_list = self._pipe_res['pdf_info']
-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix)
-        writer.write_string(file_path, md_content)
-
-    def dump_content_list(self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str, drop_mode=DropMode.NONE):
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, image_dir_or_bucket_prefix)
-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
-
-    def dump_middle_json(self, writer: DataWriter, file_path: str):
-        writer.write_string(file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4))
-
-    def draw_layout(self, file_path: str) -> None:
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_span(self, file_path: str):
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_line_sort(self, file_path: str):
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_content_list(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF):
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_dir_or_bucket_prefix)
-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
-
-    def get_compress_pdf_mid_data(self):
-        return JsonCompressor.compress_json(self.pdf_mid_data)

+ 2 - 3
magic_pdf/tools/common.py

@@ -10,7 +10,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.model.types import InferenceResult
+from magic_pdf.model.operators import InferenceResult
 
 
 # from io import BytesIO
 # from io import BytesIO
 # from pypdf import PdfReader, PdfWriter
 # from pypdf import PdfReader, PdfWriter
@@ -223,8 +223,7 @@ def do_parse(
         pipe_result.dump_content_list(
         pipe_result.dump_content_list(
             md_writer,
             md_writer,
             f'{pdf_file_name}_content_list.json',
             f'{pdf_file_name}_content_list.json',
-            image_dir,
-            drop_mode=DropMode.NONE,
+            image_dir
         )
         )
 
 
     logger.info(f'local output dir is {local_md_dir}')
     logger.info(f'local output dir is {local_md_dir}')