소스 검색

feat: add function definitions

icecraft 11 달 전
부모
커밋
4a82d6a07a

+ 36 - 0
magic_pdf/data/dataset.py

@@ -32,10 +32,28 @@ class PageableData(ABC):
 
     @abstractmethod
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
         pass
 
     @abstractmethod
     def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
+        """
         pass
 
 
@@ -244,6 +262,16 @@ class Doc(PageableData):
             return getattr(self._doc, name)
 
     def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
         self._doc.draw_rect(
             rect_coords,
             color=color,
@@ -254,4 +282,12 @@ class Doc(PageableData):
         )
 
     def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which descript the RGB of the board line, None will use the default font color!
+        """
         self._doc.insert_text(coord, content, fontsize=fontsize, color=color)

+ 1 - 1
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -13,7 +13,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
                                           get_local_models_dir,
                                           get_table_recog_config)
 from magic_pdf.model.model_list import MODEL
-from magic_pdf.model.types import InferenceResult
+from magic_pdf.model.operators import InferenceResult
 
 
 def dict_compare(d1, d2):

+ 177 - 0
magic_pdf/model/operators.py

@@ -0,0 +1,177 @@
+import copy
+import json
+import os
+from typing import Callable
+
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.data_reader_writer import DataWriter
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.filter import classify
+from magic_pdf.libs.draw_bbox import draw_model_bbox
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+from magic_pdf.pipe.operators import PipeResult
+
+
+class InferenceResult:
+    def __init__(self, inference_results: list, dataset: Dataset):
+        """Initialized method.
+
+        Args:
+            inference_results (list): the inference result generated by model
+            dataset (Dataset): the dataset related with model inference result
+        """
+        self._infer_res = inference_results
+        self._dataset = dataset
+
+    def draw_model(self, file_path: str) -> None:
+        """Draw model inference result.
+
+        Args:
+            file_path (str): the output file path
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        draw_model_bbox(
+            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
+        )
+
+    def dump_model(self, writer: DataWriter, file_path: str):
+        """Dump model inference result to file.
+
+        Args:
+            writer (DataWriter): writer handle
+            file_path (str): the location of target file
+        """
+        writer.write_string(
+            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
+        )
+
+    def get_infer_res(self):
+        """Get the inference result.
+
+        Returns:
+            list[dict]: the inference result generated by model
+        """
+        return self._infer_res
+
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(inference_result, *args, **kwargs)
+
+        Returns:
+            Any: return the result generated by proc
+        """
+        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
+
+    def pipe_auto_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result.
+            step1: classify the dataset type
+            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        pdf_proc_method = classify(self._dataset.data_bits())
+
+        if pdf_proc_method == SupportedPdfParseMethod.TXT:
+            return self.pipe_txt_mode(
+                imageWriter, start_page_id, end_page_id, debug_mode, lang
+            )
+        else:
+            return self.pipe_ocr_mode(
+                imageWriter, start_page_id, end_page_id, debug_mode, lang
+            )
+
+    def pipe_txt_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result, Extract the text using the
+        third library, such as `pymupdf`
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        def proc(*args, **kwargs) -> PipeResult:
+            res = pdf_parse_union(*args, **kwargs)
+            return PipeResult(res, self._dataset)
+
+        return self.apply(
+            proc,
+            self._dataset,
+            imageWriter,
+            SupportedPdfParseMethod.TXT,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id,
+            debug_mode=debug_mode,
+            lang=lang,
+        )
+
+    def pipe_ocr_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result, Extract the text using `OCR`
+        technical.
+
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (_type_, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (_type_, optional): Defaults to None.
+
+        Returns:
+            PipeResult: the result
+        """
+
+        def proc(*args, **kwargs) -> PipeResult:
+            res = pdf_parse_union(*args, **kwargs)
+            return PipeResult(res, self._dataset)
+
+        return self.apply(
+            proc,
+            self._dataset,
+            imageWriter,
+            SupportedPdfParseMethod.TXT,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id,
+            debug_mode=debug_mode,
+            lang=lang,
+        )

+ 0 - 122
magic_pdf/model/types.py

@@ -1,122 +0,0 @@
-import copy
-import json
-import os
-from typing import Callable
-
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.filter import classify
-from magic_pdf.libs.draw_bbox import draw_model_bbox
-from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
-from magic_pdf.pipe.types import PipeResult
-
-
-class InferenceResult:
-    def __init__(self, inference_results: list, dataset: Dataset):
-        self._infer_res = inference_results
-        self._dataset = dataset
-
-    def draw_model(self, file_path: str) -> None:
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        draw_model_bbox(
-            copy.deepcopy(self._infer_res), self._dataset, dir_name, base_name
-        )
-
-    def dump_model(self, writer: DataWriter, file_path: str):
-        writer.write_string(
-            file_path, json.dumps(self._infer_res, ensure_ascii=False, indent=4)
-        )
-
-    def get_infer_res(self):
-        return self._infer_res
-
-    def apply(self, proc: Callable, *args, **kwargs):
-        return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
-
-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        pdf_proc_method = classify(self._dataset.data_bits())
-
-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
-            return self.apply(
-                proc,
-                self._dataset,
-                imageWriter,
-                SupportedPdfParseMethod.TXT,
-                start_page_id=0,
-                end_page_id=None,
-                debug_mode=False,
-                lang=None,
-            )
-        else:
-            return self.apply(
-                proc,
-                self._dataset,
-                imageWriter,
-                SupportedPdfParseMethod.OCR,
-                start_page_id=0,
-                end_page_id=None,
-                debug_mode=False,
-                lang=None,
-            )
-
-    def pipe_txt_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        return self.apply(
-            proc,
-            self._dataset,
-            imageWriter,
-            SupportedPdfParseMethod.TXT,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-        )
-
-    def pipe_ocr_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-
-        def proc(*args, **kwargs) -> PipeResult:
-            res = pdf_parse_union(*args, **kwargs)
-            return PipeResult(res, self._dataset)
-
-        return self.apply(
-            proc,
-            self._dataset,
-            imageWriter,
-            SupportedPdfParseMethod.TXT,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-        )

+ 13 - 12
magic_pdf/pdf_parse_union_core_v2.py

@@ -4,8 +4,8 @@ import statistics
 import time
 from typing import List
 
-import torch
 import fitz
+import torch
 from loguru import logger
 
 from magic_pdf.config.enums import SupportedPdfParseMethod
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
-
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.model.magic_model import MagicModel
 
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
-
 try:
     import torchtext
 
-    if torchtext.__version__ >= "0.18.0":
+    if torchtext.__version__ >= '0.18.0':
         torchtext.disable_torchtext_deprecation_warning()
 except ImportError:
     pass
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
 
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
+
 
 def __replace_STX_ETX(text_str: str):
     """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
@@ -90,7 +89,10 @@ def chars_to_content(span):
 
 
 LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
+<<<<<<< HEAD
 LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
+=======
+>>>>>>> 731f4bf (feat: add function definitions)
 
 
 def fill_char_in_spans(spans, all_chars):
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
         # 初始化ocr模型
         atom_model_manager = AtomModelSingleton()
         ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name="ocr",
+            atom_model_name='ocr',
             ocr_show_log=False,
             det_db_box_thresh=0.3,
             lang=lang
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
 
         for span in empty_spans:
             # 对span的bbox截图再ocr
-            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
+            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
             ocr_res = ocr_model.ocr(span_img, det=False)
             if ocr_res and len(ocr_res) > 0:
                 if len(ocr_res[0]) > 0:
@@ -681,7 +683,7 @@ def parse_page_core(
     """根据parse_mode,构造spans,主要是文本类的字符填充"""
     if parse_mode == SupportedPdfParseMethod.TXT:
 
-        """使用新版本的混合ocr方案"""
+        """使用新版本的混合ocr方案."""
         spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
 
     elif parse_mode == SupportedPdfParseMethod.OCR:
@@ -689,7 +691,6 @@ def parse_page_core(
     else:
         raise Exception('parse_mode must be txt or ocr')
 
-
     """先处理不需要排版的discarded_blocks"""
     discarded_block_with_spans, spans = fill_spans_in_blocks(
         all_discarded_blocks, spans, 0.4
@@ -762,8 +763,8 @@ def parse_page_core(
 
 
 def pdf_parse_union(
-    dataset: Dataset,
     model_list,
+    dataset: Dataset,
     imageWriter,
     parse_mode,
     start_page_id=0,
@@ -832,4 +833,4 @@ def pdf_parse_union(
 
 
 if __name__ == '__main__':
-    pass
+    pass

+ 124 - 0
magic_pdf/pipe/operators.py

@@ -0,0 +1,124 @@
+import json
+import os
+
+from magic_pdf.config.make_content_config import DropMode, MakeMode
+from magic_pdf.data.data_reader_writer import DataWriter
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.dict2md.ocr_mkcontent import union_make
+from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
+                                      draw_span_bbox)
+from magic_pdf.libs.json_compressor import JsonCompressor
+
+
+class PipeResult:
+    def __init__(self, pipe_res, dataset: Dataset):
+        """Initialized.
+
+        Args:
+            pipe_res (list[dict]): the pipeline processed result of model inference result
+            dataset (Dataset): the dataset associated with pipe_res
+        """
+        self._pipe_res = pipe_res
+        self._dataset = dataset
+
+    def dump_md(
+        self,
+        writer: DataWriter,
+        file_path: str,
+        img_dir_or_bucket_prefix: str,
+        drop_mode=DropMode.WHOLE_PDF,
+        md_make_mode=MakeMode.MM_MD,
+    ):
+        """Dump The Markdown.
+
+        Args:
+            writer (DataWriter): File writer handle
+            file_path (str): The file location of markdown
+            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
+        """
+        pdf_info_list = self._pipe_res['pdf_info']
+        md_content = union_make(
+            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
+        )
+        writer.write_string(file_path, md_content)
+
+    def dump_content_list(
+        self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
+    ):
+        """Dump Content List.
+
+        Args:
+            writer (DataWriter): File writer handle
+            file_path (str): The file location of content list
+            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+        """
+        pdf_info_list = self._pipe_res['pdf_info']
+        content_list = union_make(
+            pdf_info_list,
+            MakeMode.STANDARD_FORMAT,
+            DropMode.NONE,
+            image_dir_or_bucket_prefix,
+        )
+        writer.write_string(
+            file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
+        )
+
+    def dump_middle_json(self, writer: DataWriter, file_path: str):
+        """Dump the result of pipeline.
+
+        Args:
+            writer (DataWriter): File writer handler
+            file_path (str): The file location of middle json
+        """
+        writer.write_string(
+            file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
+        )
+
+    def draw_layout(self, file_path: str) -> None:
+        """Draw the layout.
+
+        Args:
+            file_path (str): The file location of layout result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def draw_span(self, file_path: str):
+        """Draw the Span.
+
+        Args:
+            file_path (str): The file location of span result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def draw_line_sort(self, file_path: str):
+        """Draw line sort.
+
+        Args:
+            file_path (str): The file location of line sort result file
+        """
+        dir_name = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name, exist_ok=True)
+        pdf_info = self._pipe_res['pdf_info']
+        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
+
+    def get_compress_pdf_mid_data(self):
+        """Compress the pipeline result.
+
+        Returns:
+            str: compress the pipeline result and return
+        """
+        return JsonCompressor.compress_json(self.pdf_mid_data)

+ 0 - 62
magic_pdf/pipe/types.py

@@ -1,62 +0,0 @@
-
-import json
-import os
-
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.dict2md.ocr_mkcontent import union_make
-from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
-                                      draw_span_bbox)
-from magic_pdf.libs.json_compressor import JsonCompressor
-
-
-class PipeResult:
-    def __init__(self, pipe_res, dataset: Dataset):
-        self._pipe_res = pipe_res
-        self._dataset = dataset
-
-    def dump_md(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
-        pdf_info_list = self._pipe_res['pdf_info']
-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix)
-        writer.write_string(file_path, md_content)
-
-    def dump_content_list(self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str, drop_mode=DropMode.NONE):
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, image_dir_or_bucket_prefix)
-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
-
-    def dump_middle_json(self, writer: DataWriter, file_path: str):
-        writer.write_string(file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4))
-
-    def draw_layout(self, file_path: str) -> None:
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_span(self, file_path: str):
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_line_sort(self, file_path: str):
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-
-    def draw_content_list(self, writer: DataWriter, file_path: str, img_dir_or_bucket_prefix: str, drop_mode=DropMode.WHOLE_PDF):
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_dir_or_bucket_prefix)
-        writer.write_string(file_path, json.dumps(content_list, ensure_ascii=False, indent=4))
-
-    def get_compress_pdf_mid_data(self):
-        return JsonCompressor.compress_json(self.pdf_mid_data)

+ 2 - 3
magic_pdf/tools/common.py

@@ -10,7 +10,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.model.types import InferenceResult
+from magic_pdf.model.operators import InferenceResult
 
 # from io import BytesIO
 # from pypdf import PdfReader, PdfWriter
@@ -223,8 +223,7 @@ def do_parse(
         pipe_result.dump_content_list(
             md_writer,
             f'{pdf_file_name}_content_list.json',
-            image_dir,
-            drop_mode=DropMode.NONE,
+            image_dir
         )
 
     logger.info(f'local output dir is {local_md_dir}')