11 months ago · 440fd0c75b
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -1,27 +0,0 @@
 
				-import os
			
 
				-
			
 
				-from loguru import logger
			
 
				-
			
 
				-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				-from magic_pdf.data.dataset import PymuDocDataset
			
 
				-from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				-
			
 
				-
			
 
				-try:
			
 
				-    current_script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-    demo_name = 'demo1'
			
 
				-    pdf_path = os.path.join(current_script_dir, f'{demo_name}.pdf')
			
 
				-    pdf_bytes = open(pdf_path, 'rb').read()
			
 
				-    jso_useful_key = {'_pdf_type': '', 'model_list': []}
			
 
				-    local_image_dir = os.path.join(current_script_dir, 'images')
			
 
				-    image_dir = str(os.path.basename(local_image_dir))
			
 
				-    image_writer = FileBasedDataWriter(local_image_dir)
			
 
				-    pipe = UNIPipe(PymuDocDataset(pdf_bytes), jso_useful_key, image_writer)
			
 
				-    pipe.pipe_classify()
			
 
				-    pipe.pipe_analyze()
			
 
				-    pipe.pipe_parse()
			
 
				-    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode='none')
			
 
				-    with open(f'{demo_name}.md', 'w', encoding='utf-8') as f:
			
 
				-        f.write(md_content)
			
 
				-except Exception as e:
			
 
				-    logger.exception(e)
			
--- a/demo/magic_pdf_parse_main.py
+++ b/demo/magic_pdf_parse_main.py
@@ -1,147 +0,0 @@
 
				-import copy
			
 
				-import json
			
 
				-import os
			
 
				-
			
 
				-from loguru import logger
			
 
				-
			
 
				-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				-from magic_pdf.data.dataset import PymuDocDataset
			
 
				-from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
			
 
				-from magic_pdf.pipe.OCRPipe import OCRPipe
			
 
				-from magic_pdf.pipe.TXTPipe import TXTPipe
			
 
				-from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				-
			
 
				-# todo: 设备类型选择 （？）
			
 
				-
			
 
				-
			
 
				-def json_md_dump(
			
 
				-        pipe,
			
 
				-        md_writer,
			
 
				-        pdf_name,
			
 
				-        content_list,
			
 
				-        md_content,
			
 
				-        orig_model_list,
			
 
				-):
			
 
				-    # 写入模型结果到 model.json
			
 
				-
			
 
				-    md_writer.write_string(
			
 
				-        f'{pdf_name}_model.json',
			
 
				-        json.dumps(orig_model_list, ensure_ascii=False, indent=4)
			
 
				-    )
			
 
				-
			
 
				-    # 写入中间结果到 middle.json
			
 
				-    md_writer.write_string(
			
 
				-        f'{pdf_name}_middle.json',
			
 
				-        json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
			
 
				-    )
			
 
				-
			
 
				-    # text文本结果写入到 conent_list.json
			
 
				-    md_writer.write_string(
			
 
				-        f'{pdf_name}_content_list.json',
			
 
				-        json.dumps(content_list, ensure_ascii=False, indent=4)
			
 
				-    )
			
 
				-
			
 
				-    # 写入结果到 .md 文件中
			
 
				-    md_writer.write_string(
			
 
				-        f'{pdf_name}.md',
			
 
				-        md_content,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-# 可视化
			
 
				-def draw_visualization_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name):
			
 
				-    # 画布局框，附带排序结果
			
 
				-    draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
			
 
				-    # 画 span 框
			
 
				-    draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
			
 
				-
			
 
				-
			
 
				-def pdf_parse_main(
			
 
				-        pdf_path: str,
			
 
				-        parse_method: str = 'auto',
			
 
				-        model_json_path: str = None,
			
 
				-        is_json_md_dump: bool = True,
			
 
				-        is_draw_visualization_bbox: bool = True,
			
 
				-        output_dir: str = None
			
 
				-):
			
 
				-    """执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录.
			
 
				-
			
 
				-    :param pdf_path: .pdf 文件的路径，可以是相对路径，也可以是绝对路径
			
 
				-    :param parse_method: 解析方法， 共 auto、ocr、txt 三种，默认 auto，如果效果不好，可以尝试 ocr
			
 
				-    :param model_json_path: 已经存在的模型数据文件，如果为空则使用内置模型，pdf 和 model_json 务必对应
			
 
				-    :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中，默认 True，会将不同阶段的数据写入到不同的 .json 文件中（共3个.json文件），md内容会保存到 .md 文件中
			
 
				-    :param is_draw_visualization_bbox: 是否绘制可视化边界框，默认 True，会生成布局框和 span 框的图像
			
 
				-    :param output_dir: 输出结果的目录地址，会生成一个以 pdf 文件名命名的文件夹并保存所有结果
			
 
				-    """
			
 
				-    try:
			
 
				-        pdf_name = os.path.basename(pdf_path).split('.')[0]
			
 
				-        pdf_path_parent = os.path.dirname(pdf_path)
			
 
				-
			
 
				-        if output_dir:
			
 
				-            output_path = os.path.join(output_dir, pdf_name)
			
 
				-        else:
			
 
				-            output_path = os.path.join(pdf_path_parent, pdf_name)
			
 
				-
			
 
				-        output_image_path = os.path.join(output_path, 'images')
			
 
				-
			
 
				-        # 获取图片的父路径，为的是以相对路径保存到 .md 和 conent_list.json 文件中
			
 
				-        image_path_parent = os.path.basename(output_image_path)
			
 
				-
			
 
				-        pdf_bytes = open(pdf_path, 'rb').read()  # 读取 pdf 文件的二进制数据
			
 
				-
			
 
				-        orig_model_list = []
			
 
				-
			
 
				-        if model_json_path:
			
 
				-            # 读取已经被模型解析后的pdf文件的 json 原始数据，list 类型
			
 
				-            model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read())
			
 
				-            orig_model_list = copy.deepcopy(model_json)
			
 
				-        else:
			
 
				-            model_json = []
			
 
				-
			
 
				-        # 执行解析步骤
			
 
				-        image_writer, md_writer = FileBasedDataWriter(output_image_path), FileBasedDataWriter(output_path)
			
 
				-
			
 
				-        # 选择解析方式
			
 
				-        if parse_method == 'auto':
			
 
				-            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
			
 
				-            pipe = UNIPipe(PymuDocDataset(pdf_bytes), jso_useful_key, image_writer)
			
 
				-        elif parse_method == 'txt':
			
 
				-            pipe = TXTPipe(PymuDocDataset(pdf_bytes), model_json, image_writer)
			
 
				-        elif parse_method == 'ocr':
			
 
				-            pipe = OCRPipe(PymuDocDataset(pdf_bytes), model_json, image_writer)
			
 
				-        else:
			
 
				-            logger.error('unknown parse method, only auto, ocr, txt allowed')
			
 
				-            exit(1)
			
 
				-
			
 
				-        # 执行分类
			
 
				-        pipe.pipe_classify()
			
 
				-
			
 
				-        # 如果没有传入模型数据，则使用内置模型解析
			
 
				-        if len(model_json) == 0:
			
 
				-            pipe.pipe_analyze()  # 解析
			
 
				-            orig_model_list = copy.deepcopy(pipe.model_list)
			
 
				-
			
 
				-        # 执行解析
			
 
				-        pipe.pipe_parse()
			
 
				-
			
 
				-        # 保存 text 和 md 格式的结果
			
 
				-        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none')
			
 
				-        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none')
			
 
				-
			
 
				-        if is_json_md_dump:
			
 
				-            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content, orig_model_list)
			
 
				-
			
 
				-        if is_draw_visualization_bbox:
			
 
				-            draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name)
			
 
				-
			
 
				-    except Exception as e:
			
 
				-        logger.exception(e)
			
 
				-
			
 
				-
			
 
				-# 测试
			
 
				-if __name__ == '__main__':
			
 
				-    current_script_dir = os.path.dirname(os.path.abspath(__file__))
			
 
				-    demo_names = ['demo1', 'demo2', 'small_ocr']
			
 
				-    for name in demo_names:
			
 
				-        file_path = os.path.join(current_script_dir, f'{name}.pdf')
			
 
				-        pdf_parse_main(file_path)
			
--- a/magic_pdf/pipe/operators.py
+++ b/magic_pdf/pipe/operators.py
@@ -1,7 +1,7 @@
 
				+import copy
			
 
				 import json
			
 
				 import os
			
 
				 from typing import Callable
			
 
				-import copy
			
 
				 
			
 
				 from magic_pdf.config.make_content_config import DropMode, MakeMode
			
 
				 from magic_pdf.data.data_reader_writer import DataWriter
			
@@ -23,6 +23,26 @@ class PipeResult:
 
				         self._pipe_res = pipe_res
			
 
				         self._dataset = dataset
			
 
				 
			
 
				+    def get_markdown(self,
			
 
				+                    img_dir_or_bucket_prefix: str,
			
 
				+                    drop_mode=DropMode.WHOLE_PDF,
			
 
				+                    md_make_mode=MakeMode.MM_MD) -> str:
			
 
				+        """Get markdown content.
			
 
				+
			
 
				+        Args:
			
 
				+            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
			
 
				+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
			
 
				+
			
 
				+        Returns:
			
 
				+            str: return markdown content
			
 
				+        """
			
 
				+        pdf_info_list = self._pipe_res['pdf_info']
			
 
				+        md_content = union_make(
			
 
				+            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
			
 
				+        )
			
 
				+        return md_content
			
 
				+
			
 
				     def dump_md(
			
 
				         self,
			
 
				         writer: DataWriter,
			
@@ -40,14 +60,40 @@ class PipeResult:
 
				             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
			
 
				             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
			
 
				         """
			
 
				+
			
 
				+        md_content = self.get_markdown(img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
			
 
				+        writer.write_string(file_path, md_content)
			
 
				+
			
 
				+    def get_content_list(self,
			
 
				+                        image_dir_or_bucket_prefix: str,
			
 
				+                        drop_mode=DropMode.NONE,
			
 
				+                        md_make_mode=MakeMode.STANDARD_FORMAT) -> str:
			
 
				+        """Get Content List.
			
 
				+
			
 
				+        Args:
			
 
				+            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
			
 
				+
			
 
				+        Returns:
			
 
				+            str: content list content
			
 
				+        """
			
 
				         pdf_info_list = self._pipe_res['pdf_info']
			
 
				-        md_content = union_make(
			
 
				-            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
			
 
				+        content_list = union_make(
			
 
				+            pdf_info_list,
			
 
				+            md_make_mode,
			
 
				+            drop_mode,
			
 
				+            image_dir_or_bucket_prefix,
			
 
				         )
			
 
				-        writer.write_string(file_path, md_content)
			
 
				+        return content_list
			
 
				 
			
 
				     def dump_content_list(
			
 
				-        self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
			
 
				+        self,
			
 
				+        writer: DataWriter,
			
 
				+        file_path: str,
			
 
				+        image_dir_or_bucket_prefix: str,
			
 
				+        drop_mode=DropMode.NONE,
			
 
				+        md_make_mode=MakeMode.STANDARD_FORMAT
			
 
				     ):
			
 
				         """Dump Content List.
			
 
				 
			
@@ -55,14 +101,10 @@ class PipeResult:
 
				             writer (DataWriter): File writer handle
			
 
				             file_path (str): The file location of content list
			
 
				             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
			
 
				+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
			
 
				+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
			
 
				         """
			
 
				-        pdf_info_list = self._pipe_res['pdf_info']
			
 
				-        content_list = union_make(
			
 
				-            pdf_info_list,
			
 
				-            MakeMode.STANDARD_FORMAT,
			
 
				-            DropMode.NONE,
			
 
				-            image_dir_or_bucket_prefix,
			
 
				-        )
			
 
				+        content_list = self.get_content_list(image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
			
 
				         writer.write_string(
			
 
				             file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
			
 
				         )
			
@@ -123,7 +165,7 @@ class PipeResult:
 
				         Returns:
			
 
				             str: compress the pipeline result and return
			
 
				         """
			
 
				-        return JsonCompressor.compress_json(self.pdf_mid_data)
			
 
				+        return JsonCompressor.compress_json(self._pipe_res)
			
 
				 
			
 
				     def apply(self, proc: Callable, *args, **kwargs):
			
 
				         """Apply callable method which.
			
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -7,7 +7,7 @@ Convert Doc
 
				     :class: tip
			
 
				 
			
 
				     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				-    
			
 
				+
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 
			
@@ -15,15 +15,15 @@ Convert Doc
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.doc -o output -m auto
			
 
				 
			
 
				 
			
 
				-API 
			
 
				+API
			
 
				 ^^^^^^^^
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     import os
			
 
				 
			
@@ -44,13 +44,16 @@ API
 
				     # proc
			
 
				     ## Create Dataset Instance
			
 
				     input_file = "some_doc.doc"     # replace with real ms-office file
			
 
				-    
			
 
				+
			
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_office(input_file)[0]
			
 
				 
			
 
				+    # ocr mode
			
 
				     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				 
			
 
				-
			
 
				-
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -6,23 +6,23 @@ Convert DocX
 
				     :class: tip
			
 
				 
			
 
				     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				-    
			
 
				+
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 
			
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.docx -o output -m auto
			
 
				 
			
 
				 
			
 
				-API 
			
 
				+API
			
 
				 ^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     import os
			
 
				 
			
@@ -43,11 +43,16 @@ API
 
				     # proc
			
 
				     ## Create Dataset Instance
			
 
				     input_file = "some_docx.docx"     # replace with real ms-office file
			
 
				-    
			
 
				+
			
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_office(input_file)[0]
			
 
				 
			
 
				+    # ocr mode
			
 
				     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				 
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -7,13 +7,13 @@ Convert Image
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.png -o output -m auto
			
 
				 
			
 
				 
			
 
				-API 
			
 
				+API
			
 
				 ^^^^^^
			
 
				 
			
 
				 .. code:: python
			
@@ -41,6 +41,12 @@ API
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_images(input_file)[0]
			
 
				 
			
 
				+    # ocr mode
			
 
				     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				+
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -1,12 +1,12 @@
 
				 
			
 
				 
			
 
				-Convert PDF 
			
 
				+Convert PDF
			
 
				 ============
			
 
				 
			
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.pdf -o output -m auto
			
@@ -44,6 +44,12 @@ API
 
				     ## Create Dataset Instance
			
 
				     ds = PymuDocDataset(pdf_bytes)
			
 
				 
			
 
				-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				-
			
 
				+    # ocr mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{name_without_suff}.md", image_dir
			
 
				+    )
			
 
				 
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{name_without_suff}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -1,28 +1,28 @@
 
				 
			
 
				 
			
 
				-Convert PPT 
			
 
				+Convert PPT
			
 
				 ============
			
 
				 
			
 
				 .. admonition:: Warning
			
 
				     :class: tip
			
 
				 
			
 
				     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				-    
			
 
				+
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.ppt -o output -m auto
			
 
				 
			
 
				 
			
 
				-API 
			
 
				+API
			
 
				 ^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     import os
			
 
				 
			
@@ -43,10 +43,16 @@ API
 
				     # proc
			
 
				     ## Create Dataset Instance
			
 
				     input_file = "some_ppt.ppt"     # replace with real ms-office file
			
 
				-    
			
 
				+
			
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_office(input_file)[0]
			
 
				 
			
 
				+    # ocr mode
			
 
				     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				+
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -7,14 +7,14 @@ Convert PPTX
 
				     :class: tip
			
 
				 
			
 
				     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				-    
			
 
				+
			
 
				     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				 
			
 
				 
			
 
				 Command Line
			
 
				 ^^^^^^^^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     # make sure the file have correct suffix
			
 
				     magic-pdf -p a.pptx -o output -m auto
			
@@ -22,10 +22,10 @@ Command Line
 
				 
			
 
				 
			
 
				 
			
 
				-API 
			
 
				+API
			
 
				 ^^^^^^
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     import os
			
 
				 
			
@@ -46,10 +46,16 @@ API
 
				     # proc
			
 
				     ## Create Dataset Instance
			
 
				     input_file = "some_pptx.pptx"     # replace with real ms-office file
			
 
				-    
			
 
				+
			
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_office(input_file)[0]
			
 
				 
			
 
				+    # ocr mode
			
 
				     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				+
			
 
				+    # txt mode
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/usage/api.rst
+++ b/next_docs/en/user_guide/usage/api.rst
@@ -1,5 +1,5 @@
 
				 
			
 
				-Api Usage 
			
 
				+Api Usage
			
 
				 ===========
			
 
				 
			
 
				 
			
@@ -16,6 +16,7 @@ Local File Example
 
				     from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				     from magic_pdf.data.dataset import PymuDocDataset
			
 
				     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				 
			
 
				     # args
			
 
				     pdf_file_name = "abc.pdf"  # replace with the real pdf path
			
@@ -40,15 +41,22 @@ Local File Example
 
				     ## Create Dataset Instance
			
 
				     ds = PymuDocDataset(pdf_bytes)
			
 
				 
			
 
				-    ## inference 
			
 
				-    infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				+    ## inference
			
 
				+    if ds.classify() == SupportedPdfParseMethod.OCR:
			
 
				+        infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				+
			
 
				+        ## pipeline
			
 
				+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
			
 
				+
			
 
				+    else:
			
 
				+        infer_result = ds.apply(doc_analyze, ocr=False)
			
 
				+
			
 
				+        ## pipeline
			
 
				+        pipe_result = infer_result.pipe_txt_mode(image_writer)
			
 
				 
			
 
				     ### draw model result on each page
			
 
				     infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
			
 
				 
			
 
				-    ## pipeline
			
 
				-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
			
 
				-
			
 
				     ### draw layout result on each page
			
 
				     pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
			
 
				 
			
@@ -58,6 +66,9 @@ Local File Example
 
				     ### dump markdown
			
 
				     pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				 
			
 
				+    ### dump content list
			
 
				+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
			
 
				+
			
 
				 
			
 
				 S3 File Example
			
 
				 ^^^^^^^^^^^^^^^^
			
@@ -96,30 +107,39 @@ S3 File Example
 
				     ## Create Dataset Instance
			
 
				     ds = PymuDocDataset(pdf_bytes)
			
 
				 
			
 
				-    ## inference 
			
 
				-    infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				+    ## inference
			
 
				+    if ds.classify() == SupportedPdfParseMethod.OCR:
			
 
				+        infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				 
			
 
				-    ### draw model result on each page
			
 
				-    infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf'))  # dump to local
			
 
				+        ## pipeline
			
 
				+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
			
 
				+
			
 
				+    else:
			
 
				+        infer_result = ds.apply(doc_analyze, ocr=False)
			
 
				 
			
 
				-    ## pipeline
			
 
				-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
			
 
				+        ## pipeline
			
 
				+        pipe_result = infer_result.pipe_txt_mode(image_writer)
			
 
				+
			
 
				+    ### draw model result on each page
			
 
				+    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
			
 
				 
			
 
				     ### draw layout result on each page
			
 
				-    pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf'))  # dump to local
			
 
				+    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
			
 
				 
			
 
				     ### draw spans result on each page
			
 
				-    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))   # dump to local 
			
 
				+    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
			
 
				 
			
 
				     ### dump markdown
			
 
				-    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
			
 
				+    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				 
			
 
				+    ### dump content list
			
 
				+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
			
 
				 
			
 
				 
			
 
				-MS-Office 
			
 
				+MS-Office
			
 
				 ----------
			
 
				 
			
 
				-.. code:: python 
			
 
				+.. code:: python
			
 
				 
			
 
				     import os
			
 
				 
			
@@ -144,7 +164,7 @@ MS-Office
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_office(input_file)[0]
			
 
				 
			
 
				-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				 
			
@@ -154,7 +174,7 @@ This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx**
 
				 Image
			
 
				 ---------
			
 
				 
			
 
				-Single Image File 
			
 
				+Single Image File
			
 
				 ^^^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 .. code:: python
			
@@ -182,12 +202,12 @@ Single Image File
 
				     input_file_name = input_file.split(".")[0]
			
 
				     ds = read_local_images(input_file)[0]
			
 
				 
			
 
				-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				         md_writer, f"{input_file_name}.md", image_dir
			
 
				     )
			
 
				 
			
 
				 
			
 
				-Directory That Contains Images 
			
 
				+Directory That Contains Images
			
 
				 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 .. code:: python
			
@@ -217,7 +237,7 @@ Directory That Contains Images
 
				 
			
 
				     count = 0
			
 
				     for ds in dss:
			
 
				-        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
			
 
				             md_writer, f"{count}.md", image_dir
			
 
				         )
			
 
				         count += 1
			
--- a/projects/web_api/app.py
+++ b/projects/web_api/app.py
@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
 
				 from loguru import logger
			
 
				 
			
 
				 import magic_pdf.model as model_config
			
 
				+from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				-from magic_pdf.pipe.OCRPipe import OCRPipe
			
 
				-from magic_pdf.pipe.TXTPipe import TXTPipe
			
 
				-from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				+from magic_pdf.data.dataset import PymuDocDataset
			
 
				+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+from magic_pdf.model.operators import InferenceResult
			
 
				 
			
 
				 model_config.__use_inside_model__ = True
			
 
				 
			
@@ -20,14 +21,15 @@ app = FastAPI()
 
				 
			
 
				 
			
 
				 def json_md_dump(
			
 
				-    pipe,
			
 
				+    model_json,
			
 
				+    middle_json,
			
 
				     md_writer,
			
 
				     pdf_name,
			
 
				     content_list,
			
 
				     md_content,
			
 
				 ):
			
 
				     # Write model results to model.json
			
 
				-    orig_model_list = copy.deepcopy(pipe.model_list)
			
 
				+    orig_model_list = copy.deepcopy(model_json)
			
 
				     md_writer.write_string(
			
 
				         f'{pdf_name}_model.json',
			
 
				         json.dumps(orig_model_list, ensure_ascii=False, indent=4),
			
@@ -36,7 +38,7 @@ def json_md_dump(
 
				     # Write intermediate results to middle.json
			
 
				     md_writer.write_string(
			
 
				         f'{pdf_name}_middle.json',
			
 
				-        json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
			
 
				+        json.dumps(middle_json, ensure_ascii=False, indent=4),
			
 
				     )
			
 
				 
			
 
				     # Write text content results to content_list.json
			
@@ -100,45 +102,49 @@ async def pdf_parse_main(
 
				             output_image_path
			
 
				         ), FileBasedDataWriter(output_path)
			
 
				 
			
 
				+        ds = PymuDocDataset(pdf_bytes)
			
 
				         # Choose parsing method
			
 
				         if parse_method == 'auto':
			
 
				-            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
			
 
				-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
			
 
				-        elif parse_method == 'txt':
			
 
				-            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
			
 
				-        elif parse_method == 'ocr':
			
 
				-            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
			
 
				-        else:
			
 
				+            if ds.classify() == SupportedPdfParseMethod.OCR:
			
 
				+                parse_method = 'ocr'
			
 
				+            else:
			
 
				+                parse_method = 'txt'
			
 
				+
			
 
				+        if parse_method not in ['txt', 'ocr']:
			
 
				             logger.error('Unknown parse method, only auto, ocr, txt allowed')
			
 
				             return JSONResponse(
			
 
				                 content={'error': 'Invalid parse method'}, status_code=400
			
 
				             )
			
 
				 
			
 
				-        # Execute classification
			
 
				-        pipe.pipe_classify()
			
 
				-
			
 
				-        # If no model data is provided, use built-in model for parsing
			
 
				-        if not model_json:
			
 
				-            if model_config.__use_inside_model__:
			
 
				-                pipe.pipe_analyze()  # Parse
			
 
				+        if len(model_json) == 0:
			
 
				+            if parse_method == 'ocr':
			
 
				+                infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				             else:
			
 
				+                infer_result = ds.apply(doc_analyze, ocr=False)
			
 
				+
			
 
				+        else:
			
 
				+            infer_result = InferenceResult(model_json, ds)
			
 
				+
			
 
				+        if len(model_json) == 0 and not model_config.__use_inside_model__:
			
 
				                 logger.error('Need model list input')
			
 
				                 return JSONResponse(
			
 
				                     content={'error': 'Model list input required'}, status_code=400
			
 
				                 )
			
 
				+        if parse_method == 'ocr':
			
 
				+            pipe_res = infer_result.pipe_ocr_mode(image_writer)
			
 
				+        else:
			
 
				+            pipe_res = infer_result.pipe_txt_mode(image_writer)
			
 
				 
			
 
				-        # Execute parsing
			
 
				-        pipe.pipe_parse()
			
 
				 
			
 
				         # Save results in text and md format
			
 
				-        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none')
			
 
				-        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none')
			
 
				+        content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
			
 
				+        md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
			
 
				 
			
 
				         if is_json_md_dump:
			
 
				-            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
			
 
				+            json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
			
 
				         data = {
			
 
				-            'layout': copy.deepcopy(pipe.model_list),
			
 
				-            'info': pipe.pdf_mid_data,
			
 
				+            'layout': copy.deepcopy(infer_result._infer_res),
			
 
				+            'info': pipe_res._pipe_res,
			
 
				             'content_list': content_list,
			
 
				             'md_content': md_content,
			
 
				         }
			
--- a/projects/web_demo/web_demo/api/analysis/pdf_ext.py
+++ b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
@@ -11,9 +11,12 @@ from flask import current_app, url_for
 
				 from loguru import logger
			
 
				 
			
 
				 import magic_pdf.model as model_config
			
 
				+from magic_pdf.config.enums import SupportedPdfParseMethod
			
 
				 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+from magic_pdf.data.dataset import PymuDocDataset
			
 
				 from magic_pdf.libs.json_compressor import JsonCompressor
			
 
				-from magic_pdf.pipe.UNIPipe import UNIPipe
			
 
				+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+from magic_pdf.model.operators import InferenceResult
			
 
				 
			
 
				 from ..extentions import app, db
			
 
				 from .ext import find_file
			
@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
 
				 def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
			
 
				     try:
			
 
				         model_json = []  # model_json传空list使用内置模型解析
			
 
				+        image_writer = FileBasedDataWriter(image_dir)
			
 
				         logger.info(f'is_ocr: {is_ocr}')
			
 
				+        parse_method = 'ocr'
			
 
				+        ds = PymuDocDataset(pdf_bytes)
			
 
				+        # Choose parsing method
			
 
				         if not is_ocr:
			
 
				-            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
			
 
				-            image_writer = FileBasedDataWriter(image_dir)
			
 
				-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
			
 
				-            pipe.pipe_classify()
			
 
				-        else:
			
 
				-            jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
			
 
				-            image_writer = FileBasedDataWriter(image_dir)
			
 
				-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
			
 
				-        """如果没有传入有效的模型数据，则使用内置model解析"""
			
 
				-        if len(model_json) == 0:
			
 
				-            if model_config.__use_inside_model__:
			
 
				-                pipe.pipe_analyze()
			
 
				+            if ds.classify() == SupportedPdfParseMethod.OCR:
			
 
				+                parse_method = 'ocr'
			
 
				             else:
			
 
				-                logger.error('need model list input')
			
 
				-                exit(1)
			
 
				-        pipe.pipe_parse()
			
 
				-        pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
			
 
				+                parse_method = 'txt'
			
 
				+
			
 
				+        if parse_method == 'ocr':
			
 
				+            infer_result = ds.apply(doc_analyze, ocr=True)
			
 
				+        else:
			
 
				+            infer_result = ds.apply(doc_analyze, ocr=False)
			
 
				+
			
 
				+        if parse_method == 'ocr':
			
 
				+            pipe_res = infer_result.pipe_ocr_mode(image_writer)
			
 
				+        else:
			
 
				+            pipe_res = infer_result.pipe_txt_mode(image_writer)
			
 
				+
			
 
				+        pdf_mid_data = pipe_res._pipe_res
			
 
				         pdf_info_list = pdf_mid_data['pdf_info']
			
 
				         md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
			
 
				                                 ensure_ascii=False)
			
@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
 
				     except Exception as e:  # noqa: F841
			
 
				         logger.error(traceback.format_exc())
			
 
				 
			
 
				-
			
 
				 def get_bbox_info(data):
			
 
				     bbox_info = []
			
 
				     for page in data: