소스 검색

Merge pull request #1280 from icecraft/docs/tune_docs

Docs/tune docs
Xiaomeng Zhao 11 달 전
부모
커밋
d0a3058ba8

+ 55 - 13
magic_pdf/pipe/operators.py

@@ -1,7 +1,7 @@
+import copy
 import json
 import os
 from typing import Callable
-import copy
 
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import DataWriter
@@ -23,6 +23,26 @@ class PipeResult:
         self._pipe_res = pipe_res
         self._dataset = dataset
 
+    def get_markdown(self,
+                    img_dir_or_bucket_prefix: str,
+                    drop_mode=DropMode.WHOLE_PDF,
+                    md_make_mode=MakeMode.MM_MD) -> str:
+        """Get markdown content.
+
+        Args:
+            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
+
+        Returns:
+            str: return markdown content
+        """
+        pdf_info_list = self._pipe_res['pdf_info']
+        md_content = union_make(
+            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
+        )
+        return md_content
+
     def dump_md(
         self,
         writer: DataWriter,
@@ -40,14 +60,40 @@ class PipeResult:
             drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
             md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
         """
+
+        md_content = self.get_markdown(img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
+        writer.write_string(file_path, md_content)
+
+    def get_content_list(self,
+                        image_dir_or_bucket_prefix: str,
+                        drop_mode=DropMode.NONE,
+                        md_make_mode=MakeMode.STANDARD_FORMAT) -> str:
+        """Get Content List.
+
+        Args:
+            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
+
+        Returns:
+            str: content list content
+        """
         pdf_info_list = self._pipe_res['pdf_info']
-        md_content = union_make(
-            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
+        content_list = union_make(
+            pdf_info_list,
+            md_make_mode,
+            drop_mode,
+            image_dir_or_bucket_prefix,
         )
-        writer.write_string(file_path, md_content)
+        return content_list
 
     def dump_content_list(
-        self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
+        self,
+        writer: DataWriter,
+        file_path: str,
+        image_dir_or_bucket_prefix: str,
+        drop_mode=DropMode.NONE,
+        md_make_mode=MakeMode.STANDARD_FORMAT
     ):
         """Dump Content List.
 
@@ -55,14 +101,10 @@ class PipeResult:
             writer (DataWriter): File writer handle
             file_path (str): The file location of content list
             image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
+            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
         """
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(
-            pdf_info_list,
-            MakeMode.STANDARD_FORMAT,
-            DropMode.NONE,
-            image_dir_or_bucket_prefix,
-        )
+        content_list = self.get_content_list(image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
         writer.write_string(
             file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
         )
@@ -123,7 +165,7 @@ class PipeResult:
         Returns:
             str: compress the pipeline result and return
         """
-        return JsonCompressor.compress_json(self.pdf_mid_data)
+        return JsonCompressor.compress_json(self._pipe_res)
 
     def apply(self, proc: Callable, *args, **kwargs):
         """Apply callable method which.

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 16
next_docs/README.md


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 16
next_docs/README_zh-CN.md


+ 18 - 1
next_docs/en/user_guide/data/read_api.rst

@@ -83,7 +83,24 @@ Read images from path or directory
     datasets = read_local_images("tt.png")  # replace with real file path
 
     # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # replace with real directory 
+    datasets = read_local_images("images/", suffixes=[".png", ".jpg"])  # replace with real directory 
+
+
+read_local_office
+^^^^^^^^^^^^^^^^^^^^
+Read MS-Office files from path or directory
+
+.. code:: python 
+
+    from magic_pdf.data.read_api import *
+
+    # read from image path 
+    datasets = read_local_office("tt.doc")  # replace with real file path
+
+    # read files from directory that endswith suffix in suffixes array 
+    datasets = read_local_office("docs/")  # replace with real directory 
+
+
 
 
 Check :doc:`../../api/read_api` for more details

+ 8 - 0
next_docs/en/user_guide/install/config.rst

@@ -5,6 +5,14 @@ Config
 
 File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
 
+.. admonition:: Tip 
+    :class: tip
+
+    You can override the default location of config file via the following command:
+    
+    export MINERU_TOOLS_CONFIG_JSON=new_magic_pdf.json
+
+
 
 magic-pdf.json
 ----------------

+ 10 - 7
next_docs/en/user_guide/quick_start/convert_doc.rst

@@ -7,7 +7,7 @@ Convert Doc
     :class: tip
 
     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    
+
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 
@@ -15,15 +15,15 @@ Convert Doc
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.doc -o output -m auto
 
 
-API 
+API
 ^^^^^^^^
-.. code:: python 
+.. code:: python
 
     import os
 
@@ -44,13 +44,16 @@ API
     # proc
     ## Create Dataset Instance
     input_file = "some_doc.doc"     # replace with real ms-office file
-    
+
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
+    # ocr mode
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
 
-
-
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 10 - 5
next_docs/en/user_guide/quick_start/convert_docx.rst

@@ -6,23 +6,23 @@ Convert DocX
     :class: tip
 
     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    
+
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.docx -o output -m auto
 
 
-API 
+API
 ^^^^^
 
-.. code:: python 
+.. code:: python
 
     import os
 
@@ -43,11 +43,16 @@ API
     # proc
     ## Create Dataset Instance
     input_file = "some_docx.docx"     # replace with real ms-office file
-    
+
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
+    # ocr mode
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
 
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 8 - 2
next_docs/en/user_guide/quick_start/convert_image.rst

@@ -7,13 +7,13 @@ Convert Image
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.png -o output -m auto
 
 
-API 
+API
 ^^^^^^
 
 .. code:: python
@@ -41,6 +41,12 @@ API
     input_file_name = input_file.split(".")[0]
     ds = read_local_images(input_file)[0]
 
+    # ocr mode
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
+
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 10 - 4
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -1,12 +1,12 @@
 
 
-Convert PDF 
+Convert PDF
 ============
 
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.pdf -o output -m auto
@@ -44,6 +44,12 @@ API
     ## Create Dataset Instance
     ds = PymuDocDataset(pdf_bytes)
 
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
+    # ocr mode
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{name_without_suff}.md", image_dir
+    )
 
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{name_without_suff}.md", image_dir
+    )

+ 12 - 6
next_docs/en/user_guide/quick_start/convert_ppt.rst

@@ -1,28 +1,28 @@
 
 
-Convert PPT 
+Convert PPT
 ============
 
 .. admonition:: Warning
     :class: tip
 
     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    
+
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.ppt -o output -m auto
 
 
-API 
+API
 ^^^^^
 
-.. code:: python 
+.. code:: python
 
     import os
 
@@ -43,10 +43,16 @@ API
     # proc
     ## Create Dataset Instance
     input_file = "some_ppt.ppt"     # replace with real ms-office file
-    
+
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
+    # ocr mode
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
+
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 11 - 5
next_docs/en/user_guide/quick_start/convert_pptx.rst

@@ -7,14 +7,14 @@ Convert PPTX
     :class: tip
 
     When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-    
+
     For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
 
 
 Command Line
 ^^^^^^^^^^^^^
 
-.. code:: python 
+.. code:: python
 
     # make sure the file have correct suffix
     magic-pdf -p a.pptx -o output -m auto
@@ -22,10 +22,10 @@ Command Line
 
 
 
-API 
+API
 ^^^^^^
 
-.. code:: python 
+.. code:: python
 
     import os
 
@@ -46,10 +46,16 @@ API
     # proc
     ## Create Dataset Instance
     input_file = "some_pptx.pptx"     # replace with real ms-office file
-    
+
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
+    # ocr mode
     ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
+
+    # txt mode
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 41 - 21
next_docs/en/user_guide/usage/api.rst

@@ -1,5 +1,5 @@
 
-Api Usage 
+Api Usage
 ===========
 
 
@@ -16,6 +16,7 @@ Local File Example
     from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
     from magic_pdf.data.dataset import PymuDocDataset
     from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.config.enums import SupportedPdfParseMethod
 
     # args
     pdf_file_name = "abc.pdf"  # replace with the real pdf path
@@ -40,15 +41,22 @@ Local File Example
     ## Create Dataset Instance
     ds = PymuDocDataset(pdf_bytes)
 
-    ## inference 
-    infer_result = ds.apply(doc_analyze, ocr=True)
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        infer_result = ds.apply(doc_analyze, ocr=True)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
+
+    else:
+        infer_result = ds.apply(doc_analyze, ocr=False)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_txt_mode(image_writer)
 
     ### draw model result on each page
     infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
 
-    ## pipeline
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
     ### draw layout result on each page
     pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
 
@@ -58,6 +66,9 @@ Local File Example
     ### dump markdown
     pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
 
+    ### dump content list
+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
+
 
 S3 File Example
 ^^^^^^^^^^^^^^^^
@@ -96,30 +107,39 @@ S3 File Example
     ## Create Dataset Instance
     ds = PymuDocDataset(pdf_bytes)
 
-    ## inference 
-    infer_result = ds.apply(doc_analyze, ocr=True)
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        infer_result = ds.apply(doc_analyze, ocr=True)
 
-    ### draw model result on each page
-    infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf'))  # dump to local
+        ## pipeline
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
 
-    ## pipeline
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
+    else:
+        infer_result = ds.apply(doc_analyze, ocr=False)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_txt_mode(image_writer)
+
+    ### draw model result on each page
+    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
 
     ### draw layout result on each page
-    pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf'))  # dump to local
+    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
 
     ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))   # dump to local 
+    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
 
     ### dump markdown
-    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
+    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
 
+    ### dump content list
+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
 
 
-MS-Office 
+MS-Office
 ----------
 
-.. code:: python 
+.. code:: python
 
     import os
 
@@ -144,7 +164,7 @@ MS-Office
     input_file_name = input_file.split(".")[0]
     ds = read_local_office(input_file)[0]
 
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
         md_writer, f"{input_file_name}.md", image_dir
     )
 
@@ -154,7 +174,7 @@ This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx**
 Image
 ---------
 
-Single Image File 
+Single Image File
 ^^^^^^^^^^^^^^^^^^^
 
 .. code:: python
@@ -187,7 +207,7 @@ Single Image File
     )
 
 
-Directory That Contains Images 
+Directory That Contains Images
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. code:: python
@@ -213,7 +233,7 @@ Directory That Contains Images
     input_directory = "some_image_dir/"       # replace with real directory that contains images
 
 
-    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
 
     count = 0
     for ds in dss:

+ 1 - 0
next_docs/requirements.txt

@@ -8,6 +8,7 @@ myst-parser
 Pillow==8.4.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
+pdfminer.six==20231228
 sphinx
 sphinx-argparse>=0.5.2
 sphinx-book-theme>=1.1.3

+ 33 - 27
projects/web_api/app.py

@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
 from loguru import logger
 
 import magic_pdf.model as model_config
+from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.pipe.OCRPipe import OCRPipe
-from magic_pdf.pipe.TXTPipe import TXTPipe
-from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.model.operators import InferenceResult
 
 model_config.__use_inside_model__ = True
 
@@ -20,14 +21,15 @@ app = FastAPI()
 
 
 def json_md_dump(
-    pipe,
+    model_json,
+    middle_json,
     md_writer,
     pdf_name,
     content_list,
     md_content,
 ):
     # Write model results to model.json
-    orig_model_list = copy.deepcopy(pipe.model_list)
+    orig_model_list = copy.deepcopy(model_json)
     md_writer.write_string(
         f'{pdf_name}_model.json',
         json.dumps(orig_model_list, ensure_ascii=False, indent=4),
@@ -36,7 +38,7 @@ def json_md_dump(
     # Write intermediate results to middle.json
     md_writer.write_string(
         f'{pdf_name}_middle.json',
-        json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+        json.dumps(middle_json, ensure_ascii=False, indent=4),
     )
 
     # Write text content results to content_list.json
@@ -100,45 +102,49 @@ async def pdf_parse_main(
             output_image_path
         ), FileBasedDataWriter(output_path)
 
+        ds = PymuDocDataset(pdf_bytes)
         # Choose parsing method
         if parse_method == 'auto':
-            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-        elif parse_method == 'txt':
-            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
-        elif parse_method == 'ocr':
-            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
-        else:
+            if ds.classify() == SupportedPdfParseMethod.OCR:
+                parse_method = 'ocr'
+            else:
+                parse_method = 'txt'
+
+        if parse_method not in ['txt', 'ocr']:
             logger.error('Unknown parse method, only auto, ocr, txt allowed')
             return JSONResponse(
                 content={'error': 'Invalid parse method'}, status_code=400
             )
 
-        # Execute classification
-        pipe.pipe_classify()
-
-        # If no model data is provided, use built-in model for parsing
-        if not model_json:
-            if model_config.__use_inside_model__:
-                pipe.pipe_analyze()  # Parse
+        if len(model_json) == 0:
+            if parse_method == 'ocr':
+                infer_result = ds.apply(doc_analyze, ocr=True)
             else:
+                infer_result = ds.apply(doc_analyze, ocr=False)
+
+        else:
+            infer_result = InferenceResult(model_json, ds)
+
+        if len(model_json) == 0 and not model_config.__use_inside_model__:
                 logger.error('Need model list input')
                 return JSONResponse(
                     content={'error': 'Model list input required'}, status_code=400
                 )
+        if parse_method == 'ocr':
+            pipe_res = infer_result.pipe_ocr_mode(image_writer)
+        else:
+            pipe_res = infer_result.pipe_txt_mode(image_writer)
 
-        # Execute parsing
-        pipe.pipe_parse()
 
         # Save results in text and md format
-        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none')
-        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none')
+        content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
+        md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
 
         if is_json_md_dump:
-            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
+            json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
         data = {
-            'layout': copy.deepcopy(pipe.model_list),
-            'info': pipe.pdf_mid_data,
+            'layout': copy.deepcopy(infer_result._infer_res),
+            'info': pipe_res._pipe_res,
             'content_list': content_list,
             'md_content': md_content,
         }

+ 23 - 18
projects/web_demo/web_demo/api/analysis/pdf_ext.py

@@ -11,9 +11,12 @@ from flask import current_app, url_for
 from loguru import logger
 
 import magic_pdf.model as model_config
+from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.libs.json_compressor import JsonCompressor
-from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.model.operators import InferenceResult
 
 from ..extentions import app, db
 from .ext import find_file
@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
 def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
     try:
         model_json = []  # model_json传空list使用内置模型解析
+        image_writer = FileBasedDataWriter(image_dir)
         logger.info(f'is_ocr: {is_ocr}')
+        parse_method = 'ocr'
+        ds = PymuDocDataset(pdf_bytes)
+        # Choose parsing method
         if not is_ocr:
-            jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
-            image_writer = FileBasedDataWriter(image_dir)
-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
-            pipe.pipe_classify()
-        else:
-            jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
-            image_writer = FileBasedDataWriter(image_dir)
-            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
-        """如果没有传入有效的模型数据,则使用内置model解析"""
-        if len(model_json) == 0:
-            if model_config.__use_inside_model__:
-                pipe.pipe_analyze()
+            if ds.classify() == SupportedPdfParseMethod.OCR:
+                parse_method = 'ocr'
             else:
-                logger.error('need model list input')
-                exit(1)
-        pipe.pipe_parse()
-        pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
+                parse_method = 'txt'
+
+        if parse_method == 'ocr':
+            infer_result = ds.apply(doc_analyze, ocr=True)
+        else:
+            infer_result = ds.apply(doc_analyze, ocr=False)
+
+        if parse_method == 'ocr':
+            pipe_res = infer_result.pipe_ocr_mode(image_writer)
+        else:
+            pipe_res = infer_result.pipe_txt_mode(image_writer)
+
+        pdf_mid_data = pipe_res._pipe_res
         pdf_info_list = pdf_mid_data['pdf_info']
         md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
                                 ensure_ascii=False)
@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
     except Exception as e:  # noqa: F841
         logger.error(traceback.format_exc())
 
-
 def get_bbox_info(data):
     bbox_info = []
     for page in data:

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.