11 mēneši atpakaļ · bdacf29179
--- a/magic_pdf/data/data_reader_writer/filebase.py
+++ b/magic_pdf/data/data_reader_writer/filebase.py
@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
 
				         if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
			
 
				             fn_path = os.path.join(self._parent_dir, path)
			
 
				 
			
 
				-        if not os.path.exists(os.path.dirname(fn_path)):
			
 
				+        if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
			
 
				             os.makedirs(os.path.dirname(fn_path), exist_ok=True)
			
 
				 
			
 
				         with open(fn_path, 'wb') as f:
			
--- a/magic_pdf/data/read_api.py
+++ b/magic_pdf/data/read_api.py
@@ -1,12 +1,14 @@
 
				 import json
			
 
				 import os
			
 
				+import tempfile
			
 
				+import shutil
			
 
				 from pathlib import Path
			
 
				 
			
 
				 from magic_pdf.config.exceptions import EmptyData, InvalidParams
			
 
				 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
			
 
				                                                MultiBucketS3DataReader)
			
 
				 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
			
 
				-
			
 
				+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
			
 
				 
			
 
				 def read_jsonl(
			
 
				     s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
			
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
 
				         list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
			
 
				     """
			
 
				     if os.path.isdir(path):
			
 
				-        reader = FileBasedDataReader(path)
			
 
				-        return [
			
 
				-            PymuDocDataset(reader.read(doc_path.name))
			
 
				-            for doc_path in Path(path).glob('*.pdf')
			
 
				-        ]
			
 
				+        reader = FileBasedDataReader()
			
 
				+        ret = []
			
 
				+        for root, _, files in os.walk(path):
			
 
				+            for file in files:
			
 
				+                suffix = file.split('.')
			
 
				+                if suffix[-1] == 'pdf':
			
 
				+                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
			
 
				+        return ret
			
 
				     else:
			
 
				         reader = FileBasedDataReader()
			
 
				         bits = reader.read(path)
			
 
				         return [PymuDocDataset(bits)]
			
 
				 
			
 
				+def read_local_office(path: str) -> list[PymuDocDataset]:
			
 
				+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
			
 
				 
			
 
				-def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
			
 
				+    Args:
			
 
				+        path (str): ms-office file or directory that contains ms-office files
			
 
				+
			
 
				+    Returns:
			
 
				+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
			
 
				+        
			
 
				+    Raises:
			
 
				+        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
			
 
				+        FileNotFoundError: File not Found
			
 
				+        Exception: Unknown Exception raised
			
 
				+    """
			
 
				+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
			
 
				+    fns = []
			
 
				+    ret = []
			
 
				+    if os.path.isdir(path):
			
 
				+        for root, _, files in os.walk(path):
			
 
				+            for file in files:
			
 
				+                suffix = Path(file).suffix
			
 
				+                if suffix in suffixes:
			
 
				+                    fns.append((os.path.join(root, file)))
			
 
				+    else:
			
 
				+        fns.append(path)
			
 
				+        
			
 
				+    reader = FileBasedDataReader()
			
 
				+    temp_dir = tempfile.mkdtemp()
			
 
				+    for fn in fns:
			
 
				+        try:
			
 
				+            convert_file_to_pdf(fn, temp_dir)
			
 
				+        except ConvertToPdfError as e:
			
 
				+            raise e
			
 
				+        except FileNotFoundError as e:
			
 
				+            raise e
			
 
				+        except Exception as e:
			
 
				+            raise e
			
 
				+        fn_path = Path(fn)
			
 
				+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
			
 
				+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
			
 
				+    shutil.rmtree(temp_dir)
			
 
				+    return ret
			
 
				+
			
 
				+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
			
 
				     """Read images from path or directory.
			
 
				 
			
 
				     Args:
			
 
				         path (str): image file path or directory that contains image files
			
 
				-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
			
 
				+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
			
 
				 
			
 
				     Returns:
			
 
				         list[ImageDataset]: each image file will converted to a ImageDataset
			
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
 
				     if os.path.isdir(path):
			
 
				         imgs_bits = []
			
 
				         s_suffixes = set(suffixes)
			
 
				-        reader = FileBasedDataReader(path)
			
 
				+        reader = FileBasedDataReader()
			
 
				         for root, _, files in os.walk(path):
			
 
				             for file in files:
			
 
				-                suffix = file.split('.')
			
 
				-                if suffix[-1] in s_suffixes:
			
 
				-                    imgs_bits.append(reader.read(file))
			
 
				+                suffix = Path(file).suffix
			
 
				+                if suffix in s_suffixes:
			
 
				+                    imgs_bits.append(reader.read(os.path.join(root, file)))
			
 
				         return [ImageDataset(bits) for bits in imgs_bits]
			
 
				     else:
			
 
				         reader = FileBasedDataReader()
			
--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
@@ -66,31 +66,6 @@ class InferenceResultBase(ABC):
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def pipe_auto_mode(
			
 
				-        self,
			
 
				-        imageWriter: DataWriter,
			
 
				-        start_page_id=0,
			
 
				-        end_page_id=None,
			
 
				-        debug_mode=False,
			
 
				-        lang=None,
			
 
				-    ) -> PipeResult:
			
 
				-        """Post-proc the model inference result.
			
 
				-            step1: classify the dataset type
			
 
				-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
			
 
				-
			
 
				-        Args:
			
 
				-            imageWriter (DataWriter): the image writer handle
			
 
				-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
			
 
				-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
			
 
				-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
			
 
				-            lang (str, optional): Defaults to None.
			
 
				-
			
 
				-        Returns:
			
 
				-            PipeResult: the result
			
 
				-        """
			
 
				-        pass
			
 
				-
			
 
				-    @abstractmethod
			
 
				     def pipe_txt_mode(
			
 
				         self,
			
 
				         imageWriter: DataWriter,
			
--- a/magic_pdf/model/operators.py
+++ b/magic_pdf/model/operators.py
@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
 
				         """
			
 
				         return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
			
 
				 
			
 
				-    def pipe_auto_mode(
			
 
				-        self,
			
 
				-        imageWriter: DataWriter,
			
 
				-        start_page_id=0,
			
 
				-        end_page_id=None,
			
 
				-        debug_mode=False,
			
 
				-        lang=None,
			
 
				-    ) -> PipeResult:
			
 
				-        """Post-proc the model inference result.
			
 
				-            step1: classify the dataset type
			
 
				-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
			
 
				-
			
 
				-        Args:
			
 
				-            imageWriter (DataWriter): the image writer handle
			
 
				-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
			
 
				-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
			
 
				-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
			
 
				-            lang (str, optional): Defaults to None.
			
 
				-
			
 
				-        Returns:
			
 
				-            PipeResult: the result
			
 
				-        """
			
 
				-
			
 
				-        pdf_proc_method = classify(self._dataset.data_bits())
			
 
				-
			
 
				-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
			
 
				-            return self.pipe_txt_mode(
			
 
				-                imageWriter, start_page_id, end_page_id, debug_mode, lang
			
 
				-            )
			
 
				-        else:
			
 
				-            return self.pipe_ocr_mode(
			
 
				-                imageWriter, start_page_id, end_page_id, debug_mode, lang
			
 
				-            )
			
 
				-
			
 
				     def pipe_txt_mode(
			
 
				         self,
			
 
				         imageWriter: DataWriter,
			
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -1,13 +1,20 @@
 
				 import os
			
 
				-from pathlib import Path
			
 
				-
			
 
				+import shutil
			
 
				+import tempfile
			
 
				 import click
			
 
				+import fitz
			
 
				 from loguru import logger
			
 
				+from pathlib import Path
			
 
				 
			
 
				 import magic_pdf.model as model_config
			
 
				 from magic_pdf.data.data_reader_writer import FileBasedDataReader
			
 
				 from magic_pdf.libs.version import __version__
			
 
				 from magic_pdf.tools.common import do_parse, parse_pdf_methods
			
 
				+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
			
 
				+
			
 
				+pdf_suffixes = ['.pdf']
			
 
				+ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
			
 
				+image_suffixes = ['.png', '.jpg']
			
 
				 
			
 
				 
			
 
				 @click.command()
			
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
 
				     'path',
			
 
				     type=click.Path(exists=True),
			
 
				     required=True,
			
 
				-    help='local pdf filepath or directory',
			
 
				+    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
			
 
				 )
			
 
				 @click.option(
			
 
				     '-o',
			
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
 
				     model_config.__use_inside_model__ = True
			
 
				     model_config.__model_mode__ = 'full'
			
 
				     os.makedirs(output_dir, exist_ok=True)
			
 
				+    temp_dir = tempfile.mkdtemp()
			
 
				+    def read_fn(path: Path):
			
 
				+        if path.suffix in ms_office_suffixes:
			
 
				+            convert_file_to_pdf(str(path), temp_dir)
			
 
				+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
			
 
				+        elif path.suffix in image_suffixes:
			
 
				+            with open(str(path), 'rb') as f:
			
 
				+                bits = f.read()
			
 
				+            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
			
 
				+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
			
 
				+            with open(fn, 'wb') as f:
			
 
				+                f.write(pdf_bytes)
			
 
				+        elif path.suffix in pdf_suffixes:
			
 
				+            fn = str(path)
			
 
				+        else:
			
 
				+            raise Exception(f"Unknown file suffix: {path.suffix}")
			
 
				+        
			
 
				+        disk_rw = FileBasedDataReader(os.path.dirname(fn))
			
 
				+        return disk_rw.read(os.path.basename(fn))
			
 
				 
			
 
				-    def read_fn(path):
			
 
				-        disk_rw = FileBasedDataReader(os.path.dirname(path))
			
 
				-        return disk_rw.read(os.path.basename(path))
			
 
				-
			
 
				-    def parse_doc(doc_path: str):
			
 
				+    def parse_doc(doc_path: Path):
			
 
				         try:
			
 
				             file_name = str(Path(doc_path).stem)
			
 
				             pdf_data = read_fn(doc_path)
			
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
 
				             logger.exception(e)
			
 
				 
			
 
				     if os.path.isdir(path):
			
 
				-        for doc_path in Path(path).glob('*.pdf'):
			
 
				-            parse_doc(doc_path)
			
 
				+        for doc_path in Path(path).glob('*'):
			
 
				+            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
			
 
				+                parse_doc(doc_path)
			
 
				     else:
			
 
				-        parse_doc(path)
			
 
				+        parse_doc(Path(path))
			
 
				+
			
 
				+    shutil.rmtree(temp_dir)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -170,6 +170,7 @@ def do_parse(
 
				             logger.error('need model list input')
			
 
				             exit(2)
			
 
				     else:
			
 
				+        
			
 
				         infer_result = InferenceResult(model_list, ds)
			
 
				         if parse_method == 'ocr':
			
 
				             pipe_result = infer_result.pipe_ocr_mode(
			
@@ -180,9 +181,15 @@ def do_parse(
 
				                 image_writer, debug_mode=True, lang=lang
			
 
				             )
			
 
				         else:
			
 
				-            pipe_result = infer_result.pipe_auto_mode(
			
 
				-                image_writer, debug_mode=True, lang=lang
			
 
				-            )
			
 
				+            if ds.classify() == SupportedPdfParseMethod.TXT:
			
 
				+                pipe_result = infer_result.pipe_txt_mode(
			
 
				+                        image_writer, debug_mode=True, lang=lang
			
 
				+                    )
			
 
				+            else:
			
 
				+                pipe_result = infer_result.pipe_txt_mode(
			
 
				+                        image_writer, debug_mode=True, lang=lang
			
 
				+                    )
			
 
				+            
			
 
				 
			
 
				     if f_draw_model_bbox:
			
 
				         infer_result.draw_model(
			
--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
@@ -0,0 +1,29 @@
 
				+import os
			
 
				+import subprocess
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+class ConvertToPdfError(Exception):
			
 
				+    def __init__(self, msg):
			
 
				+        self.msg = msg
			
 
				+        super().__init__(self.msg)
			
 
				+
			
 
				+
			
 
				+def convert_file_to_pdf(input_path, output_dir):
			
 
				+    if not os.path.isfile(input_path):
			
 
				+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
			
 
				+
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+    
			
 
				+    cmd = [
			
 
				+        'soffice',
			
 
				+        '--headless',
			
 
				+        '--convert-to', 'pdf',
			
 
				+        '--outdir', str(output_dir),
			
 
				+        str(input_path)
			
 
				+    ]
			
 
				+    
			
 
				+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
			
 
				+    
			
 
				+    if process.returncode != 0:
			
 
				+        raise ConvertToPdfError(process.stderr.decode())
			
--- a/next_docs/en/_static/image/inference_result.png
+++ b/next_docs/en/_static/image/inference_result.png
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
@@ -4,8 +4,11 @@ Glossary
 
				 ===========
			
 
				 
			
 
				 1. jsonl 
			
 
				-    TODO: add description
			
 
				+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
			
 
				+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
			
 
				+
			
 
				+
			
 
				+2. magic-pdf.json 
			
 
				+    TODO
			
 
				 
			
 
				-2. magic-pdf.json
			
 
				-    TODO: add description
			
 
				 
			
--- a/next_docs/en/index.rst
+++ b/next_docs/en/index.rst
@@ -70,6 +70,12 @@ Key Features
 
				 -  Supports both CPU and GPU environments.
			
 
				 -  Compatible with Windows, Linux, and Mac platforms.
			
 
				 
			
 
				+
			
 
				+.. tip::
			
 
				+
			
 
				+   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
			
 
				+
			
 
				+
			
 
				 User Guide
			
 
				 -------------
			
 
				 .. toctree::
			
--- a/next_docs/en/user_guide.rst
+++ b/next_docs/en/user_guide.rst
@@ -4,7 +4,9 @@
 
				     :maxdepth: 2
			
 
				 
			
 
				     user_guide/install
			
 
				+    user_guide/usage
			
 
				     user_guide/quick_start
			
 
				     user_guide/tutorial
			
 
				     user_guide/data
			
 
				-    
			
 
				+    user_guide/inference_result
			
 
				+    user_guide/pipe_result
			
--- a/next_docs/en/user_guide/data/data_reader_writer.rst
+++ b/next_docs/en/user_guide/data/data_reader_writer.rst
@@ -87,56 +87,70 @@ Read Examples
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				+    import os 
			
 
				     from magic_pdf.data.data_reader_writer import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				 
			
 
				-    # file based related 
			
 
				+    # file based related
			
 
				     file_based_reader1 = FileBasedDataReader('')
			
 
				 
			
 
				-    ## will read file abc 
			
 
				-    file_based_reader1.read('abc') 
			
 
				+    ## will read file abc
			
 
				+    file_based_reader1.read('abc')
			
 
				 
			
 
				     file_based_reader2 = FileBasedDataReader('/tmp')
			
 
				 
			
 
				     ## will read /tmp/abc
			
 
				     file_based_reader2.read('abc')
			
 
				 
			
 
				-    ## will read /var/logs/message.txt
			
 
				-    file_based_reader2.read('/var/logs/message.txt')
			
 
				+    ## will read /tmp/logs/message.txt
			
 
				+    file_based_reader2.read('/tmp/logs/message.txt')
			
 
				 
			
 
				     # multi bucket s3 releated
			
 
				-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
			
 
				-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+    bucket = "bucket"               # replace with real bucket
			
 
				+    ak = "ak"                       # replace with real access key
			
 
				+    sk = "sk"                       # replace with real secret key
			
 
				+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"               # replace with real bucket
			
 
				+    ak_2 = "ak_2"                       # replace with real access key
			
 
				+    sk_2 = "sk_2"                       # replace with real secret key 
			
 
				+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
			
 
				+
			
 
				+    test_prefix = 'test/unittest'
			
 
				+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
			
 
				+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				         ),
			
 
				         S3Config(
			
 
				-            bucket_name=test_bucket_2,
			
 
				+            bucket_name=bucket_2,
			
 
				             access_key=ak_2,
			
 
				             secret_key=sk_2,
			
 
				             endpoint_url=endpoint_url_2,
			
 
				         )])
			
 
				-    
			
 
				-    ## will read s3://test_bucket1/test_prefix/abc
			
 
				+
			
 
				+    ## will read s3://{bucket}/{test_prefix}/abc
			
 
				     multi_bucket_s3_reader1.read('abc')
			
 
				 
			
 
				-    ## will read s3://test_bucket1/efg
			
 
				-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
			
 
				+    ## will read s3://{bucket}/{test_prefix}/efg
			
 
				+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
			
 
				 
			
 
				-    ## will read s3://test_bucket2/abc
			
 
				-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
			
 
				+    ## will read s3://{bucket2}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
			
 
				 
			
 
				     # s3 related
			
 
				     s3_reader1 = S3DataReader(
			
 
				-        default_prefix_without_bucket = "test_prefix"
			
 
				-        bucket: "test_bucket",
			
 
				-        ak: "ak",
			
 
				-        sk: "sk",
			
 
				-        endpoint_url: "localhost"
			
 
				+        test_prefix,
			
 
				+        bucket,
			
 
				+        ak,
			
 
				+        sk,
			
 
				+        endpoint_url
			
 
				     )
			
 
				 
			
 
				-    ## will read s3://test_bucket/test_prefix/abc 
			
 
				+    ## will read s3://{bucket}/{test_prefix}/abc
			
 
				     s3_reader1.read('abc')
			
 
				-   
			
 
				-    ## will read s3://test_bucket/efg
			
 
				-    s3_reader1.read('s3://test_bucket/efg')
			
 
				+
			
 
				+    ## will read s3://{bucket}/efg
			
 
				+    s3_reader1.read(f's3://{bucket}/efg')
			
 
				 
			
 
				 
			
 
				 Write Examples
			
@@ -144,65 +158,79 @@ Write Examples
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				+    import os
			
 
				     from magic_pdf.data.data_reader_writer import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				 
			
 
				-    # file based related 
			
 
				-    file_based_writer1 = FileBasedDataWriter('')
			
 
				+    # file based related
			
 
				+    file_based_writer1 = FileBasedDataWriter("")
			
 
				 
			
 
				     ## will write 123 to abc
			
 
				-    file_based_writer1.write('abc', '123'.encode()) 
			
 
				+    file_based_writer1.write("abc", "123".encode())
			
 
				 
			
 
				     ## will write 123 to abc
			
 
				-    file_based_writer1.write_string('abc', '123') 
			
 
				+    file_based_writer1.write_string("abc", "123")
			
 
				 
			
 
				-    file_based_writer2 = FileBasedDataWriter('/tmp')
			
 
				+    file_based_writer2 = FileBasedDataWriter("/tmp")
			
 
				 
			
 
				     ## will write 123 to /tmp/abc
			
 
				-    file_based_writer2.write_string('abc', '123')
			
 
				+    file_based_writer2.write_string("abc", "123")
			
 
				 
			
 
				-    ## will write 123 to /var/logs/message.txt
			
 
				-    file_based_writer2.write_string('/var/logs/message.txt', '123')
			
 
				+    ## will write 123 to /tmp/logs/message.txt
			
 
				+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
			
 
				 
			
 
				     # multi bucket s3 releated
			
 
				-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
			
 
				-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				-        ),
			
 
				-        S3Config(
			
 
				-            bucket_name=test_bucket_2,
			
 
				-            access_key=ak_2,
			
 
				-            secret_key=sk_2,
			
 
				-            endpoint_url=endpoint_url_2,
			
 
				-        )])
			
 
				-    
			
 
				-    ## will write 123 to s3://test_bucket1/test_prefix/abc
			
 
				-    multi_bucket_s3_writer1.write_string('abc', '123')
			
 
				+    bucket = "bucket"               # replace with real bucket
			
 
				+    ak = "ak"                       # replace with real access key
			
 
				+    sk = "sk"                       # replace with real secret key
			
 
				+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"               # replace with real bucket
			
 
				+    ak_2 = "ak_2"                       # replace with real access key
			
 
				+    sk_2 = "sk_2"                       # replace with real secret key 
			
 
				+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
			
 
				+
			
 
				+    test_prefix = "test/unittest"
			
 
				+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
			
 
				+        f"{bucket}/{test_prefix}",
			
 
				+        [
			
 
				+            S3Config(
			
 
				+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+            ),
			
 
				+            S3Config(
			
 
				+                bucket_name=bucket_2,
			
 
				+                access_key=ak_2,
			
 
				+                secret_key=sk_2,
			
 
				+                endpoint_url=endpoint_url_2,
			
 
				+            ),
			
 
				+        ],
			
 
				+    )
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket1/test_prefix/abc
			
 
				-    multi_bucket_s3_writer1.write('abc', '123'.encode())
			
 
				+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write_string("abc", "123")
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket1/efg
			
 
				-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
			
 
				+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write("abc", "123".encode())
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket2/abc
			
 
				-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
			
 
				+    ## will write 123 to s3://{bucket}/{test_prefix}/efg
			
 
				+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
			
 
				+
			
 
				+    ## will write 123 to s3://{bucket_2}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
			
 
				 
			
 
				     # s3 related
			
 
				-    s3_writer1 = S3DataWriter(
			
 
				-        default_prefix_without_bucket = "test_prefix"
			
 
				-        bucket: "test_bucket",
			
 
				-        ak: "ak",
			
 
				-        sk: "sk",
			
 
				-        endpoint_url: "localhost"
			
 
				-    )
			
 
				+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
			
 
				+
			
 
				+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    s3_writer1.write("abc", "123".encode())
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket/test_prefix/abc 
			
 
				-    s3_writer1.write('abc', '123'.encode())
			
 
				+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    s3_writer1.write_string("abc", "123")
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket/test_prefix/abc 
			
 
				-    s3_writer1.write_string('abc', '123')
			
 
				+    ## will write 123 to s3://{bucket}/efg
			
 
				+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
			
 
				 
			
 
				-    ## will write 123 to s3://test_bucket/efg
			
 
				-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
			
 
				 
			
 
				 
			
 
				 Check :doc:`../../api/data_reader_writer` for more details
			
--- a/next_docs/en/user_guide/data/read_api.rst
+++ b/next_docs/en/user_guide/data/read_api.rst
@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				 
			
 
				-    # read jsonl from local machine 
			
 
				-    datasets = read_jsonl("tt.jsonl", None)
			
 
				+    # read jsonl from local machine
			
 
				+    datasets = read_jsonl("tt.jsonl", None)   # replace with real jsonl file
			
 
				 
			
 
				     # read jsonl from remote s3
			
 
				-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
			
 
				 
			
 
				+    bucket = "bucket_1"                     # replace with real s3 bucket
			
 
				+    ak = "access_key_1"                     # replace with real s3 access key
			
 
				+    sk = "secret_key_1"                     # replace with real s3 secret key
			
 
				+    endpoint_url = "endpoint_url_1"         # replace with real s3 endpoint url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"                   # replace with real s3 bucket
			
 
				+    ak_2 = "access_key_2"                   # replace with real s3 access key
			
 
				+    sk_2 = "secret_key_2"                   # replace with real s3 secret key
			
 
				+    endpoint_url_2 = "endpoint_url_2"       # replace with real s3 endpoint url
			
 
				+
			
 
				+    s3configs = [
			
 
				+        S3Config(
			
 
				+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+        ),
			
 
				+        S3Config(
			
 
				+            bucket_name=bucket_2,
			
 
				+            access_key=ak_2,
			
 
				+            secret_key=sk_2,
			
 
				+            endpoint_url=endpoint_url_2,
			
 
				+        ),
			
 
				+    ]
			
 
				+
			
 
				+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
			
 
				+
			
 
				+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # replace with real s3 jsonl file
			
 
				 
			
 
				 read_local_pdfs
			
 
				-^^^^^^^^^^^^^^^^
			
 
				+^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 Read pdf from path or directory.
			
 
				 
			
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				 
			
 
				     # read pdf path
			
 
				     datasets = read_local_pdfs("tt.pdf")
			
@@ -51,13 +77,13 @@ Read images from path or directory
 
				 
			
 
				 .. code:: python 
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				 
			
 
				     # read from image path 
			
 
				-    datasets = read_local_images("tt.png")
			
 
				+    datasets = read_local_images("tt.png")  # replace with real file path
			
 
				 
			
 
				     # read files from directory that endswith suffix in suffixes array 
			
 
				-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
			
 
				+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # replace with real directory 
			
 
				 
			
 
				 
			
 
				 Check :doc:`../../api/read_api` for more details
			
--- a/next_docs/en/user_guide/inference_result.rst
+++ b/next_docs/en/user_guide/inference_result.rst
@@ -0,0 +1,145 @@
 
				+
			
 
				+Inference Result 
			
 
				+==================
			
 
				+
			
 
				+.. admonition:: Tip
			
 
				+    :class: tip
			
 
				+
			
 
				+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
			
 
				+
			
 
				+The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model. 
			
 
				+Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
			
 
				+
			
 
				+
			
 
				+Model Inference Result
			
 
				+-----------------------
			
 
				+
			
 
				+Structure Definition
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    from pydantic import BaseModel, Field
			
 
				+    from enum import IntEnum
			
 
				+
			
 
				+    class CategoryType(IntEnum):
			
 
				+            title = 0               # Title
			
 
				+            plain_text = 1          # Text
			
 
				+            abandon = 2             # Includes headers, footers, page numbers, and page annotations
			
 
				+            figure = 3              # Image
			
 
				+            figure_caption = 4      # Image description
			
 
				+            table = 5               # Table
			
 
				+            table_caption = 6       # Table description
			
 
				+            table_footnote = 7      # Table footnote
			
 
				+            isolate_formula = 8     # Block formula
			
 
				+            formula_caption = 9     # Formula label
			
 
				+
			
 
				+            embedding = 13          # Inline formula
			
 
				+            isolated = 14           # Block formula
			
 
				+            text = 15               # OCR recognition result
			
 
				+
			
 
				+
			
 
				+    class PageInfo(BaseModel):
			
 
				+        page_no: int = Field(description="Page number, the first page is 0", ge=0)
			
 
				+        height: int = Field(description="Page height", gt=0)
			
 
				+        width: int = Field(description="Page width", ge=0)
			
 
				+
			
 
				+    class ObjectInferenceResult(BaseModel):
			
 
				+        category_id: CategoryType = Field(description="Category", ge=0)
			
 
				+        poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
			
 
				+        score: float = Field(description="Confidence of the inference result")
			
 
				+        latex: str | None = Field(description="LaTeX parsing result", default=None)
			
 
				+        html: str | None = Field(description="HTML parsing result", default=None)
			
 
				+
			
 
				+    class PageInferenceResults(BaseModel):
			
 
				+            layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
			
 
				+            page_info: PageInfo = Field(description="Page metadata")
			
 
				+
			
 
				+
			
 
				+Example 
			
 
				+^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: json
			
 
				+
			
 
				+    [
			
 
				+        {
			
 
				+            "layout_dets": [
			
 
				+                {
			
 
				+                    "category_id": 2,
			
 
				+                    "poly": [
			
 
				+                        99.1906967163086,
			
 
				+                        100.3119125366211,
			
 
				+                        730.3707885742188,
			
 
				+                        100.3119125366211,
			
 
				+                        730.3707885742188,
			
 
				+                        245.81326293945312,
			
 
				+                        99.1906967163086,
			
 
				+                        245.81326293945312
			
 
				+                    ],
			
 
				+                    "score": 0.9999997615814209
			
 
				+                }
			
 
				+            ],
			
 
				+            "page_info": {
			
 
				+                "page_no": 0,
			
 
				+                "height": 2339,
			
 
				+                "width": 1654
			
 
				+            }
			
 
				+        },
			
 
				+        {
			
 
				+            "layout_dets": [
			
 
				+                {
			
 
				+                    "category_id": 5,
			
 
				+                    "poly": [
			
 
				+                        99.13092803955078,
			
 
				+                        2210.680419921875,
			
 
				+                        497.3183898925781,
			
 
				+                        2210.680419921875,
			
 
				+                        497.3183898925781,
			
 
				+                        2264.78076171875,
			
 
				+                        99.13092803955078,
			
 
				+                        2264.78076171875
			
 
				+                    ],
			
 
				+                    "score": 0.9999997019767761
			
 
				+                }
			
 
				+            ],
			
 
				+            "page_info": {
			
 
				+                "page_no": 1,
			
 
				+                "height": 2339,
			
 
				+                "width": 1654
			
 
				+            }
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
			
 
				+representing the coordinates of the top-left, top-right, bottom-right,
			
 
				+and bottom-left points respectively. |Poly Coordinate Diagram|
			
 
				+
			
 
				+
			
 
				+
			
 
				+Inference Result 
			
 
				+-------------------------
			
 
				+
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    from magic_pdf.model.operators import InferenceResult
			
 
				+    from magic_pdf.data.dataset import Dataset 
			
 
				+    
			
 
				+    dataset : Dataset = some_data_set    # not real dataset
			
 
				+
			
 
				+    # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
			
 
				+    model_inference_result: list[PageInferenceResults] = []
			
 
				+
			
 
				+    Inference_result = InferenceResult(model_inference_result, dataset)
			
 
				+
			
 
				+
			
 
				+
			
 
				+some_model.pdf
			
 
				+^^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. figure:: ../_static/image/Inference_result.png
			
 
				+
			
 
				+
			
 
				+
			
 
				+.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
			
 
				+
			
--- a/next_docs/en/user_guide/install.rst
+++ b/next_docs/en/user_guide/install.rst
@@ -8,5 +8,5 @@ Installation
 
				    install/install
			
 
				    install//boost_with_cuda
			
 
				    install/download_model_weight_files
			
 
				-
			
 
				+   install/config
			
 
				 
			
--- a/next_docs/en/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/en/user_guide/install/boost_with_cuda.rst
@@ -9,25 +9,7 @@ appropriate guide based on your system:
 
				 
			
 
				 -  :ref:`ubuntu_22_04_lts_section`
			
 
				 -  :ref:`windows_10_or_11_section`
			
 
				--  Quick Deployment with Docker
			
 
				 
			
 
				-.. admonition:: Important
			
 
				-   :class: tip
			
 
				-
			
 
				-   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
			
 
				-
			
 
				-   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
			
 
				-
			
 
				-   .. code-block:: bash
			
 
				-
			
 
				-      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
			
 
				-
			
 
				-.. code:: sh
			
 
				-
			
 
				-   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
			
 
				-   docker build -t mineru:latest .
			
 
				-   docker run --rm -it --gpus=all mineru:latest /bin/bash
			
 
				-   magic-pdf --help
			
 
				 
			
 
				 .. _ubuntu_22_04_lts_section:
			
 
				 
			
--- a/next_docs/en/user_guide/install/config.rst
+++ b/next_docs/en/user_guide/install/config.rst
@@ -0,0 +1,160 @@
 
				+
			
 
				+
			
 
				+Config
			
 
				+=========
			
 
				+
			
 
				+File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
			
 
				+
			
 
				+
			
 
				+magic-pdf.json
			
 
				+----------------
			
 
				+
			
 
				+.. code:: json 
			
 
				+
			
 
				+    {
			
 
				+        "bucket_info":{
			
 
				+            "bucket-name-1":["ak", "sk", "endpoint"],
			
 
				+            "bucket-name-2":["ak", "sk", "endpoint"]
			
 
				+        },
			
 
				+        "models-dir":"/tmp/models",
			
 
				+        "layoutreader-model-dir":"/tmp/layoutreader",
			
 
				+        "device-mode":"cpu",
			
 
				+        "layout-config": {
			
 
				+            "model": "layoutlmv3"
			
 
				+        },
			
 
				+        "formula-config": {
			
 
				+            "mfd_model": "yolo_v8_mfd",
			
 
				+            "mfr_model": "unimernet_small",
			
 
				+            "enable": true
			
 
				+        },
			
 
				+        "table-config": {
			
 
				+            "model": "rapid_table",
			
 
				+            "enable": false,
			
 
				+            "max_time": 400    
			
 
				+        },
			
 
				+        "config_version": "1.0.0"
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+bucket_info
			
 
				+^^^^^^^^^^^^^^
			
 
				+Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
			
 
				+
			
 
				+Example: 
			
 
				+
			
 
				+.. code:: text
			
 
				+
			
 
				+        {
			
 
				+            "image_bucket":[{access_key}, {secret_key}, {endpoint}],
			
 
				+            "video_bucket":[{access_key}, {secret_key}, {endpoint}]
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+models-dir
			
 
				+^^^^^^^^^^^^
			
 
				+
			
 
				+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
			
 
				+
			
 
				+
			
 
				+layoutreader-model-dir
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
			
 
				+
			
 
				+
			
 
				+devide-mode
			
 
				+^^^^^^^^^^^^^^
			
 
				+
			
 
				+This field have two options, **cpu** or **cuda**.
			
 
				+
			
 
				+**cpu**: inference via cpu
			
 
				+
			
 
				+**cuda**: using cuda to accelerate inference
			
 
				+
			
 
				+
			
 
				+layout-config 
			
 
				+^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: json
			
 
				+
			
 
				+    {
			
 
				+        "model": "layoutlmv3"  
			
 
				+    }
			
 
				+
			
 
				+layout model can not be disabled now, And we have only kind of layout model currently.
			
 
				+
			
 
				+
			
 
				+formula-config
			
 
				+^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: json
			
 
				+
			
 
				+    {
			
 
				+        "mfd_model": "yolo_v8_mfd",   
			
 
				+        "mfr_model": "unimernet_small",
			
 
				+        "enable": true 
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+mfd_model
			
 
				+""""""""""
			
 
				+
			
 
				+Specify the formula detection model, options are ['yolo_v8_mfd']
			
 
				+
			
 
				+
			
 
				+mfr_model
			
 
				+""""""""""
			
 
				+Specify the formula recognition model, options are ['unimernet_small']
			
 
				+
			
 
				+Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
			
 
				+
			
 
				+
			
 
				+enable
			
 
				+""""""""
			
 
				+
			
 
				+on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
			
 
				+
			
 
				+
			
 
				+table-config
			
 
				+^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: json
			
 
				+
			
 
				+   {
			
 
				+        "model": "rapid_table",
			
 
				+        "enable": false,
			
 
				+        "max_time": 400    
			
 
				+    }
			
 
				+
			
 
				+model
			
 
				+""""""""
			
 
				+
			
 
				+Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
			
 
				+
			
 
				+
			
 
				+max_time
			
 
				+"""""""""
			
 
				+
			
 
				+Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
			
 
				+
			
 
				+
			
 
				+
			
 
				+enable
			
 
				+"""""""
			
 
				+
			
 
				+on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
			
 
				+
			
 
				+
			
 
				+config_version
			
 
				+^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+The version of config schema.
			
 
				+
			
 
				+
			
 
				+.. admonition:: Tip
			
 
				+    :class: tip
			
 
				+    
			
 
				+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
			
 
				+
			
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -4,6 +4,7 @@ Install
 
				 If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
			
 
				 If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
			
 
				 
			
 
				+Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
			
 
				 
			
 
				 .. admonition:: Warning
			
 
				     :class: tip
			
@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_
 
				 
			
 
				 
			
 
				 Create an environment
			
 
				-~~~~~~~~~~~~~~~~~~~~~
			
 
				+---------------------------
			
 
				 
			
 
				 .. code-block:: shell
			
 
				 
			
@@ -98,7 +99,7 @@ Create an environment
 
				 
			
 
				 
			
 
				 Download model weight files
			
 
				-~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+------------------------------
			
 
				 
			
 
				 .. code-block:: shell
			
 
				 
			
@@ -107,4 +108,32 @@ Download model weight files
 
				     python download_models_hf.py    
			
 
				 
			
 
				 
			
 
				-The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
			
 
				+
			
 
				+Install LibreOffice[Optional]
			
 
				+----------------------------------
			
 
				+
			
 
				+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
			
 
				+
			
 
				+
			
 
				+Linux/Macos Platform
			
 
				+""""""""""""""""""""""
			
 
				+
			
 
				+.. code::
			
 
				+
			
 
				+    apt-get/yum/brew install libreoffice
			
 
				+
			
 
				+
			
 
				+Windows Platform 
			
 
				+""""""""""""""""""""
			
 
				+
			
 
				+.. code::
			
 
				+
			
 
				+    install libreoffice 
			
 
				+    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
			
 
				+
			
 
				+
			
 
				+.. tip::
			
 
				+
			
 
				+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
			
 
				+
			
 
				+
			
--- a/next_docs/en/user_guide/pipe_result.rst
+++ b/next_docs/en/user_guide/pipe_result.rst
@@ -0,0 +1,335 @@
 
				+
			
 
				+
			
 
				+Pipe Result 
			
 
				+==============
			
 
				+
			
 
				+.. admonition:: Tip
			
 
				+    :class: tip
			
 
				+
			
 
				+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
			
 
				+
			
 
				+
			
 
				+The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span. 
			
 
				+Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
			
 
				+
			
 
				+
			
 
				+
			
 
				+Structure Definitions
			
 
				+-------------------------------
			
 
				+
			
 
				+**some_pdf_middle.json**
			
 
				+
			
 
				++----------------+--------------------------------------------------------------+
			
 
				+| Field Name     | Description                                                  |
			
 
				+|                |                                                              |
			
 
				++================+==============================================================+
			
 
				+| pdf_info       | list, each element is a dict representing the parsing result |
			
 
				+|                | of each PDF page, see the table below for details            |
			
 
				++----------------+--------------------------------------------------------------+
			
 
				+| \_             | ocr \| txt, used to indicate the mode used in this           |
			
 
				+| parse_type     | intermediate parsing state                                   |
			
 
				+|                |                                                              |
			
 
				++----------------+--------------------------------------------------------------+
			
 
				+| \_version_name | string, indicates the version of magic-pdf used in this      |
			
 
				+|                | parsing                                                      |
			
 
				+|                |                                                              |
			
 
				++----------------+--------------------------------------------------------------+
			
 
				+
			
 
				+**pdf_info**
			
 
				+
			
 
				+Field structure description
			
 
				+
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| Field                   | Description                                                |
			
 
				+| Name                    |                                                            |
			
 
				++=========================+============================================================+
			
 
				+| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
			
 
				+|                         | segmented                                                  |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| layout_bboxes           | Layout segmentation results, containing layout direction   |
			
 
				+|                         | (vertical, horizontal), and bbox, sorted by reading order  |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| page_idx                | Page number, starting from 0                               |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| page_size               | Page width and height                                      |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| \_layout_tree           | Layout tree structure                                      |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| images                  | list, each element is a dict representing an img_block     |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| tables                  | list, each element is a dict representing a table_block    |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| interline_equation      | list, each element is a dict representing an               |
			
 
				+|                         | interline_equation_block                                   |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| discarded_blocks        | List, block information returned by the model that needs   |
			
 
				+|                         | to be dropped                                              |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+| para_blocks             | Result after segmenting preproc_blocks                     |
			
 
				+|                         |                                                            |
			
 
				++-------------------------+------------------------------------------------------------+
			
 
				+
			
 
				+In the above table, ``para_blocks`` is an array of dicts, each dict
			
 
				+representing a block structure. A block can support up to one level of
			
 
				+nesting.
			
 
				+
			
 
				+**block**
			
 
				+
			
 
				+The outer block is referred to as a first-level block, and the fields in
			
 
				+the first-level block include:
			
 
				+
			
 
				++------------------------+-------------------------------------------------------------+
			
 
				+| Field                  | Description                                                 |
			
 
				+| Name                   |                                                             |
			
 
				++========================+=============================================================+
			
 
				+| type                   | Block type (table|image)                                    |
			
 
				++------------------------+-------------------------------------------------------------+
			
 
				+| bbox                   | Block bounding box coordinates                              |
			
 
				++------------------------+-------------------------------------------------------------+
			
 
				+| blocks                 | list, each element is a dict representing a second-level    |
			
 
				+|                        | block                                                       |
			
 
				++------------------------+-------------------------------------------------------------+
			
 
				+
			
 
				+There are only two types of first-level blocks: “table” and “image”. All
			
 
				+other blocks are second-level blocks.
			
 
				+
			
 
				+The fields in a second-level block include:
			
 
				+
			
 
				++----------------------+----------------------------------------------------------------+
			
 
				+| Field                | Description                                                    |
			
 
				+| Name                 |                                                                |
			
 
				++======================+================================================================+
			
 
				+|                      | Block type                                                     |
			
 
				+| type                 |                                                                |
			
 
				++----------------------+----------------------------------------------------------------+
			
 
				+|                      | Block bounding box coordinates                                 |
			
 
				+| bbox                 |                                                                |
			
 
				++----------------------+----------------------------------------------------------------+
			
 
				+|                      | list, each element is a dict representing a line, used to      |
			
 
				+| lines                | describe the composition of a line of information              |
			
 
				++----------------------+----------------------------------------------------------------+
			
 
				+
			
 
				+Detailed explanation of second-level block types
			
 
				+
			
 
				+================== ======================
			
 
				+type               Description
			
 
				+================== ======================
			
 
				+image_body         Main body of the image
			
 
				+image_caption      Image description text
			
 
				+table_body         Main body of the table
			
 
				+table_caption      Table description text
			
 
				+table_footnote     Table footnote
			
 
				+text               Text block
			
 
				+title              Title block
			
 
				+interline_equation Block formula
			
 
				+================== ======================
			
 
				+
			
 
				+**line**
			
 
				+
			
 
				+The field format of a line is as follows:
			
 
				+
			
 
				++---------------------+----------------------------------------------------------------+
			
 
				+| Field               | Description                                                    |
			
 
				+| Name                |                                                                |
			
 
				++=====================+================================================================+
			
 
				+|                     | Bounding box coordinates of the line                           |
			
 
				+| bbox                |                                                                |
			
 
				++---------------------+----------------------------------------------------------------+
			
 
				+| spans               | list, each element is a dict representing a span, used to      |
			
 
				+|                     | describe the composition of the smallest unit                  |
			
 
				++---------------------+----------------------------------------------------------------+
			
 
				+
			
 
				+**span**
			
 
				+
			
 
				++---------------------+-----------------------------------------------------------+
			
 
				+| Field               | Description                                               |
			
 
				+| Name                |                                                           |
			
 
				++=====================+===========================================================+
			
 
				+| bbox                | Bounding box coordinates of the span                      |
			
 
				++---------------------+-----------------------------------------------------------+
			
 
				+| type                | Type of the span                                          |
			
 
				++---------------------+-----------------------------------------------------------+
			
 
				+| content             | Text spans use content, chart spans use img_path to store |
			
 
				+| \|                  | the actual text or screenshot path information            |
			
 
				+| img_path            |                                                           |
			
 
				++---------------------+-----------------------------------------------------------+
			
 
				+
			
 
				+The types of spans are as follows:
			
 
				+
			
 
				+================== ==============
			
 
				+type               Description
			
 
				+================== ==============
			
 
				+image              Image
			
 
				+table              Table
			
 
				+text               Text
			
 
				+inline_equation    Inline formula
			
 
				+interline_equation Block formula
			
 
				+================== ==============
			
 
				+
			
 
				+**Summary**
			
 
				+
			
 
				+A span is the smallest storage unit for all elements.
			
 
				+
			
 
				+The elements stored within para_blocks are block information.
			
 
				+
			
 
				+The block structure is as follows:
			
 
				+
			
 
				+First-level block (if any) -> Second-level block -> Line -> Span
			
 
				+
			
 
				+.. _example-1:
			
 
				+
			
 
				+example
			
 
				+^^^^^^^
			
 
				+
			
 
				+.. code:: json
			
 
				+
			
 
				+   {
			
 
				+       "pdf_info": [
			
 
				+           {
			
 
				+               "preproc_blocks": [
			
 
				+                   {
			
 
				+                       "type": "text",
			
 
				+                       "bbox": [
			
 
				+                           52,
			
 
				+                           61.956024169921875,
			
 
				+                           294,
			
 
				+                           82.99800872802734
			
 
				+                       ],
			
 
				+                       "lines": [
			
 
				+                           {
			
 
				+                               "bbox": [
			
 
				+                                   52,
			
 
				+                                   61.956024169921875,
			
 
				+                                   294,
			
 
				+                                   72.0000228881836
			
 
				+                               ],
			
 
				+                               "spans": [
			
 
				+                                   {
			
 
				+                                       "bbox": [
			
 
				+                                           54.0,
			
 
				+                                           61.956024169921875,
			
 
				+                                           296.2261657714844,
			
 
				+                                           72.0000228881836
			
 
				+                                       ],
			
 
				+                                       "content": "dependent on the service headway and the reliability of the departure ",
			
 
				+                                       "type": "text",
			
 
				+                                       "score": 1.0
			
 
				+                                   }
			
 
				+                               ]
			
 
				+                           }
			
 
				+                       ]
			
 
				+                   }
			
 
				+               ],
			
 
				+               "layout_bboxes": [
			
 
				+                   {
			
 
				+                       "layout_bbox": [
			
 
				+                           52,
			
 
				+                           61,
			
 
				+                           294,
			
 
				+                           731
			
 
				+                       ],
			
 
				+                       "layout_label": "V",
			
 
				+                       "sub_layout": []
			
 
				+                   }
			
 
				+               ],
			
 
				+               "page_idx": 0,
			
 
				+               "page_size": [
			
 
				+                   612.0,
			
 
				+                   792.0
			
 
				+               ],
			
 
				+               "_layout_tree": [],
			
 
				+               "images": [],
			
 
				+               "tables": [],
			
 
				+               "interline_equations": [],
			
 
				+               "discarded_blocks": [],
			
 
				+               "para_blocks": [
			
 
				+                   {
			
 
				+                       "type": "text",
			
 
				+                       "bbox": [
			
 
				+                           52,
			
 
				+                           61.956024169921875,
			
 
				+                           294,
			
 
				+                           82.99800872802734
			
 
				+                       ],
			
 
				+                       "lines": [
			
 
				+                           {
			
 
				+                               "bbox": [
			
 
				+                                   52,
			
 
				+                                   61.956024169921875,
			
 
				+                                   294,
			
 
				+                                   72.0000228881836
			
 
				+                               ],
			
 
				+                               "spans": [
			
 
				+                                   {
			
 
				+                                       "bbox": [
			
 
				+                                           54.0,
			
 
				+                                           61.956024169921875,
			
 
				+                                           296.2261657714844,
			
 
				+                                           72.0000228881836
			
 
				+                                       ],
			
 
				+                                       "content": "dependent on the service headway and the reliability of the departure ",
			
 
				+                                       "type": "text",
			
 
				+                                       "score": 1.0
			
 
				+                                   }
			
 
				+                               ]
			
 
				+                           }
			
 
				+                       ]
			
 
				+                   }
			
 
				+               ]
			
 
				+           }
			
 
				+       ],
			
 
				+       "_parse_type": "txt",
			
 
				+       "_version_name": "0.6.1"
			
 
				+   }
			
 
				+
			
 
				+
			
 
				+Pipeline Result 
			
 
				+------------------
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
			
 
				+    from magic_pdf.pipe.operators import PipeResult
			
 
				+    from magic_pdf.data.dataset import Dataset 
			
 
				+
			
 
				+    res = pdf_parse_union(*args, **kwargs)
			
 
				+    res['_parse_type'] = PARSE_TYPE_OCR
			
 
				+    res['_version_name'] = __version__
			
 
				+    if 'lang' in kwargs and kwargs['lang'] is not None:
			
 
				+        res['lang'] = kwargs['lang']
			
 
				+
			
 
				+    dataset : Dataset = some_dataset   # not real dataset
			
 
				+    pipeResult = PipeResult(res, dataset)
			
 
				+
			
 
				+
			
 
				+
			
 
				+some_pdf_layout.pdf
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Each page layout consists of one or more boxes. The number at the top
			
 
				+left of each box indicates its sequence number. Additionally, in
			
 
				+``layout.pdf``, different content blocks are highlighted with different
			
 
				+background colors.
			
 
				+
			
 
				+.. figure:: ../_static/image/layout_example.png
			
 
				+   :alt: layout example
			
 
				+
			
 
				+   layout example
			
 
				+
			
 
				+some_pdf_spans.pdf
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+All spans on the page are drawn with different colored line frames
			
 
				+according to the span type. This file can be used for quality control,
			
 
				+allowing for quick identification of issues such as missing text or
			
 
				+unrecognized inline formulas.
			
 
				+
			
 
				+.. figure:: ../_static/image/spans_example.png
			
 
				+   :alt: spans example
			
 
				+
			
 
				+   spans example
			
--- a/next_docs/en/user_guide/quick_start.rst
+++ b/next_docs/en/user_guide/quick_start.rst
@@ -2,12 +2,14 @@
 
				 Quick Start 
			
 
				 ==============
			
 
				 
			
 
				-Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
			
 
				-
			
 
				+Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
			
 
				 
			
 
				 .. toctree::
			
 
				     :maxdepth: 1
			
 
				 
			
 
				-    quick_start/command_line
			
 
				-    quick_start/to_markdown
			
 
				-
			
 
				+    quick_start/convert_pdf 
			
 
				+    quick_start/convert_image
			
 
				+    quick_start/convert_ppt
			
 
				+    quick_start/convert_pptx
			
 
				+    quick_start/convert_doc
			
 
				+    quick_start/convert_docx
			
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -0,0 +1,56 @@
 
				+
			
 
				+
			
 
				+Convert Doc
			
 
				+=============
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.doc -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^^^
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_doc.doc"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -0,0 +1,53 @@
 
				+
			
 
				+Convert DocX
			
 
				+=============
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.docx -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_docx.docx"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -0,0 +1,46 @@
 
				+
			
 
				+
			
 
				+Convert Image
			
 
				+===============
			
 
				+
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.png -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_image.jpg"       # replace with real image file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_images(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -0,0 +1,49 @@
 
				+
			
 
				+
			
 
				+Convert PDF 
			
 
				+============
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.pdf -o output -m auto
			
 
				+
			
 
				+
			
 
				+API
			
 
				+^^^^^^
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.data.dataset import PymuDocDataset
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+
			
 
				+    # args
			
 
				+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
			
 
				+    name_without_suff = pdf_file_name.split(".")[0]
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # read bytes
			
 
				+    reader1 = FileBasedDataReader("")
			
 
				+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    ds = PymuDocDataset(pdf_bytes)
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
			
 
				+
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -0,0 +1,52 @@
 
				+
			
 
				+
			
 
				+Convert PPT 
			
 
				+============
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.ppt -o output -m auto
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_ppt.ppt"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -0,0 +1,55 @@
 
				+
			
 
				+
			
 
				+Convert PPTX
			
 
				+=================
			
 
				+
			
 
				+.. admonition:: Warning
			
 
				+    :class: tip
			
 
				+
			
 
				+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
			
 
				+    
			
 
				+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
			
 
				+
			
 
				+
			
 
				+Command Line
			
 
				+^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    # make sure the file have correct suffix
			
 
				+    magic-pdf -p a.pptx -o output -m auto
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+API 
			
 
				+^^^^^^
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_pptx.pptx"     # replace with real ms-office file
			
 
				+    
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 
				 .. toctree::
			
 
				     :maxdepth: 1
			
 
				 
			
 
				-    tutorial/output_file_description
			
 
				     tutorial/pipeline
			
 
				 
			
--- a/next_docs/en/user_guide/tutorial/pipeline.rst
+++ b/next_docs/en/user_guide/tutorial/pipeline.rst
@@ -28,7 +28,6 @@ Minimal Example
 
				     image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				         local_md_dir
			
 
				     )
			
 
				-    image_dir = str(os.path.basename(local_image_dir))
			
 
				 
			
 
				     # read bytes
			
 
				     reader1 = FileBasedDataReader("")
			
@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
 
				 .. admonition:: Tip
			
 
				     :class: tip
			
 
				 
			
 
				-    For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
			
 
				-
			
 
				     For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
			
 
				 
			
 
				 
			
--- a/next_docs/en/user_guide/usage.rst
+++ b/next_docs/en/user_guide/usage.rst
@@ -0,0 +1,12 @@
 
				+
			
 
				+
			
 
				+Usage
			
 
				+========
			
 
				+
			
 
				+.. toctree::
			
 
				+   :maxdepth: 1
			
 
				+
			
 
				+   usage/command_line
			
 
				+   usage/api
			
 
				+   usage/docker
			
 
				+
			
--- a/next_docs/en/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/en/user_guide/quick_start/to_markdown.rst
@@ -1,8 +1,10 @@
 
				 
			
 
				+Api Usage 
			
 
				+===========
			
 
				 
			
 
				-Convert To Markdown
			
 
				-========================
			
 
				 
			
 
				+PDF
			
 
				+----
			
 
				 
			
 
				 Local File Example
			
 
				 ^^^^^^^^^^^^^^^^^^
			
@@ -113,4 +115,112 @@ S3 File Example
 
				     pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
			
 
				 
			
 
				 
			
 
				+
			
 
				+MS-Office 
			
 
				+----------
			
 
				+
			
 
				+.. code:: python 
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_office
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_ppt.ppt"     # replace with real ms-office file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_office(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
			
 
				+
			
 
				+
			
 
				+Image
			
 
				+---------
			
 
				+
			
 
				+Single Image File 
			
 
				+^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_file = "some_image.jpg"       # replace with real image file
			
 
				+
			
 
				+    input_file_name = input_file.split(".")[0]
			
 
				+    ds = read_local_images(input_file)[0]
			
 
				+
			
 
				+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+        md_writer, f"{input_file_name}.md", image_dir
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+Directory That Contains Images 
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
			
 
				+
			
 
				+.. code:: python
			
 
				+
			
 
				+    import os
			
 
				+
			
 
				+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
			
 
				+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
			
 
				+    from magic_pdf.data.read_api import read_local_images
			
 
				+
			
 
				+    # prepare env
			
 
				+    local_image_dir, local_md_dir = "output/images", "output"
			
 
				+    image_dir = str(os.path.basename(local_image_dir))
			
 
				+
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
			
 
				+        local_md_dir
			
 
				+    )
			
 
				+
			
 
				+    # proc
			
 
				+    ## Create Dataset Instance
			
 
				+    input_directory = "some_image_dir/"       # replace with real directory that contains images
			
 
				+
			
 
				+
			
 
				+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
			
 
				+
			
 
				+    count = 0
			
 
				+    for ds in dss:
			
 
				+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
			
 
				+            md_writer, f"{count}.md", image_dir
			
 
				+        )
			
 
				+        count += 1
			
 
				+
			
 
				+
			
 
				 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
			
--- a/next_docs/en/user_guide/quick_start/command_line.rst
+++ b/next_docs/en/user_guide/quick_start/command_line.rst
@@ -10,7 +10,8 @@ Command Line
 
				 
			
 
				    Options:
			
 
				      -v, --version                display the version and exit
			
 
				-     -p, --path PATH              local pdf filepath or directory  [required]
			
 
				+     -p, --path PATH              local filepath or directory. support PDF, PPT,
			
 
				+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
			
 
				      -o, --output-dir PATH        output local directory  [required]
			
 
				      -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
			
 
				                                   technique to extract information from pdf. txt:
			
@@ -40,6 +41,20 @@ Command Line
 
				    ## command line example
			
 
				    magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
			
 
				 
			
 
				+
			
 
				+.. admonition:: Important
			
 
				+    :class: tip
			
 
				+
			
 
				+    The file must endswith with the following suffix.
			
 
				+       .pdf 
			
 
				+       .png
			
 
				+       .jpg
			
 
				+       .ppt
			
 
				+       .pptx
			
 
				+       .doc
			
 
				+       .docx
			
 
				+
			
 
				+
			
 
				 ``{some_pdf}`` can be a single PDF file or a directory containing
			
 
				 multiple PDFs. The results will be saved in the ``{some_output_dir}``
			
 
				 directory. The output file list is as follows:
			
@@ -57,6 +72,6 @@ directory. The output file list is as follows:
 
				 
			
 
				 .. admonition:: Tip
			
 
				    :class: tip
			
 
				+   
			
 
				 
			
 
				-   For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
			
 
				-
			
 
				+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
			
--- a/next_docs/en/user_guide/usage/docker.rst
+++ b/next_docs/en/user_guide/usage/docker.rst
@@ -0,0 +1,24 @@
 
				+
			
 
				+
			
 
				+Docker 
			
 
				+=======
			
 
				+
			
 
				+.. admonition:: Important
			
 
				+   :class: tip
			
 
				+
			
 
				+   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
			
 
				+
			
 
				+   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
			
 
				+
			
 
				+   .. code-block:: bash
			
 
				+
			
 
				+      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
			
 
				+
			
 
				+
			
 
				+.. code:: sh
			
 
				+
			
 
				+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
			
 
				+   docker build -t mineru:latest .
			
 
				+   docker run --rm -it --gpus=all mineru:latest /bin/bash
			
 
				+   magic-pdf --help
			
 
				+
			
--- a/next_docs/zh_cn/_static/image/inference_result.png
+++ b/next_docs/zh_cn/_static/image/inference_result.png
--- a/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
+++ b/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建，但仅支持单个桶。S3D
 
				 ---------
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.data_reader_writer import * 
			
 
				+    import os 
			
 
				+    from magic_pdf.data.data_reader_writer import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				 
			
 
				-    # 文件相关的
			
 
				+    # 初始化 reader
			
 
				     file_based_reader1 = FileBasedDataReader('')
			
 
				 
			
 
				-    ## 将读取文件 abc 
			
 
				-    file_based_reader1.read('abc') 
			
 
				+    ## 读本地文件 abc
			
 
				+    file_based_reader1.read('abc')
			
 
				 
			
 
				     file_based_reader2 = FileBasedDataReader('/tmp')
			
 
				 
			
 
				-    ## 将读取 /tmp/abc
			
 
				+    ## 读本地文件 /tmp/abc
			
 
				     file_based_reader2.read('abc')
			
 
				 
			
 
				-    ## 将读取 /var/logs/message.txt
			
 
				-    file_based_reader2.read('/var/logs/message.txt')
			
 
				+    ## 读本地文件 /tmp/logs/message.txt
			
 
				+    file_based_reader2.read('/tmp/logs/message.txt')
			
 
				+
			
 
				+    # 初始化多桶 s3 reader
			
 
				+    bucket = "bucket"               # 替换为有效的 bucket
			
 
				+    ak = "ak"                       # 替换为有效的 access key
			
 
				+    sk = "sk"                       # 替换为有效的 secret key
			
 
				+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
			
 
				+    ak_2 = "ak_2"                       # 替换为有效的 access key
			
 
				+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
			
 
				+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
			
 
				 
			
 
				-    # 多桶 S3 相关的
			
 
				-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
			
 
				-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+    test_prefix = 'test/unittest'
			
 
				+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
			
 
				+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				         ),
			
 
				         S3Config(
			
 
				-            bucket_name=test_bucket_2,
			
 
				+            bucket_name=bucket_2,
			
 
				             access_key=ak_2,
			
 
				             secret_key=sk_2,
			
 
				             endpoint_url=endpoint_url_2,
			
 
				         )])
			
 
				 
			
 
				-    ## 将读取 s3://test_bucket1/test_prefix/abc
			
 
				+    ## 读文件 s3://{bucket}/{test_prefix}/abc
			
 
				     multi_bucket_s3_reader1.read('abc')
			
 
				 
			
 
				-    ## 将读取 s3://test_bucket1/efg
			
 
				-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
			
 
				+    ## 读文件 s3://{bucket}/{test_prefix}/efg
			
 
				+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
			
 
				 
			
 
				-    ## 将读取 s3://test_bucket2/abc
			
 
				-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
			
 
				+    ## 读文件 s3://{bucket2}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
			
 
				 
			
 
				-    # S3 相关的
			
 
				+    # 初始化 s3 reader
			
 
				     s3_reader1 = S3DataReader(
			
 
				-        default_prefix_without_bucket = "test_prefix",
			
 
				-        bucket: "test_bucket",
			
 
				-        ak: "ak",
			
 
				-        sk: "sk",
			
 
				-        endpoint_url: "localhost"
			
 
				+        test_prefix,
			
 
				+        bucket,
			
 
				+        ak,
			
 
				+        sk,
			
 
				+        endpoint_url
			
 
				     )
			
 
				 
			
 
				-    ## 将读取 s3://test_bucket/test_prefix/abc 
			
 
				+    ## 读文件 s3://{bucket}/{test_prefix}/abc
			
 
				     s3_reader1.read('abc')
			
 
				 
			
 
				-    ## 将读取 s3://test_bucket/efg
			
 
				-    s3_reader1.read('s3://test_bucket/efg')
			
 
				+    ## 读文件 s3://{bucket}/efg
			
 
				+    s3_reader1.read(f's3://{bucket}/efg')
			
 
				+
			
 
				 
			
 
				 写入示例
			
 
				 ----------
			
 
				 .. code:: python
			
 
				 
			
 
				+    import os
			
 
				     from magic_pdf.data.data_reader_writer import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				+
			
 
				+    # 初始化 reader
			
 
				+    file_based_writer1 = FileBasedDataWriter("")
			
 
				+
			
 
				+    ## 写数据 123 to abc
			
 
				+    file_based_writer1.write("abc", "123".encode())
			
 
				+
			
 
				+    ## 写数据 123 to abc
			
 
				+    file_based_writer1.write_string("abc", "123")
			
 
				+
			
 
				+    file_based_writer2 = FileBasedDataWriter("/tmp")
			
 
				+
			
 
				+    ## 写数据 123 to /tmp/abc
			
 
				+    file_based_writer2.write_string("abc", "123")
			
 
				+
			
 
				+    ## 写数据 123 to /tmp/logs/message.txt
			
 
				+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
			
 
				+
			
 
				+    # 初始化多桶 s3 writer
			
 
				+    bucket = "bucket"               # 替换为有效的 bucket
			
 
				+    ak = "ak"                       # 替换为有效的 access key
			
 
				+    sk = "sk"                       # 替换为有效的 secret key
			
 
				+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
			
 
				+    ak_2 = "ak_2"                       # 替换为有效的 access key
			
 
				+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
			
 
				+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
			
 
				+
			
 
				+    test_prefix = "test/unittest"
			
 
				+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
			
 
				+        f"{bucket}/{test_prefix}",
			
 
				+        [
			
 
				+            S3Config(
			
 
				+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+            ),
			
 
				+            S3Config(
			
 
				+                bucket_name=bucket_2,
			
 
				+                access_key=ak_2,
			
 
				+                secret_key=sk_2,
			
 
				+                endpoint_url=endpoint_url_2,
			
 
				+            ),
			
 
				+        ],
			
 
				+    )
			
 
				 
			
 
				-    # 文件相关的
			
 
				-    file_based_writer1 = FileBasedDataWriter('')
			
 
				-
			
 
				-    ## 将写入 123 到 abc
			
 
				-    file_based_writer1.write('abc', '123'.encode()) 
			
 
				-
			
 
				-    ## 将写入 123 到 abc
			
 
				-    file_based_writer1.write_string('abc', '123') 
			
 
				-
			
 
				-    file_based_writer2 = FileBasedDataWriter('/tmp')
			
 
				-
			
 
				-    ## 将写入 123 到 /tmp/abc
			
 
				-    file_based_writer2.write_string('abc', '123')
			
 
				-
			
 
				-    ## 将写入 123 到 /var/logs/message.txt
			
 
				-    file_based_writer2.write_string('/var/logs/message.txt', '123')
			
 
				-
			
 
				-    # 多桶 S3 相关的
			
 
				-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
			
 
				-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				-        ),
			
 
				-        S3Config(
			
 
				-            bucket_name=test_bucket_2,
			
 
				-            access_key=ak_2,
			
 
				-            secret_key=sk_2,
			
 
				-            endpoint_url=endpoint_url_2,
			
 
				-        )])
			
 
				-
			
 
				-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
			
 
				-    multi_bucket_s3_writer1.write_string('abc', '123')
			
 
				+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write_string("abc", "123")
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
			
 
				-    multi_bucket_s3_writer1.write('abc', '123'.encode())
			
 
				+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write("abc", "123".encode())
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket1/efg
			
 
				-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
			
 
				+    ## 写数据 123 to s3://{bucket}/{test_prefix}/efg
			
 
				+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket2/abc
			
 
				-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
			
 
				+    ## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
			
 
				+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
			
 
				 
			
 
				-    # S3 相关的
			
 
				-    s3_writer1 = S3DataWriter(
			
 
				-        default_prefix_without_bucket = "test_prefix",
			
 
				-        bucket: "test_bucket",
			
 
				-        ak: "ak",
			
 
				-        sk: "sk",
			
 
				-        endpoint_url: "localhost"
			
 
				-    )
			
 
				+    # 初始化 s3 writer
			
 
				+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
			
 
				-    s3_writer1.write('abc', '123'.encode())
			
 
				+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    s3_writer1.write("abc", "123".encode())
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
			
 
				-    s3_writer1.write_string('abc', '123')
			
 
				+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
			
 
				+    s3_writer1.write_string("abc", "123")
			
 
				 
			
 
				-    ## 将写入 123 到 s3://test_bucket/efg
			
 
				-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
			
 
				+    ## 写数据 123 to s3://{bucket}/efg
			
 
				+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
			
 
				 
			
--- a/next_docs/zh_cn/user_guide/data/read_api.rst
+++ b/next_docs/zh_cn/user_guide/data/read_api.rst
@@ -15,13 +15,41 @@ read_jsonl
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
			
 
				+    from magic_pdf.data.schemas import S3Config
			
 
				 
			
 
				-    # 从本地机器读取 JSONL
			
 
				-    datasets = read_jsonl("tt.jsonl", None)
			
 
				+    # 读取本地 jsonl 文件
			
 
				+    datasets = read_jsonl("tt.jsonl", None)   # 替换为有效的文件
			
 
				+
			
 
				+    # 读取 s3 jsonl 文件
			
 
				+
			
 
				+    bucket = "bucket_1"                     # 替换为有效的 s3 bucket
			
 
				+    ak = "access_key_1"                     # 替换为有效的 s3 access key
			
 
				+    sk = "secret_key_1"                     # 替换为有效的 s3 secret key
			
 
				+    endpoint_url = "endpoint_url_1"         # 替换为有效的 s3 endpoint url
			
 
				+
			
 
				+    bucket_2 = "bucket_2"                   # 替换为有效的 s3 bucket
			
 
				+    ak_2 = "access_key_2"                   # 替换为有效的 s3 access key
			
 
				+    sk_2 = "secret_key_2"                   # 替换为有效的 s3 secret key
			
 
				+    endpoint_url_2 = "endpoint_url_2"       # 替换为有效的 s3 endpoint url
			
 
				+
			
 
				+    s3configs = [
			
 
				+        S3Config(
			
 
				+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
			
 
				+        ),
			
 
				+        S3Config(
			
 
				+            bucket_name=bucket_2,
			
 
				+            access_key=ak_2,
			
 
				+            secret_key=sk_2,
			
 
				+            endpoint_url=endpoint_url_2,
			
 
				+        ),
			
 
				+    ]
			
 
				+
			
 
				+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
			
 
				+
			
 
				+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # 替换为有效的 s3 jsonl file
			
 
				 
			
 
				-    # 从远程 S3 读取 JSONL
			
 
				-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
			
 
				 
			
 
				 read_local_pdfs
			
 
				 ^^^^^^^^^^^^^^^^
			
@@ -30,13 +58,13 @@ read_local_pdfs
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				 
			
 
				     # 读取 PDF 路径
			
 
				-    datasets = read_local_pdfs("tt.pdf")
			
 
				+    datasets = read_local_pdfs("tt.pdf")  # 替换为有效的文件
			
 
				 
			
 
				     # 读取目录下的 PDF 文件
			
 
				-    datasets = read_local_pdfs("pdfs/")
			
 
				+    datasets = read_local_pdfs("pdfs/")   # 替换为有效的文件目录
			
 
				 
			
 
				 read_local_images
			
 
				 ^^^^^^^^^^^^^^^^^^^
			
@@ -45,10 +73,10 @@ read_local_images
 
				 
			
 
				 .. code:: python
			
 
				 
			
 
				-    from magic_pdf.data.io.read_api import *
			
 
				+    from magic_pdf.data.read_api import *
			
 
				 
			
 
				     # 从图像路径读取
			
 
				-    datasets = read_local_images("tt.png")
			
 
				+    datasets = read_local_images("tt.png")  # 替换为有效的文件
			
 
				 
			
 
				     # 从目录读取以 suffixes 数组中指定后缀结尾的文件
			
 
				-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
			
 
				+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # 替换为有效的文件目录
			
--- a/tests/unittest/test_data/test_read_api.py
+++ b/tests/unittest/test_data/test_read_api.py
@@ -19,7 +19,7 @@ def test_read_local_pdfs():
 
				 
			
 
				 
			
 
				 def test_read_local_images():
			
 
				-    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['png'])
			
 
				+    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['.png'])
			
 
				     assert len(datasets) == 2
			
 
				     assert len(datasets[0]) == 1
			
 
				     assert len(datasets[1]) == 1