Pārlūkot izejas kodu

Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs
Xiaomeng Zhao 11 mēneši atpakaļ
vecāks
revīzija
bdacf29179
36 mainītis faili ar 1575 papildinājumiem un 284 dzēšanām
  1. 1 1
      magic_pdf/data/data_reader_writer/filebase.py
  2. 59 12
      magic_pdf/data/read_api.py
  3. 0 25
      magic_pdf/model/__init__.py
  4. 0 34
      magic_pdf/model/operators.py
  5. 36 11
      magic_pdf/tools/cli.py
  6. 10 3
      magic_pdf/tools/common.py
  7. 29 0
      magic_pdf/utils/office_to_pdf.py
  8. BIN
      next_docs/en/_static/image/inference_result.png
  9. 6 3
      next_docs/en/additional_notes/glossary.rst
  10. 6 0
      next_docs/en/index.rst
  11. 3 1
      next_docs/en/user_guide.rst
  12. 90 62
      next_docs/en/user_guide/data/data_reader_writer.rst
  13. 35 9
      next_docs/en/user_guide/data/read_api.rst
  14. 145 0
      next_docs/en/user_guide/inference_result.rst
  15. 1 1
      next_docs/en/user_guide/install.rst
  16. 0 18
      next_docs/en/user_guide/install/boost_with_cuda.rst
  17. 160 0
      next_docs/en/user_guide/install/config.rst
  18. 32 3
      next_docs/en/user_guide/install/install.rst
  19. 335 0
      next_docs/en/user_guide/pipe_result.rst
  20. 7 5
      next_docs/en/user_guide/quick_start.rst
  21. 56 0
      next_docs/en/user_guide/quick_start/convert_doc.rst
  22. 53 0
      next_docs/en/user_guide/quick_start/convert_docx.rst
  23. 46 0
      next_docs/en/user_guide/quick_start/convert_image.rst
  24. 49 0
      next_docs/en/user_guide/quick_start/convert_pdf.rst
  25. 52 0
      next_docs/en/user_guide/quick_start/convert_ppt.rst
  26. 55 0
      next_docs/en/user_guide/quick_start/convert_pptx.rst
  27. 0 1
      next_docs/en/user_guide/tutorial.rst
  28. 0 3
      next_docs/en/user_guide/tutorial/pipeline.rst
  29. 12 0
      next_docs/en/user_guide/usage.rst
  30. 112 2
      next_docs/en/user_guide/usage/api.rst
  31. 18 3
      next_docs/en/user_guide/usage/command_line.rst
  32. 24 0
      next_docs/en/user_guide/usage/docker.rst
  33. BIN
      next_docs/zh_cn/_static/image/inference_result.png
  34. 103 75
      next_docs/zh_cn/user_guide/data/data_reader_writer.rst
  35. 39 11
      next_docs/zh_cn/user_guide/data/read_api.rst
  36. 1 1
      tests/unittest/test_data/test_read_api.py

+ 1 - 1
magic_pdf/data/data_reader_writer/filebase.py

@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
         if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
             fn_path = os.path.join(self._parent_dir, path)
 
-        if not os.path.exists(os.path.dirname(fn_path)):
+        if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
             os.makedirs(os.path.dirname(fn_path), exist_ok=True)
 
         with open(fn_path, 'wb') as f:

+ 59 - 12
magic_pdf/data/read_api.py

@@ -1,12 +1,14 @@
 import json
 import os
+import tempfile
+import shutil
 from pathlib import Path
 
 from magic_pdf.config.exceptions import EmptyData, InvalidParams
 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
                                                MultiBucketS3DataReader)
 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
-
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
 
 def read_jsonl(
     s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
         list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
     """
     if os.path.isdir(path):
-        reader = FileBasedDataReader(path)
-        return [
-            PymuDocDataset(reader.read(doc_path.name))
-            for doc_path in Path(path).glob('*.pdf')
-        ]
+        reader = FileBasedDataReader()
+        ret = []
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] == 'pdf':
+                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
+        return ret
     else:
         reader = FileBasedDataReader()
         bits = reader.read(path)
         return [PymuDocDataset(bits)]
 
+def read_local_office(path: str) -> list[PymuDocDataset]:
+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
 
-def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
+    Args:
+        path (str): ms-office file or directory that contains ms-office files
+
+    Returns:
+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+        
+    Raises:
+        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
+        FileNotFoundError: File not Found
+        Exception: Unknown Exception raised
+    """
+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+    fns = []
+    ret = []
+    if os.path.isdir(path):
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = Path(file).suffix
+                if suffix in suffixes:
+                    fns.append((os.path.join(root, file)))
+    else:
+        fns.append(path)
+        
+    reader = FileBasedDataReader()
+    temp_dir = tempfile.mkdtemp()
+    for fn in fns:
+        try:
+            convert_file_to_pdf(fn, temp_dir)
+        except ConvertToPdfError as e:
+            raise e
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise e
+        fn_path = Path(fn)
+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
+    shutil.rmtree(temp_dir)
+    return ret
+
+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
     """Read images from path or directory.
 
     Args:
         path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
 
     Returns:
         list[ImageDataset]: each image file will converted to a ImageDataset
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
     if os.path.isdir(path):
         imgs_bits = []
         s_suffixes = set(suffixes)
-        reader = FileBasedDataReader(path)
+        reader = FileBasedDataReader()
         for root, _, files in os.walk(path):
             for file in files:
-                suffix = file.split('.')
-                if suffix[-1] in s_suffixes:
-                    imgs_bits.append(reader.read(file))
+                suffix = Path(file).suffix
+                if suffix in s_suffixes:
+                    imgs_bits.append(reader.read(os.path.join(root, file)))
         return [ImageDataset(bits) for bits in imgs_bits]
     else:
         reader = FileBasedDataReader()

+ 0 - 25
magic_pdf/model/__init__.py

@@ -66,31 +66,6 @@ class InferenceResultBase(ABC):
         pass
 
     @abstractmethod
-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        """Post-proc the model inference result.
-            step1: classify the dataset type
-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
-
-        Args:
-            imageWriter (DataWriter): the image writer handle
-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
-            lang (str, optional): Defaults to None.
-
-        Returns:
-            PipeResult: the result
-        """
-        pass
-
-    @abstractmethod
     def pipe_txt_mode(
         self,
         imageWriter: DataWriter,

+ 0 - 34
magic_pdf/model/operators.py

@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
         """
         return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
 
-    def pipe_auto_mode(
-        self,
-        imageWriter: DataWriter,
-        start_page_id=0,
-        end_page_id=None,
-        debug_mode=False,
-        lang=None,
-    ) -> PipeResult:
-        """Post-proc the model inference result.
-            step1: classify the dataset type
-            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
-
-        Args:
-            imageWriter (DataWriter): the image writer handle
-            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
-            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
-            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
-            lang (str, optional): Defaults to None.
-
-        Returns:
-            PipeResult: the result
-        """
-
-        pdf_proc_method = classify(self._dataset.data_bits())
-
-        if pdf_proc_method == SupportedPdfParseMethod.TXT:
-            return self.pipe_txt_mode(
-                imageWriter, start_page_id, end_page_id, debug_mode, lang
-            )
-        else:
-            return self.pipe_ocr_mode(
-                imageWriter, start_page_id, end_page_id, debug_mode, lang
-            )
-
     def pipe_txt_mode(
         self,
         imageWriter: DataWriter,

+ 36 - 11
magic_pdf/tools/cli.py

@@ -1,13 +1,20 @@
 import os
-from pathlib import Path
-
+import shutil
+import tempfile
 import click
+import fitz
 from loguru import logger
+from pathlib import Path
 
 import magic_pdf.model as model_config
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.libs.version import __version__
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
+
+pdf_suffixes = ['.pdf']
+ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+image_suffixes = ['.png', '.jpg']
 
 
 @click.command()
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
     'path',
     type=click.Path(exists=True),
     required=True,
-    help='local pdf filepath or directory',
+    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
 )
 @click.option(
     '-o',
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
     model_config.__use_inside_model__ = True
     model_config.__model_mode__ = 'full'
     os.makedirs(output_dir, exist_ok=True)
+    temp_dir = tempfile.mkdtemp()
+    def read_fn(path: Path):
+        if path.suffix in ms_office_suffixes:
+            convert_file_to_pdf(str(path), temp_dir)
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+        elif path.suffix in image_suffixes:
+            with open(str(path), 'rb') as f:
+                bits = f.read()
+            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            with open(fn, 'wb') as f:
+                f.write(pdf_bytes)
+        elif path.suffix in pdf_suffixes:
+            fn = str(path)
+        else:
+            raise Exception(f"Unknown file suffix: {path.suffix}")
+        
+        disk_rw = FileBasedDataReader(os.path.dirname(fn))
+        return disk_rw.read(os.path.basename(fn))
 
-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-
-    def parse_doc(doc_path: str):
+    def parse_doc(doc_path: Path):
         try:
             file_name = str(Path(doc_path).stem)
             pdf_data = read_fn(doc_path)
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
             logger.exception(e)
 
     if os.path.isdir(path):
-        for doc_path in Path(path).glob('*.pdf'):
-            parse_doc(doc_path)
+        for doc_path in Path(path).glob('*'):
+            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
+                parse_doc(doc_path)
     else:
-        parse_doc(path)
+        parse_doc(Path(path))
+
+    shutil.rmtree(temp_dir)
 
 
 if __name__ == '__main__':

+ 10 - 3
magic_pdf/tools/common.py

@@ -170,6 +170,7 @@ def do_parse(
             logger.error('need model list input')
             exit(2)
     else:
+        
         infer_result = InferenceResult(model_list, ds)
         if parse_method == 'ocr':
             pipe_result = infer_result.pipe_ocr_mode(
@@ -180,9 +181,15 @@ def do_parse(
                 image_writer, debug_mode=True, lang=lang
             )
         else:
-            pipe_result = infer_result.pipe_auto_mode(
-                image_writer, debug_mode=True, lang=lang
-            )
+            if ds.classify() == SupportedPdfParseMethod.TXT:
+                pipe_result = infer_result.pipe_txt_mode(
+                        image_writer, debug_mode=True, lang=lang
+                    )
+            else:
+                pipe_result = infer_result.pipe_txt_mode(
+                        image_writer, debug_mode=True, lang=lang
+                    )
+            
 
     if f_draw_model_bbox:
         infer_result.draw_model(

+ 29 - 0
magic_pdf/utils/office_to_pdf.py

@@ -0,0 +1,29 @@
+import os
+import subprocess
+from pathlib import Path
+
+
+class ConvertToPdfError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+        super().__init__(self.msg)
+
+
+def convert_file_to_pdf(input_path, output_dir):
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"The input file {input_path} does not exist.")
+
+    os.makedirs(output_dir, exist_ok=True)
+    
+    cmd = [
+        'soffice',
+        '--headless',
+        '--convert-to', 'pdf',
+        '--outdir', str(output_dir),
+        str(input_path)
+    ]
+    
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    
+    if process.returncode != 0:
+        raise ConvertToPdfError(process.stderr.decode())

BIN
next_docs/en/_static/image/inference_result.png


+ 6 - 3
next_docs/en/additional_notes/glossary.rst

@@ -4,8 +4,11 @@ Glossary
 ===========
 
 1. jsonl 
-    TODO: add description
+    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
+    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
+
+
+2. magic-pdf.json 
+    TODO
 
-2. magic-pdf.json
-    TODO: add description
 

+ 6 - 0
next_docs/en/index.rst

@@ -70,6 +70,12 @@ Key Features
 -  Supports both CPU and GPU environments.
 -  Compatible with Windows, Linux, and Mac platforms.
 
+
+.. tip::
+
+   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
+
+
 User Guide
 -------------
 .. toctree::

+ 3 - 1
next_docs/en/user_guide.rst

@@ -4,7 +4,9 @@
     :maxdepth: 2
 
     user_guide/install
+    user_guide/usage
     user_guide/quick_start
     user_guide/tutorial
     user_guide/data
-    
+    user_guide/inference_result
+    user_guide/pipe_result

+ 90 - 62
next_docs/en/user_guide/data/data_reader_writer.rst

@@ -87,56 +87,70 @@ Read Examples
 
 .. code:: python
 
+    import os 
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # file based related 
+    # file based related
     file_based_reader1 = FileBasedDataReader('')
 
-    ## will read file abc 
-    file_based_reader1.read('abc') 
+    ## will read file abc
+    file_based_reader1.read('abc')
 
     file_based_reader2 = FileBasedDataReader('/tmp')
 
     ## will read /tmp/abc
     file_based_reader2.read('abc')
 
-    ## will read /var/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    ## will read /tmp/logs/message.txt
+    file_based_reader2.read('/tmp/logs/message.txt')
 
     # multi bucket s3 releated
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = 'test/unittest'
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
         ),
         S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
             access_key=ak_2,
             secret_key=sk_2,
             endpoint_url=endpoint_url_2,
         )])
-    
-    ## will read s3://test_bucket1/test_prefix/abc
+
+    ## will read s3://{bucket}/{test_prefix}/abc
     multi_bucket_s3_reader1.read('abc')
 
-    ## will read s3://test_bucket1/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## will read s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
 
-    ## will read s3://test_bucket2/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    ## will read s3://{bucket2}/{test_prefix}/abc
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
 
     # s3 related
     s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        test_prefix,
+        bucket,
+        ak,
+        sk,
+        endpoint_url
     )
 
-    ## will read s3://test_bucket/test_prefix/abc 
+    ## will read s3://{bucket}/{test_prefix}/abc
     s3_reader1.read('abc')
-   
-    ## will read s3://test_bucket/efg
-    s3_reader1.read('s3://test_bucket/efg')
+
+    ## will read s3://{bucket}/efg
+    s3_reader1.read(f's3://{bucket}/efg')
 
 
 Write Examples
@@ -144,65 +158,79 @@ Write Examples
 
 .. code:: python
 
+    import os
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config
 
-    # file based related 
-    file_based_writer1 = FileBasedDataWriter('')
+    # file based related
+    file_based_writer1 = FileBasedDataWriter("")
 
     ## will write 123 to abc
-    file_based_writer1.write('abc', '123'.encode()) 
+    file_based_writer1.write("abc", "123".encode())
 
     ## will write 123 to abc
-    file_based_writer1.write_string('abc', '123') 
+    file_based_writer1.write_string("abc", "123")
 
-    file_based_writer2 = FileBasedDataWriter('/tmp')
+    file_based_writer2 = FileBasedDataWriter("/tmp")
 
     ## will write 123 to /tmp/abc
-    file_based_writer2.write_string('abc', '123')
+    file_based_writer2.write_string("abc", "123")
 
-    ## will write 123 to /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
+    ## will write 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
 
     # multi bucket s3 releated
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=test_bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-    
-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
+    bucket = "bucket"               # replace with real bucket
+    ak = "ak"                       # replace with real access key
+    sk = "sk"                       # replace with real secret key
+    endpoint_url = "endpoint_url"   # replace with real endpoint_url
+
+    bucket_2 = "bucket_2"               # replace with real bucket
+    ak_2 = "ak_2"                       # replace with real access key
+    sk_2 = "sk_2"                       # replace with real secret key 
+    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
+
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            ),
+            S3Config(
+                bucket_name=bucket_2,
+                access_key=ak_2,
+                secret_key=sk_2,
+                endpoint_url=endpoint_url_2,
+            ),
+        ],
+    )
 
-    ## will write 123 to s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write_string("abc", "123")
 
-    ## will write 123 to s3://test_bucket1/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write("abc", "123".encode())
 
-    ## will write 123 to s3://test_bucket2/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
+
+    ## will write 123 to s3://{bucket_2}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
 
     # s3 related
-    s3_writer1 = S3DataWriter(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
-    )
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
+
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write("abc", "123".encode())
 
-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write('abc', '123'.encode())
+    ## will write 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write_string("abc", "123")
 
-    ## will write 123 to s3://test_bucket/test_prefix/abc 
-    s3_writer1.write_string('abc', '123')
+    ## will write 123 to s3://{bucket}/efg
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
 
-    ## will write 123 to s3://test_bucket/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
 
 
 Check :doc:`../../api/data_reader_writer` for more details

+ 35 - 9
next_docs/en/user_guide/data/read_api.rst

@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # read jsonl from local machine 
-    datasets = read_jsonl("tt.jsonl", None)
+    # read jsonl from local machine
+    datasets = read_jsonl("tt.jsonl", None)   # replace with real jsonl file
 
     # read jsonl from remote s3
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
 
+    bucket = "bucket_1"                     # replace with real s3 bucket
+    ak = "access_key_1"                     # replace with real s3 access key
+    sk = "secret_key_1"                     # replace with real s3 secret key
+    endpoint_url = "endpoint_url_1"         # replace with real s3 endpoint url
+
+    bucket_2 = "bucket_2"                   # replace with real s3 bucket
+    ak_2 = "access_key_2"                   # replace with real s3 access key
+    sk_2 = "secret_key_2"                   # replace with real s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # replace with real s3 endpoint url
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # replace with real s3 jsonl file
 
 read_local_pdfs
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^
 
 Read pdf from path or directory.
 
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # read pdf path
     datasets = read_local_pdfs("tt.pdf")
@@ -51,13 +77,13 @@ Read images from path or directory
 
 .. code:: python 
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # read from image path 
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # replace with real file path
 
     # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # replace with real directory 
 
 
 Check :doc:`../../api/read_api` for more details

+ 145 - 0
next_docs/en/user_guide/inference_result.rst

@@ -0,0 +1,145 @@
+
+Inference Result 
+==================
+
+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model. 
+Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
+
+
+Model Inference Result
+-----------------------
+
+Structure Definition
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    from pydantic import BaseModel, Field
+    from enum import IntEnum
+
+    class CategoryType(IntEnum):
+            title = 0               # Title
+            plain_text = 1          # Text
+            abandon = 2             # Includes headers, footers, page numbers, and page annotations
+            figure = 3              # Image
+            figure_caption = 4      # Image description
+            table = 5               # Table
+            table_caption = 6       # Table description
+            table_footnote = 7      # Table footnote
+            isolate_formula = 8     # Block formula
+            formula_caption = 9     # Formula label
+
+            embedding = 13          # Inline formula
+            isolated = 14           # Block formula
+            text = 15               # OCR recognition result
+
+
+    class PageInfo(BaseModel):
+        page_no: int = Field(description="Page number, the first page is 0", ge=0)
+        height: int = Field(description="Page height", gt=0)
+        width: int = Field(description="Page width", ge=0)
+
+    class ObjectInferenceResult(BaseModel):
+        category_id: CategoryType = Field(description="Category", ge=0)
+        poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
+        score: float = Field(description="Confidence of the inference result")
+        latex: str | None = Field(description="LaTeX parsing result", default=None)
+        html: str | None = Field(description="HTML parsing result", default=None)
+
+    class PageInferenceResults(BaseModel):
+            layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
+            page_info: PageInfo = Field(description="Page metadata")
+
+
+Example 
+^^^^^^^^^^^
+
+.. code:: json
+
+    [
+        {
+            "layout_dets": [
+                {
+                    "category_id": 2,
+                    "poly": [
+                        99.1906967163086,
+                        100.3119125366211,
+                        730.3707885742188,
+                        100.3119125366211,
+                        730.3707885742188,
+                        245.81326293945312,
+                        99.1906967163086,
+                        245.81326293945312
+                    ],
+                    "score": 0.9999997615814209
+                }
+            ],
+            "page_info": {
+                "page_no": 0,
+                "height": 2339,
+                "width": 1654
+            }
+        },
+        {
+            "layout_dets": [
+                {
+                    "category_id": 5,
+                    "poly": [
+                        99.13092803955078,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2210.680419921875,
+                        497.3183898925781,
+                        2264.78076171875,
+                        99.13092803955078,
+                        2264.78076171875
+                    ],
+                    "score": 0.9999997019767761
+                }
+            ],
+            "page_info": {
+                "page_no": 1,
+                "height": 2339,
+                "width": 1654
+            }
+        }
+    ]
+
+The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
+representing the coordinates of the top-left, top-right, bottom-right,
+and bottom-left points respectively. |Poly Coordinate Diagram|
+
+
+
+Inference Result 
+-------------------------
+
+
+.. code:: python
+
+    from magic_pdf.model.operators import InferenceResult
+    from magic_pdf.data.dataset import Dataset 
+    
+    dataset : Dataset = some_data_set    # not real dataset
+
+    # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
+    model_inference_result: list[PageInferenceResults] = []
+
+    Inference_result = InferenceResult(model_inference_result, dataset)
+
+
+
+some_model.pdf
+^^^^^^^^^^^^^^^^^^^^
+
+.. figure:: ../_static/image/Inference_result.png
+
+
+
+.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png
+

+ 1 - 1
next_docs/en/user_guide/install.rst

@@ -8,5 +8,5 @@ Installation
    install/install
    install//boost_with_cuda
    install/download_model_weight_files
-
+   install/config
 

+ 0 - 18
next_docs/en/user_guide/install/boost_with_cuda.rst

@@ -9,25 +9,7 @@ appropriate guide based on your system:
 
 -  :ref:`ubuntu_22_04_lts_section`
 -  :ref:`windows_10_or_11_section`
--  Quick Deployment with Docker
 
-.. admonition:: Important
-   :class: tip
-
-   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
-
-   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
-
-   .. code-block:: bash
-
-      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-
-.. code:: sh
-
-   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
-   docker build -t mineru:latest .
-   docker run --rm -it --gpus=all mineru:latest /bin/bash
-   magic-pdf --help
 
 .. _ubuntu_22_04_lts_section:
 

+ 160 - 0
next_docs/en/user_guide/install/config.rst

@@ -0,0 +1,160 @@
+
+
+Config
+=========
+
+File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
+
+
+magic-pdf.json
+----------------
+
+.. code:: json 
+
+    {
+        "bucket_info":{
+            "bucket-name-1":["ak", "sk", "endpoint"],
+            "bucket-name-2":["ak", "sk", "endpoint"]
+        },
+        "models-dir":"/tmp/models",
+        "layoutreader-model-dir":"/tmp/layoutreader",
+        "device-mode":"cpu",
+        "layout-config": {
+            "model": "layoutlmv3"
+        },
+        "formula-config": {
+            "mfd_model": "yolo_v8_mfd",
+            "mfr_model": "unimernet_small",
+            "enable": true
+        },
+        "table-config": {
+            "model": "rapid_table",
+            "enable": false,
+            "max_time": 400    
+        },
+        "config_version": "1.0.0"
+    }
+
+
+
+
+bucket_info
+^^^^^^^^^^^^^^
+Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
+
+Example: 
+
+.. code:: text
+
+        {
+            "image_bucket":[{access_key}, {secret_key}, {endpoint}],
+            "video_bucket":[{access_key}, {secret_key}, {endpoint}]
+        }
+
+
+models-dir
+^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+layoutreader-model-dir
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+devide-mode
+^^^^^^^^^^^^^^
+
+This field have two options, **cpu** or **cuda**.
+
+**cpu**: inference via cpu
+
+**cuda**: using cuda to accelerate inference
+
+
+layout-config 
+^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "model": "layoutlmv3"  
+    }
+
+layout model can not be disabled now, And we have only kind of layout model currently.
+
+
+formula-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "mfd_model": "yolo_v8_mfd",   
+        "mfr_model": "unimernet_small",
+        "enable": true 
+    }
+
+
+mfd_model
+""""""""""
+
+Specify the formula detection model, options are ['yolo_v8_mfd']
+
+
+mfr_model
+""""""""""
+Specify the formula recognition model, options are ['unimernet_small']
+
+Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
+
+
+enable
+""""""""
+
+on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
+
+
+table-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+   {
+        "model": "rapid_table",
+        "enable": false,
+        "max_time": 400    
+    }
+
+model
+""""""""
+
+Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
+
+
+max_time
+"""""""""
+
+Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
+
+
+
+enable
+"""""""
+
+on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
+
+
+config_version
+^^^^^^^^^^^^^^^^
+
+The version of config schema.
+
+
+.. admonition:: Tip
+    :class: tip
+    
+    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
+

+ 32 - 3
next_docs/en/user_guide/install/install.rst

@@ -4,6 +4,7 @@ Install
 If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
 If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
 
+Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
 
 .. admonition:: Warning
     :class: tip
@@ -88,7 +89,7 @@ If the parsing results are not as expected, refer to the :doc:`../../additional_
 
 
 Create an environment
-~~~~~~~~~~~~~~~~~~~~~
+---------------------------
 
 .. code-block:: shell
 
@@ -98,7 +99,7 @@ Create an environment
 
 
 Download model weight files
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------
 
 .. code-block:: shell
 
@@ -107,4 +108,32 @@ Download model weight files
     python download_models_hf.py    
 
 
-The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
+
+Install LibreOffice[Optional]
+----------------------------------
+
+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
+
+
+Linux/Macos Platform
+""""""""""""""""""""""
+
+.. code::
+
+    apt-get/yum/brew install libreoffice
+
+
+Windows Platform 
+""""""""""""""""""""
+
+.. code::
+
+    install libreoffice 
+    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
+
+
+.. tip::
+
+    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
+
+

+ 335 - 0
next_docs/en/user_guide/pipe_result.rst

@@ -0,0 +1,335 @@
+
+
+Pipe Result 
+==============
+
+.. admonition:: Tip
+    :class: tip
+
+    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
+
+
+The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span. 
+Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
+
+
+
+Structure Definitions
+-------------------------------
+
+**some_pdf_middle.json**
+
++----------------+--------------------------------------------------------------+
+| Field Name     | Description                                                  |
+|                |                                                              |
++================+==============================================================+
+| pdf_info       | list, each element is a dict representing the parsing result |
+|                | of each PDF page, see the table below for details            |
++----------------+--------------------------------------------------------------+
+| \_             | ocr \| txt, used to indicate the mode used in this           |
+| parse_type     | intermediate parsing state                                   |
+|                |                                                              |
++----------------+--------------------------------------------------------------+
+| \_version_name | string, indicates the version of magic-pdf used in this      |
+|                | parsing                                                      |
+|                |                                                              |
++----------------+--------------------------------------------------------------+
+
+**pdf_info**
+
+Field structure description
+
++-------------------------+------------------------------------------------------------+
+| Field                   | Description                                                |
+| Name                    |                                                            |
++=========================+============================================================+
+| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
+|                         | segmented                                                  |
++-------------------------+------------------------------------------------------------+
+| layout_bboxes           | Layout segmentation results, containing layout direction   |
+|                         | (vertical, horizontal), and bbox, sorted by reading order  |
++-------------------------+------------------------------------------------------------+
+| page_idx                | Page number, starting from 0                               |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+| page_size               | Page width and height                                      |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+| \_layout_tree           | Layout tree structure                                      |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+| images                  | list, each element is a dict representing an img_block     |
++-------------------------+------------------------------------------------------------+
+| tables                  | list, each element is a dict representing a table_block    |
++-------------------------+------------------------------------------------------------+
+| interline_equation      | list, each element is a dict representing an               |
+|                         | interline_equation_block                                   |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+| discarded_blocks        | List, block information returned by the model that needs   |
+|                         | to be dropped                                              |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+| para_blocks             | Result after segmenting preproc_blocks                     |
+|                         |                                                            |
++-------------------------+------------------------------------------------------------+
+
+In the above table, ``para_blocks`` is an array of dicts, each dict
+representing a block structure. A block can support up to one level of
+nesting.
+
+**block**
+
+The outer block is referred to as a first-level block, and the fields in
+the first-level block include:
+
++------------------------+-------------------------------------------------------------+
+| Field                  | Description                                                 |
+| Name                   |                                                             |
++========================+=============================================================+
+| type                   | Block type (table|image)                                    |
++------------------------+-------------------------------------------------------------+
+| bbox                   | Block bounding box coordinates                              |
++------------------------+-------------------------------------------------------------+
+| blocks                 | list, each element is a dict representing a second-level    |
+|                        | block                                                       |
++------------------------+-------------------------------------------------------------+
+
+There are only two types of first-level blocks: “table” and “image”. All
+other blocks are second-level blocks.
+
+The fields in a second-level block include:
+
++----------------------+----------------------------------------------------------------+
+| Field                | Description                                                    |
+| Name                 |                                                                |
++======================+================================================================+
+|                      | Block type                                                     |
+| type                 |                                                                |
++----------------------+----------------------------------------------------------------+
+|                      | Block bounding box coordinates                                 |
+| bbox                 |                                                                |
++----------------------+----------------------------------------------------------------+
+|                      | list, each element is a dict representing a line, used to      |
+| lines                | describe the composition of a line of information              |
++----------------------+----------------------------------------------------------------+
+
+Detailed explanation of second-level block types
+
+================== ======================
+type               Description
+================== ======================
+image_body         Main body of the image
+image_caption      Image description text
+table_body         Main body of the table
+table_caption      Table description text
+table_footnote     Table footnote
+text               Text block
+title              Title block
+interline_equation Block formula
+================== ======================
+
+**line**
+
+The field format of a line is as follows:
+
++---------------------+----------------------------------------------------------------+
+| Field               | Description                                                    |
+| Name                |                                                                |
++=====================+================================================================+
+|                     | Bounding box coordinates of the line                           |
+| bbox                |                                                                |
++---------------------+----------------------------------------------------------------+
+| spans               | list, each element is a dict representing a span, used to      |
+|                     | describe the composition of the smallest unit                  |
++---------------------+----------------------------------------------------------------+
+
+**span**
+
++---------------------+-----------------------------------------------------------+
+| Field               | Description                                               |
+| Name                |                                                           |
++=====================+===========================================================+
+| bbox                | Bounding box coordinates of the span                      |
++---------------------+-----------------------------------------------------------+
+| type                | Type of the span                                          |
++---------------------+-----------------------------------------------------------+
+| content             | Text spans use content, chart spans use img_path to store |
+| \|                  | the actual text or screenshot path information            |
+| img_path            |                                                           |
++---------------------+-----------------------------------------------------------+
+
+The types of spans are as follows:
+
+================== ==============
+type               Description
+================== ==============
+image              Image
+table              Table
+text               Text
+inline_equation    Inline formula
+interline_equation Block formula
+================== ==============
+
+**Summary**
+
+A span is the smallest storage unit for all elements.
+
+The elements stored within para_blocks are block information.
+
+The block structure is as follows:
+
+First-level block (if any) -> Second-level block -> Line -> Span
+
+.. _example-1:
+
+example
+^^^^^^^
+
+.. code:: json
+
+   {
+       "pdf_info": [
+           {
+               "preproc_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ],
+               "layout_bboxes": [
+                   {
+                       "layout_bbox": [
+                           52,
+                           61,
+                           294,
+                           731
+                       ],
+                       "layout_label": "V",
+                       "sub_layout": []
+                   }
+               ],
+               "page_idx": 0,
+               "page_size": [
+                   612.0,
+                   792.0
+               ],
+               "_layout_tree": [],
+               "images": [],
+               "tables": [],
+               "interline_equations": [],
+               "discarded_blocks": [],
+               "para_blocks": [
+                   {
+                       "type": "text",
+                       "bbox": [
+                           52,
+                           61.956024169921875,
+                           294,
+                           82.99800872802734
+                       ],
+                       "lines": [
+                           {
+                               "bbox": [
+                                   52,
+                                   61.956024169921875,
+                                   294,
+                                   72.0000228881836
+                               ],
+                               "spans": [
+                                   {
+                                       "bbox": [
+                                           54.0,
+                                           61.956024169921875,
+                                           296.2261657714844,
+                                           72.0000228881836
+                                       ],
+                                       "content": "dependent on the service headway and the reliability of the departure ",
+                                       "type": "text",
+                                       "score": 1.0
+                                   }
+                               ]
+                           }
+                       ]
+                   }
+               ]
+           }
+       ],
+       "_parse_type": "txt",
+       "_version_name": "0.6.1"
+   }
+
+
+Pipeline Result 
+------------------
+
+.. code:: python 
+
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.pipe.operators import PipeResult
+    from magic_pdf.data.dataset import Dataset 
+
+    res = pdf_parse_union(*args, **kwargs)
+    res['_parse_type'] = PARSE_TYPE_OCR
+    res['_version_name'] = __version__
+    if 'lang' in kwargs and kwargs['lang'] is not None:
+        res['lang'] = kwargs['lang']
+
+    dataset : Dataset = some_dataset   # not real dataset
+    pipeResult = PipeResult(res, dataset)
+
+
+
+some_pdf_layout.pdf
+~~~~~~~~~~~~~~~~~~~
+
+Each page layout consists of one or more boxes. The number at the top
+left of each box indicates its sequence number. Additionally, in
+``layout.pdf``, different content blocks are highlighted with different
+background colors.
+
+.. figure:: ../_static/image/layout_example.png
+   :alt: layout example
+
+   layout example
+
+some_pdf_spans.pdf
+~~~~~~~~~~~~~~~~~~
+
+All spans on the page are drawn with different colored line frames
+according to the span type. This file can be used for quality control,
+allowing for quick identification of issues such as missing text or
+unrecognized inline formulas.
+
+.. figure:: ../_static/image/spans_example.png
+   :alt: spans example
+
+   spans example

+ 7 - 5
next_docs/en/user_guide/quick_start.rst

@@ -2,12 +2,14 @@
 Quick Start 
 ==============
 
-Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
-
+Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
 
 .. toctree::
     :maxdepth: 1
 
-    quick_start/command_line
-    quick_start/to_markdown
-
+    quick_start/convert_pdf 
+    quick_start/convert_image
+    quick_start/convert_ppt
+    quick_start/convert_pptx
+    quick_start/convert_doc
+    quick_start/convert_docx

+ 56 - 0
next_docs/en/user_guide/quick_start/convert_doc.rst

@@ -0,0 +1,56 @@
+
+
+Convert Doc
+=============
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.doc -o output -m auto
+
+
+API 
+^^^^^^^^
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_doc.doc"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+
+

+ 53 - 0
next_docs/en/user_guide/quick_start/convert_docx.rst

@@ -0,0 +1,53 @@
+
+Convert DocX
+=============
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.docx -o output -m auto
+
+
+API 
+^^^^^
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_docx.docx"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+

+ 46 - 0
next_docs/en/user_guide/quick_start/convert_image.rst

@@ -0,0 +1,46 @@
+
+
+Convert Image
+===============
+
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.png -o output -m auto
+
+
+API 
+^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 49 - 0
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -0,0 +1,49 @@
+
+
+Convert PDF 
+============
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.pdf -o output -m auto
+
+
+API
+^^^^^^
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.data.dataset import PymuDocDataset
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+
+    # args
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    name_without_suff = pdf_file_name.split(".")[0]
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+
+

+ 52 - 0
next_docs/en/user_guide/quick_start/convert_ppt.rst

@@ -0,0 +1,52 @@
+
+
+Convert PPT 
+============
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.ppt -o output -m auto
+
+
+API 
+^^^^^
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 55 - 0
next_docs/en/user_guide/quick_start/convert_pptx.rst

@@ -0,0 +1,55 @@
+
+
+Convert PPTX
+=================
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+Command Line
+^^^^^^^^^^^^^
+
+.. code:: python 
+
+    # make sure the file have correct suffix
+    magic-pdf -p a.pptx -o output -m auto
+
+
+
+
+API 
+^^^^^^
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_pptx.pptx"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )

+ 0 - 1
next_docs/en/user_guide/tutorial.rst

@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 .. toctree::
     :maxdepth: 1
 
-    tutorial/output_file_description
     tutorial/pipeline
 

+ 0 - 3
next_docs/en/user_guide/tutorial/pipeline.rst

@@ -28,7 +28,6 @@ Minimal Example
     image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
         local_md_dir
     )
-    image_dir = str(os.path.basename(local_image_dir))
 
     # read bytes
     reader1 = FileBasedDataReader("")
@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
 .. admonition:: Tip
     :class: tip
 
-    For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
-
     For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
 
 

+ 12 - 0
next_docs/en/user_guide/usage.rst

@@ -0,0 +1,12 @@
+
+
+Usage
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   usage/command_line
+   usage/api
+   usage/docker
+

+ 112 - 2
next_docs/en/user_guide/quick_start/to_markdown.rst → next_docs/en/user_guide/usage/api.rst

@@ -1,8 +1,10 @@
 
+Api Usage 
+===========
 
-Convert To Markdown
-========================
 
+PDF
+----
 
 Local File Example
 ^^^^^^^^^^^^^^^^^^
@@ -113,4 +115,112 @@ S3 File Example
     pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
 
 
+
+MS-Office 
+----------
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
+
+
+Image
+---------
+
+Single Image File 
+^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+
+Directory That Contains Images 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_directory = "some_image_dir/"       # replace with real directory that contains images
+
+
+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
+
+    count = 0
+    for ds in dss:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+            md_writer, f"{count}.md", image_dir
+        )
+        count += 1
+
+
 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details

+ 18 - 3
next_docs/en/user_guide/quick_start/command_line.rst → next_docs/en/user_guide/usage/command_line.rst

@@ -10,7 +10,8 @@ Command Line
 
    Options:
      -v, --version                display the version and exit
-     -p, --path PATH              local pdf filepath or directory  [required]
+     -p, --path PATH              local filepath or directory. support PDF, PPT,
+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
      -o, --output-dir PATH        output local directory  [required]
      -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
                                   technique to extract information from pdf. txt:
@@ -40,6 +41,20 @@ Command Line
    ## command line example
    magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
 
+
+.. admonition:: Important
+    :class: tip
+
+    The file must endswith with the following suffix.
+       .pdf 
+       .png
+       .jpg
+       .ppt
+       .pptx
+       .doc
+       .docx
+
+
 ``{some_pdf}`` can be a single PDF file or a directory containing
 multiple PDFs. The results will be saved in the ``{some_output_dir}``
 directory. The output file list is as follows:
@@ -57,6 +72,6 @@ directory. The output file list is as follows:
 
 .. admonition:: Tip
    :class: tip
+   
 
-   For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
-
+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`

+ 24 - 0
next_docs/en/user_guide/usage/docker.rst

@@ -0,0 +1,24 @@
+
+
+Docker 
+=======
+
+.. admonition:: Important
+   :class: tip
+
+   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
+
+   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
+
+   .. code-block:: bash
+
+      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+   docker build -t mineru:latest .
+   docker run --rm -it --gpus=all mineru:latest /bin/bash
+   magic-pdf --help
+

BIN
next_docs/zh_cn/_static/image/inference_result.png


+ 103 - 75
next_docs/zh_cn/user_guide/data/data_reader_writer.rst

@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
 ---------
 .. code:: python
 
-    from magic_pdf.data.data_reader_writer import * 
+    import os 
+    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # 文件相关的
+    # 初始化 reader
     file_based_reader1 = FileBasedDataReader('')
 
-    ## 将读取文件 abc 
-    file_based_reader1.read('abc') 
+    ## 读本地文件 abc
+    file_based_reader1.read('abc')
 
     file_based_reader2 = FileBasedDataReader('/tmp')
 
-    ## 将读取 /tmp/abc
+    ## 读本地文件 /tmp/abc
     file_based_reader2.read('abc')
 
-    ## 将读取 /var/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    ## 读本地文件 /tmp/logs/message.txt
+    file_based_reader2.read('/tmp/logs/message.txt')
+
+    # 初始化多桶 s3 reader
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
 
-    # 多桶 S3 相关的
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+    test_prefix = 'test/unittest'
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
         ),
         S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
             access_key=ak_2,
             secret_key=sk_2,
             endpoint_url=endpoint_url_2,
         )])
 
-    ## 将读取 s3://test_bucket1/test_prefix/abc
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
     multi_bucket_s3_reader1.read('abc')
 
-    ## 将读取 s3://test_bucket1/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    ## 读文件 s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
 
-    ## 将读取 s3://test_bucket2/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    ## 读文件 s3://{bucket2}/{test_prefix}/abc
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
 
-    # S3 相关的
+    # 初始化 s3 reader
     s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix",
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        test_prefix,
+        bucket,
+        ak,
+        sk,
+        endpoint_url
     )
 
-    ## 将读取 s3://test_bucket/test_prefix/abc 
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
     s3_reader1.read('abc')
 
-    ## 将读取 s3://test_bucket/efg
-    s3_reader1.read('s3://test_bucket/efg')
+    ## 读文件 s3://{bucket}/efg
+    s3_reader1.read(f's3://{bucket}/efg')
+
 
 写入示例
 ----------
 .. code:: python
 
+    import os
     from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config
+
+    # 初始化 reader
+    file_based_writer1 = FileBasedDataWriter("")
+
+    ## 写数据 123 to abc
+    file_based_writer1.write("abc", "123".encode())
+
+    ## 写数据 123 to abc
+    file_based_writer1.write_string("abc", "123")
+
+    file_based_writer2 = FileBasedDataWriter("/tmp")
+
+    ## 写数据 123 to /tmp/abc
+    file_based_writer2.write_string("abc", "123")
+
+    ## 写数据 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
+
+    # 初始化多桶 s3 writer
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
+
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            ),
+            S3Config(
+                bucket_name=bucket_2,
+                access_key=ak_2,
+                secret_key=sk_2,
+                endpoint_url=endpoint_url_2,
+            ),
+        ],
+    )
 
-    # 文件相关的
-    file_based_writer1 = FileBasedDataWriter('')
-
-    ## 将写入 123 到 abc
-    file_based_writer1.write('abc', '123'.encode()) 
-
-    ## 将写入 123 到 abc
-    file_based_writer1.write_string('abc', '123') 
-
-    file_based_writer2 = FileBasedDataWriter('/tmp')
-
-    ## 将写入 123 到 /tmp/abc
-    file_based_writer2.write_string('abc', '123')
-
-    ## 将写入 123 到 /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
-
-    # 多桶 S3 相关的
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=test_bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write_string("abc", "123")
 
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write("abc", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket1/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/efg
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket2/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
 
-    # S3 相关的
-    s3_writer1 = S3DataWriter(
-        default_prefix_without_bucket = "test_prefix",
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
-    )
+    # 初始化 s3 writer
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
 
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
-    s3_writer1.write('abc', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write("abc", "123".encode())
 
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
-    s3_writer1.write_string('abc', '123')
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
+    s3_writer1.write_string("abc", "123")
 
-    ## 将写入 123 到 s3://test_bucket/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
+    ## 写数据 123 to s3://{bucket}/efg
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
 

+ 39 - 11
next_docs/zh_cn/user_guide/data/read_api.rst

@@ -15,13 +15,41 @@ read_jsonl
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # 从本地机器读取 JSONL
-    datasets = read_jsonl("tt.jsonl", None)
+    # 读取本地 jsonl 文件
+    datasets = read_jsonl("tt.jsonl", None)   # 替换为有效的文件
+
+    # 读取 s3 jsonl 文件
+
+    bucket = "bucket_1"                     # 替换为有效的 s3 bucket
+    ak = "access_key_1"                     # 替换为有效的 s3 access key
+    sk = "secret_key_1"                     # 替换为有效的 s3 secret key
+    endpoint_url = "endpoint_url_1"         # 替换为有效的 s3 endpoint url
+
+    bucket_2 = "bucket_2"                   # 替换为有效的 s3 bucket
+    ak_2 = "access_key_2"                   # 替换为有效的 s3 access key
+    sk_2 = "secret_key_2"                   # 替换为有效的 s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # 替换为有效的 s3 endpoint url
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # 替换为有效的 s3 jsonl file
 
-    # 从远程 S3 读取 JSONL
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
 
 read_local_pdfs
 ^^^^^^^^^^^^^^^^
@@ -30,13 +58,13 @@ read_local_pdfs
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # 读取 PDF 路径
-    datasets = read_local_pdfs("tt.pdf")
+    datasets = read_local_pdfs("tt.pdf")  # 替换为有效的文件
 
     # 读取目录下的 PDF 文件
-    datasets = read_local_pdfs("pdfs/")
+    datasets = read_local_pdfs("pdfs/")   # 替换为有效的文件目录
 
 read_local_images
 ^^^^^^^^^^^^^^^^^^^
@@ -45,10 +73,10 @@ read_local_images
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # 从图像路径读取
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # 替换为有效的文件
 
     # 从目录读取以 suffixes 数组中指定后缀结尾的文件
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # 替换为有效的文件目录

+ 1 - 1
tests/unittest/test_data/test_read_api.py

@@ -19,7 +19,7 @@ def test_read_local_pdfs():
 
 
 def test_read_local_images():
-    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['png'])
+    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['.png'])
     assert len(datasets) == 2
     assert len(datasets[0]) == 1
     assert len(datasets[1]) == 1