浏览代码

feat: add more docs about data releated api

xu rui 1 年之前
父节点
当前提交
47db844ca4
共有 34 个文件被更改,包括 1287 次插入45 次删除
  1. 33 0
      docs/en/api/io.rst
  2. 10 0
      docs/en/api/schemas.rst
  3. 25 19
      magic_pdf/data/data_reader_writer/multi_bucket_s3.py
  4. 6 2
      magic_pdf/data/data_reader_writer/s3.py
  5. 6 0
      magic_pdf/data/io/__init__.py
  6. 1 1
      magic_pdf/data/io/base.py
  7. 4 0
      magic_pdf/data/schemas.py
  8. 19 0
      next_docs/en/additional_notes/changelog.rst
  9. 76 0
      next_docs/en/additional_notes/faq.rst
  10. 9 0
      next_docs/en/additional_notes/glossary.rst
  11. 19 0
      next_docs/en/additional_notes/known_issues.rst
  12. 6 5
      next_docs/en/api.rst
  13. 14 0
      next_docs/en/api/classes.rst
  14. 9 9
      next_docs/en/api/data_reader_writer.rst
  15. 8 2
      next_docs/en/api/dataset.rst
  16. 2 2
      next_docs/en/api/read_api.rst
  17. 30 1
      next_docs/en/conf.py
  18. 66 0
      next_docs/en/index.rst
  19. 13 0
      next_docs/en/projects.rst
  20. 10 0
      next_docs/en/user_guide.rst
  21. 19 0
      next_docs/en/user_guide/data.rst
  22. 204 0
      next_docs/en/user_guide/data/data_reader_writer.rst
  23. 40 0
      next_docs/en/user_guide/data/dataset.rst
  24. 25 0
      next_docs/en/user_guide/data/io.rst
  25. 58 0
      next_docs/en/user_guide/data/read_api.rst
  26. 12 0
      next_docs/en/user_guide/install.rst
  27. 315 0
      next_docs/en/user_guide/install/boost_with_cuda.rst
  28. 48 0
      next_docs/en/user_guide/install/download_model_weight_files.rst
  29. 107 0
      next_docs/en/user_guide/install/install.rst
  30. 13 0
      next_docs/en/user_guide/quick_start.rst
  31. 60 0
      next_docs/en/user_guide/quick_start/command_line.rst
  32. 10 0
      next_docs/en/user_guide/quick_start/extract_text.rst
  33. 5 0
      next_docs/en/user_guide/tutorial.rst
  34. 5 4
      next_docs/requirements.txt

+ 33 - 0
docs/en/api/io.rst

@@ -0,0 +1,33 @@
+IO
+==
+
+.. autoclass:: magic_pdf.data.io.base.IOReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.base.IOWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.s3.S3Reader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.s3.S3Writer
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.http.HttpReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: magic_pdf.data.io.http.HttpWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
+

+ 10 - 0
docs/en/api/schemas.rst

@@ -0,0 +1,10 @@
+
+schemas 
+===========
+
+.. autopydantic_model:: magic_pdf.data.schemas.S3Config
+   :members:
+
+.. autopydantic_model:: magic_pdf.data.schemas.PageInfo
+   :members:
+

+ 25 - 19
magic_pdf/data/data_reader_writer/multi_bucket_s3.py

@@ -1,3 +1,4 @@
+import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
 
 
 class MultiS3Mixin:
-    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+    def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
         """Initialized with multiple s3 configs.
 
         Args:
-            default_bucket (str): the default bucket name of the relative path
+            default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
             s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
 
         Raises:
-            InvalidConfig: default bucket config not in s3_configs
-            InvalidConfig: bucket name not unique in s3_configs
-            InvalidConfig: default bucket must be provided
+            InvalidConfig: default bucket config not in s3_configs.
+            InvalidConfig: bucket name not unique in s3_configs.
+            InvalidConfig: default bucket must be provided.
         """
-        if len(default_bucket) == 0:
-            raise InvalidConfig('default_bucket must be provided')
+        if len(default_prefix) == 0:
+            raise InvalidConfig('default_prefix must be provided')
+    
+        arr = default_prefix.strip("/").split("/")
+        self.default_bucket = arr[0]
+        self.default_prefix = "/".join(arr[1:])
 
         found_default_bucket_config = False
         for conf in s3_configs:
-            if conf.bucket_name == default_bucket:
+            if conf.bucket_name == self.default_bucket:
                 found_default_bucket_config = True
                 break
 
         if not found_default_bucket_config:
             raise InvalidConfig(
-                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+                f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
             )
 
         uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
                 f'the bucket_name in s3_configs: {s3_configs} must be unique'
             )
 
-        self.default_bucket = default_bucket
         self.s3_configs = s3_configs
         self._s3_clients_h: dict = {}
 
@@ -47,14 +51,14 @@ class MultiS3Mixin:
 class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
     def read(self, path: str) -> bytes:
         """Read the path from s3, select diffect bucket client for each request
-        based on the path, also support range read.
+        based on the bucket, also support range read.
 
         Args:
-            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
-            for example: s3://bucket_name/path?0,100
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
+            for example: s3://bucket_name/path?0,100.
 
         Returns:
-            bytes: the content of s3 file
+            bytes: the content of s3 file.
         """
         may_range_params = parse_s3_range_params(path)
         if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
 
     def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
         """Read the file with offset and limit, select diffect bucket client
-        for each request based on the path.
+        for each request based on the bucket.
 
         Args:
-            path (str): the file path
+            path (str): the file path.
             offset (int, optional): the number of bytes skipped. Defaults to 0.
             limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
 
         Returns:
-            bytes: the file content
+            bytes: the file content.
         """
         if path.startswith('s3://'):
             bucket_name, path = parse_s3path(path)
             s3_reader = self.__get_s3_client(bucket_name)
         else:
             s3_reader = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
         return s3_reader.read_at(path, offset, limit)
 
 
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
 
     def write(self, path: str, data: bytes) -> None:
         """Write file with data, also select diffect bucket client for each
-        request based on the path.
+        request based on the bucket.
 
         Args:
             path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
+            data (bytes): the data want to write.
         """
         if path.startswith('s3://'):
             bucket_name, path = parse_s3path(path)
             s3_writer = self.__get_s3_client(bucket_name)
         else:
             s3_writer = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
         return s3_writer.write(path, data)

+ 6 - 2
magic_pdf/data/data_reader_writer/s3.py

@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
 class S3DataReader(MultiBucketS3DataReader):
     def __init__(
         self,
+        default_prefix_without_bucket: str,
         bucket: str,
         ak: str,
         sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
         """s3 reader client.
 
         Args:
+            default_prefix_without_bucket: prefix that not contains bucket
             bucket (str): bucket name
             ak (str): access key
             sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
             refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
         """
         super().__init__(
-            bucket,
+            f"{bucket}/{default_prefix_without_bucket}"
             [
                 S3Config(
                     bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
 class S3DataWriter(MultiBucketS3DataWriter):
     def __init__(
         self,
+        default_prefix_without_bucket: str,
         bucket: str,
         ak: str,
         sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
         """s3 writer client.
 
         Args:
+            default_prefix_without_bucket: prefix that not contains bucket
             bucket (str): bucket name
             ak (str): access key
             sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
             refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
         """
         super().__init__(
-            bucket,
+            f"{bucket}/{default_prefix_without_bucket}"
             [
                 S3Config(
                     bucket_name=bucket,

+ 6 - 0
magic_pdf/data/io/__init__.py

@@ -0,0 +1,6 @@
+
+from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
+from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
+from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
+
+__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']

+ 1 - 1
magic_pdf/data/io/base.py

@@ -29,7 +29,7 @@ class IOReader(ABC):
         pass
 
 
-class IOWriter:
+class IOWriter(ABC):
 
     @abstractmethod
     def write(self, path: str, data: bytes) -> None:

+ 4 - 0
magic_pdf/data/schemas.py

@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
 
 
 class S3Config(BaseModel):
+    """S3 config
+    """
     bucket_name: str = Field(description='s3 bucket name', min_length=1)
     access_key: str = Field(description='s3 access key', min_length=1)
     secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
 
 
 class PageInfo(BaseModel):
+    """The width and height of page
+    """
     w: float = Field(description='the width of page')
     h: float = Field(description='the height of page')

+ 19 - 0
next_docs/en/additional_notes/changelog.rst

@@ -0,0 +1,19 @@
+
+
+Changelog
+=========
+
+-  2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
+   `localized deployment version <projects/web_demo/README.md>`__ of the
+   `online
+   demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
+   the `front-end interface <projects/web/README.md>`__.
+-  2024/09/09: Version 0.8.0 released, supporting fast deployment with
+   Dockerfile, and launching demos on Huggingface and Modelscope.
+-  2024/08/30: Version 0.7.1 released, add paddle tablemaster table
+   recognition option
+-  2024/08/09: Version 0.7.0b1 released, simplified installation
+   process, added table recognition functionality
+-  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
+   issues and installation documentation
+-  2024/07/05: Initial open-source release

+ 76 - 0
next_docs/en/additional_notes/faq.rst

@@ -0,0 +1,76 @@
+FAQ
+==========================
+
+1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+On macOS, the default shell has switched from Bash to Z shell, which has
+special handling logic for certain types of string matching. This can
+lead to the “no matches found” error. You can try disabling the globbing
+feature in the command line and then run the installation command again.
+
+.. code:: bash
+
+   setopt no_nomatch
+   pip install magic-pdf[full]
+
+2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This might be due to an incomplete download of the model file. You can
+try re-downloading the model file and then try again. Reference:
+https://github.com/opendatalab/MinerU/issues/143
+
+3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The path for the model files is configured in “magic-pdf.json”. just
+like:
+
+.. code:: json
+
+   {
+     "models-dir": "/tmp/models"
+   }
+
+This path is an absolute path, not a relative path. You can obtain the
+absolute path in the models directory using the “pwd” command.
+Reference:
+https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+
+4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
+install the ``libgl`` library with the following command to resolve the
+issue:
+
+.. code:: bash
+
+   sudo apt-get install libgl1-mesa-glx
+
+Reference: https://github.com/opendatalab/MinerU/issues/388
+
+5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You need to uninstall the module and reinstall it:
+
+.. code:: bash
+
+   pip uninstall fairscale
+   pip install fairscale
+
+Reference: https://github.com/opendatalab/MinerU/issues/411
+
+6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The compatibility of cuda11 with new graphics cards is poor, and the
+CUDA version used by Paddle needs to be upgraded.
+
+.. code:: bash
+
+   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+
+Reference: https://github.com/opendatalab/MinerU/issues/558

+ 9 - 0
next_docs/en/additional_notes/glossary.rst

@@ -0,0 +1,9 @@
+
+
+Glossary 
+===========
+
+1. jsonl 
+    TODO: add description
+
+

+ 19 - 0
next_docs/en/additional_notes/known_issues.rst

@@ -0,0 +1,19 @@
+Known Issues
+============
+
+-  Reading order is based on the model’s sorting of text distribution in
+   space, which may become disordered under extremely complex layouts.
+-  Vertical text is not supported.
+-  Tables of contents and lists are recognized through rules; a few
+   uncommon list formats may not be identified.
+-  Only one level of headings is supported; hierarchical heading levels
+   are currently not supported.
+-  Code blocks are not yet supported in the layout model.
+-  Comic books, art books, elementary school textbooks, and exercise
+   books are not well-parsed yet
+-  Enabling OCR may produce better results in PDFs with a high density
+   of formulas
+-  If you are processing PDFs with a large number of formulas, it is
+   strongly recommended to enable the OCR function. When using PyMuPDF
+   to extract text, overlapping text lines can occur, leading to
+   inaccurate formula insertion positions.

+ 6 - 5
next_docs/en/api.rst

@@ -1,9 +1,10 @@
-Data Api
-------------------
 
 .. toctree::
    :maxdepth: 2
 
-   api/dataset.rst
-   api/data_reader_writer.rst
-   api/read_api.rst
+   api/dataset
+   api/data_reader_writer
+   api/read_api
+   api/schemas
+   api/io
+   api/classes

+ 14 - 0
next_docs/en/api/classes.rst

@@ -0,0 +1,14 @@
+Class Hierarchy
+===============
+
+.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.dataset
+   :parts: 2
+
+
+.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
+   :parts: 2
+

+ 9 - 9
next_docs/en/api/data_reader_writer.rst

@@ -1,44 +1,44 @@
 
 Data Reader Writer
---------------------
+===================
 
 .. autoclass:: magic_pdf.data.data_reader_writer.DataReader
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
    :members:
    :inherited-members:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
-   :members:
-   :inherited-members:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
-   :members:
-   :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
    :members:
    :inherited-members:
+   :show-inheritance:
 

+ 8 - 2
next_docs/en/api/dataset.rst

@@ -1,22 +1,28 @@
-Dataset Api
-------------------
+Dataset
+========
 
 .. autoclass:: magic_pdf.data.dataset.PageableData
    :members:
    :inherited-members:
+   :show-inheritance:
+
 
 .. autoclass:: magic_pdf.data.dataset.Dataset
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.dataset.ImageDataset
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.dataset.PymuDocDataset
    :members:
    :inherited-members:
+   :show-inheritance:
 
 .. autoclass:: magic_pdf.data.dataset.Doc
    :members:
    :inherited-members:
+   :show-inheritance:

+ 2 - 2
next_docs/en/api/read_api.rst

@@ -1,5 +1,5 @@
-read_api Api
-------------------
+read_api
+=========
 
 .. automodule:: magic_pdf.data.read_api
    :members:

+ 30 - 1
next_docs/en/conf.py

@@ -15,7 +15,8 @@ import subprocess
 import sys
 
 from sphinx.ext import autodoc
-
+from docutils import nodes
+from docutils.parsers.rst import Directive
 
 def install(package):
     subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
@@ -58,10 +59,20 @@ extensions = [
     'sphinx_copybutton',
     'sphinx.ext.autodoc',
     'sphinx.ext.autosummary',
+    'sphinx.ext.inheritance_diagram',
     'myst_parser',
     'sphinxarg.ext',
+    'sphinxcontrib.autodoc_pydantic',
 ]
 
+# class hierarchy diagram
+inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
+inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
+inheritance_edge_attrs = dict(arrow='vee')
+
+autodoc_pydantic_model_show_json = True
+autodoc_pydantic_model_show_config_summary = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter
 
 navigation_with_keys = False
+
+
+# add custom directive 
+
+
+class VideoDirective(Directive):
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec = {}
+
+    def run(self):
+        url = self.arguments[0]
+        video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
+        return [video_node]
+
+def setup(app):
+    app.add_directive('video', VideoDirective)

+ 66 - 0
next_docs/en/index.rst

@@ -26,6 +26,50 @@ Welcome to the MinerU Documentation
    </p>
 
 
+Project Introduction
+--------------------
+
+MinerU is a tool that converts PDFs into machine-readable formats (e.g.,
+markdown, JSON), allowing for easy extraction into any format. MinerU
+was born during the pre-training process of
+`InternLM <https://github.com/InternLM/InternLM>`__. We focus on solving
+symbol conversion issues in scientific literature and hope to contribute
+to technological development in the era of large models. Compared to
+well-known commercial products, MinerU is still young. If you encounter
+any issues or if the results are not as expected, please submit an issue
+on `issue <https://github.com/opendatalab/MinerU/issues>`__ and **attach
+the relevant PDF**.
+
+.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
+
+
+Key Features
+------------
+
+-  Removes elements such as headers, footers, footnotes, and page
+   numbers while maintaining semantic continuity
+-  Outputs text in a human-readable order from multi-column documents
+-  Retains the original structure of the document, including titles,
+   paragraphs, and lists
+-  Extracts images, image captions, tables, and table captions
+-  Automatically recognizes formulas in the document and converts them
+   to LaTeX
+-  Automatically recognizes tables in the document and converts them to
+   LaTeX
+-  Automatically detects and enables OCR for corrupted PDFs
+-  Supports both CPU and GPU environments
+-  Supports Windows, Linux, and Mac platforms
+
+
+User Guide
+-------------
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+
+   user_guide
+
+
 API Reference
 -------------
 
@@ -34,5 +78,27 @@ method, this part of the documentation is for you.
 
 .. toctree::
    :maxdepth: 2
+   :caption: API
 
    api
+
+
+Additional Notes
+------------------
+.. toctree::
+   :maxdepth: 1
+   :caption: Additional Notes
+
+   additional_notes/known_issues
+   additional_notes/faq
+   additional_notes/changelog
+   additional_notes/glossary
+
+
+Projects 
+---------
+.. toctree::
+   :maxdepth: 1
+   :caption: Projects
+
+   projects

+ 13 - 0
next_docs/en/projects.rst

@@ -0,0 +1,13 @@
+
+
+
+llama_index_rag 
+===============
+
+
+gradio_app
+============
+
+
+other projects
+===============

+ 10 - 0
next_docs/en/user_guide.rst

@@ -0,0 +1,10 @@
+
+
+.. toctree::
+    :maxdepth: 2
+
+    user_guide/install
+    user_guide/quick_start
+    user_guide/tutorial
+    user_guide/data
+    

+ 19 - 0
next_docs/en/user_guide/data.rst

@@ -0,0 +1,19 @@
+
+
+Data
+=========
+
+.. toctree::
+   :maxdepth: 2
+
+   data/dataset
+
+   data/read_api
+
+   data/data_reader_writer 
+
+   data/io
+
+
+
+

+ 204 - 0
next_docs/en/user_guide/data/data_reader_writer.rst

@@ -0,0 +1,204 @@
+
+Data Reader Writer 
+====================
+
+Aims for read or write bytes from different media, You can implement new classes to meet the needs of your personal scenarios 
+if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
+``DataReader`` or ``DataWriter``
+
+.. code:: python
+
+    class SomeReader(DataReader):
+        def read(self, path: str) -> bytes:
+            pass
+
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+
+
+    class SomeWriter(DataWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
+
+        def write_string(self, path: str, data: str) -> None:
+            pass
+
+
+Reader may curious about the difference between :doc:`io` and this section. Those two sections look very similarity at first glance.
+:doc:`io` provides fundamental functions, while This section thinks more at application level. Customer can build they own classes to meet 
+their own applications need which may share same IO function. That is why we have :doc:`io`.
+
+
+Important Classes
+-----------------
+
+.. code:: python
+
+    class FileBasedDataReader(DataReader):
+        def __init__(self, parent_dir: str = ''):
+            pass
+
+
+    class FileBasedDataWriter(DataWriter):
+        def __init__(self, parent_dir: str = '') -> None:
+            pass
+
+Class ``FileBasedDataReader`` initialized with unary param ``parent_dir``, That means that every method ``FileBasedDataReader`` provided will have features as follow.
+
+Features:
+    #. read content from the absolute path file, ``parent_dir`` will be ignored.
+    #. read the relative path, file will first join with ``parent_dir``, then read content from the merged path
+
+
+.. note::
+
+    ``FileBasedDataWriter`` shares the same behavior with ``FileBaseDataReader``
+
+
+.. code:: python 
+
+    class MultiS3Mixin:
+        def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
+            pass
+
+    class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
+        pass
+
+All read-related method that class ``MultiBucketS3DataReader`` provided will have features as follow.
+
+Features:
+    #. read object with full s3-format path, for example ``s3://test_bucket/test_object``, ``default_prefix`` will be ignored.
+    #. read object with relative path, file will join ``default_prefix`` and trim the ``bucket_name`` firstly, then read the content. ``bucket_name`` is the first element of the result after split ``default_prefix`` with delimiter ``\`` 
+
+.. note::
+    ``MultiBucketS3DataWriter`` shares the same behavior with ``MultiBucketS3DataReader``
+
+
+.. code:: python
+
+    class S3DataReader(MultiBucketS3DataReader):
+        pass
+
+``S3DataReader`` is build on top of MultiBucketS3DataReader which only support for bucket. So is ``S3DataWriter``. 
+
+
+Read Examples
+------------
+
+.. code:: python
+
+    # file based related 
+    file_based_reader1 = FileBasedDataReader('')
+
+    ## will read file abc 
+    file_based_reader1.read('abc') 
+
+    file_based_reader2 = FileBasedDataReader('/tmp')
+
+    ## will read /tmp/abc
+    file_based_reader2.read('abc')
+
+    ## will read /var/logs/message.txt
+    file_based_reader2.read('/var/logs/message.txt')
+
+    # multi bucket s3 releated
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    
+    ## will read s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_reader1.read('abc')
+
+    ## will read s3://test_bucket1/efg
+    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+
+    ## will read s3://test_bucket2/abc
+    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+
+    # s3 related
+    s3_reader1 = S3DataReader(
+        default_prefix_without_bucket = "test_prefix"
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+
+    ## will read s3://test_bucket/test_prefix/abc 
+    s3_reader1.read('abc')
+   
+    ## will read s3://test_bucket/efg
+    s3_reader1.read('s3://test_bucket/efg')
+
+
+Write Examples
+---------------
+
+.. code:: python
+
+    # file based related 
+    file_based_writer1 = FileBasedDataWriter('')
+
+    ## will write 123 to abc
+    file_based_writer1.write('abc', '123'.encode()) 
+
+    ## will write 123 to abc
+    file_based_writer1.write_string('abc', '123') 
+
+    file_based_writer2 = FileBasedDataWriter('/tmp')
+
+    ## will write 123 to /tmp/abc
+    file_based_writer2.write_string('abc', '123')
+
+    ## will write 123 to /var/logs/message.txt
+    file_based_writer2.write_string('/var/logs/message.txt', '123')
+
+    # multi bucket s3 releated
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
+            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=test_bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        )])
+    
+    ## will write 123 to s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write_string('abc', '123')
+
+    ## will write 123 to s3://test_bucket1/test_prefix/abc
+    multi_bucket_s3_writer1.write('abc', '123'.encode())
+
+    ## will write 123 to s3://test_bucket1/efg
+    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+
+    ## will write 123 to s3://test_bucket2/abc
+    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+
+    # s3 related
+    s3_writer1 = S3DataWriter(
+        default_prefix_without_bucket = "test_prefix"
+        bucket: "test_bucket",
+        ak: "ak",
+        sk: "sk",
+        endpoint_url: "localhost"
+    )
+
+    ## will write 123 to s3://test_bucket/test_prefix/abc 
+    s3_writer1.write('abc', '123'.encode())
+
+    ## will write 123 to s3://test_bucket/test_prefix/abc 
+    s3_writer1.write_string('abc', '123')
+
+    ## will write 123 to s3://test_bucket/efg
+    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
+
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/data_reader_writer` for more details

+ 40 - 0
next_docs/en/user_guide/data/dataset.rst

@@ -0,0 +1,40 @@
+
+
+Dataset 
+===========
+
+
+Import Classes 
+-----------------
+
+Dataset 
+^^^^^^^^
+
+Each pdfs or image will form one ``Dataset``. As we all know, Pdf has two categories, :ref:`digital_method_section` or :ref:`ocr_method_section`.
+Will get ``ImageDataset`` which is subclass of ``Dataset`` with images and get ``PymuDocDataset`` from pdf files.
+The difference between ``ImageDataset`` and ``PymuDocDataset`` is that ``ImageDataset`` only support ``OCR`` parse method, 
+while ``PymuDocDataset`` support both ``OCR`` and ``TXT``
+
+.. note::
+
+    In fact some pdf may generated by images, that means it can not support ``TXT`` methods. Currently it is something the user needs to ensure does not happen
+
+
+
+Pdf Parse Methods
+------------------
+
+.. _ocr_method_section:
+OCR 
+^^^^
+Extract chars via ``Optical Character Recognition`` technical.
+
+.. _digital_method_section:
+TXT
+^^^^^^^^
+Extract chars via third-party library, currently we use ``pymupdf``. 
+
+
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/dataset` for more details
+

+ 25 - 0
next_docs/en/user_guide/data/io.rst

@@ -0,0 +1,25 @@
+
+IO
+===
+
+Aims for read or write bytes from different media, Currently We provide ``S3Reader``, ``S3Writer`` for AWS S3 compatible media 
+and ``HttpReader``, ``HttpWriter`` for remote Http file. You can implement new classes to meet the needs of your personal scenarios 
+if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
+``IOReader`` or ``IOWriter``
+
+.. code:: python
+
+    class SomeReader(IOReader):
+        def read(self, path: str) -> bytes:
+            pass
+
+        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+            pass
+
+
+    class SomeWriter(IOWriter):
+        def write(self, path: str, data: bytes) -> None:
+            pass
+
+Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/io` for more details
+

+ 58 - 0
next_docs/en/user_guide/data/read_api.rst

@@ -0,0 +1,58 @@
+
+read_api 
+==========
+
+Read the content from file or directory to create ``Dataset``, Currently we provided serval functions that cover some scenarios.
+if you have new scenarios that is common to most of the users, you can post it on the offical github issues with detail descriptions.
+Also it is easy to implement your own read-related funtions.
+
+
+Important Functions
+-------------------
+
+
+read_jsonl
+^^^^^^^^^^^^^^^^
+
+Read the contet from jsonl which may located on local machine or remote s3. if you want to know more about jsonl, please goto :doc:`../../additional_notes/glossary`
+
+.. code:: python
+
+    # read jsonl from local machine 
+    datasets = read_jsonl("tt.jsonl", None)
+
+    # read jsonl from remote s3
+    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
+
+
+read_local_pdfs
+^^^^^^^^^^^^^^^^
+
+Read pdf from path or directory.
+
+
+.. code:: python
+
+    # read pdf path
+    datasets = read_local_pdfs("tt.pdf")
+
+    # read pdfs under directory
+    datasets = read_local_pdfs("pdfs/")
+
+
+read_local_images
+^^^^^^^^^^^^^^^^^^^
+
+Read images from path or directory
+
+.. code:: python 
+
+    # read from image path 
+    datasets = read_local_images("tt.png")
+
+
+    # read files from directory that endswith suffix in suffixes array 
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+
+
+Check :doc:`../../api/read_api` for more details

+ 12 - 0
next_docs/en/user_guide/install.rst

@@ -0,0 +1,12 @@
+
+Installation
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   install/install
+   install//boost_with_cuda
+   install/download_model_weight_files
+
+

+ 315 - 0
next_docs/en/user_guide/install/boost_with_cuda.rst

@@ -0,0 +1,315 @@
+
+Boost With Cuda 
+================
+
+
+If your device supports CUDA and meets the GPU requirements of the
+mainline environment, you can use GPU acceleration. Please select the
+appropriate guide based on your system:
+
+-  :ref:`ubuntu_22_04_lts_section`
+-  :ref:`windows_10_or_11_section`
+
+-  Quick Deployment with Docker > Docker requires a GPU with at least
+   16GB of VRAM, and all acceleration features are enabled by default.
+
+.. note:: 
+
+   Before running this Docker, you can use the following command to
+   check if your device supports CUDA acceleration on Docker. 
+
+   bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+   docker build -t mineru:latest .
+   docker run --rm -it --gpus=all mineru:latest /bin/bash
+   magic-pdf --help
+
+.. _ubuntu_22_04_lts_section:
+
+Ubuntu 22.04 LTS
+-----------------
+
+1. Check if NVIDIA Drivers Are Installed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: sh
+
+   nvidia-smi
+
+If you see information similar to the following, it means that the
+NVIDIA drivers are already installed, and you can skip Step 2.
+
+Notice:``CUDA Version`` should be >= 12.1, If the displayed version
+number is less than 12.1, please upgrade the driver.
+
+.. code:: text
+
+   +---------------------------------------------------------------------------------------+
+   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
+   |-----------------------------------------+----------------------+----------------------+
+   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+   |                                         |                      |               MIG M. |
+   |=========================================+======================+======================|
+   |   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
+   |  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
+   |                                         |                      |                  N/A |
+   +-----------------------------------------+----------------------+----------------------+
+
+2. Install the Driver
+~~~~~~~~~~~~~~~~~~~~~
+
+If no driver is installed, use the following command:
+
+.. code:: sh
+
+   sudo apt-get update
+   sudo apt-get install nvidia-driver-545
+
+Install the proprietary driver and restart your computer after
+installation.
+
+.. code:: sh
+
+   reboot
+
+3. Install Anaconda
+~~~~~~~~~~~~~~~~~~~
+
+If Anaconda is already installed, skip this step.
+
+.. code:: sh
+
+   wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
+   bash Anaconda3-2024.06-1-Linux-x86_64.sh
+
+In the final step, enter ``yes``, close the terminal, and reopen it.
+
+4. Create an Environment Using Conda
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Specify Python version 3.10.
+
+.. code:: sh
+
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+
+5. Install Applications
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: sh
+
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+❗ After installation, make sure to check the version of ``magic-pdf``
+using the following command:
+
+.. code:: sh
+
+   magic-pdf --version
+
+If the version number is less than 0.7.0, please report the issue.
+
+6. Download Models
+~~~~~~~~~~~~~~~~~~
+
+Refer to detailed instructions on :doc:`download_model_weight_files`
+
+7. Understand the Location of the Configuration File
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After completing the `6. Download Models <#6-download-models>`__ step,
+the script will automatically generate a ``magic-pdf.json`` file in the
+user directory and configure the default model path. You can find the
+``magic-pdf.json`` file in your user directory.
+
+   The user directory for Linux is “/home/username”.
+
+8. First Run
+~~~~~~~~~~~~
+
+Download a sample file from the repository and test it.
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
+   magic-pdf -p small_ocr.pdf
+
+9. Test CUDA Acceleration
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your graphics card has at least **8GB** of VRAM, follow these steps
+to test CUDA acceleration:
+
+   ❗ Due to the extremely limited nature of 8GB VRAM for running this
+   application, you need to close all other programs using VRAM to
+   ensure that 8GB of VRAM is available when running this application.
+
+1. Modify the value of ``"device-mode"`` in the ``magic-pdf.json``
+   configuration file located in your home directory.
+
+   .. code:: json
+
+      {
+        "device-mode": "cuda"
+      }
+
+2. Test CUDA acceleration with the following command:
+
+   .. code:: sh
+
+      magic-pdf -p small_ocr.pdf
+
+10. Enable CUDA Acceleration for OCR
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. Download ``paddlepaddle-gpu``. Installation will automatically enable
+   OCR acceleration.
+
+   .. code:: sh
+
+      python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+
+2. Test OCR acceleration with the following command:
+
+   .. code:: sh
+
+      magic-pdf -p small_ocr.pdf
+
+.. _windows_10_or_11_section:
+
+Windows 10/11
+--------------
+
+1. Install CUDA and cuDNN
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Required versions: CUDA 11.8 + cuDNN 8.7.0
+
+-  CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
+-  cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x:
+   https://developer.nvidia.com/rdp/cudnn-archive
+
+2. Install Anaconda
+~~~~~~~~~~~~~~~~~~~
+
+If Anaconda is already installed, you can skip this step.
+
+Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
+
+3. Create an Environment Using Conda
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Python version must be 3.10.
+
+::
+
+   conda create -n MinerU python=3.10
+   conda activate MinerU
+
+4. Install Applications
+~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+..
+
+   ❗️After installation, verify the version of ``magic-pdf``:
+
+   .. code:: bash
+
+      magic-pdf --version
+
+   If the version number is less than 0.7.0, please report it in the
+   issues section.
+
+5. Download Models
+~~~~~~~~~~~~~~~~~~
+
+Refer to detailed instructions on :doc:`download_model_weight_files`
+
+6. Understand the Location of the Configuration File
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+After completing the `5. Download Models <#5-download-models>`__ step,
+the script will automatically generate a ``magic-pdf.json`` file in the
+user directory and configure the default model path. You can find the
+``magic-pdf.json`` file in your 【user directory】 .
+
+   The user directory for Windows is “C:/Users/username”.
+
+7. First Run
+~~~~~~~~~~~~
+
+Download a sample file from the repository and test it.
+
+.. code:: powershell
+
+     wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
+     magic-pdf -p small_ocr.pdf
+
+8. Test CUDA Acceleration
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your graphics card has at least 8GB of VRAM, follow these steps to
+test CUDA-accelerated parsing performance.
+
+   ❗ Due to the extremely limited nature of 8GB VRAM for running this
+   application, you need to close all other programs using VRAM to
+   ensure that 8GB of VRAM is available when running this application.
+
+1. **Overwrite the installation of torch and torchvision** supporting
+   CUDA.
+
+   ::
+
+      pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+
+   ..
+
+      ❗️Ensure the following versions are specified in the command:
+
+      ::
+
+         torch==2.3.1 torchvision==0.18.1
+
+      These are the highest versions we support. Installing higher
+      versions without specifying them will cause the program to fail.
+
+2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json``
+   configuration file located in your user directory.
+
+   .. code:: json
+
+      {
+        "device-mode": "cuda"
+      }
+
+3. **Run the following command to test CUDA acceleration**:
+
+   ::
+
+      magic-pdf -p small_ocr.pdf
+
+9. Enable CUDA Acceleration for OCR
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+1. **Download paddlepaddle-gpu**, which will automatically enable OCR
+   acceleration upon installation.
+
+   ::
+
+      pip install paddlepaddle-gpu==2.6.1
+
+2. **Run the following command to test OCR acceleration**:
+
+   ::
+
+      magic-pdf -p small_ocr.pdf
+

+ 48 - 0
next_docs/en/user_guide/install/download_model_weight_files.rst

@@ -0,0 +1,48 @@
+
+Download Model Weight Files
+==============================
+
+Model downloads are divided into initial downloads and updates to the
+model directory. Please refer to the corresponding documentation for
+instructions on how to proceed.
+
+Initial download of model files
+------------------------------
+
+1. Download the Model from Hugging Face
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use a Python Script to Download Model Files from Hugging Face
+
+.. code:: bash
+
+   pip install huggingface_hub
+   wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+   python download_models_hf.py
+
+The Python script will automatically download the model files and
+configure the model directory in the configuration file.
+
+The configuration file can be found in the user directory, with the
+filename ``magic-pdf.json``.
+
+How to update models previously downloaded
+-----------------------------------------
+
+1. Models downloaded via Git LFS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   Due to feedback from some users that downloading model files using
+   git lfs was incomplete or resulted in corrupted model files, this
+   method is no longer recommended.
+
+If you previously downloaded model files via git lfs, you can navigate
+to the previous download directory and use the ``git pull`` command to
+update the model.
+
+2. Models downloaded via Hugging Face or Model Scope
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you previously downloaded models via Hugging Face or Model Scope, you
+can rerun the Python script used for the initial download. This will
+automatically update the model directory to the latest version.

+ 107 - 0
next_docs/en/user_guide/install/install.rst

@@ -0,0 +1,107 @@
+
+Install 
+===============================================================
+If you encounter any installation issues, please first consult the FAQ.
+If the parsing results are not as expected, refer to the Known Issues.
+There are three different ways to experience MinerU
+
+Pre-installation Notice—Hardware and Software Environment Support
+------------------------------------------------------------------
+
+To ensure the stability and reliability of the project, we only optimize
+and test for specific hardware and software environments during
+development. This ensures that users deploying and running the project
+on recommended system configurations will get the best performance with
+the fewest compatibility issues.
+
+By focusing resources on the mainline environment, our team can more
+efficiently resolve potential bugs and develop new features.
+
+In non-mainline environments, due to the diversity of hardware and
+software configurations, as well as third-party dependency compatibility
+issues, we cannot guarantee 100% project availability. Therefore, for
+users who wish to use this project in non-recommended environments, we
+suggest carefully reading the documentation and FAQ first. Most issues
+already have corresponding solutions in the FAQ. We also encourage
+community feedback to help us gradually expand support.
+
+.. raw:: html
+
+   <style>
+      table, th, td {
+      border: 1px solid black;
+      border-collapse: collapse;
+      }
+   </style>
+   <table>
+    <tr>
+        <td colspan="3" rowspan="2">Operating System</td>
+    </tr>
+    <tr>
+        <td>Ubuntu 22.04 LTS</td>
+        <td>Windows 10 / 11</td>
+        <td>macOS 11+</td>
+    </tr>
+    <tr>
+        <td colspan="3">CPU</td>
+        <td>x86_64</td>
+        <td>x86_64</td>
+        <td>x86_64 / arm64</td>
+    </tr>
+    <tr>
+        <td colspan="3">Memory</td>
+        <td colspan="3">16GB or more, recommended 32GB+</td>
+    </tr>
+    <tr>
+        <td colspan="3">Python Version</td>
+        <td colspan="3">3.10</td>
+    </tr>
+    <tr>
+        <td colspan="3">Nvidia Driver Version</td>
+        <td>latest (Proprietary Driver)</td>
+        <td>latest</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CUDA Environment</td>
+        <td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
+        <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td rowspan="2">GPU Hardware Support List</td>
+        <td colspan="2">Minimum Requirement 8G+ VRAM</td>
+        <td colspan="2">3060ti/3070/3080/3080ti/4060/4070/4070ti<br>
+        8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
+        <td rowspan="2">None</td>
+    </tr>
+    <tr>
+        <td colspan="2">Recommended Configuration 16G+ VRAM</td>
+        <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
+        16G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
+        </td>
+    </tr>
+   </table>
+
+
+Create an environment
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+    conda create -n MinerU python=3.10
+    conda activate MinerU
+    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+
+
+Download model weight files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+    pip install huggingface_hub
+    wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+    python download_models_hf.py    
+
+
+The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference

+ 13 - 0
next_docs/en/user_guide/quick_start.rst

@@ -0,0 +1,13 @@
+
+Quick Start 
+==============
+
+Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
+
+
+.. toctree::
+    :maxdepth: 1
+
+    quick_start/command_line
+    quick_start/extract_text
+

+ 60 - 0
next_docs/en/user_guide/quick_start/command_line.rst

@@ -0,0 +1,60 @@
+
+
+Command Line
+===================
+
+.. code:: bash
+
+   magic-pdf --help
+   Usage: magic-pdf [OPTIONS]
+
+   Options:
+     -v, --version                display the version and exit
+     -p, --path PATH              local pdf filepath or directory  [required]
+     -o, --output-dir PATH        output local directory  [required]
+     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
+                                  technique to extract information from pdf. txt:
+                                  suitable for the text-based pdf only and
+                                  outperform ocr. auto: automatically choose the
+                                  best method for parsing pdf from ocr and txt.
+                                  without method specified, auto will be used by
+                                  default.
+     -l, --lang TEXT              Input the languages in the pdf (if known) to
+                                  improve OCR accuracy.  Optional. You should
+                                  input "Abbreviation" with language form url: ht
+                                  tps://paddlepaddle.github.io/PaddleOCR/en/ppocr
+                                  /blog/multi_languages.html#5-support-languages-
+                                  and-abbreviations
+     -d, --debug BOOLEAN          Enables detailed debugging information during
+                                  the execution of the CLI commands.
+     -s, --start INTEGER          The starting page for PDF parsing, beginning
+                                  from 0.
+     -e, --end INTEGER            The ending page for PDF parsing, beginning from
+                                  0.
+     --help                       Show this message and exit.
+
+
+   ## show version
+   magic-pdf -v
+
+   ## command line example
+   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
+
+``{some_pdf}`` can be a single PDF file or a directory containing
+multiple PDFs. The results will be saved in the ``{some_output_dir}``
+directory. The output file list is as follows:
+
+.. code:: text
+
+   ├── some_pdf.md                          # markdown file
+   ├── images                               # directory for storing images
+   ├── some_pdf_layout.pdf                  # layout diagram
+   ├── some_pdf_middle.json                 # MinerU intermediate processing result
+   ├── some_pdf_model.json                  # model inference result
+   ├── some_pdf_origin.pdf                  # original PDF file
+   ├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
+   └── some_pdf_content_list.json           # Rich text JSON arranged in reading order
+
+For more information about the output files, please refer to the `Output
+File Description <docs/output_file_en_us.md>`__.
+

+ 10 - 0
next_docs/en/user_guide/quick_start/extract_text.rst

@@ -0,0 +1,10 @@
+
+
+Extract Content from Pdf
+========================
+
+.. code:: python
+
+    from magic_pdf.data.read_api import read_local_pdfs
+    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze

+ 5 - 0
next_docs/en/user_guide/tutorial.rst

@@ -0,0 +1,5 @@
+
+Tutorial
+----------
+
+From the beginning to the end, Show how to using mineru via a minimal project

+ 5 - 4
next_docs/requirements.txt

@@ -5,7 +5,8 @@ Pillow==8.4.0
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 sphinx
-sphinx-argparse
-sphinx-book-theme
-sphinx-copybutton
-sphinx_rtd_theme
+sphinx-argparse>=0.5.2
+sphinx-book-theme>=1.1.3
+sphinx-copybutton>=0.5.2
+sphinx_rtd_theme>=3.0.1
+autodoc_pydantic>=2.2.0