Ver código fonte

docs: rewrite install and usage docs

xu rui 11 meses atrás
pai
commit
6ca86beaa3

+ 9 - 6
magic_pdf/data/read_api.py

@@ -59,17 +59,20 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
     """
     if os.path.isdir(path):
         reader = FileBasedDataReader(path)
-        return [
-            PymuDocDataset(reader.read(doc_path.name))
-            for doc_path in Path(path).glob('*.pdf')
-        ]
+        ret = []
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] == 'pdf':
+                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
+        return ret
     else:
         reader = FileBasedDataReader()
         bits = reader.read(path)
         return [PymuDocDataset(bits)]
 
 
-def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
+def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
     """Read images from path or directory.
 
     Args:
@@ -87,7 +90,7 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
             for file in files:
                 suffix = file.split('.')
                 if suffix[-1] in s_suffixes:
-                    imgs_bits.append(reader.read(file))
+                    imgs_bits.append(reader.read(os.path.join(root, file)))
         return [ImageDataset(bits) for bits in imgs_bits]
     else:
         reader = FileBasedDataReader()

+ 6 - 0
next_docs/en/index.rst

@@ -70,6 +70,12 @@ Key Features
 -  Supports both CPU and GPU environments.
 -  Compatible with Windows, Linux, and Mac platforms.
 
+
+.. tip::
+
+   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
+
+
 User Guide
 -------------
 .. toctree::

+ 3 - 1
next_docs/en/user_guide.rst

@@ -4,7 +4,9 @@
     :maxdepth: 2
 
     user_guide/install
+    user_guide/usage
     user_guide/quick_start
     user_guide/tutorial
     user_guide/data
-    
+    user_guide/inference_result
+    user_guide/pipe_result

+ 11 - 11
next_docs/en/user_guide/data/data_reader_writer.rst

@@ -125,16 +125,16 @@ Read Examples
 
     # s3 related
     s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        "test_prefix",
+        "test_bucket",
+        "ak",
+        "sk",
+        "localhost"
     )
 
     ## will read s3://test_bucket/test_prefix/abc 
     s3_reader1.read('abc')
-   
+
     ## will read s3://test_bucket/efg
     s3_reader1.read('s3://test_bucket/efg')
 
@@ -188,11 +188,11 @@ Write Examples
 
     # s3 related
     s3_writer1 = S3DataWriter(
-        default_prefix_without_bucket = "test_prefix"
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        "test_prefix",
+        "test_bucket",
+        "ak",
+        "sk",
+        "localhost"
     )
 
     ## will write 123 to s3://test_bucket/test_prefix/abc 

+ 33 - 7
next_docs/en/user_guide/data/read_api.rst

@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # read jsonl from local machine 
-    datasets = read_jsonl("tt.jsonl", None)
+    # read jsonl from local machine
+    datasets = read_jsonl("tt.jsonl", None)   # replace with real jsonl file
 
     # read jsonl from remote s3
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
 
+    bucket = "bucket_1"                     # replace with real s3 bucket
+    ak = "access_key_1"                     # replace with real s3 access key
+    sk = "secret_key_1"                     # replace with real s3 secret key
+    endpoint_url = "endpoint_url_1"         # replace with real s3 endpoint url
+
+    bucket_2 = "bucket_2"                   # replace with real s3 bucket
+    ak_2 = "access_key_2"                   # replace with real s3 access key
+    sk_2 = "secret_key_2"                   # replace with real s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # replace with real s3 endpoint url
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # replace with real s3 jsonl file
 
 read_local_pdfs
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^
 
 Read pdf from path or directory.
 
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # read pdf path
     datasets = read_local_pdfs("tt.pdf")
@@ -51,7 +77,7 @@ Read images from path or directory
 
 .. code:: python 
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # read from image path 
     datasets = read_local_images("tt.png")

+ 7 - 0
next_docs/en/user_guide/inference_result.rst

@@ -0,0 +1,7 @@
+
+
+Inference Result 
+==================
+
+
+

+ 1 - 1
next_docs/en/user_guide/install.rst

@@ -8,5 +8,5 @@ Installation
    install/install
    install//boost_with_cuda
    install/download_model_weight_files
-
+   install/config
 

+ 0 - 18
next_docs/en/user_guide/install/boost_with_cuda.rst

@@ -9,25 +9,7 @@ appropriate guide based on your system:
 
 -  :ref:`ubuntu_22_04_lts_section`
 -  :ref:`windows_10_or_11_section`
--  Quick Deployment with Docker
 
-.. admonition:: Important
-   :class: tip
-
-   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
-
-   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
-
-   .. code-block:: bash
-
-      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-
-.. code:: sh
-
-   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
-   docker build -t mineru:latest .
-   docker run --rm -it --gpus=all mineru:latest /bin/bash
-   magic-pdf --help
 
 .. _ubuntu_22_04_lts_section:
 

+ 157 - 0
next_docs/en/user_guide/install/config.rst

@@ -0,0 +1,157 @@
+
+
+Config
+=========
+
+File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
+
+
+magic-pdf.json
+----------------
+
+.. code:: json 
+
+    {
+        "bucket_info":{
+            "bucket-name-1":["ak", "sk", "endpoint"],
+            "bucket-name-2":["ak", "sk", "endpoint"]
+        },
+        "models-dir":"/tmp/models",
+        "layoutreader-model-dir":"/tmp/layoutreader",
+        "device-mode":"cpu",
+        "layout-config": {
+            "model": "layoutlmv3"
+        },
+        "formula-config": {
+            "mfd_model": "yolo_v8_mfd",
+            "mfr_model": "unimernet_small",
+            "enable": true
+        },
+        "table-config": {
+            "model": "rapid_table",
+            "enable": false,
+            "max_time": 400    
+        },
+        "config_version": "1.0.0"
+    }
+
+
+
+
+bucket_info
+^^^^^^^^^^^^^^
+Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
+
+Example: 
+
+.. code:: text
+
+        {
+            "image_bucket":[{access_key}, {secret_key}, {endpoint}],
+            "video_bucket":[{access_key}, {secret_key}, {endpoint}]
+        }
+
+
+models-dir
+^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+layoutreader-model-dir
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
+
+
+devide-mode
+^^^^^^^^^^^^^^
+
+This field have two options, **cpu** or **cuda**.
+
+**cpu**: inference via cpu
+
+**cuda**: using cuda to accelerate inference
+
+
+layout-config 
+^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "model": "layoutlmv3"  
+    }
+
+layout model can not be disabled now, And we have only kind of layout model currently.
+
+
+formula-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+    {
+        "mfd_model": "yolo_v8_mfd",   
+        "mfr_model": "unimernet_small",
+        "enable": true 
+    }
+
+
+mfd_model
+""""""""""
+
+Specify the formula detection model, options are ['yolo_v8_mfd']
+
+
+mfr_model
+""""""""""
+Specify the formula recognition model, options are ['unimernet_small']
+
+Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
+
+
+enable
+""""""""
+
+on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
+
+
+table-config
+^^^^^^^^^^^^^^^^
+
+.. code:: json
+
+   {
+        "model": "rapid_table",
+        "enable": false,
+        "max_time": 400    
+    }
+
+model
+""""""""
+
+Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
+
+
+max_time
+"""""""""
+
+Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
+
+
+
+enable
+"""""""
+
+on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
+
+
+config_version
+^^^^^^^^^^^^^^^^
+
+The version of config schema.
+
+
+
+Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.

+ 4 - 1
next_docs/en/user_guide/install/install.rst

@@ -4,6 +4,7 @@ Install
 If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
 If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
 
+Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
 
 .. admonition:: Warning
     :class: tip
@@ -107,4 +108,6 @@ Download model weight files
     python download_models_hf.py    
 
 
-The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference
+.. tip::
+
+    The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install

+ 5 - 0
next_docs/en/user_guide/pipe_result.rst

@@ -0,0 +1,5 @@
+
+
+Pipe Result 
+==============
+

+ 6 - 4
next_docs/en/user_guide/quick_start.rst

@@ -2,12 +2,14 @@
 Quick Start 
 ==============
 
-Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first.
-
+Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
 
 .. toctree::
     :maxdepth: 1
 
-    quick_start/command_line
-    quick_start/to_markdown
+    quick_start/convert_pdf 
+    quick_start/convert_images
+    quick_start/convert_ppt
+    quick_start/convert_word 
+    quick_start/convert_directory
 

+ 8 - 0
next_docs/en/user_guide/quick_start/convert_directory.rst

@@ -0,0 +1,8 @@
+
+
+Convert Files Under Directory 
+=================================
+
+.. code:: python 
+
+    

+ 5 - 0
next_docs/en/user_guide/quick_start/convert_images.rst

@@ -0,0 +1,5 @@
+
+
+Convert Images 
+================
+

+ 5 - 0
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -0,0 +1,5 @@
+
+
+Convert PDF 
+============
+

+ 5 - 0
next_docs/en/user_guide/quick_start/convert_ppt.rst

@@ -0,0 +1,5 @@
+
+
+Convert PPT 
+============
+

+ 6 - 0
next_docs/en/user_guide/quick_start/convert_word.rst

@@ -0,0 +1,6 @@
+
+
+Convert Word 
+=============
+
+

+ 12 - 0
next_docs/en/user_guide/usage.rst

@@ -0,0 +1,12 @@
+
+
+Usage
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   usage/command_line
+   usage/api
+   usage/docker
+

+ 2 - 4
next_docs/en/user_guide/quick_start/to_markdown.rst → next_docs/en/user_guide/usage/api.rst

@@ -1,8 +1,6 @@
 
-
-Convert To Markdown
-========================
-
+Api Usage 
+===========
 
 Local File Example
 ^^^^^^^^^^^^^^^^^^

+ 2 - 2
next_docs/en/user_guide/quick_start/command_line.rst → next_docs/en/user_guide/usage/command_line.rst

@@ -57,6 +57,6 @@ directory. The output file list is as follows:
 
 .. admonition:: Tip
    :class: tip
+   
 
-   For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
-
+   For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`

+ 24 - 0
next_docs/en/user_guide/usage/docker.rst

@@ -0,0 +1,24 @@
+
+
+Docker 
+=======
+
+.. admonition:: Important
+   :class: tip
+
+   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
+
+   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
+
+   .. code-block:: bash
+
+      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+
+
+.. code:: sh
+
+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+   docker build -t mineru:latest .
+   docker run --rm -it --gpus=all mineru:latest /bin/bash
+   magic-pdf --help
+

+ 10 - 10
next_docs/zh_cn/user_guide/data/data_reader_writer.rst

@@ -111,11 +111,11 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
 
     # S3 相关的
     s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix",
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        "test_prefix",
+        "test_bucket",
+        "ak",
+        "sk",
+        "localhost"
     )
 
     ## 将读取 s3://test_bucket/test_prefix/abc 
@@ -172,11 +172,11 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
 
     # S3 相关的
     s3_writer1 = S3DataWriter(
-        default_prefix_without_bucket = "test_prefix",
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
+        "test_prefix",
+        "test_bucket",
+        "ak",
+        "sk",
+        "localhost"
     )
 
     ## 将写入 123 到 s3://test_bucket/test_prefix/abc 

+ 35 - 7
next_docs/zh_cn/user_guide/data/read_api.rst

@@ -15,13 +15,41 @@ read_jsonl
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
 
-    # 从本地机器读取 JSONL
-    datasets = read_jsonl("tt.jsonl", None)
+    # 读取本地 jsonl 文件
+    datasets = read_jsonl("tt.jsonl", None)   # 替换为有效的文件
+
+    # 读取 s3 jsonl 文件
+
+    bucket = "bucket_1"                     # 替换为有效的 s3 bucket
+    ak = "access_key_1"                     # 替换为有效的 s3 access key
+    sk = "secret_key_1"                     # 替换为有效的 s3 secret key
+    endpoint_url = "endpoint_url_1"         # 替换为有效的 s3 endpoint url
+
+    bucket_2 = "bucket_2"                   # 替换为有效的 s3 bucket
+    ak_2 = "access_key_2"                   # 替换为有效的 s3 access key
+    sk_2 = "secret_key_2"                   # 替换为有效的 s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # 替换为有效的 s3 endpoint url
+
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # 替换为有效的 s3 jsonl file
 
-    # 从远程 S3 读取 JSONL
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
 
 read_local_pdfs
 ^^^^^^^^^^^^^^^^
@@ -30,7 +58,7 @@ read_local_pdfs
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # 读取 PDF 路径
     datasets = read_local_pdfs("tt.pdf")
@@ -45,7 +73,7 @@ read_local_images
 
 .. code:: python
 
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
 
     # 从图像路径读取
     datasets = read_local_images("tt.png")