Procházet zdrojové kódy

Merge pull request #2622 from myhloli/dev

Dev
Xiaomeng Zhao před 5 měsíci
rodič
revize
bcbbee8cbd
100 změnil soubory, kde provedl 354 přidání a 4155 odebrání
  1. 9 8
      .github/workflows/python-package.yml
  2. 0 16
      .readthedocs.yaml
  3. 5 4
      README.md
  4. 5 4
      README_zh-CN.md
  5. 0 23
      demo/batch_demo.py
  6. 241 66
      demo/demo.py
  7. 0 51
      docker/ascend_npu/Dockerfile
  8. 6 24
      docker/china/Dockerfile
  9. 6 24
      docker/global/Dockerfile
  10. 0 51
      docs/README_Ascend_NPU_Acceleration_zh_CN.md
  11. 0 111
      docs/README_Ubuntu_CUDA_Acceleration_en_US.md
  12. 0 115
      docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md
  13. 0 83
      docs/README_Windows_CUDA_Acceleration_en_US.md
  14. 0 86
      docs/README_Windows_CUDA_Acceleration_zh_CN.md
  15. 0 23
      docs/how_to_download_models_en.md
  16. 0 37
      docs/how_to_download_models_zh_cn.md
  17. 3 3
      mineru/backend/pipeline/batch_analyze.py
  18. 4 2
      mineru/backend/pipeline/model_json_to_middle_json.py
  19. 2 2
      mineru/backend/vlm/base_predictor.py
  20. 5 5
      mineru/backend/vlm/predictor.py
  21. 10 7
      mineru/backend/vlm/vlm_analyze.py
  22. 4 3
      mineru/cli/client.py
  23. 1 3
      mineru/cli/common.py
  24. 4 0
      mineru/model/vlm_sglang_model/model.py
  25. 15 4
      mineru/model/vlm_sglang_model/server.py
  26. 9 3
      mineru/utils/config_reader.py
  27. 4 2
      mineru/utils/draw_bbox.py
  28. 10 2
      mineru/utils/models_download_utils.py
  29. 11 1
      mineru/utils/ocr_utils.py
  30. 0 16
      next_docs/en/.readthedocs.yaml
  31. 0 20
      next_docs/en/Makefile
  32. binární
      next_docs/en/_static/image/MinerU-logo-hq.png
  33. binární
      next_docs/en/_static/image/MinerU-logo.png
  34. 0 13
      next_docs/en/_static/image/ReadTheDocs.svg
  35. binární
      next_docs/en/_static/image/datalab_logo.png
  36. binární
      next_docs/en/_static/image/flowchart_en.png
  37. binární
      next_docs/en/_static/image/flowchart_zh_cn.png
  38. binární
      next_docs/en/_static/image/inference_result.png
  39. binární
      next_docs/en/_static/image/layout_example.png
  40. binární
      next_docs/en/_static/image/logo.png
  41. 0 3
      next_docs/en/_static/image/pipeline.drawio.svg
  42. binární
      next_docs/en/_static/image/poly.png
  43. binární
      next_docs/en/_static/image/project_panorama_en.png
  44. binární
      next_docs/en/_static/image/project_panorama_zh_cn.png
  45. binární
      next_docs/en/_static/image/spans_example.png
  46. binární
      next_docs/en/_static/image/web_demo_1.png
  47. 0 88
      next_docs/en/additional_notes/faq.rst
  48. 0 14
      next_docs/en/additional_notes/glossary.rst
  49. 0 20
      next_docs/en/additional_notes/known_issues.rst
  50. 0 11
      next_docs/en/api.rst
  51. 0 44
      next_docs/en/api/data_reader_writer.rst
  52. 0 28
      next_docs/en/api/dataset.rst
  53. 0 33
      next_docs/en/api/io.rst
  54. 0 8
      next_docs/en/api/model_operators.rst
  55. 0 9
      next_docs/en/api/pipe_operators.rst
  56. 0 6
      next_docs/en/api/read_api.rst
  57. 0 10
      next_docs/en/api/schemas.rst
  58. 0 151
      next_docs/en/conf.py
  59. 0 111
      next_docs/en/index.rst
  60. 0 35
      next_docs/en/make.bat
  61. 0 12
      next_docs/en/user_guide.rst
  62. 0 19
      next_docs/en/user_guide/data.rst
  63. 0 236
      next_docs/en/user_guide/data/data_reader_writer.rst
  64. 0 40
      next_docs/en/user_guide/data/dataset.rst
  65. 0 25
      next_docs/en/user_guide/data/io.rst
  66. 0 106
      next_docs/en/user_guide/data/read_api.rst
  67. 0 144
      next_docs/en/user_guide/inference_result.rst
  68. 0 12
      next_docs/en/user_guide/install.rst
  69. 0 255
      next_docs/en/user_guide/install/boost_with_cuda.rst
  70. 0 168
      next_docs/en/user_guide/install/config.rst
  71. 0 37
      next_docs/en/user_guide/install/download_model_weight_files.rst
  72. 0 142
      next_docs/en/user_guide/install/install.rst
  73. 0 335
      next_docs/en/user_guide/pipe_result.rst
  74. 0 12
      next_docs/en/user_guide/quick_start.rst
  75. 0 47
      next_docs/en/user_guide/quick_start/convert_image.rst
  76. 0 60
      next_docs/en/user_guide/quick_start/convert_ms_office.rst
  77. 0 56
      next_docs/en/user_guide/quick_start/convert_pdf.rst
  78. 0 11
      next_docs/en/user_guide/tutorial.rst
  79. 0 412
      next_docs/en/user_guide/tutorial/output_file_description.rst
  80. 0 182
      next_docs/en/user_guide/tutorial/pipeline.rst
  81. 0 12
      next_docs/en/user_guide/usage.rst
  82. 0 279
      next_docs/en/user_guide/usage/api.rst
  83. 0 77
      next_docs/en/user_guide/usage/command_line.rst
  84. 0 24
      next_docs/en/user_guide/usage/docker.rst
  85. 0 17
      next_docs/requirements.txt
  86. 0 16
      next_docs/zh_cn/.readthedocs.yaml
  87. 0 20
      next_docs/zh_cn/Makefile
  88. binární
      next_docs/zh_cn/_static/image/MinerU-logo-hq.png
  89. binární
      next_docs/zh_cn/_static/image/MinerU-logo.png
  90. 0 13
      next_docs/zh_cn/_static/image/ReadTheDocs.svg
  91. binární
      next_docs/zh_cn/_static/image/datalab_logo.png
  92. binární
      next_docs/zh_cn/_static/image/flowchart_en.png
  93. binární
      next_docs/zh_cn/_static/image/flowchart_zh_cn.png
  94. binární
      next_docs/zh_cn/_static/image/inference_result.png
  95. binární
      next_docs/zh_cn/_static/image/layout_example.png
  96. binární
      next_docs/zh_cn/_static/image/logo.png
  97. 0 3
      next_docs/zh_cn/_static/image/pipeline.drawio.svg
  98. binární
      next_docs/zh_cn/_static/image/poly.png
  99. binární
      next_docs/zh_cn/_static/image/project_panorama_en.png
  100. binární
      next_docs/zh_cn/_static/image/project_panorama_zh_cn.png

+ 9 - 8
.github/workflows/python-package.yml

@@ -32,14 +32,14 @@ jobs:
 
       - name: Verify version.py
         run: |
-          ls -l magic_pdf/libs/version.py
-          cat magic_pdf/libs/version.py
+          ls -l mineru/version.py
+          cat mineru/version.py
 
       - name: Commit changes
         run: |
           git config --local user.email "moe@myhloli.com"
           git config --local user.name "myhloli"
-          git add magic_pdf/libs/version.py
+          git add mineru/version.py
           if git diff-index --quiet HEAD; then
             echo "No changes to commit"
           else
@@ -71,18 +71,18 @@ jobs:
 
     - name: Verify version.py
       run: |
-        ls -l magic_pdf/libs/version.py
-        cat magic_pdf/libs/version.py
+        ls -l mineru/version.py
+        cat mineru/version.py
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install magic-pdf
+    - name: Install mineru
       run: |
         python -m pip install --upgrade pip
-        pip install -e .[full]
+        pip install -e .[all]
 
   build:
     needs: [ check-install ]
@@ -103,10 +103,11 @@ jobs:
     - name: Install wheel
       run: |
         python -m pip install wheel
+        pip install build
 
     - name: Build wheel
       run: |
-        python setup.py bdist_wheel
+        python -m build --wheel
 
     - name: Upload artifact
       uses: actions/upload-artifact@v4

+ 0 - 16
.readthedocs.yaml

@@ -1,16 +0,0 @@
-version: 2
-
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-
-formats:
-  - epub
-
-python:
-  install:
-    - requirements: next_docs/zh_cn/requirements.txt
-
-sphinx:
-  configuration: next_docs/zh_cn/conf.py

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 5 - 4
README.md


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 5 - 4
README_zh-CN.md


+ 0 - 23
demo/batch_demo.py

@@ -1,23 +0,0 @@
-import os
-from pathlib import Path
-from magic_pdf.data.batch_build_dataset import batch_build_dataset
-from magic_pdf.tools.common import batch_do_parse
-
-
-def batch(pdf_dir, output_dir, method, lang):
-    os.makedirs(output_dir, exist_ok=True)
-    doc_paths = []
-    for doc_path in Path(pdf_dir).glob('*'):
-        if doc_path.suffix == '.pdf':
-            doc_paths.append(doc_path)
-
-    # build dataset with 2 workers
-    datasets = batch_build_dataset(doc_paths, 4, lang)
-
-    # os.environ["MINERU_MIN_BATCH_INFERENCE_SIZE"] = "200"  # every 200 pages will be parsed in one batch
-    batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method)
-
-
-if __name__ == '__main__':
-    batch("pdfs", "output", "auto", "")
-

+ 241 - 66
demo/demo.py

@@ -1,68 +1,243 @@
 # Copyright (c) Opendatalab. All rights reserved.
+import copy
+import json
 import os
-
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
-
-# args
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-pdf_file_name = os.path.join(__dir__, "pdfs", "demo1.pdf")  # replace with the real pdf path
-name_without_extension = os.path.basename(pdf_file_name).split('.')[0]
-
-# prepare env
-local_image_dir = os.path.join(__dir__, "output", name_without_extension, "images")
-local_md_dir = os.path.join(__dir__, "output", name_without_extension)
-image_dir = str(os.path.basename(local_image_dir))
-os.makedirs(local_image_dir, exist_ok=True)
-
-image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
-
-# read bytes
-reader1 = FileBasedDataReader("")
-pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-# proc
-## Create Dataset Instance
-ds = PymuDocDataset(pdf_bytes)
-
-## inference
-if ds.classify() == SupportedPdfParseMethod.OCR:
-    infer_result = ds.apply(doc_analyze, ocr=True)
-
-    ## pipeline
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
-else:
-    infer_result = ds.apply(doc_analyze, ocr=False)
-
-    ## pipeline
-    pipe_result = infer_result.pipe_txt_mode(image_writer)
-
-### get model inference result
-model_inference_result = infer_result.get_infer_res()
-
-### draw layout result on each page
-pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_extension}_layout.pdf"))
-
-### draw spans result on each page
-pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_extension}_spans.pdf"))
-
-### get markdown content
-md_content = pipe_result.get_markdown(image_dir)
-
-### dump markdown
-pipe_result.dump_md(md_writer, f"{name_without_extension}.md", image_dir)
-
-### get content list content
-content_list_content = pipe_result.get_content_list(image_dir)
-
-### dump content list
-pipe_result.dump_content_list(md_writer, f"{name_without_extension}_content_list.json", image_dir)
-
-### get middle json
-middle_json_content = pipe_result.get_middle_json()
-
-### dump middle json
-pipe_result.dump_middle_json(md_writer, f'{name_without_extension}_middle.json')
+from pathlib import Path
+
+from loguru import logger
+
+from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
+from mineru.data.data_reader_writer import FileBasedDataWriter
+from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
+from mineru.utils.enum_class import MakeMode
+from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
+from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
+from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
+from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
+from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
+
+
+def do_parse(
+    output_dir,  # Output directory for storing parsing results
+    pdf_file_names: list[str],  # List of PDF file names to be parsed
+    pdf_bytes_list: list[bytes],  # List of PDF bytes to be parsed
+    p_lang_list: list[str],  # List of languages for each PDF, default is 'ch' (Chinese)
+    backend="pipeline",  # The backend for parsing PDF, default is 'pipeline'
+    parse_method="auto",  # The method for parsing PDF, default is 'auto'
+    p_formula_enable=True,  # Enable formula parsing
+    p_table_enable=True,  # Enable table parsing
+    server_url=None,  # Server URL for vlm-sglang-client backend
+    f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
+    f_draw_span_bbox=True,  # Whether to draw span bounding boxes
+    f_dump_md=True,  # Whether to dump markdown files
+    f_dump_middle_json=True,  # Whether to dump middle JSON files
+    f_dump_model_output=True,  # Whether to dump model output files
+    f_dump_orig_pdf=True,  # Whether to dump original PDF files
+    f_dump_content_list=True,  # Whether to dump content list files
+    f_make_md_mode=MakeMode.MM_MD,  # The mode for making markdown content, default is MM_MD
+    start_page_id=0,  # Start page ID for parsing, default is 0
+    end_page_id=None,  # End page ID for parsing, default is None (parse all pages until the end of the document)
+):
+
+    if backend == "pipeline":
+        for idx, pdf_bytes in enumerate(pdf_bytes_list):
+            new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
+            pdf_bytes_list[idx] = new_pdf_bytes
+
+        infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
+
+        for idx, model_list in enumerate(infer_results):
+            model_json = copy.deepcopy(model_list)
+            pdf_file_name = pdf_file_names[idx]
+            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
+            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+
+            images_list = all_image_lists[idx]
+            pdf_doc = all_pdf_docs[idx]
+            _lang = lang_list[idx]
+            _ocr_enable = ocr_enabled_list[idx]
+            middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
+
+            pdf_info = middle_json["pdf_info"]
+
+            pdf_bytes = pdf_bytes_list[idx]
+            if f_draw_layout_bbox:
+                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
+
+            if f_draw_span_bbox:
+                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
+
+            if f_dump_orig_pdf:
+                md_writer.write(
+                    f"{pdf_file_name}_origin.pdf",
+                    pdf_bytes,
+                )
+
+            if f_dump_md:
+                image_dir = str(os.path.basename(local_image_dir))
+                md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}.md",
+                    md_content_str,
+                )
+
+            if f_dump_content_list:
+                image_dir = str(os.path.basename(local_image_dir))
+                content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}_content_list.json",
+                    json.dumps(content_list, ensure_ascii=False, indent=4),
+                )
+
+            if f_dump_middle_json:
+                md_writer.write_string(
+                    f"{pdf_file_name}_middle.json",
+                    json.dumps(middle_json, ensure_ascii=False, indent=4),
+                )
+
+            if f_dump_model_output:
+                md_writer.write_string(
+                    f"{pdf_file_name}_model.json",
+                    json.dumps(model_json, ensure_ascii=False, indent=4),
+                )
+
+            logger.info(f"local output dir is {local_md_dir}")
+    else:
+        if backend.startswith("vlm-"):
+            backend = backend[4:]
+
+        f_draw_span_bbox = False
+        parse_method = "vlm"
+        for idx, pdf_bytes in enumerate(pdf_bytes_list):
+            pdf_file_name = pdf_file_names[idx]
+            pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
+            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
+            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+            middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
+
+            pdf_info = middle_json["pdf_info"]
+
+            if f_draw_layout_bbox:
+                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
+
+            if f_draw_span_bbox:
+                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
+
+            if f_dump_orig_pdf:
+                md_writer.write(
+                    f"{pdf_file_name}_origin.pdf",
+                    pdf_bytes,
+                )
+
+            if f_dump_md:
+                image_dir = str(os.path.basename(local_image_dir))
+                md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}.md",
+                    md_content_str,
+                )
+
+            if f_dump_content_list:
+                image_dir = str(os.path.basename(local_image_dir))
+                content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+                md_writer.write_string(
+                    f"{pdf_file_name}_content_list.json",
+                    json.dumps(content_list, ensure_ascii=False, indent=4),
+                )
+
+            if f_dump_middle_json:
+                md_writer.write_string(
+                    f"{pdf_file_name}_middle.json",
+                    json.dumps(middle_json, ensure_ascii=False, indent=4),
+                )
+
+            if f_dump_model_output:
+                model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
+                md_writer.write_string(
+                    f"{pdf_file_name}_model_output.txt",
+                    model_output,
+                )
+
+            logger.info(f"local output dir is {local_md_dir}")
+
+
+def parse_doc(
+        path_list: list[Path],
+        output_dir,
+        lang="ch",
+        backend="pipeline",
+        method="auto",
+        server_url=None,
+        start_page_id=0,  # Start page ID for parsing, default is 0
+        end_page_id=None  # End page ID for parsing, default is None (parse all pages until the end of the document)
+):
+    """
+        Parameter description:
+        path_list: List of document paths to be parsed, can be PDF or image files.
+        output_dir: Output directory for storing parsing results.
+        lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
+            Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
+            Adapted only for the case where the backend is set to "pipeline"
+        backend: the backend for parsing pdf:
+            pipeline: More general.
+            vlm-transformers: More general.
+            vlm-sglang-engine: Faster(engine).
+            vlm-sglang-client: Faster(client).
+            without method specified, pipeline will be used by default.
+        method: the method for parsing pdf:
+            auto: Automatically determine the method based on the file type.
+            txt: Use text extraction method.
+            ocr: Use OCR method for image-based PDFs.
+            Without method specified, 'auto' will be used by default.
+            Adapted only for the case where the backend is set to "pipeline".
+        server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+    """
+    try:
+        file_name_list = []
+        pdf_bytes_list = []
+        lang_list = []
+        for path in path_list:
+            file_name = str(Path(path).stem)
+            pdf_bytes = read_fn(path)
+            file_name_list.append(file_name)
+            pdf_bytes_list.append(pdf_bytes)
+            lang_list.append(lang)
+        do_parse(
+            output_dir=output_dir,
+            pdf_file_names=file_name_list,
+            pdf_bytes_list=pdf_bytes_list,
+            p_lang_list=lang_list,
+            backend=backend,
+            parse_method=method,
+            server_url=server_url,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id
+        )
+    except Exception as e:
+        logger.exception(e)
+
+
+if __name__ == '__main__':
+    # args
+    __dir__ = os.path.dirname(os.path.abspath(__file__))
+    pdf_files_dir = os.path.join(__dir__, "pdfs")
+    output_dir = os.path.join(__dir__, "output")
+    pdf_suffixes = [".pdf"]
+    image_suffixes = [".png", ".jpeg", ".jpg"]
+
+    doc_path_list = []
+    for doc_path in Path(pdf_files_dir).glob('*'):
+        if doc_path.suffix in pdf_suffixes + image_suffixes:
+            doc_path_list.append(doc_path)
+
+    """如果您由于网络问题无法下载模型,可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
+    # os.environ['MINERU_MODEL_SOURCE'] = "modelscope"
+
+    """Use pipeline mode if your environment does not support VLM"""
+    parse_doc(doc_path_list, output_dir, backend="pipeline")
+
+    """To enable VLM mode, change the backend to 'vlm-xxx'"""
+    # parse_doc(doc_path_list, output_dir, backend="vlm-transformers")  # more general.
+    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine")  # faster(engine).
+    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000")  # faster(client).

+ 0 - 51
docker/ascend_npu/Dockerfile

@@ -1,51 +0,0 @@
-# Use the official Ubuntu base image
-FROM swr.cn-central-221.ovaijisuan.com/mindformers/mindformers1.2_mindspore2.3:20240722
-
-USER root
-
-# Set environment variables to non-interactive to avoid prompts during installation
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Update the package list and install necessary packages
-RUN apt-get update && \
-    apt-get install -y \
-        software-properties-common && \
-    add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y \
-        python3.10 \
-        python3.10-venv \
-        python3.10-distutils \
-        python3.10-dev \
-        python3-pip \
-        wget \
-        git \
-        libgl1 \
-        libglib2.0-0 \
-        && rm -rf /var/lib/apt/lists/*
-
-# Set Python 3.10 as the default python3
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-
-# Create a virtual environment for MinerU
-RUN python3 -m venv /opt/mineru_venv
-
-# Copy the configuration file template and install magic-pdf latest
-RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install -U magic-pdf[full] 'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops -i https://mirrors.aliyun.com/pypi/simple && \
-    wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl && \
-    pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
-
-# Download models and update the configuration file
-RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
-    pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
-    wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|npu|g' /root/magic-pdf.json"
-
-# Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]

+ 6 - 24
docker/china/Dockerfile

@@ -18,37 +18,19 @@ RUN apt-get update && \
         wget \
         git \
         libgl1 \
-        libreoffice \
-        fonts-noto-cjk \
-        fonts-wqy-zenhei \
-        fonts-wqy-microhei \
-        ttf-mscorefonts-installer \
-        fontconfig \
         libglib2.0-0 \
-        libxrender1 \
-        libsm6 \
-        libxext6 \
-        poppler-utils \
         && rm -rf /var/lib/apt/lists/*
 
 # Set Python 3.10 as the default python3
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 
-# Create a virtual environment for MinerU
-RUN python3 -m venv /opt/mineru_venv
-
-# Copy the configuration file template and install magic-pdf latest
-RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
-    pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple"
+# install mineru latest
+RUN /bin/bash -c "pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple && \
+    pip3 install uv -i https://mirrors.aliyun.com/pypi/simple && \
+    uv pip install 'mineru[all]>=2.0.0' -i https://mirrors.aliyun.com/pypi/simple"
 
 # Download models and update the configuration file
-RUN /bin/bash -c "pip3 install modelscope -i https://mirrors.aliyun.com/pypi/simple && \
-    wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
 
 # Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

+ 6 - 24
docker/global/Dockerfile

@@ -18,37 +18,19 @@ RUN apt-get update && \
         wget \
         git \
         libgl1 \
-        libreoffice \
-        fonts-noto-cjk \
-        fonts-wqy-zenhei \
-        fonts-wqy-microhei \
-        ttf-mscorefonts-installer \
-        fontconfig \
         libglib2.0-0 \
-        libxrender1 \
-        libsm6 \
-        libxext6 \
-        poppler-utils \
         && rm -rf /var/lib/apt/lists/*
 
 # Set Python 3.10 as the default python3
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 
-# Create a virtual environment for MinerU
-RUN python3 -m venv /opt/mineru_venv
-
-# Copy the configuration file template and install magic-pdf latest
-RUN /bin/bash -c "wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json && \
-    cp magic-pdf.template.json /root/magic-pdf.json && \
-    source /opt/mineru_venv/bin/activate && \
-    pip3 install --upgrade pip && \
-    pip3 install -U magic-pdf[full]"
+# install mineru latest
+RUN /bin/bash -c "pip3 install --upgrade pip && \
+    pip3 install uv && \
+    uv pip install 'mineru[all]>=2.0.0'"
 
 # Download models and update the configuration file
-RUN /bin/bash -c "pip3 install huggingface_hub && \
-    wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models.py && \
-    python3 download_models.py && \
-    sed -i 's|cpu|cuda|g' /root/magic-pdf.json"
+RUN /bin/bash -c "mineru-models-download -s huggingface -m all"
 
 # Set the entry point to activate the virtual environment and run the command line tool
-ENTRYPOINT ["/bin/bash", "-c", "source /opt/mineru_venv/bin/activate && exec \"$@\"", "--"]
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

+ 0 - 51
docs/README_Ascend_NPU_Acceleration_zh_CN.md

@@ -1,51 +0,0 @@
-# Ascend NPU 加速
-
-## 简介
-
-本文档介绍如何在 Ascend NPU 上使用 MinerU。本文档内容已在`华为 Atlas 800T A2`服务器上测试通过。
-```
-CPU:鲲鹏 920 aarch64 2.6GHz
-NPU:Ascend 910B 64GB
-OS:openEuler 22.03 (LTS-SP3)/ Ubuntu 22.04.5 LTS
-CANN:8.0.RC2
-驱动版本:24.1.rc2.1
-```
-由于适配 Ascend NPU 的环境较为复杂,建议使用 Docker 容器运行 MinerU。
-
-通过docker运行MinerU前需确保物理机已安装支持CANN 8.0.RC2的驱动和固件。
-
-
-## 构建镜像
-请保持网络状况良好,并执行以下代码构建镜像。    
-```bash
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/ascend_npu/Dockerfile -O Dockerfile
-docker build -t mineru_npu:latest .
-```
-如果构建过程中未发生报错则说明镜像构建成功。
-
-
-## 运行容器
-
-```bash
-docker run -it -u root --name mineru-npu --privileged=true \
-    --ipc=host \
-    --network=host \
-    --device=/dev/davinci0 \
-    --device=/dev/davinci1 \
-    --device=/dev/davinci2 \
-    --device=/dev/davinci3 \
-    --device=/dev/davinci4 \
-    --device=/dev/davinci5 \
-    --device=/dev/davinci6 \
-    --device=/dev/davinci7 \
-    --device=/dev/davinci_manager \
-    --device=/dev/devmm_svm \
-    --device=/dev/hisi_hdc \
-    -v /var/log/npu/:/usr/slog \
-    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-    mineru_npu:latest \
-    /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
-
-magic-pdf --help
-```

+ 0 - 111
docs/README_Ubuntu_CUDA_Acceleration_en_US.md

@@ -1,111 +0,0 @@
-# Ubuntu 22.04 LTS
-
-### 1. Check if NVIDIA Drivers Are Installed
-
-```sh
-nvidia-smi
-```
-
-If you see information similar to the following, it means that the NVIDIA drivers are already installed, and you can skip Step 2.
-
-> [!NOTE]
-> Notice:`CUDA Version` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
-
-```plaintext
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
-|  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
-|                                         |                      |                  N/A |
-+-----------------------------------------+----------------------+----------------------+
-```
-
-### 2. Install the Driver
-
-If no driver is installed, use the following command:
-
-```sh
-sudo apt-get update
-sudo apt-get install nvidia-driver-570-server
-```
-
-Install the proprietary driver and restart your computer after installation.
-
-```sh
-reboot
-```
-
-### 3. Install Anaconda
-
-If Anaconda is already installed, skip this step.
-
-```sh
-wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
-bash Anaconda3-2024.06-1-Linux-x86_64.sh
-```
-
-In the final step, enter `yes`, close the terminal, and reopen it.
-
-### 4. Create an Environment Using Conda
-
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-
-### 5. Install Applications
-
-```sh
-pip install -U magic-pdf[full]
-```
-> [!TIP]
-> After installation, you can check the version of `magic-pdf` using the following command:
->
-> ```sh
-> magic-pdf --version
-> ```
-
-
-### 6. Download Models
-
-
-Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
-
-
-## 7. Understand the Location of the Configuration File
-
-After completing the [6. Download Models](#6-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
-You can find the `magic-pdf.json` file in your user directory.
-
-> [!TIP]
-> The user directory for Linux is "/home/username".
-
-
-### 8. First Run
-
-Download a sample file from the repository and test it.
-
-```sh
-wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
-magic-pdf -p small_ocr.pdf -o ./output
-```
-
-### 9. Test CUDA Acceleration
-
-If your graphics card has at least **6GB** of VRAM, follow these steps to test CUDA acceleration:
-
-1. Modify the value of `"device-mode"` in the `magic-pdf.json` configuration file located in your home directory.
-   ```json
-   {
-     "device-mode": "cuda"
-   }
-   ```
-2. Test CUDA acceleration with the following command:
-   ```sh
-   magic-pdf -p small_ocr.pdf -o ./output
-   ```

+ 0 - 115
docs/README_Ubuntu_CUDA_Acceleration_zh_CN.md

@@ -1,115 +0,0 @@
-# Ubuntu 22.04 LTS
-
-## 1. 检测是否已安装nvidia驱动
-
-```bash
-nvidia-smi
-```
-
-如果看到类似如下的信息,说明已经安装了nvidia驱动,可以跳过步骤2
-
-> [!NOTE]
-> `CUDA Version` 显示的版本号应 >= 12.4,如显示的版本号小于12.4,请升级驱动
-
-```plaintext
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
-|  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
-|                                         |                      |                  N/A |
-+-----------------------------------------+----------------------+----------------------+
-```
-
-## 2. 安装驱动
-
-如没有驱动,则通过如下命令
-
-```bash
-sudo apt-get update
-sudo apt-get install nvidia-driver-570-server
-```
-
-安装专有驱动,安装完成后,重启电脑
-
-```bash
-reboot
-```
-
-## 3. 安装anacoda
-
-如果已安装conda,可以跳过本步骤
-
-```bash
-wget -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
-bash Anaconda3-2024.06-1-Linux-x86_64.sh
-```
-
-最后一步输入yes,关闭终端重新打开
-
-## 4. 使用conda 创建环境
-
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-
-## 5. 安装应用
-
-```bash
-pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
-```
-
-> [!TIP]
-> 下载完成后,您可以通过以下命令检查`magic-pdf`的版本:
->
-> ```bash
-> magic-pdf --version
-> ```
-
-
-## 6. 下载模型
-
-
-详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
-
-## 7. 了解配置文件存放的位置
-
-完成[6.下载模型](#6-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
-您可在【用户目录】下找到magic-pdf.json文件。
-
-> [!TIP]
-> linux用户目录为 "/home/用户名"
-
-## 8. 第一次运行
-
-从仓库中下载样本文件,并测试
-
-```bash
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
-magic-pdf -p small_ocr.pdf -o ./output
-```
-
-## 9. 测试CUDA加速
-
-如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
-
-**1.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
-
-```json
-{
-  "device-mode":"cuda"
-}
-```
-
-**2.运行以下命令测试cuda加速效果**
-
-```bash
-magic-pdf -p small_ocr.pdf -o ./output
-```
-> [!TIP]
-> CUDA加速是否生效可以根据log中输出的各个阶段cost耗时来简单判断,通常情况下,使用cuda加速会比cpu更快。

+ 0 - 83
docs/README_Windows_CUDA_Acceleration_en_US.md

@@ -1,83 +0,0 @@
-# Windows 10/11
-
-### 1. Install CUDA and cuDNN
-
-You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
-
-- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
-- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
-- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
-- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
-
-### 2. Install Anaconda
-
-If Anaconda is already installed, you can skip this step.
-
-Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
-
-### 3. Create an Environment Using Conda
-
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-
-### 4. Install Applications
-
-```
-pip install -U magic-pdf[full]
-```
-
-> [!IMPORTANT]
-> After installation, you can check the version of `magic-pdf` using the following command:
->
-> ```bash
-> magic-pdf --version
-> ```
-
-
-### 5. Download Models
-
-Refer to detailed instructions on [how to download model files](how_to_download_models_en.md).
-
-### 6. Understand the Location of the Configuration File
-
-After completing the [5. Download Models](#5-download-models) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
-You can find the `magic-pdf.json` file in your 【user directory】 .
-
-> [!TIP]
-> The user directory for Windows is "C:/Users/username".
-
-### 7. First Run
-
-Download a sample file from the repository and test it.
-
-```powershell
-  wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
-  magic-pdf -p small_ocr.pdf -o ./output
-```
-
-### 8. Test CUDA Acceleration
-
-If your graphics card has at least 6GB of VRAM, follow these steps to test CUDA-accelerated parsing performance.
-
-1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
-
-   ```
-   pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
-   ```
-
-2. **Modify the value of `"device-mode"`** in the `magic-pdf.json` configuration file located in your user directory.
-
-   ```json
-   {
-     "device-mode": "cuda"
-   }
-   ```
-
-
-3. **Run the following command to test CUDA acceleration**:
-
-   ```
-   magic-pdf -p small_ocr.pdf -o ./output
-   ```

+ 0 - 86
docs/README_Windows_CUDA_Acceleration_zh_CN.md

@@ -1,86 +0,0 @@
-# Windows10/11
-
-## 1. 安装cuda环境
-
-需要安装符合torch要求的cuda版本,具体可参考[torch官网](https://pytorch.org/get-started/locally/)
-
-- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
-- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
-- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
-- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
-
-## 2. 安装anaconda
-
-如果已安装conda,可以跳过本步骤
-
-下载链接:
-https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
-
-## 3. 使用conda 创建环境
-
-```bash
-conda create -n mineru 'python=3.12' -y
-conda activate mineru
-```
-
-## 4. 安装应用
-
-```bash
-pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
-```
-
-> [!IMPORTANT]
-> 下载完成后,您可以通过以下命令检查magic-pdf的版本
->
-> ```bash
-> magic-pdf --version
-> ```
-
-
-## 5. 下载模型
-
-详细参考 [如何下载模型文件](how_to_download_models_zh_cn.md)
-
-## 6. 了解配置文件存放的位置
-
-完成[5.下载模型](#5-下载模型)步骤后,脚本会自动生成用户目录下的magic-pdf.json文件,并自动配置默认模型路径。
-您可在【用户目录】下找到magic-pdf.json文件。
-
-> [!TIP]
-> windows用户目录为 "C:/Users/用户名"
-
-## 7. 第一次运行
-
-从仓库中下载样本文件,并测试
-
-```powershell
- wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
- magic-pdf -p small_ocr.pdf -o ./output
-```
-
-## 8. 测试CUDA加速
-
-如果您的显卡显存大于等于 **6GB** ,可以进行以下流程,测试CUDA解析加速效果
-
-**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url,具体可参考[torch官网](https://pytorch.org/get-started/locally/))
-
-```bash
-pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
-```
-
-**2.修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值**
-
-```json
-{
-  "device-mode":"cuda"
-}
-```
-
-**3.运行以下命令测试cuda加速效果**
-
-```bash
-magic-pdf -p small_ocr.pdf -o ./output
-```
-
-> [!TIP]
-> CUDA加速是否生效可以根据log中输出的各个阶段的耗时来简单判断,通常情况下,cuda加速后运行速度比cpu更快。

+ 0 - 23
docs/how_to_download_models_en.md

@@ -1,23 +0,0 @@
-Model downloads are divided into initial downloads and updates to the model directory. Please refer to the corresponding documentation for instructions on how to proceed.
-
-
-# Initial download of model files
-
-### Download the Model from Hugging Face
-
-Use a Python Script to Download Model Files from Hugging Face
-```bash
-pip install huggingface_hub
-wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
-python download_models_hf.py
-```
-The Python script will automatically download the model files and configure the model directory in the configuration file.
-
-The configuration file can be found in the user directory, with the filename `magic-pdf.json`.
-
-
-# How to update models previously downloaded
-
-## 1. Models downloaded via Hugging Face or Model Scope
-
-If you previously downloaded models via Hugging Face or Model Scope, you can rerun the Python script used for the initial download. This will automatically update the model directory to the latest version.

+ 0 - 37
docs/how_to_download_models_zh_cn.md

@@ -1,37 +0,0 @@
-模型下载分为首次下载和更新模型目录,请参考对应的文档内容进行操作
-
-# 首次下载模型文件
-
-模型文件可以从 Hugging Face 或 Model Scope 下载,由于网络原因,国内用户访问HF可能会失败,请使用 ModelScope。
-
-<details>
-  <summary>方法一:从 Hugging Face 下载模型</summary>
-  <p>使用python脚本 从Hugging Face下载模型文件</p>
-  <pre><code>pip install huggingface_hub
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
-python download_models_hf.py</code></pre>
-  <p>python脚本会自动下载模型文件并配置好配置文件中的模型目录</p>
-</details>
-
-## 方法二:从 ModelScope 下载模型
-
-### 使用python脚本 从ModelScope下载模型文件
-
-```bash
-pip install modelscope
-wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
-python download_models.py
-```
-python脚本会自动下载模型文件并配置好配置文件中的模型目录
-
-配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
-
-> [!TIP]
-> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
-
-
-# 此前下载过模型,如何更新
-
-## 1. 通过 Hugging Face 或 Model Scope 下载过模型
-
-如此前通过 HuggingFace 或 Model Scope 下载过模型,可以重复执行此前的模型下载python脚本,将会自动将模型目录更新到最新版本。

+ 3 - 3
mineru/backend/pipeline/batch_analyze.py

@@ -5,8 +5,8 @@ from collections import defaultdict
 import numpy as np
 
 from .model_init import AtomModelSingleton
-from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
-from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list
+from ...utils.model_utils import crop_img, get_res_list_from_layout_res
+from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
 
 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
 MFD_BASE_BATCH_SIZE = 1
@@ -315,7 +315,7 @@ class BatchAnalyze:
                         ocr_text, ocr_score = ocr_res_list[index]
                         layout_res_item['text'] = ocr_text
                         layout_res_item['score'] = float(f"{ocr_score:.3f}")
-                        if ocr_score < 0.6:
+                        if ocr_score < OcrConfidence.min_confidence:
                             layout_res_item['category_id'] = 16
 
                     total_processed += len(img_crop_list)

+ 4 - 2
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -2,6 +2,7 @@
 import time
 
 from loguru import logger
+from tqdm import tqdm
 
 from mineru.utils.config_reader import get_device, get_llm_aided_config
 from mineru.backend.pipeline.model_init import AtomModelSingleton
@@ -14,6 +15,7 @@ from mineru.utils.enum_class import ContentType
 from mineru.utils.llm_aided import llm_aided_title
 from mineru.utils.model_utils import clean_memory
 from mineru.backend.pipeline.pipeline_magic_model import MagicModel
+from mineru.utils.ocr_utils import OcrConfidence
 from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
 from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
     remove_overlaps_min_spans, txt_spans_extract
@@ -163,7 +165,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
 
 def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True):
     middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
-    for page_index, page_model_info in enumerate(model_list):
+    for page_index, page_model_info in tqdm(enumerate(model_list), total=len(model_list), desc="Processing pages"):
         page = pdf_doc[page_index]
         image_dict = images_list[page_index]
         page_info = page_model_info_to_page_info(
@@ -208,7 +210,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
             need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
         for index, span in enumerate(need_ocr_list):
             ocr_text, ocr_score = ocr_res_list[index]
-            if ocr_score > 0.6:
+            if ocr_score > OcrConfidence.min_confidence:
                 span['content'] = ocr_text
                 span['score'] = float(f"{ocr_score:.3f}")
             else:

+ 2 - 2
mineru/backend/vlm/base_predictor.py

@@ -7,8 +7,8 @@ DEFAULT_SYSTEM_PROMPT = (
 )
 DEFAULT_USER_PROMPT = "Document Parsing:"
 DEFAULT_TEMPERATURE = 0.0
-DEFAULT_TOP_P = 0.01
-DEFAULT_TOP_K = 1
+DEFAULT_TOP_P = 0.8
+DEFAULT_TOP_K = 20
 DEFAULT_REPETITION_PENALTY = 1.0
 DEFAULT_PRESENCE_PENALTY = 0.0
 DEFAULT_NO_REPEAT_NGRAM_SIZE = 100

+ 5 - 5
mineru/backend/vlm/predictor.py

@@ -22,7 +22,7 @@ try:
 
     hf_loaded = True
 except ImportError as e:
-    logger.warning("hf is not installed. If you are not using huggingface, you can ignore this warning.")
+    logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
 
 engine_loaded = False
 try:
@@ -51,9 +51,9 @@ def get_predictor(
 ) -> BasePredictor:
     start_time = time.time()
 
-    if backend == "huggingface":
+    if backend == "transformers":
         if not model_path:
-            raise ValueError("model_path must be provided for huggingface backend.")
+            raise ValueError("model_path must be provided for transformers backend.")
         if not hf_loaded:
             raise ImportError(
                 "transformers is not installed, so huggingface backend cannot be used. "
@@ -77,7 +77,7 @@ def get_predictor(
             raise ImportError(
                 "sglang is not installed, so sglang-engine backend cannot be used. "
                 "If you need to use sglang-engine backend for inference, "
-                "please install sglang[all]==0.4.6.post4 or a newer version."
+                "please install sglang[all]==0.4.7 or a newer version."
             )
         predictor = SglangEnginePredictor(
             server_args=ServerArgs(model_path, **kwargs),
@@ -104,7 +104,7 @@ def get_predictor(
             http_timeout=http_timeout,
         )
     else:
-        raise ValueError(f"Unsupported backend: {backend}. Supports: huggingface, sglang-engine, sglang-client.")
+        raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
 
     elapsed = round(time.time() - start_time, 2)
     logger.info(f"get_predictor cost: {elapsed}s")

+ 10 - 7
mineru/backend/vlm/vlm_analyze.py

@@ -9,6 +9,7 @@ from .base_predictor import BasePredictor
 from .predictor import get_predictor
 from .token_to_middle_json import result_to_middle_json
 from ...utils.enum_class import ModelPath
+from ...utils.models_download_utils import auto_download_and_get_model_root_path
 
 
 class ModelSingleton:
@@ -28,6 +29,8 @@ class ModelSingleton:
     ) -> BasePredictor:
         key = (backend,)
         if key not in self._models:
+            if not model_path:
+                model_path = auto_download_and_get_model_root_path("/","vlm")
             self._models[key] = get_predictor(
                 backend=backend,
                 model_path=model_path,
@@ -40,8 +43,8 @@ def doc_analyze(
     pdf_bytes,
     image_writer: DataWriter | None,
     predictor: BasePredictor | None = None,
-    backend="huggingface",
-    model_path=ModelPath.vlm_root_hf,
+    backend="transformers",
+    model_path: str | None = None,
     server_url: str | None = None,
 ):
     if predictor is None:
@@ -53,10 +56,10 @@ def doc_analyze(
     # load_images_time = round(time.time() - load_images_start, 2)
     # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
 
-    infer_start = time.time()
+    # infer_start = time.time()
     results = predictor.batch_predict(images=images_base64_list)
-    infer_time = round(time.time() - infer_start, 2)
-    logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
+    # infer_time = round(time.time() - infer_start, 2)
+    # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
 
     middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
     return middle_json, results
@@ -66,8 +69,8 @@ async def aio_doc_analyze(
     pdf_bytes,
     image_writer: DataWriter | None,
     predictor: BasePredictor | None = None,
-    backend="huggingface",
-    model_path=ModelPath.vlm_root_hf,
+    backend="transformers",
+    model_path: str | None = None,
     server_url: str | None = None,
 ):
     if predictor is None:

+ 4 - 3
mineru/cli/client.py

@@ -41,17 +41,18 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     auto: Automatically determine the method based on the file type.
     txt: Use text extraction method.
     ocr: Use OCR method for image-based PDFs.
-    Without method specified, 'auto' will be used by default.""",
+    Without method specified, 'auto' will be used by default.
+    Adapted only for the case where the backend is set to "pipeline".""",
     default='auto',
 )
 @click.option(
     '-b',
     '--backend',
     'backend',
-    type=click.Choice(['pipeline', 'vlm-huggingface', 'vlm-sglang-engine', 'vlm-sglang-client']),
+    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client']),
     help="""the backend for parsing pdf:
     pipeline: More general.
-    vlm-huggingface: More general.
+    vlm-transformers: More general.
     vlm-sglang-engine: Faster(engine).
     vlm-sglang-client: Faster(client).
     without method specified, pipeline will be used by default.""",

+ 1 - 3
mineru/cli/common.py

@@ -16,7 +16,6 @@ from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc
 from mineru.data.data_reader_writer import FileBasedDataWriter
 from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
 from mineru.utils.enum_class import MakeMode
-from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
 from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
 
 pdf_suffixes = [".pdf"]
@@ -173,8 +172,7 @@ def do_parse(
             pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
             local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
             image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
-            model_path = auto_download_and_get_model_root_path('/', 'vlm')
-            middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, model_path=model_path, server_url=server_url)
+            middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
 
             pdf_info = middle_json["pdf_info"]
 

+ 4 - 0
mineru/model/vlm_sglang_model/model.py

@@ -22,6 +22,7 @@ from transformers import (
 
 from ..vlm_hf_model.configuration_mineru2 import Mineru2QwenConfig
 from ..vlm_hf_model.modeling_mineru2 import build_vision_projector
+from ...utils.models_download_utils import auto_download_and_get_model_root_path
 
 
 def flatten_nested_list(nested_list):
@@ -61,6 +62,9 @@ class Mineru2QwenForCausalLM(nn.Module):
 
         # load vision tower
         mm_vision_tower = self.config.mm_vision_tower
+        model_root_path = auto_download_and_get_model_root_path("/", "vlm")
+        mm_vision_tower = f"{model_root_path}/{mm_vision_tower}"
+
         if "clip" in mm_vision_tower:
             vision_config = CLIPVisionConfig.from_pretrained(mm_vision_tower)
             self.vision_tower = CLIPVisionModel(vision_config)  # type: ignore

+ 15 - 4
mineru/model/vlm_sglang_model/server.py

@@ -27,16 +27,27 @@ async def custom_generate_request(obj: GenerateReqInput, request: Request):
 
 
 def main():
-    server_args = prepare_server_args(sys.argv[1:])
+    # 检查命令行参数中是否包含--model-path
+    args = sys.argv[1:]
+    has_model_path_arg = False
+
+    for i, arg in enumerate(args):
+        if arg == "--model-path" or arg.startswith("--model-path="):
+            has_model_path_arg = True
+            break
+
+    # 如果没有--model-path参数,在参数列表中添加它
+    if not has_model_path_arg:
+        default_path = auto_download_and_get_model_root_path("/", "vlm")
+        args.extend(["--model-path", default_path])
+
+    server_args = prepare_server_args(args)
 
     if server_args.chat_template is None:
         server_args.chat_template = "chatml"
 
     server_args.enable_custom_logit_processor = True
 
-    if server_args.model_path is None:
-        server_args.model_path = auto_download_and_get_model_root_path("/","vlm")
-
     try:
         launch_server(server_args)
     finally:

+ 9 - 3
mineru/utils/config_reader.py

@@ -17,7 +17,7 @@ def read_config():
         config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
 
     if not os.path.exists(config_file):
-        logger.warning(f'{config_file} not found, using default configuration')
+        # logger.warning(f'{config_file} not found, using default configuration')
         return None
     else:
         with open(config_file, 'r', encoding='utf-8') as f:
@@ -106,7 +106,9 @@ def get_formula_config():
 
 def get_latex_delimiter_config():
     config = read_config()
-    latex_delimiter_config = config.get('latex-delimiter-config')
+    if config is None:
+        return None
+    latex_delimiter_config = config.get('latex-delimiter-config', None)
     if latex_delimiter_config is None:
         # logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
         return None
@@ -116,7 +118,9 @@ def get_latex_delimiter_config():
 
 def get_llm_aided_config():
     config = read_config()
-    llm_aided_config = config.get('llm-aided-config')
+    if config is None:
+        return None
+    llm_aided_config = config.get('llm-aided-config', None)
     if llm_aided_config is None:
         # logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
         return None
@@ -126,6 +130,8 @@ def get_llm_aided_config():
 
 def get_local_models_dir():
     config = read_config()
+    if config is None:
+        return None
     models_dir = config.get('models-dir')
     if models_dir is None:
         logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use None as default")

+ 4 - 2
mineru/utils/draw_bbox.py

@@ -188,7 +188,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
             page.merge_page(overlay_pdf.pages[0])
         else:
             # 记录日志并继续处理下一个页面
-            logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
+            # logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
+            pass
 
         output_pdf.add_page(page)
 
@@ -302,7 +303,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
             page.merge_page(overlay_pdf.pages[0])
         else:
             # 记录日志并继续处理下一个页面
-            logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
+            # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
+            pass
 
         output_pdf.add_page(page)
 

+ 10 - 2
mineru/utils/models_download_utils.py

@@ -51,9 +51,17 @@ def auto_download_and_get_model_root_path(relative_path: str, repo_mode='pipelin
     else:
         raise ValueError(f"未知的仓库类型: {model_source}")
 
-    relative_path = relative_path.strip('/')
-    cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"])
+    cache_dir = None
 
+    if repo_mode == 'pipeline':
+        relative_path = relative_path.strip('/')
+        cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"])
+    elif repo_mode == 'vlm':
+        # VLM 模式下,直接下载整个模型目录
+        cache_dir = snapshot_download(repo)
+
+    if not cache_dir:
+        raise FileNotFoundError(f"Failed to download model: {relative_path} from {repo}")
     return cache_dir
 
 

+ 11 - 1
mineru/utils/ocr_utils.py

@@ -4,6 +4,11 @@ import cv2
 import numpy as np
 
 
+class OcrConfidence:
+    min_confidence = 0.68
+    min_width = 3
+
+
 def merge_spans_to_line(spans, threshold=0.6):
     if len(spans) == 0:
         return []
@@ -304,7 +309,7 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
             p1, p2, p3, p4 = box_ocr_res[0]
             text, score = box_ocr_res[1]
             # logger.info(f"text: {text}, score: {score}")
-            if score < 0.6:  # 过滤低置信度的结果
+            if score < OcrConfidence.min_confidence:  # 过滤低置信度的结果
                 continue
         else:
             p1, p2, p3, p4 = box_ocr_res
@@ -317,6 +322,11 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
         # average_angle_degrees = calculate_angle_degrees(box_ocr_res[0])
         # if average_angle_degrees > 0.5:
         poly = [p1, p2, p3, p4]
+
+        if (p3[0] - p1[0]) < OcrConfidence.min_width:
+            # logger.info(f"width too small: {p3[0] - p1[0]}, text: {text}")
+            continue
+
         if calculate_is_angle(poly):
             # logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}")
             # 与x轴的夹角超过0.5度,对边界做一下矫正

+ 0 - 16
next_docs/en/.readthedocs.yaml

@@ -1,16 +0,0 @@
-version: 2
-
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-
-formats:
-  - epub
-
-python:
-  install:
-    - requirements: next_docs/requirements.txt
-
-sphinx:
-  configuration: next_docs/en/conf.py

+ 0 - 20
next_docs/en/Makefile

@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

binární
next_docs/en/_static/image/MinerU-logo-hq.png


binární
next_docs/en/_static/image/MinerU-logo.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 13
next_docs/en/_static/image/ReadTheDocs.svg


binární
next_docs/en/_static/image/datalab_logo.png


binární
next_docs/en/_static/image/flowchart_en.png


binární
next_docs/en/_static/image/flowchart_zh_cn.png


binární
next_docs/en/_static/image/inference_result.png


binární
next_docs/en/_static/image/layout_example.png


binární
next_docs/en/_static/image/logo.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 3
next_docs/en/_static/image/pipeline.drawio.svg


binární
next_docs/en/_static/image/poly.png


binární
next_docs/en/_static/image/project_panorama_en.png


binární
next_docs/en/_static/image/project_panorama_zh_cn.png


binární
next_docs/en/_static/image/spans_example.png


binární
next_docs/en/_static/image/web_demo_1.png


+ 0 - 88
next_docs/en/additional_notes/faq.rst

@@ -1,88 +0,0 @@
-FAQ
-==========================
-
-1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-On macOS, the default shell has switched from Bash to Z shell, which has
-special handling logic for certain types of string matching. This can
-lead to the “no matches found” error. You can try disabling the globbing
-feature in the command line and then run the installation command again.
-
-.. code:: bash
-
-   setopt no_nomatch
-   pip install magic-pdf[full]
-
-2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This might be due to an incomplete download of the model file. You can
-try re-downloading the model file and then try again. Reference:
-https://github.com/opendatalab/MinerU/issues/143
-
-3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The path for the model files is configured in “magic-pdf.json”. just
-like:
-
-.. code:: json
-
-   {
-     "models-dir": "/tmp/models"
-   }
-
-This path is an absolute path, not a relative path. You can obtain the
-absolute path in the models directory using the “pwd” command.
-Reference:
-https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
-
-4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
-install the ``libgl`` library with the following command to resolve the
-issue:
-
-.. code:: bash
-
-   sudo apt-get install libgl1-mesa-glx
-
-Reference: https://github.com/opendatalab/MinerU/issues/388
-
-5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You need to uninstall the module and reinstall it:
-
-.. code:: bash
-
-   pip uninstall fairscale
-   pip install fairscale
-
-Reference: https://github.com/opendatalab/MinerU/issues/411
-
-6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The compatibility of cuda11 with new graphics cards is poor, and the
-CUDA version used by Paddle needs to be upgraded.
-
-.. code:: bash
-
-   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
-
-Reference: https://github.com/opendatalab/MinerU/issues/558
-
-
-7. On some Linux servers, the program immediately reports an error ``Illegal instruction (core dumped)``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This might be because the server's CPU does not support the AVX/AVX2
-instruction set, or the CPU itself supports it but has been disabled by
-the system administrator. You can try contacting the system
-administrator to remove the restriction or change to a different server.
-
-References: https://github.com/opendatalab/MinerU/issues/591 ,
-https://github.com/opendatalab/MinerU/issues/736

+ 0 - 14
next_docs/en/additional_notes/glossary.rst

@@ -1,14 +0,0 @@
-
-
-Glossary 
-===========
-
-1. jsonl 
-    Newline-delimited (\n), and each line must be a valid, independent JSON object. 
-    Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
-
-
-2. magic-pdf.json 
-    TODO
-
-

+ 0 - 20
next_docs/en/additional_notes/known_issues.rst

@@ -1,20 +0,0 @@
-Known Issues
-============
-
--  Reading order is determined by the model based on the spatial
-   distribution of readable content, and may be out of order in some
-   areas under extremely complex layouts.
--  Vertical text is not supported.
--  Tables of contents and lists are recognized through rules, and some
-   uncommon list formats may not be recognized.
--  Only one level of headings is supported; hierarchical headings are
-   not currently supported.
--  Code blocks are not yet supported in the layout model.
--  Comic books, art albums, primary school textbooks, and exercises
-   cannot be parsed well.
--  Table recognition may result in row/column recognition errors in
-   complex tables.
--  OCR recognition may produce inaccurate characters in PDFs of
-   lesser-known languages (e.g., diacritical marks in Latin script,
-   easily confused characters in Arabic script).
--  Some formulas may not render correctly in Markdown.

+ 0 - 11
next_docs/en/api.rst

@@ -1,11 +0,0 @@
-
-.. toctree::
-   :maxdepth: 2
-
-   api/dataset
-   api/data_reader_writer
-   api/read_api
-   api/schemas
-   api/io
-   api/pipe_operators
-   api/model_operators

+ 0 - 44
next_docs/en/api/data_reader_writer.rst

@@ -1,44 +0,0 @@
-
-Data Reader Writer
-===================
-
-.. autoclass:: magic_pdf.data.data_reader_writer.DataReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-

+ 0 - 28
next_docs/en/api/dataset.rst

@@ -1,28 +0,0 @@
-Dataset
-========
-
-.. autoclass:: magic_pdf.data.dataset.PageableData
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-
-.. autoclass:: magic_pdf.data.dataset.Dataset
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.dataset.ImageDataset
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.dataset.PymuDocDataset
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.dataset.Doc
-   :members:
-   :inherited-members:
-   :show-inheritance:

+ 0 - 33
next_docs/en/api/io.rst

@@ -1,33 +0,0 @@
-IO
-==
-
-.. autoclass:: magic_pdf.data.io.base.IOReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.io.base.IOWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.io.s3.S3Reader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.io.s3.S3Writer
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.io.http.HttpReader
-   :members:
-   :inherited-members:
-   :show-inheritance:
-
-.. autoclass:: magic_pdf.data.io.http.HttpWriter
-   :members:
-   :inherited-members:
-   :show-inheritance:
-

+ 0 - 8
next_docs/en/api/model_operators.rst

@@ -1,8 +0,0 @@
-
-Model Api
-==========
-
-.. autoclass:: magic_pdf.operators.InferenceResultBase
-   :members:
-   :inherited-members:
-   :show-inheritance:

+ 0 - 9
next_docs/en/api/pipe_operators.rst

@@ -1,9 +0,0 @@
-
-
-Pipeline Api
-=============
-
-.. autoclass:: magic_pdf.operators.pipes.PipeResult
-   :members:
-   :inherited-members:
-   :show-inheritance:

+ 0 - 6
next_docs/en/api/read_api.rst

@@ -1,6 +0,0 @@
-read_api
-=========
-
-.. automodule:: magic_pdf.data.read_api
-   :members:
-   :inherited-members:

+ 0 - 10
next_docs/en/api/schemas.rst

@@ -1,10 +0,0 @@
-
-schemas 
-===========
-
-.. autopydantic_model:: magic_pdf.data.schemas.S3Config
-   :members:
-
-.. autopydantic_model:: magic_pdf.data.schemas.PageInfo
-   :members:
-

+ 0 - 151
next_docs/en/conf.py

@@ -1,151 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os
-import subprocess
-import sys
-
-from sphinx.ext import autodoc
-from docutils import nodes
-from docutils.parsers.rst import Directive
-
-def install(package):
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
-
-
-requirements_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'requirements.txt'))
-if os.path.exists(requirements_path):
-    with open(requirements_path) as f:
-        packages = f.readlines()
-    for package in packages:
-        install(package.strip())
-
-sys.path.insert(0, os.path.abspath('../..'))
-
-# -- Project information -----------------------------------------------------
-
-project = 'MinerU'
-copyright = '2024, MinerU Contributors'
-author = 'OpenDataLab'
-
-# The full version, including alpha/beta/rc tags
-version_file = '../../magic_pdf/libs/version.py'
-with open(version_file) as f:
-    exec(compile(f.read(), version_file, 'exec'))
-__version__ = locals()['__version__']
-# The short X.Y version
-version = __version__
-# The full version, including alpha/beta/rc tags
-release = __version__
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.intersphinx',
-    'sphinx_copybutton',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.inheritance_diagram',
-    'myst_parser',
-    'sphinxarg.ext',
-    'sphinxcontrib.autodoc_pydantic',
-]
-
-# class hierarchy diagram
-inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
-inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
-inheritance_edge_attrs = dict(arrow='vee')
-
-autodoc_pydantic_model_show_json = True
-autodoc_pydantic_model_show_config_summary = False
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-
-# Exclude the prompt "$" when copying code
-copybutton_prompt_text = r'\$ '
-copybutton_prompt_is_regexp = True
-
-language = 'en'
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_book_theme'
-html_logo = '_static/image/logo.png'
-html_theme_options = {
-    'path_to_docs': 'next_docs/en',
-    'repository_url': 'https://github.com/opendatalab/MinerU',
-    'use_repository_button': True,
-}
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
-# Mock out external dependencies here.
-autodoc_mock_imports = [
-    'cpuinfo',
-    'torch',
-    'transformers',
-    'psutil',
-    'prometheus_client',
-    'sentencepiece',
-    'vllm.cuda_utils',
-    'vllm._C',
-    # 'numpy',
-    'tqdm',
-]
-
-
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    """Remove note about base class when a class is derived from object."""
-
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == '   Bases: :py:class:`object`':
-            return
-        super().add_line(line, source, *lineno)
-
-
-autodoc.ClassDocumenter = MockedClassDocumenter
-
-navigation_with_keys = False
-
-
-# add custom directive 
-
-
-class VideoDirective(Directive):
-    required_arguments = 1
-    optional_arguments = 0
-    final_argument_whitespace = True
-    option_spec = {}
-
-    def run(self):
-        url = self.arguments[0]
-        video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
-        return [video_node]
-
-def setup(app):
-    app.add_directive('video', VideoDirective)

+ 0 - 111
next_docs/en/index.rst

@@ -1,111 +0,0 @@
-.. xtuner documentation master file, created by
-   sphinx-quickstart on Tue Jan  9 16:33:06 2024.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to the MinerU Documentation
-==============================================
-
-.. figure:: ./_static/image/logo.png
-  :align: center
-  :alt: mineru
-  :class: no-scaled-link
-
-.. raw:: html
-
-   <p style="text-align:center">
-   <strong>A one-stop, open-source, high-quality data extraction tool
-   </strong>
-   </p>
-
-   <p style="text-align:center">
-   <script async defer src="https://buttons.github.io/buttons.js"></script>
-   <a class="github-button" href="https://github.com/opendatalab/MinerU" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-   <a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-   <a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
-   </p>
-
-
-Project Introduction
---------------------
-
-MinerU is a tool that converts PDFs into machine-readable formats (e.g.,
-markdown, JSON), allowing for easy extraction into any format. MinerU
-was born during the pre-training process of
-`InternLM <https://github.com/InternLM/InternLM>`__. We focus on solving
-symbol conversion issues in scientific literature and hope to contribute
-to technological development in the era of large models. Compared to
-well-known commercial products, MinerU is still young. If you encounter
-any issues or if the results are not as expected, please submit an issue
-on `issue <https://github.com/opendatalab/MinerU/issues>`__ and **attach
-the relevant PDF**.
-
-.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
-
-
-Key Features
-------------
-
--  Remove headers, footers, footnotes, page numbers, etc., to ensure
-   semantic coherence.
--  Output text in human-readable order, suitable for single-column,
-   multi-column, and complex layouts.
--  Preserve the structure of the original document, including headings,
-   paragraphs, lists, etc.
--  Extract images, image descriptions, tables, table titles, and
-   footnotes.
--  Automatically recognize and convert formulas in the document to LaTeX
-   format.
--  Automatically recognize and convert tables in the document to LaTeX
-   or HTML format.
--  Automatically detect scanned PDFs and garbled PDFs and enable OCR
-   functionality.
--  OCR supports detection and recognition of 84 languages.
--  Supports multiple output formats, such as multimodal and NLP
-   Markdown, JSON sorted by reading order, and rich intermediate
-   formats.
--  Supports various visualization results, including layout
-   visualization and span visualization, for efficient confirmation of
-   output quality.
--  Supports both CPU and GPU environments.
--  Compatible with Windows, Linux, and Mac platforms.
-
-
-.. tip::
-
-   Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
-
-
-User Guide
--------------
-.. toctree::
-   :maxdepth: 2
-   :caption: User Guide
-
-   user_guide
-
-
-API Reference
--------------
-
-If you are looking for information on a specific function, class or
-method, this part of the documentation is for you.
-
-.. toctree::
-   :maxdepth: 2
-   :caption: API
-
-   api
-
-
-Additional Notes
-------------------
-.. toctree::
-   :maxdepth: 1
-   :caption: Additional Notes
-
-   additional_notes/known_issues
-   additional_notes/faq
-   additional_notes/glossary
-
-

+ 0 - 35
next_docs/en/make.bat

@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd

+ 0 - 12
next_docs/en/user_guide.rst

@@ -1,12 +0,0 @@
-
-
-.. toctree::
-    :maxdepth: 2
-
-    user_guide/install
-    user_guide/usage
-    user_guide/quick_start
-    user_guide/tutorial
-    user_guide/data
-    user_guide/inference_result
-    user_guide/pipe_result

+ 0 - 19
next_docs/en/user_guide/data.rst

@@ -1,19 +0,0 @@
-
-
-Data
-=========
-
-.. toctree::
-   :maxdepth: 2
-
-   data/dataset
-
-   data/read_api
-
-   data/data_reader_writer 
-
-   data/io
-
-
-
-

+ 0 - 236
next_docs/en/user_guide/data/data_reader_writer.rst

@@ -1,236 +0,0 @@
-
-Data Reader Writer 
-====================
-
-Aims for read or write bytes from different media, You can implement new classes to meet the needs of your personal scenarios 
-if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
-``DataReader`` or ``DataWriter``
-
-.. code:: python
-
-    class SomeReader(DataReader):
-        def read(self, path: str) -> bytes:
-            pass
-
-        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
-            pass
-
-
-    class SomeWriter(DataWriter):
-        def write(self, path: str, data: bytes) -> None:
-            pass
-
-        def write_string(self, path: str, data: str) -> None:
-            pass
-
-
-Reader may curious about the difference between :doc:`io` and this section. Those two sections look very similarity at first glance.
-:doc:`io` provides fundamental functions, while This section thinks more at application level. Customer can build they own classes to meet 
-their own applications need which may share same IO function. That is why we have :doc:`io`.
-
-
-Important Classes
------------------
-
-.. code:: python
-
-    class FileBasedDataReader(DataReader):
-        def __init__(self, parent_dir: str = ''):
-            pass
-
-
-    class FileBasedDataWriter(DataWriter):
-        def __init__(self, parent_dir: str = '') -> None:
-            pass
-
-Class ``FileBasedDataReader`` initialized with unary param ``parent_dir``, That means that every method ``FileBasedDataReader`` provided will have features as follow.
-
-Features:
-    #. read content from the absolute path file, ``parent_dir`` will be ignored.
-    #. read the relative path, file will first join with ``parent_dir``, then read content from the merged path
-
-
-.. note::
-
-    ``FileBasedDataWriter`` shares the same behavior with ``FileBaseDataReader``
-
-
-.. code:: python 
-
-    class MultiS3Mixin:
-        def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
-            pass
-
-    class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
-        pass
-
-All read-related method that class ``MultiBucketS3DataReader`` provided will have features as follow.
-
-Features:
-    #. read object with full s3-format path, for example ``s3://test_bucket/test_object``, ``default_prefix`` will be ignored.
-    #. read object with relative path, file will join ``default_prefix`` and trim the ``bucket_name`` firstly, then read the content. ``bucket_name`` is the first element of the result after split ``default_prefix`` with delimiter ``\`` 
-
-.. note::
-    ``MultiBucketS3DataWriter`` shares the same behavior with ``MultiBucketS3DataReader``
-
-
-.. code:: python
-
-    class S3DataReader(MultiBucketS3DataReader):
-        pass
-
-``S3DataReader`` is build on top of MultiBucketS3DataReader which only support for bucket. So is ``S3DataWriter``. 
-
-
-Read Examples
-------------
-
-.. code:: python
-
-    import os 
-    from magic_pdf.data.data_reader_writer import *
-    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
-    from magic_pdf.data.schemas import S3Config
-
-    # file based related
-    file_based_reader1 = FileBasedDataReader('')
-
-    ## will read file abc
-    file_based_reader1.read('abc')
-
-    file_based_reader2 = FileBasedDataReader('/tmp')
-
-    ## will read /tmp/abc
-    file_based_reader2.read('abc')
-
-    ## will read /tmp/logs/message.txt
-    file_based_reader2.read('/tmp/logs/message.txt')
-
-    # multi bucket s3 releated
-    bucket = "bucket"               # replace with real bucket
-    ak = "ak"                       # replace with real access key
-    sk = "sk"                       # replace with real secret key
-    endpoint_url = "endpoint_url"   # replace with real endpoint_url
-
-    bucket_2 = "bucket_2"               # replace with real bucket
-    ak_2 = "ak_2"                       # replace with real access key
-    sk_2 = "sk_2"                       # replace with real secret key 
-    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
-
-    test_prefix = 'test/unittest'
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
-            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-
-    ## will read s3://{bucket}/{test_prefix}/abc
-    multi_bucket_s3_reader1.read('abc')
-
-    ## will read s3://{bucket}/{test_prefix}/efg
-    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
-
-    ## will read s3://{bucket2}/{test_prefix}/abc
-    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
-
-    # s3 related
-    s3_reader1 = S3DataReader(
-        test_prefix,
-        bucket,
-        ak,
-        sk,
-        endpoint_url
-    )
-
-    ## will read s3://{bucket}/{test_prefix}/abc
-    s3_reader1.read('abc')
-
-    ## will read s3://{bucket}/efg
-    s3_reader1.read(f's3://{bucket}/efg')
-
-
-Write Examples
----------------
-
-.. code:: python
-
-    import os
-    from magic_pdf.data.data_reader_writer import *
-    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
-    from magic_pdf.data.schemas import S3Config
-
-    # file based related
-    file_based_writer1 = FileBasedDataWriter("")
-
-    ## will write 123 to abc
-    file_based_writer1.write("abc", "123".encode())
-
-    ## will write 123 to abc
-    file_based_writer1.write_string("abc", "123")
-
-    file_based_writer2 = FileBasedDataWriter("/tmp")
-
-    ## will write 123 to /tmp/abc
-    file_based_writer2.write_string("abc", "123")
-
-    ## will write 123 to /tmp/logs/message.txt
-    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
-
-    # multi bucket s3 releated
-    bucket = "bucket"               # replace with real bucket
-    ak = "ak"                       # replace with real access key
-    sk = "sk"                       # replace with real secret key
-    endpoint_url = "endpoint_url"   # replace with real endpoint_url
-
-    bucket_2 = "bucket_2"               # replace with real bucket
-    ak_2 = "ak_2"                       # replace with real access key
-    sk_2 = "sk_2"                       # replace with real secret key 
-    endpoint_url_2 = "endpoint_url_2"   # replace with real endpoint_url
-
-    test_prefix = "test/unittest"
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
-        f"{bucket}/{test_prefix}",
-        [
-            S3Config(
-                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-            ),
-            S3Config(
-                bucket_name=bucket_2,
-                access_key=ak_2,
-                secret_key=sk_2,
-                endpoint_url=endpoint_url_2,
-            ),
-        ],
-    )
-
-    ## will write 123 to s3://{bucket}/{test_prefix}/abc
-    multi_bucket_s3_writer1.write_string("abc", "123")
-
-    ## will write 123 to s3://{bucket}/{test_prefix}/abc
-    multi_bucket_s3_writer1.write("abc", "123".encode())
-
-    ## will write 123 to s3://{bucket}/{test_prefix}/efg
-    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
-
-    ## will write 123 to s3://{bucket_2}/{test_prefix}/abc
-    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
-
-    # s3 related
-    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
-
-    ## will write 123 to s3://{bucket}/{test_prefix}/abc
-    s3_writer1.write("abc", "123".encode())
-
-    ## will write 123 to s3://{bucket}/{test_prefix}/abc
-    s3_writer1.write_string("abc", "123")
-
-    ## will write 123 to s3://{bucket}/efg
-    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
-
-
-
-Check :doc:`../../api/data_reader_writer` for more details

+ 0 - 40
next_docs/en/user_guide/data/dataset.rst

@@ -1,40 +0,0 @@
-
-
-Dataset 
-===========
-
-
-Import Classes 
------------------
-
-Dataset 
-^^^^^^^^
-
-Each pdfs or image will form one ``Dataset``. As we all know, Pdf has two categories, :ref:`digital_method_section` or :ref:`ocr_method_section`.
-Will get ``ImageDataset`` which is subclass of ``Dataset`` with images and get ``PymuDocDataset`` from pdf files.
-The difference between ``ImageDataset`` and ``PymuDocDataset`` is that ``ImageDataset`` only support ``OCR`` parse method, 
-while ``PymuDocDataset`` support both ``OCR`` and ``TXT``
-
-.. note::
-
-    In fact some pdf may generated by images, that means it can not support ``TXT`` methods. Currently it is something the user needs to ensure does not happen
-
-
-
-Pdf Parse Methods
-------------------
-
-.. _ocr_method_section:
-OCR 
-^^^^
-Extract chars via ``Optical Character Recognition`` technical.
-
-.. _digital_method_section:
-TXT
-^^^^^^^^
-Extract chars via third-party library, currently we use ``pymupdf``. 
-
-
-
-Check :doc:`../../api/dataset` for more details
-

+ 0 - 25
next_docs/en/user_guide/data/io.rst

@@ -1,25 +0,0 @@
-
-IO
-===
-
-Aims for read or write bytes from different media, Currently We provide ``S3Reader``, ``S3Writer`` for AWS S3 compatible media 
-and ``HttpReader``, ``HttpWriter`` for remote Http file. You can implement new classes to meet the needs of your personal scenarios 
-if MinerU have not provide the suitable classes. It is easy to implement new classes, the only one requirement is to inherit from
-``IOReader`` or ``IOWriter``
-
-.. code:: python
-
-    class SomeReader(IOReader):
-        def read(self, path: str) -> bytes:
-            pass
-
-        def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
-            pass
-
-
-    class SomeWriter(IOWriter):
-        def write(self, path: str, data: bytes) -> None:
-            pass
-
-Check :doc:`../../api/io` for more details
-

+ 0 - 106
next_docs/en/user_guide/data/read_api.rst

@@ -1,106 +0,0 @@
-
-read_api 
-==========
-
-Read the content from file or directory to create ``Dataset``, Currently we provided serval functions that cover some scenarios.
-if you have new scenarios that is common to most of the users, you can post it on the offical github issues with detail descriptions.
-Also it is easy to implement your own read-related funtions.
-
-
-Important Functions
--------------------
-
-
-read_jsonl
-^^^^^^^^^^^^^^^^
-
-Read the contet from jsonl which may located on local machine or remote s3. if you want to know more about jsonl, please goto :doc:`../../additional_notes/glossary`
-
-.. code:: python
-
-    from magic_pdf.data.read_api import *
-    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
-    from magic_pdf.data.schemas import S3Config
-
-    # read jsonl from local machine
-    datasets = read_jsonl("tt.jsonl", None)   # replace with real jsonl file
-
-    # read jsonl from remote s3
-
-    bucket = "bucket_1"                     # replace with real s3 bucket
-    ak = "access_key_1"                     # replace with real s3 access key
-    sk = "secret_key_1"                     # replace with real s3 secret key
-    endpoint_url = "endpoint_url_1"         # replace with real s3 endpoint url
-
-    bucket_2 = "bucket_2"                   # replace with real s3 bucket
-    ak_2 = "access_key_2"                   # replace with real s3 access key
-    sk_2 = "secret_key_2"                   # replace with real s3 secret key
-    endpoint_url_2 = "endpoint_url_2"       # replace with real s3 endpoint url
-
-    s3configs = [
-        S3Config(
-            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        ),
-    ]
-
-    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
-
-    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # replace with real s3 jsonl file
-
-read_local_pdfs
-^^^^^^^^^^^^^^^^^
-
-Read pdf from path or directory.
-
-
-.. code:: python
-
-    from magic_pdf.data.read_api import *
-
-    # read pdf path
-    datasets = read_local_pdfs("tt.pdf")
-
-    # read pdfs under directory
-    datasets = read_local_pdfs("pdfs/")
-
-
-read_local_images
-^^^^^^^^^^^^^^^^^^^
-
-Read images from path or directory
-
-.. code:: python 
-
-    from magic_pdf.data.read_api import *
-
-    # read from image path 
-    datasets = read_local_images("tt.png")  # replace with real file path
-
-    # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_images("images/", suffixes=[".png", ".jpg"])  # replace with real directory 
-
-
-read_local_office
-^^^^^^^^^^^^^^^^^^^^
-Read MS-Office files from path or directory
-
-.. code:: python 
-
-    from magic_pdf.data.read_api import *
-
-    # read from image path 
-    datasets = read_local_office("tt.doc")  # replace with real file path
-
-    # read files from directory that endswith suffix in suffixes array 
-    datasets = read_local_office("docs/")  # replace with real directory 
-
-
-
-
-Check :doc:`../../api/read_api` for more details

+ 0 - 144
next_docs/en/user_guide/inference_result.rst

@@ -1,144 +0,0 @@
-
-Inference Result
-==================
-
-.. admonition:: Tip
-    :class: tip
-
-    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
-
-The **InferenceResult** class is a container for storing model inference results and implements a series of methods related to these results, such as draw_model, dump_model.
-Checkout :doc:`../api/model_operators` for more details about **InferenceResult**
-
-
-Model Inference Result
------------------------
-
-Structure Definition
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    from pydantic import BaseModel, Field
-    from enum import IntEnum
-
-    class CategoryType(IntEnum):
-            title = 0               # Title
-            plain_text = 1          # Text
-            abandon = 2             # Includes headers, footers, page numbers, and page annotations
-            figure = 3              # Image
-            figure_caption = 4      # Image description
-            table = 5               # Table
-            table_caption = 6       # Table description
-            table_footnote = 7      # Table footnote
-            isolate_formula = 8     # Block formula
-            formula_caption = 9     # Formula label
-
-            embedding = 13          # Inline formula
-            isolated = 14           # Block formula
-            text = 15               # OCR recognition result
-
-
-    class PageInfo(BaseModel):
-        page_no: int = Field(description="Page number, the first page is 0", ge=0)
-        height: int = Field(description="Page height", gt=0)
-        width: int = Field(description="Page width", ge=0)
-
-    class ObjectInferenceResult(BaseModel):
-        category_id: CategoryType = Field(description="Category", ge=0)
-        poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
-        score: float = Field(description="Confidence of the inference result")
-        latex: str | None = Field(description="LaTeX parsing result", default=None)
-        html: str | None = Field(description="HTML parsing result", default=None)
-
-    class PageInferenceResults(BaseModel):
-            layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
-            page_info: PageInfo = Field(description="Page metadata")
-
-
-Example
-^^^^^^^^^^^
-
-.. code:: json
-
-    [
-        {
-            "layout_dets": [
-                {
-                    "category_id": 2,
-                    "poly": [
-                        99.1906967163086,
-                        100.3119125366211,
-                        730.3707885742188,
-                        100.3119125366211,
-                        730.3707885742188,
-                        245.81326293945312,
-                        99.1906967163086,
-                        245.81326293945312
-                    ],
-                    "score": 0.9999997615814209
-                }
-            ],
-            "page_info": {
-                "page_no": 0,
-                "height": 2339,
-                "width": 1654
-            }
-        },
-        {
-            "layout_dets": [
-                {
-                    "category_id": 5,
-                    "poly": [
-                        99.13092803955078,
-                        2210.680419921875,
-                        497.3183898925781,
-                        2210.680419921875,
-                        497.3183898925781,
-                        2264.78076171875,
-                        99.13092803955078,
-                        2264.78076171875
-                    ],
-                    "score": 0.9999997019767761
-                }
-            ],
-            "page_info": {
-                "page_no": 1,
-                "height": 2339,
-                "width": 1654
-            }
-        }
-    ]
-
-The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
-representing the coordinates of the top-left, top-right, bottom-right,
-and bottom-left points respectively. |Poly Coordinate Diagram|
-
-
-
-Inference Result
--------------------------
-
-
-.. code:: python
-
-    from magic_pdf.operators.models import InferenceResult
-    from magic_pdf.data.dataset import Dataset
-
-    dataset : Dataset = some_data_set    # not real dataset
-
-    # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
-    model_inference_result: list[PageInferenceResults] = []
-
-    Inference_result = InferenceResult(model_inference_result, dataset)
-
-
-
-some_model.pdf
-^^^^^^^^^^^^^^^^^^^^
-
-.. figure:: ../_static/image/inference_result.png
-
-
-
-.. |Poly Coordinate Diagram| image:: ../_static/image/poly.png

+ 0 - 12
next_docs/en/user_guide/install.rst

@@ -1,12 +0,0 @@
-
-Installation
-==============
-
-.. toctree::
-   :maxdepth: 1
-
-   install/install
-   install//boost_with_cuda
-   install/download_model_weight_files
-   install/config
-

+ 0 - 255
next_docs/en/user_guide/install/boost_with_cuda.rst

@@ -1,255 +0,0 @@
-
-Boost With Cuda 
-================
-
-
-If your device supports CUDA and meets the GPU requirements of the
-mainline environment, you can use GPU acceleration. Please select the
-appropriate guide based on your system:
-
--  :ref:`ubuntu_22_04_lts_section`
--  :ref:`windows_10_or_11_section`
-
-
-.. _ubuntu_22_04_lts_section:
-
-Ubuntu 22.04 LTS
------------------
-
-1. Check if NVIDIA Drivers Are Installed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sh
-
-   nvidia-smi
-
-If you see information similar to the following, it means that the
-NVIDIA drivers are already installed, and you can skip Step 2.
-
-.. note::
-
-   ``CUDA Version`` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
-
-.. code:: text
-
-   +---------------------------------------------------------------------------------------+
-   | NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
-   |-----------------------------------------+----------------------+----------------------+
-   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-   |                                         |                      |               MIG M. |
-   |=========================================+======================+======================|
-   |   0  NVIDIA GeForce RTX 3060 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
-   |  0%   51C    P8              12W / 200W |   1489MiB /  8192MiB |      5%      Default |
-   |                                         |                      |                  N/A |
-   +-----------------------------------------+----------------------+----------------------+
-
-2. Install the Driver
-~~~~~~~~~~~~~~~~~~~~~
-
-If no driver is installed, use the following command:
-
-.. code:: sh
-
-   sudo apt-get update
-   sudo apt-get install nvidia-driver-570-server
-
-Install the proprietary driver and restart your computer after
-installation.
-
-.. code:: sh
-
-   reboot
-
-3. Install Anaconda
-~~~~~~~~~~~~~~~~~~~
-
-If Anaconda is already installed, skip this step.
-
-.. code:: sh
-
-   wget https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
-   bash Anaconda3-2024.06-1-Linux-x86_64.sh
-
-In the final step, enter ``yes``, close the terminal, and reopen it.
-
-4. Create an Environment Using Conda
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Specify Python version 3.10~3.13.
-
-.. code:: sh
-
-    conda create -n mineru 'python=3.12' -y
-    conda activate mineru
-
-5. Install Applications
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sh
-
-   pip install -U magic-pdf[full]
-
-.. admonition:: TIP
-    :class: tip
-
-    After installation, you can check the version of ``magic-pdf`` using the following command:
-
-.. code:: sh
-
-   magic-pdf --version
-
-
-6. Download Models
-~~~~~~~~~~~~~~~~~~
-
-Refer to detailed instructions on :doc:`download_model_weight_files`
-
-7. Understand the Location of the Configuration File
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-After completing the `6. Download Models <#6-download-models>`__ step,
-the script will automatically generate a ``magic-pdf.json`` file in the
-user directory and configure the default model path. You can find the
-``magic-pdf.json`` file in your user directory.
-
-.. admonition:: TIP
-    :class: tip
-
-    The user directory for Linux is “/home/username”.
-
-8. First Run
-~~~~~~~~~~~~
-
-Download a sample file from the repository and test it.
-
-.. code:: sh
-
-   wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
-   magic-pdf -p small_ocr.pdf -o ./output
-
-9. Test CUDA Acceleration
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your graphics card has at least **8GB** of VRAM, follow these steps
-to test CUDA acceleration:
-
-1. Modify the value of ``"device-mode"`` in the ``magic-pdf.json``
-   configuration file located in your home directory.
-
-   .. code:: json
-
-      {
-        "device-mode": "cuda"
-      }
-
-2. Test CUDA acceleration with the following command:
-
-   .. code:: sh
-
-      magic-pdf -p small_ocr.pdf -o ./output
-
-
-.. _windows_10_or_11_section:
-
-Windows 10/11
---------------
-
-1. Install CUDA
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You need to install a CUDA version that is compatible with torch's requirements. For details, please refer to the [official PyTorch website](https://pytorch.org/get-started/locally/).
-
-- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
-- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
-- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
-- CUDA 12.8 https://developer.nvidia.com/cuda-12-8-0-download-archive
-
-
-2. Install Anaconda
-~~~~~~~~~~~~~~~~~~~
-
-If Anaconda is already installed, you can skip this step.
-
-Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86_64.exe
-
-3. Create an Environment Using Conda
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-::
-
-    conda create -n mineru 'python=3.12' -y
-    conda activate mineru
-
-4. Install Applications
-~~~~~~~~~~~~~~~~~~~~~~~
-
-::
-
-   pip install -U magic-pdf[full]
-
-.. admonition:: Tip
-    :class: tip
-
-    After installation, you can check the version of ``magic-pdf``:
-
-    .. code:: bash
-
-      magic-pdf --version
-
-
-5. Download Models
-~~~~~~~~~~~~~~~~~~
-
-Refer to detailed instructions on :doc:`download_model_weight_files`
-
-6. Understand the Location of the Configuration File
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-After completing the `5. Download Models <#5-download-models>`__ step,
-the script will automatically generate a ``magic-pdf.json`` file in the
-user directory and configure the default model path. You can find the
-``magic-pdf.json`` file in your 【user directory】 .
-
-.. admonition:: Tip
-    :class: tip
-
-    The user directory for Windows is “C:/Users/username”.
-
-7. First Run
-~~~~~~~~~~~~
-
-Download a sample file from the repository and test it.
-
-.. code:: powershell
-
-     wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
-     magic-pdf -p small_ocr.pdf -o ./output
-
-8. Test CUDA Acceleration
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your graphics card has at least 8GB of VRAM, follow these steps to
-test CUDA-accelerated parsing performance.
-
-1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
-
-.. code:: sh
-
-   pip install --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu124
-
-
-2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json``
-   configuration file located in your user directory.
-
-   .. code:: json
-
-      {
-        "device-mode": "cuda"
-      }
-
-3. **Run the following command to test CUDA acceleration**:
-
-   ::
-
-      magic-pdf -p small_ocr.pdf -o ./output

+ 0 - 168
next_docs/en/user_guide/install/config.rst

@@ -1,168 +0,0 @@
-
-
-Config
-=========
-
-File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
-
-.. admonition:: Tip 
-    :class: tip
-
-    You can override the default location of config file via the following command:
-    
-    export MINERU_TOOLS_CONFIG_JSON=new_magic_pdf.json
-
-
-
-magic-pdf.json
-----------------
-
-.. code:: json 
-
-    {
-        "bucket_info":{
-            "bucket-name-1":["ak", "sk", "endpoint"],
-            "bucket-name-2":["ak", "sk", "endpoint"]
-        },
-        "models-dir":"/tmp/models",
-        "layoutreader-model-dir":"/tmp/layoutreader",
-        "device-mode":"cpu",
-        "layout-config": {
-            "model": "doclayout_yolo"
-        },
-        "formula-config": {
-            "mfd_model": "yolo_v8_mfd",
-            "mfr_model": "unimernet_small",
-            "enable": true
-        },
-        "table-config": {
-            "model": "rapid_table",
-            "enable": true,
-            "max_time": 400    
-        },
-        "config_version": "1.0.0"
-    }
-
-
-
-
-bucket_info
-^^^^^^^^^^^^^^
-Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
-
-Example: 
-
-.. code:: text
-
-        {
-            "image_bucket":[{access_key}, {secret_key}, {endpoint}],
-            "video_bucket":[{access_key}, {secret_key}, {endpoint}]
-        }
-
-
-models-dir
-^^^^^^^^^^^^
-
-Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
-
-
-layoutreader-model-dir
-^^^^^^^^^^^^^^^^^^^^^^^
-
-Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
-
-
-devide-mode
-^^^^^^^^^^^^^^
-
-This field have two options, **cpu** or **cuda**.
-
-**cpu**: inference via cpu
-
-**cuda**: using cuda to accelerate inference
-
-
-layout-config 
-^^^^^^^^^^^^^^^
-
-.. code:: json
-
-    {
-        "model": "doclayout_yolo"
-    }
-
-layout model can not be disabled now.
-
-
-formula-config
-^^^^^^^^^^^^^^^^
-
-.. code:: json
-
-    {
-        "mfd_model": "yolo_v8_mfd",   
-        "mfr_model": "unimernet_small",
-        "enable": true 
-    }
-
-
-mfd_model
-""""""""""
-
-Specify the formula detection model, options are ['yolo_v8_mfd']
-
-
-mfr_model
-""""""""""
-Specify the formula recognition model, options are ['unimernet_small']
-
-Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
-
-
-enable
-""""""""
-
-on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
-
-
-table-config
-^^^^^^^^^^^^^^^^
-
-.. code:: json
-
-   {
-        "model": "rapid_table",
-        "enable": true,
-        "max_time": 400    
-    }
-
-model
-""""""""
-
-Specify the table inference model, options are ['rapid_table']
-
-
-max_time
-"""""""""
-
-Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
-
-
-
-enable
-"""""""
-
-on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
-
-
-config_version
-^^^^^^^^^^^^^^^^
-
-The version of config schema.
-
-
-.. admonition:: Tip
-    :class: tip
-    
-    Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest details
-

+ 0 - 37
next_docs/en/user_guide/install/download_model_weight_files.rst

@@ -1,37 +0,0 @@
-
-Download Model Weight Files
-==============================
-
-Model downloads are divided into initial downloads and updates to the
-model directory. Please refer to the corresponding documentation for
-instructions on how to proceed.
-
-Initial download of model files
-------------------------------
-
-1. Download the Model from Hugging Face
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use a Python Script to Download Model Files from Hugging Face
-
-.. code:: bash
-
-   pip install huggingface_hub
-   wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
-   python download_models_hf.py
-
-The Python script will automatically download the model files and
-configure the model directory in the configuration file.
-
-The configuration file can be found in the user directory, with the
-filename ``magic-pdf.json``.
-
-How to update models previously downloaded
------------------------------------------
-
-1. Models downloaded via Hugging Face or Model Scope
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you previously downloaded models via Hugging Face or Model Scope, you
-can rerun the Python script used for the initial download. This will
-automatically update the model directory to the latest version.

+ 0 - 142
next_docs/en/user_guide/install/install.rst

@@ -1,142 +0,0 @@
-
-Install 
-===============================================================
-If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
-If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
-
-Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
-
-.. admonition:: Warning
-    :class: tip
-
-    **Pre-installation Notice—Hardware and Software Environment Support**
-
-    To ensure the stability and reliability of the project, we only optimize
-    and test for specific hardware and software environments during
-    development. This ensures that users deploying and running the project
-    on recommended system configurations will get the best performance with
-    the fewest compatibility issues.
-
-    By focusing resources on the mainline environment, our team can more
-    efficiently resolve potential bugs and develop new features.
-
-    In non-mainline environments, due to the diversity of hardware and
-    software configurations, as well as third-party dependency compatibility
-    issues, we cannot guarantee 100% project availability. Therefore, for
-    users who wish to use this project in non-recommended environments, we
-    suggest carefully reading the documentation and FAQ first. Most issues
-    already have corresponding solutions in the FAQ. We also encourage
-    community feedback to help us gradually expand support.
-
-.. raw:: html
-
-    <style>
-        table, th, td {
-        border: 1px solid black;
-        border-collapse: collapse;
-        }
-    </style>
-    <table>
-    <tr>
-        <td colspan="3" rowspan="2">Operating System</td>
-    </tr>
-    <tr>
-        <td>Linux after 2019</td>
-        <td>Windows 10 / 11</td>
-        <td>macOS 11+</td>
-    </tr>
-    <tr>
-        <td colspan="3">CPU</td>
-        <td>x86_64 / arm64</td>
-        <td>x86_64(unsupported ARM Windows)</td>
-        <td>x86_64 / arm64</td>
-    </tr>
-    <tr>
-        <td colspan="3">Memory Requirements</td>
-        <td colspan="3">16GB or more, recommended 32GB+</td>
-    </tr>
-    <tr>
-        <td colspan="3">Storage Requirements</td>
-        <td colspan="3">20GB or more, with a preference for SSD</td>
-    </tr>
-    <tr>
-        <td colspan="3">Python Version</td>
-        <td colspan="3">3.10~3.13</td>
-    </tr>
-    <tr>
-        <td colspan="3">Nvidia Driver Version</td>
-        <td>latest (Proprietary Driver)</td>
-        <td>latest</td>
-        <td>None</td>
-    </tr>
-    <tr>
-        <td colspan="3">CUDA Environment</td>
-        <td colspan="2"><a href="https://pytorch.org/get-started/locally/">Refer to the PyTorch official website</a></td>
-        <td>None</td>
-    </tr>
-    <tr>
-        <td colspan="3">CANN Environment(NPU support)</td>
-        <td>8.0+(Ascend 910b)</td>
-        <td>None</td>
-        <td>None</td>
-    </tr>
-    <tr>
-        <td rowspan="2">GPU/MPS Hardware Support List</td>
-        <td colspan="2">GPU VRAM 6GB or more</td>
-        <td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
-        More than 6GB VRAM </td>
-        <td rowspan="2">Apple silicon</td>
-    </tr>
-    </table>
-
-
-
-Create an environment
----------------------------
-
-.. code-block:: shell
-
-    conda create -n mineru 'python=3.12' -y
-    conda activate mineru
-    pip install -U "magic-pdf[full]"
-
-
-Download model weight files
-------------------------------
-
-.. code-block:: shell
-
-    pip install huggingface_hub
-    wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
-    python download_models_hf.py    
-
-
-
-Install LibreOffice[Optional]
-----------------------------------
-
-This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
-
-
-Linux/Macos Platform
-""""""""""""""""""""""
-
-.. code::
-
-    apt-get/yum/brew install libreoffice
-
-
-Windows Platform 
-""""""""""""""""""""
-
-.. code::
-
-    install libreoffice 
-    append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
-
-
-.. tip::
-
-    The MinerU is installed, Check out :doc:`../usage/command_line` to convert your first pdf **or** reading the following sections for more details about install
-
-

+ 0 - 335
next_docs/en/user_guide/pipe_result.rst

@@ -1,335 +0,0 @@
-
-
-Pipe Result
-==============
-
-.. admonition:: Tip
-    :class: tip
-
-    Please first navigate to :doc:`tutorial/pipeline` to get an initial understanding of how the pipeline works; this will help in understanding the content of this section.
-
-
-The **PipeResult** class is a container for storing pipeline processing results and implements a series of methods related to these results, such as draw_layout, draw_span.
-Checkout :doc:`../api/pipe_operators` for more details about **PipeResult**
-
-
-
-Structure Definitions
--------------------------------
-
-**some_pdf_middle.json**
-
-+----------------+--------------------------------------------------------------+
-| Field Name     | Description                                                  |
-|                |                                                              |
-+================+==============================================================+
-| pdf_info       | list, each element is a dict representing the parsing result |
-|                | of each PDF page, see the table below for details            |
-+----------------+--------------------------------------------------------------+
-| \_             | ocr \| txt, used to indicate the mode used in this           |
-| parse_type     | intermediate parsing state                                   |
-|                |                                                              |
-+----------------+--------------------------------------------------------------+
-| \_version_name | string, indicates the version of magic-pdf used in this      |
-|                | parsing                                                      |
-|                |                                                              |
-+----------------+--------------------------------------------------------------+
-
-**pdf_info**
-
-Field structure description
-
-+-------------------------+------------------------------------------------------------+
-| Field                   | Description                                                |
-| Name                    |                                                            |
-+=========================+============================================================+
-| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
-|                         | segmented                                                  |
-+-------------------------+------------------------------------------------------------+
-| layout_bboxes           | Layout segmentation results, containing layout direction   |
-|                         | (vertical, horizontal), and bbox, sorted by reading order  |
-+-------------------------+------------------------------------------------------------+
-| page_idx                | Page number, starting from 0                               |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| page_size               | Page width and height                                      |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| \_layout_tree           | Layout tree structure                                      |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| images                  | list, each element is a dict representing an img_block     |
-+-------------------------+------------------------------------------------------------+
-| tables                  | list, each element is a dict representing a table_block    |
-+-------------------------+------------------------------------------------------------+
-| interline_equation      | list, each element is a dict representing an               |
-|                         | interline_equation_block                                   |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| discarded_blocks        | List, block information returned by the model that needs   |
-|                         | to be dropped                                              |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| para_blocks             | Result after segmenting preproc_blocks                     |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-
-In the above table, ``para_blocks`` is an array of dicts, each dict
-representing a block structure. A block can support up to one level of
-nesting.
-
-**block**
-
-The outer block is referred to as a first-level block, and the fields in
-the first-level block include:
-
-+------------------------+-------------------------------------------------------------+
-| Field                  | Description                                                 |
-| Name                   |                                                             |
-+========================+=============================================================+
-| type                   | Block type (table|image)                                    |
-+------------------------+-------------------------------------------------------------+
-| bbox                   | Block bounding box coordinates                              |
-+------------------------+-------------------------------------------------------------+
-| blocks                 | list, each element is a dict representing a second-level    |
-|                        | block                                                       |
-+------------------------+-------------------------------------------------------------+
-
-There are only two types of first-level blocks: “table” and “image”. All
-other blocks are second-level blocks.
-
-The fields in a second-level block include:
-
-+----------------------+----------------------------------------------------------------+
-| Field                | Description                                                    |
-| Name                 |                                                                |
-+======================+================================================================+
-|                      | Block type                                                     |
-| type                 |                                                                |
-+----------------------+----------------------------------------------------------------+
-|                      | Block bounding box coordinates                                 |
-| bbox                 |                                                                |
-+----------------------+----------------------------------------------------------------+
-|                      | list, each element is a dict representing a line, used to      |
-| lines                | describe the composition of a line of information              |
-+----------------------+----------------------------------------------------------------+
-
-Detailed explanation of second-level block types
-
-================== ======================
-type               Description
-================== ======================
-image_body         Main body of the image
-image_caption      Image description text
-table_body         Main body of the table
-table_caption      Table description text
-table_footnote     Table footnote
-text               Text block
-title              Title block
-interline_equation Block formula
-================== ======================
-
-**line**
-
-The field format of a line is as follows:
-
-+---------------------+----------------------------------------------------------------+
-| Field               | Description                                                    |
-| Name                |                                                                |
-+=====================+================================================================+
-|                     | Bounding box coordinates of the line                           |
-| bbox                |                                                                |
-+---------------------+----------------------------------------------------------------+
-| spans               | list, each element is a dict representing a span, used to      |
-|                     | describe the composition of the smallest unit                  |
-+---------------------+----------------------------------------------------------------+
-
-**span**
-
-+---------------------+-----------------------------------------------------------+
-| Field               | Description                                               |
-| Name                |                                                           |
-+=====================+===========================================================+
-| bbox                | Bounding box coordinates of the span                      |
-+---------------------+-----------------------------------------------------------+
-| type                | Type of the span                                          |
-+---------------------+-----------------------------------------------------------+
-| content             | Text spans use content, chart spans use img_path to store |
-| \|                  | the actual text or screenshot path information            |
-| img_path            |                                                           |
-+---------------------+-----------------------------------------------------------+
-
-The types of spans are as follows:
-
-================== ==============
-type               Description
-================== ==============
-image              Image
-table              Table
-text               Text
-inline_equation    Inline formula
-interline_equation Block formula
-================== ==============
-
-**Summary**
-
-A span is the smallest storage unit for all elements.
-
-The elements stored within para_blocks are block information.
-
-The block structure is as follows:
-
-First-level block (if any) -> Second-level block -> Line -> Span
-
-.. _example-1:
-
-example
-^^^^^^^
-
-.. code:: json
-
-   {
-       "pdf_info": [
-           {
-               "preproc_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ],
-               "layout_bboxes": [
-                   {
-                       "layout_bbox": [
-                           52,
-                           61,
-                           294,
-                           731
-                       ],
-                       "layout_label": "V",
-                       "sub_layout": []
-                   }
-               ],
-               "page_idx": 0,
-               "page_size": [
-                   612.0,
-                   792.0
-               ],
-               "_layout_tree": [],
-               "images": [],
-               "tables": [],
-               "interline_equations": [],
-               "discarded_blocks": [],
-               "para_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ]
-           }
-       ],
-       "_parse_type": "txt",
-       "_version_name": "0.6.1"
-   }
-
-
-Pipeline Result
-------------------
-
-.. code:: python
-
-    from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
-    from magic_pdf.operators.pipes import PipeResult
-    from magic_pdf.data.dataset import Dataset
-
-    res = pdf_parse_union(*args, **kwargs)
-    res['_parse_type'] = PARSE_TYPE_OCR
-    res['_version_name'] = __version__
-    if 'lang' in kwargs and kwargs['lang'] is not None:
-        res['lang'] = kwargs['lang']
-
-    dataset : Dataset = some_dataset   # not real dataset
-    pipeResult = PipeResult(res, dataset)
-
-
-
-some_pdf_layout.pdf
-~~~~~~~~~~~~~~~~~~~
-
-Each page layout consists of one or more boxes. The number at the top
-left of each box indicates its sequence number. Additionally, in
-``layout.pdf``, different content blocks are highlighted with different
-background colors.
-
-.. figure:: ../_static/image/layout_example.png
-   :alt: layout example
-
-   layout example
-
-some_pdf_spans.pdf
-~~~~~~~~~~~~~~~~~~
-
-All spans on the page are drawn with different colored line frames
-according to the span type. This file can be used for quality control,
-allowing for quick identification of issues such as missing text or
-unrecognized inline formulas.
-
-.. figure:: ../_static/image/spans_example.png
-   :alt: spans example
-
-   spans example

+ 0 - 12
next_docs/en/user_guide/quick_start.rst

@@ -1,12 +0,0 @@
-
-Quick Start 
-==============
-
-Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
-
-.. toctree::
-    :maxdepth: 1
-
-    quick_start/convert_pdf 
-    quick_start/convert_image
-    quick_start/convert_ms_office

+ 0 - 47
next_docs/en/user_guide/quick_start/convert_image.rst

@@ -1,47 +0,0 @@
-
-
-Convert Image
-===============
-
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # make sure the file have correct suffix
-    magic-pdf -p a.png -o output -m auto
-
-
-API
-^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_images
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_image.jpg"       # replace with real image file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_images(input_file)[0]
-
-    # ocr mode
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )

+ 0 - 60
next_docs/en/user_guide/quick_start/convert_ms_office.rst

@@ -1,60 +0,0 @@
-
-
-Convert Doc
-=============
-
-.. admonition:: Warning
-    :class: tip
-
-    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
-
-    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
-
-
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
-    magic-pdf -p a.doc -o output -m auto
-
-
-API
-^^^^^^^^
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-    from magic_pdf.config.enums import SupportedPdfParseMethod
-
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_doc.doc"     # replace with real ms-office file, we support MS-DOC, MS-DOCX, MS-PPT, MS-PPTX now
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir)
-    else:
-        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir)

+ 0 - 56
next_docs/en/user_guide/quick_start/convert_pdf.rst

@@ -1,56 +0,0 @@
-
-
-Convert PDF
-============
-
-Command Line
-^^^^^^^^^^^^^
-
-.. code:: python
-
-    # make sure the file have correct suffix
-    magic-pdf -p a.pdf -o output -m auto
-
-
-API
-^^^^^^
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-    name_without_suff = pdf_file_name.split(".")[0]
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{name_without_suff}.md", image_dir
-    )
-
-    else:
-        ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{name_without_suff}.md", image_dir
-    )

+ 0 - 11
next_docs/en/user_guide/tutorial.rst

@@ -1,11 +0,0 @@
-
-Tutorial
-===========
-
-From the beginning to the end, Show how to using mineru via a minimal project
-
-.. toctree::
-    :maxdepth: 1
-
-    tutorial/pipeline
-

+ 0 - 412
next_docs/en/user_guide/tutorial/output_file_description.rst

@@ -1,412 +0,0 @@
-
-Output File Description
-=========================
-
-After executing the ``magic-pdf`` command, in addition to outputting
-files related to markdown, several other files unrelated to markdown
-will also be generated. These files will be introduced one by one.
-
-some_pdf_layout.pdf
-~~~~~~~~~~~~~~~~~~~
-
-Each page layout consists of one or more boxes. The number at the top
-left of each box indicates its sequence number. Additionally, in
-``layout.pdf``, different content blocks are highlighted with different
-background colors.
-
-.. figure:: ../../_static/image/layout_example.png
-   :alt: layout example
-
-   layout example
-
-some_pdf_spans.pdf
-~~~~~~~~~~~~~~~~~~
-
-All spans on the page are drawn with different colored line frames
-according to the span type. This file can be used for quality control,
-allowing for quick identification of issues such as missing text or
-unrecognized inline formulas.
-
-.. figure:: ../../_static/image/spans_example.png
-   :alt: spans example
-
-   spans example
-
-some_pdf_model.json
-~~~~~~~~~~~~~~~~~~~
-
-Structure Definition
-^^^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-   from pydantic import BaseModel, Field
-   from enum import IntEnum
-
-   class CategoryType(IntEnum):
-        title = 0               # Title
-        plain_text = 1          # Text
-        abandon = 2             # Includes headers, footers, page numbers, and page annotations
-        figure = 3              # Image
-        figure_caption = 4      # Image description
-        table = 5               # Table
-        table_caption = 6       # Table description
-        table_footnote = 7      # Table footnote
-        isolate_formula = 8     # Block formula
-        formula_caption = 9     # Formula label
-
-        embedding = 13          # Inline formula
-        isolated = 14           # Block formula
-        text = 15               # OCR recognition result
-
-
-   class PageInfo(BaseModel):
-       page_no: int = Field(description="Page number, the first page is 0", ge=0)
-       height: int = Field(description="Page height", gt=0)
-       width: int = Field(description="Page width", ge=0)
-
-   class ObjectInferenceResult(BaseModel):
-       category_id: CategoryType = Field(description="Category", ge=0)
-       poly: list[float] = Field(description="Quadrilateral coordinates, representing the coordinates of the top-left, top-right, bottom-right, and bottom-left points respectively")
-       score: float = Field(description="Confidence of the inference result")
-       latex: str | None = Field(description="LaTeX parsing result", default=None)
-       html: str | None = Field(description="HTML parsing result", default=None)
-
-   class PageInferenceResults(BaseModel):
-        layout_dets: list[ObjectInferenceResult] = Field(description="Page recognition results", ge=0)
-        page_info: PageInfo = Field(description="Page metadata")
-
-
-   # The inference results of all pages, ordered by page number, are stored in a list as the inference results of MinerU
-   inference_result: list[PageInferenceResults] = []
-
-The format of the poly coordinates is [x0, y0, x1, y1, x2, y2, x3, y3],
-representing the coordinates of the top-left, top-right, bottom-right,
-and bottom-left points respectively. |Poly Coordinate Diagram|
-
-example
-^^^^^^^
-
-.. code:: json
-
-   [
-       {
-           "layout_dets": [
-               {
-                   "category_id": 2,
-                   "poly": [
-                       99.1906967163086,
-                       100.3119125366211,
-                       730.3707885742188,
-                       100.3119125366211,
-                       730.3707885742188,
-                       245.81326293945312,
-                       99.1906967163086,
-                       245.81326293945312
-                   ],
-                   "score": 0.9999997615814209
-               }
-           ],
-           "page_info": {
-               "page_no": 0,
-               "height": 2339,
-               "width": 1654
-           }
-       },
-       {
-           "layout_dets": [
-               {
-                   "category_id": 5,
-                   "poly": [
-                       99.13092803955078,
-                       2210.680419921875,
-                       497.3183898925781,
-                       2210.680419921875,
-                       497.3183898925781,
-                       2264.78076171875,
-                       99.13092803955078,
-                       2264.78076171875
-                   ],
-                   "score": 0.9999997019767761
-               }
-           ],
-           "page_info": {
-               "page_no": 1,
-               "height": 2339,
-               "width": 1654
-           }
-       }
-   ]
-
-some_pdf_middle.json
-~~~~~~~~~~~~~~~~~~~~
-
-+----------------+--------------------------------------------------------------+
-| Field Name     | Description                                                  |
-|                |                                                              |
-+================+==============================================================+
-| pdf_info       | list, each element is a dict representing the parsing result |
-|                | of each PDF page, see the table below for details            |
-+----------------+--------------------------------------------------------------+
-| \_             | ocr \| txt, used to indicate the mode used in this           |
-| parse_type     | intermediate parsing state                                   |
-|                |                                                              |
-+----------------+--------------------------------------------------------------+
-| \_version_name | string, indicates the version of magic-pdf used in this      |
-|                | parsing                                                      |
-|                |                                                              |
-+----------------+--------------------------------------------------------------+
-
-**pdf_info**
-
-Field structure description
-
-+-------------------------+------------------------------------------------------------+
-| Field                   | Description                                                |
-| Name                    |                                                            |
-+=========================+============================================================+
-| preproc_blocks          | Intermediate result after PDF preprocessing, not yet       |
-|                         | segmented                                                  |
-+-------------------------+------------------------------------------------------------+
-| layout_bboxes           | Layout segmentation results, containing layout direction   |
-|                         | (vertical, horizontal), and bbox, sorted by reading order  |
-+-------------------------+------------------------------------------------------------+
-| page_idx                | Page number, starting from 0                               |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| page_size               | Page width and height                                      |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| \_layout_tree           | Layout tree structure                                      |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| images                  | list, each element is a dict representing an img_block     |
-+-------------------------+------------------------------------------------------------+
-| tables                  | list, each element is a dict representing a table_block    |
-+-------------------------+------------------------------------------------------------+
-| interline_equation      | list, each element is a dict representing an               |
-|                         | interline_equation_block                                   |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| discarded_blocks        | List, block information returned by the model that needs   |
-|                         | to be dropped                                              |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-| para_blocks             | Result after segmenting preproc_blocks                     |
-|                         |                                                            |
-+-------------------------+------------------------------------------------------------+
-
-In the above table, ``para_blocks`` is an array of dicts, each dict
-representing a block structure. A block can support up to one level of
-nesting.
-
-**block**
-
-The outer block is referred to as a first-level block, and the fields in
-the first-level block include:
-
-+------------------------+-------------------------------------------------------------+
-| Field                  | Description                                                 |
-| Name                   |                                                             |
-+========================+=============================================================+
-| type                   | Block type (table|image)                                    |
-+------------------------+-------------------------------------------------------------+
-| bbox                   | Block bounding box coordinates                              |
-+------------------------+-------------------------------------------------------------+
-| blocks                 | list, each element is a dict representing a second-level    |
-|                        | block                                                       |
-+------------------------+-------------------------------------------------------------+
-
-There are only two types of first-level blocks: “table” and “image”. All
-other blocks are second-level blocks.
-
-The fields in a second-level block include:
-
-+----------------------+----------------------------------------------------------------+
-| Field                | Description                                                    |
-| Name                 |                                                                |
-+======================+================================================================+
-|                      | Block type                                                     |
-| type                 |                                                                |
-+----------------------+----------------------------------------------------------------+
-|                      | Block bounding box coordinates                                 |
-| bbox                 |                                                                |
-+----------------------+----------------------------------------------------------------+
-|                      | list, each element is a dict representing a line, used to      |
-| lines                | describe the composition of a line of information              |
-+----------------------+----------------------------------------------------------------+
-
-Detailed explanation of second-level block types
-
-================== ======================
-type               Description
-================== ======================
-image_body         Main body of the image
-image_caption      Image description text
-table_body         Main body of the table
-table_caption      Table description text
-table_footnote     Table footnote
-text               Text block
-title              Title block
-interline_equation Block formula
-================== ======================
-
-**line**
-
-The field format of a line is as follows:
-
-+---------------------+----------------------------------------------------------------+
-| Field               | Description                                                    |
-| Name                |                                                                |
-+=====================+================================================================+
-|                     | Bounding box coordinates of the line                           |
-| bbox                |                                                                |
-+---------------------+----------------------------------------------------------------+
-| spans               | list, each element is a dict representing a span, used to      |
-|                     | describe the composition of the smallest unit                  |
-+---------------------+----------------------------------------------------------------+
-
-**span**
-
-+---------------------+-----------------------------------------------------------+
-| Field               | Description                                               |
-| Name                |                                                           |
-+=====================+===========================================================+
-| bbox                | Bounding box coordinates of the span                      |
-+---------------------+-----------------------------------------------------------+
-| type                | Type of the span                                          |
-+---------------------+-----------------------------------------------------------+
-| content             | Text spans use content, chart spans use img_path to store |
-| \|                  | the actual text or screenshot path information            |
-| img_path            |                                                           |
-+---------------------+-----------------------------------------------------------+
-
-The types of spans are as follows:
-
-================== ==============
-type               Description
-================== ==============
-image              Image
-table              Table
-text               Text
-inline_equation    Inline formula
-interline_equation Block formula
-================== ==============
-
-**Summary**
-
-A span is the smallest storage unit for all elements.
-
-The elements stored within para_blocks are block information.
-
-The block structure is as follows:
-
-First-level block (if any) -> Second-level block -> Line -> Span
-
-.. _example-1:
-
-example
-^^^^^^^
-
-.. code:: json
-
-   {
-       "pdf_info": [
-           {
-               "preproc_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ],
-               "layout_bboxes": [
-                   {
-                       "layout_bbox": [
-                           52,
-                           61,
-                           294,
-                           731
-                       ],
-                       "layout_label": "V",
-                       "sub_layout": []
-                   }
-               ],
-               "page_idx": 0,
-               "page_size": [
-                   612.0,
-                   792.0
-               ],
-               "_layout_tree": [],
-               "images": [],
-               "tables": [],
-               "interline_equations": [],
-               "discarded_blocks": [],
-               "para_blocks": [
-                   {
-                       "type": "text",
-                       "bbox": [
-                           52,
-                           61.956024169921875,
-                           294,
-                           82.99800872802734
-                       ],
-                       "lines": [
-                           {
-                               "bbox": [
-                                   52,
-                                   61.956024169921875,
-                                   294,
-                                   72.0000228881836
-                               ],
-                               "spans": [
-                                   {
-                                       "bbox": [
-                                           54.0,
-                                           61.956024169921875,
-                                           296.2261657714844,
-                                           72.0000228881836
-                                       ],
-                                       "content": "dependent on the service headway and the reliability of the departure ",
-                                       "type": "text",
-                                       "score": 1.0
-                                   }
-                               ]
-                           }
-                       ]
-                   }
-               ]
-           }
-       ],
-       "_parse_type": "txt",
-       "_version_name": "0.6.1"
-   }
-
-.. |Poly Coordinate Diagram| image:: ../../_static/image/poly.png

+ 0 - 182
next_docs/en/user_guide/tutorial/pipeline.rst

@@ -1,182 +0,0 @@
-
-
-Pipeline
-==========
-
-
-Minimal Example 
-^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-    name_without_suff = pdf_file_name.split(".")[0]
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-Running the above code will result in the following
-
-
-.. code:: bash 
-
-    output/
-    ├── abc.md
-    └── images
-
-
-Excluding the setup of the environment, such as creating directories and importing dependencies, the actual code snippet for converting pdf to markdown is as follows
-
-
-.. code:: python 
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-``ds.apply(doc_analyze, ocr=True)`` generates an ``InferenceResult`` object. The ``InferenceResult`` object, when executing the ``pipe_ocr_mode`` method, produces a ``PipeResult`` object.
-The ``PipeResult`` object, upon executing ``dump_md``, generates a ``markdown`` file at the specified location.
-
-
-The pipeline execution process is illustrated in the following diagram
-
-
-.. image:: ../../_static/image/pipeline.drawio.svg 
-
-.. raw:: html
-
-    <br> </br>
-
-Currently, the process is divided into three stages: data, inference, and processing, which correspond to the ``Dataset``, ``InferenceResult``, and ``PipeResult`` entities in the diagram.
-These stages are linked together through methods like ``apply``, ``doc_analyze``, or ``pipe_ocr_mode``
-
-
-.. admonition:: Tip
-    :class: tip
-
-    For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
-
-
-Pipeline Composition
-^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: python 
-
-    class Dataset(ABC):
-        @abstractmethod
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(self, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            pass
-
-    class InferenceResult(InferenceResultBase):
-
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(inference_result, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
-
-        def pipe_ocr_mode(
-            self,
-            imageWriter: DataWriter,
-            start_page_id=0,
-            end_page_id=None,
-            debug_mode=False,
-            lang=None,
-            ) -> PipeResult:
-            pass
-
-    class PipeResult:
-        def apply(self, proc: Callable, *args, **kwargs):
-            """Apply callable method which.
-
-            Args:
-                proc (Callable): invoke proc as follows:
-                    proc(pipeline_result, *args, **kwargs)
-
-            Returns:
-                Any: return the result generated by proc
-            """
-            return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
-
-
-The ``Dataset``, ``InferenceResult``, and ``PipeResult`` classes all have an ``apply`` method, which can be used to chain different stages of the computation. 
-As shown below, ``MinerU`` provides a set of methods to compose these classes.
-
-
-.. code:: python 
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-
-Users can implement their own functions for chaining as needed. For example, a user could use the ``apply`` method to create a function that counts the number of pages in a ``pdf`` file.
-
-
-.. code:: python
-
-    from magic_pdf.data.data_reader_writer import  FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    def count_page(ds)-> int:
-        return len(ds)
-
-    print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`

+ 0 - 12
next_docs/en/user_guide/usage.rst

@@ -1,12 +0,0 @@
-
-
-Usage
-========
-
-.. toctree::
-   :maxdepth: 1
-
-   usage/command_line
-   usage/api
-   usage/docker
-

+ 0 - 279
next_docs/en/user_guide/usage/api.rst

@@ -1,279 +0,0 @@
-
-Api Usage
-===========
-
-
-PDF
-----
-
-Local File Example
-^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.enums import SupportedPdfParseMethod
-
-    # args
-    pdf_file_name = "abc.pdf"  # replace with the real pdf path
-    name_without_suff = pdf_file_name.split(".")[0]
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # read bytes
-    reader1 = FileBasedDataReader("")
-    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        infer_result = ds.apply(doc_analyze, ocr=True)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
-    else:
-        infer_result = ds.apply(doc_analyze, ocr=False)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_txt_mode(image_writer)
-
-    ### draw model result on each page
-    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
-
-    ### get model inference result
-    model_inference_result = infer_result.get_infer_res()
-
-    ### draw layout result on each page
-    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
-
-    ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
-
-    ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir)
-
-    ### dump markdown
-    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-    ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir)
-
-    ### dump content list
-    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-
-    ### get middle json
-    middle_json_content = pipe_result.get_middle_json()
-
-    ### dump middle json
-    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
-
-
-
-S3 File Example
-^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
-    from magic_pdf.data.dataset import PymuDocDataset
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.enums import SupportedPdfParseMethod
-
-    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
-    ak = "{Your S3 access key}"  # replace with real s3 access key
-    sk = "{Your S3 secret key}"  # replace with real s3 secret key
-    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url
-
-    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
-    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
-    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
-    md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
-
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    # args
-    pdf_file_name = (
-        f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
-    )
-
-    # prepare env
-    local_dir = "output"
-    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
-
-    # read bytes
-    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content
-
-    # proc
-    ## Create Dataset Instance
-    ds = PymuDocDataset(pdf_bytes)
-
-    ## inference
-    if ds.classify() == SupportedPdfParseMethod.OCR:
-        infer_result = ds.apply(doc_analyze, ocr=True)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
-    else:
-        infer_result = ds.apply(doc_analyze, ocr=False)
-
-        ## pipeline
-        pipe_result = infer_result.pipe_txt_mode(image_writer)
-
-    ### draw model result on each page
-    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
-
-    ### get model inference result
-    model_inference_result = infer_result.get_infer_res()
-
-    ### draw layout result on each page
-    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
-
-    ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
-
-    ### dump markdown
-    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
-
-    ### dump content list
-    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-
-    ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir)
-
-    ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir)
-
-    ### get middle json
-    middle_json_content = pipe_result.get_middle_json()
-
-    ### dump middle json
-    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
-
-MS-Office
-----------
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_office
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_ppt.ppt"     # replace with real ms-office file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_office(input_file)[0]
-
-    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-
-This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
-
-
-Image
----------
-
-Single Image File
-^^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_images
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_file = "some_image.jpg"       # replace with real image file
-
-    input_file_name = input_file.split(".")[0]
-    ds = read_local_images(input_file)[0]
-
-    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-        md_writer, f"{input_file_name}.md", image_dir
-    )
-
-
-Directory That Contains Images
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: python
-
-    import os
-
-    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.data.read_api import read_local_images
-
-    # prepare env
-    local_image_dir, local_md_dir = "output/images", "output"
-    image_dir = str(os.path.basename(local_image_dir))
-
-    os.makedirs(local_image_dir, exist_ok=True)
-
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
-
-    # proc
-    ## Create Dataset Instance
-    input_directory = "some_image_dir/"       # replace with real directory that contains images
-
-
-    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
-
-    count = 0
-    for ds in dss:
-        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-            md_writer, f"{count}.md", image_dir
-        )
-        count += 1
-
-
-Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details

+ 0 - 77
next_docs/en/user_guide/usage/command_line.rst

@@ -1,77 +0,0 @@
-
-
-Command Line
-===================
-
-.. code:: bash
-
-   magic-pdf --help
-   Usage: magic-pdf [OPTIONS]
-
-   Options:
-     -v, --version                display the version and exit
-     -p, --path PATH              local filepath or directory. support PDF, PPT,
-                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
-     -o, --output-dir PATH        output local directory  [required]
-     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
-                                  technique to extract information from pdf. txt:
-                                  suitable for the text-based pdf only and
-                                  outperform ocr. auto: automatically choose the
-                                  best method for parsing pdf from ocr and txt.
-                                  without method specified, auto will be used by
-                                  default.
-     -l, --lang TEXT              Input the languages in the pdf (if known) to
-                                  improve OCR accuracy.  Optional. You should
-                                  input "Abbreviation" with language form url: ht
-                                  tps://paddlepaddle.github.io/PaddleOCR/en/ppocr
-                                  /blog/multi_languages.html#5-support-languages-
-                                  and-abbreviations
-     -d, --debug BOOLEAN          Enables detailed debugging information during
-                                  the execution of the CLI commands.
-     -s, --start INTEGER          The starting page for PDF parsing, beginning
-                                  from 0.
-     -e, --end INTEGER            The ending page for PDF parsing, beginning from
-                                  0.
-     --help                       Show this message and exit.
-
-
-   ## show version
-   magic-pdf -v
-
-   ## command line example
-   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
-
-
-.. admonition:: Important
-    :class: tip
-
-    The file must endswith with the following suffix.
-       .pdf 
-       .png
-       .jpg
-       .ppt
-       .pptx
-       .doc
-       .docx
-
-
-``{some_pdf}`` can be a single PDF file or a directory containing
-multiple PDFs. The results will be saved in the ``{some_output_dir}``
-directory. The output file list is as follows:
-
-.. code:: text
-
-   ├── some_pdf.md                          # markdown file
-   ├── images                               # directory for storing images
-   ├── some_pdf_layout.pdf                  # layout diagram
-   ├── some_pdf_middle.json                 # MinerU intermediate processing result
-   ├── some_pdf_model.json                  # model inference result
-   ├── some_pdf_origin.pdf                  # original PDF file
-   ├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
-   └── some_pdf_content_list.json           # Rich text JSON arranged in reading order
-
-.. admonition:: Tip
-   :class: tip
-   
-
-   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`

+ 0 - 24
next_docs/en/user_guide/usage/docker.rst

@@ -1,24 +0,0 @@
-
-
-Docker 
-=======
-
-.. admonition:: Important
-   :class: tip
-
-   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
-
-   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
-
-   .. code-block:: bash
-
-      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-
-
-.. code:: sh
-
-   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
-   docker build -t mineru:latest .
-   docker run --rm -it --gpus=all mineru:latest /bin/bash
-   magic-pdf --help
-

+ 0 - 17
next_docs/requirements.txt

@@ -1,17 +0,0 @@
-numpy==1.26.4
-click==8.1.7
-fast-langdetect==0.2.2
-Brotli==1.1.0
-boto3>=1.28.43
-loguru>=0.6.0
-myst-parser
-Pillow==8.4.0
-pydantic>=2.7.2,<2.8.0
-PyMuPDF>=1.24.9
-pdfminer.six==20231228
-sphinx
-sphinx-argparse>=0.5.2
-sphinx-book-theme>=1.1.3
-sphinx-copybutton>=0.5.2
-sphinx_rtd_theme>=3.0.1
-autodoc_pydantic>=2.2.0

+ 0 - 16
next_docs/zh_cn/.readthedocs.yaml

@@ -1,16 +0,0 @@
-version: 2
-
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-
-formats:
-  - epub
-
-python:
-  install:
-    - requirements: next_docs/requirements.txt
-
-sphinx:
-  configuration: next_docs/zh_cn/conf.py

+ 0 - 20
next_docs/zh_cn/Makefile

@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

binární
next_docs/zh_cn/_static/image/MinerU-logo-hq.png


binární
next_docs/zh_cn/_static/image/MinerU-logo.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 13
next_docs/zh_cn/_static/image/ReadTheDocs.svg


binární
next_docs/zh_cn/_static/image/datalab_logo.png


binární
next_docs/zh_cn/_static/image/flowchart_en.png


binární
next_docs/zh_cn/_static/image/flowchart_zh_cn.png


binární
next_docs/zh_cn/_static/image/inference_result.png


binární
next_docs/zh_cn/_static/image/layout_example.png


binární
next_docs/zh_cn/_static/image/logo.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 3
next_docs/zh_cn/_static/image/pipeline.drawio.svg


binární
next_docs/zh_cn/_static/image/poly.png


binární
next_docs/zh_cn/_static/image/project_panorama_en.png


binární
next_docs/zh_cn/_static/image/project_panorama_zh_cn.png


Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů