Przeglądaj źródła

Merge pull request #18 from myhloli/vlm_2.5

Vlm 2.5
Xiaomeng Zhao 2 miesięcy temu
rodzic
commit
9041f04588
56 zmienionych plików z 1512 dodań i 3910 usunięć
  1. 42 18
      README.md
  2. 40 18
      README_zh-CN.md
  3. 83 90
      demo/demo.py
  4. 4 8
      docker/china/Dockerfile
  5. 16 22
      docker/compose.yaml
  6. 2 4
      docker/global/Dockerfile
  7. 0 12
      docs/en/faq/index.md
  8. 13 16
      docs/en/quick_start/docker_deployment.md
  9. 7 15
      docs/en/quick_start/extension_modules.md
  10. 3 3
      docs/en/quick_start/index.md
  11. 269 64
      docs/en/reference/output_files.md
  12. 8 21
      docs/en/usage/advanced_cli_parameters.md
  13. 3 3
      docs/en/usage/cli_tools.md
  14. 12 12
      docs/en/usage/quick_usage.md
  15. 0 12
      docs/zh/faq/index.md
  16. 14 16
      docs/zh/quick_start/docker_deployment.md
  17. 7 15
      docs/zh/quick_start/extension_modules.md
  18. 3 3
      docs/zh/quick_start/index.md
  19. 364 64
      docs/zh/reference/output_files.md
  20. 8 21
      docs/zh/usage/advanced_cli_parameters.md
  21. 3 3
      docs/zh/usage/cli_tools.md
  22. 12 12
      docs/zh/usage/quick_usage.md
  23. 0 186
      mineru/backend/vlm/base_predictor.py
  24. 0 217
      mineru/backend/vlm/hf_predictor.py
  25. 123 0
      mineru/backend/vlm/model_output_to_middle_json.py
  26. 0 111
      mineru/backend/vlm/predictor.py
  27. 0 443
      mineru/backend/vlm/sglang_client_predictor.py
  28. 0 246
      mineru/backend/vlm/sglang_engine_predictor.py
  29. 0 114
      mineru/backend/vlm/token_to_middle_json.py
  30. 0 40
      mineru/backend/vlm/utils.py
  31. 65 16
      mineru/backend/vlm/vlm_analyze.py
  32. 201 135
      mineru/backend/vlm/vlm_magic_model.py
  33. 48 5
      mineru/backend/vlm/vlm_middle_json_mkcontent.py
  34. 6 5
      mineru/cli/client.py
  35. 11 16
      mineru/cli/common.py
  36. 9 7
      mineru/cli/fast_api.py
  37. 17 16
      mineru/cli/gradio_app.py
  38. 0 4
      mineru/cli/vlm_sglang_server.py
  39. 4 0
      mineru/cli/vlm_vllm_server.py
  40. 0 9
      mineru/model/vlm_hf_model/__init__.py
  41. 0 38
      mineru/model/vlm_hf_model/configuration_mineru2.py
  42. 0 269
      mineru/model/vlm_hf_model/image_processing_mineru2.py
  43. 0 449
      mineru/model/vlm_hf_model/modeling_mineru2.py
  44. 0 14
      mineru/model/vlm_sglang_model/__init__.py
  45. 0 264
      mineru/model/vlm_sglang_model/engine.py
  46. 0 213
      mineru/model/vlm_sglang_model/image_processor.py
  47. 0 90
      mineru/model/vlm_sglang_model/logit_processor.py
  48. 0 453
      mineru/model/vlm_sglang_model/model.py
  49. 0 75
      mineru/model/vlm_sglang_model/server.py
  50. 0 0
      mineru/model/vlm_vllm_model/__init__.py
  51. 39 0
      mineru/model/vlm_vllm_model/server.py
  52. 30 9
      mineru/utils/draw_bbox.py
  53. 16 2
      mineru/utils/enum_class.py
  54. 20 0
      mineru/utils/guess_suffix_or_lang.py
  55. 1 1
      projects/multi_gpu_v2/client.py
  56. 9 11
      pyproject.toml

+ 42 - 18
README.md

@@ -44,26 +44,50 @@
 
 # Changelog
 
-- 2025/09/10 2.2.2 Released
-  - Fixed the issue where the new table recognition model would affect the overall parsing task when some table parsing failed
-
-- 2025/09/08 2.2.1 Released  
-  - Fixed the issue where some newly added models were not downloaded when using the model download command.
-
-- 2025/09/05 2.2.0 Released
-  - Major Updates
-    - In this version, we focused on improving table parsing accuracy by introducing a new [wired table recognition model](https://github.com/RapidAI/TableStructureRec) and a brand-new hybrid table structure parsing algorithm, significantly enhancing the table recognition capabilities of the `pipeline` backend.
-    - We also added support for cross-page table merging, which is supported by both `pipeline` and `vlm` backends, further improving the completeness and accuracy of table parsing.
-  - Other Updates
-    - The `pipeline` backend now supports 270-degree rotated table parsing, bringing support for table parsing in 0/90/270-degree orientations
-    - `pipeline` added OCR capability support for Thai and Greek, and updated the English OCR model to the latest version. English recognition accuracy improved by 11%, Thai recognition model accuracy is 82.68%, and Greek recognition model accuracy is 89.28% (by PPOCRv5)
-    - Added `bbox` field (mapped to 0-1000 range) in the output `content_list.json`, making it convenient for users to directly obtain position information for each content block
-    - Removed the `pipeline_old_linux` installation option, no longer supporting legacy Linux systems such as `CentOS 7`, to provide better support for `uv`'s `sync`/`run` commands
+- 2025/09/19 2.5.0 Released
+  - vlm update to 2509-2.5 version
+
 
 <details>
   <summary>History Log</summary>
 
   <details>
+    <summary>2025/09/10 2.2.2 Released</summary>
+    <ul>
+      <li>Fixed the issue where the new table recognition model would affect the overall parsing task when some table parsing failed</li>
+    </ul>
+  </details>  
+
+  <details>
+    <summary>2025/09/08 2.2.1 Released</summary>
+    <ul>
+      <li>Fixed the issue where some newly added models were not downloaded when using the model download command.</li>
+    </ul>
+  </details>  
+
+  <details>
+    <summary>2025/09/05 2.2.0 Released</summary>
+    <ul>
+      <li>
+        Major Updates
+        <ul>
+          <li>In this version, we focused on improving table parsing accuracy by introducing a new <a href="https://github.com/RapidAI/TableStructureRec">wired table recognition model</a> and a brand-new hybrid table structure parsing algorithm, significantly enhancing the table recognition capabilities of the <code>pipeline</code> backend.</li>
+          <li>We also added support for cross-page table merging, which is supported by both <code>pipeline</code> and <code>vlm</code> backends, further improving the completeness and accuracy of table parsing.</li>
+        </ul>
+      </li>
+      <li>
+        Other Updates
+        <ul>
+          <li>The <code>pipeline</code> backend now supports 270-degree rotated table parsing, bringing support for table parsing in 0/90/270-degree orientations</li>
+          <li><code>pipeline</code> added OCR capability support for Thai and Greek, and updated the English OCR model to the latest version. English recognition accuracy improved by 11%, Thai recognition model accuracy is 82.68%, and Greek recognition model accuracy is 89.28% (by PPOCRv5)</li>
+          <li>Added <code>bbox</code> field (mapped to 0-1000 range) in the output <code>content_list.json</code>, making it convenient for users to directly obtain position information for each content block</li>
+          <li>Removed the <code>pipeline_old_linux</code> installation option, no longer supporting legacy Linux systems such as <code>CentOS 7</code>, to provide better support for <code>uv</code>'s <code>sync</code>/<code>run</code> commands</li>
+        </ul>
+      </li>
+    </ul>
+  </details>
+
+  <details>
     <summary>2025/08/01 2.1.10 Released</summary>
     <ul>
       <li>Fixed an issue in the <code>pipeline</code> backend where block overlap caused the parsing results to deviate from expectations #3232</li>
@@ -559,7 +583,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
         <td>Parsing Backend</td>
         <td>pipeline</td>
         <td>vlm-transformers</td>
-        <td>vlm-sglang</td>
+        <td>vlm-vllm</td>
     </tr>
     <tr>
         <td>Operating System</td>
@@ -637,8 +661,8 @@ You can use MinerU for PDF parsing through various methods such as command line,
 - [x] Handwritten Text Recognition  
 - [x] Vertical Text Recognition  
 - [x] Latin Accent Mark Recognition
-- [ ] Code block recognition in the main text
-- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
+- [x] Code block recognition in the main text
+- [x] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)(mineru.net)
 - [ ] Geometric shape recognition
 
 # Known Issues

+ 40 - 18
README_zh-CN.md

@@ -44,25 +44,47 @@
 
 # 更新记录
 
-- 2025/09/10 2.2.2 发布
-  - 修复新的表格识别模型在部分表格解析失败时影响整体解析任务的问题
+- 2025/09/19 2.5.0 发布
+  - vlm模型更新2509-2.5版本
 
-- 2025/09/08 2.2.1 发布
-  - 修复使用模型下载命令时,部分新增模型未下载的问题
+<details>
+  <summary>历史日志</summary>
 
-- 2025/09/05 2.2.0 发布
-  - 主要更新
-    - 在这个版本我们重点提升了表格的解析精度,通过引入新的[有线表识别模型](https://github.com/RapidAI/TableStructureRec)和全新的混合表格结构解析算法,显著提升了`pipeline`后端的表格识别能力。
-    - 另外我们增加了对跨页表格合并的支持,这一功能同时支持`pipeline`和`vlm`后端,进一步提升了表格解析的完整性和准确性。
-  - 其他更新
-    - `pipeline`后端增加270度旋转的表格解析能力,现已支持0/90/270度三个方向的表格解析
-    - `pipeline`增加对泰文、希腊文的ocr能力支持,并更新了英文ocr模型至最新,英文识别精度提升11%,泰文识别模型精度 82.68%,希腊文识别模型精度 89.28%(by PPOCRv5)
-    - 在输出的`content_list.json`中增加了`bbox`字段(映射至0-1000范围内),方便用户直接获取每个内容块的位置信息
-    - 移除`pipeline_old_linux`安装可选项,不再支持老版本的Linux系统如`Centos 7`等,以便对`uv`的`sync`/`run`等命令进行更好的支持
+  <details>
+    <summary>2025/09/10 2.2.2 发布</summary>
+    <ul>
+      <li>修复新的表格识别模型在部分表格解析失败时影响整体解析任务的问题</li>
+    </ul>
+  </details>  
 
+  <details>
+    <summary>2025/09/08 2.2.1 发布</summary>
+    <ul>
+      <li>修复使用模型下载命令时,部分新增模型未下载的问题</li>
+    </ul>
+  </details>  
 
-<details>
-  <summary>历史日志</summary>
+  <details>
+    <summary>2025/09/05 2.2.0 发布</summary>
+    <ul>
+      <li>
+        主要更新
+        <ul>
+          <li>在这个版本我们重点提升了表格的解析精度,通过引入新的<a href="https://github.com/RapidAI/TableStructureRec">有线表识别模型</a>和全新的混合表格结构解析算法,显著提升了<code>pipeline</code>后端的表格识别能力。</li>
+          <li>另外我们增加了对跨页表格合并的支持,这一功能同时支持<code>pipeline</code>和<code>vlm</code>后端,进一步提升了表格解析的完整性和准确性。</li>
+        </ul>
+      </li>
+      <li>
+        其他更新
+        <ul>
+          <li><code>pipeline</code>后端增加270度旋转的表格解析能力,现已支持0/90/270度三个方向的表格解析</li>
+          <li><code>pipeline</code>增加对泰文、希腊文的ocr能力支持,并更新了英文ocr模型至最新,英文识别精度提升11%,泰文识别模型精度 82.68%,希腊文识别模型精度 89.28%(by PPOCRv5)</li>
+          <li>在输出的<code>content_list.json</code>中增加了<code>bbox</code>字段(映射至0-1000范围内),方便用户直接获取每个内容块的位置信息</li>
+          <li>移除<code>pipeline_old_linux</code>安装可选项,不再支持老版本的Linux系统如<code>Centos 7</code>等,以便对<code>uv</code>的<code>sync</code>/<code>run</code>等命令进行更好的支持</li>
+        </ul>
+      </li>
+    </ul>
+  </details>
 
   <details>
     <summary>2025/08/01 2.1.10 发布</summary>
@@ -548,7 +570,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
         <td>解析后端</td>
         <td>pipeline</td>
         <td>vlm-transformers</td>
-        <td>vlm-sglang</td>
+        <td>vlm-vllm</td>
     </tr>
     <tr>
         <td>操作系统</td>
@@ -626,8 +648,8 @@ mineru -p <input_path> -o <output_path>
 - [x] 手写文本识别
 - [x] 竖排文本识别
 - [x] 拉丁字母重音符号识别
-- [ ] 正文中代码块识别
-- [ ] [化学式识别](docs/chemical_knowledge_introduction/introduction.pdf)
+- [x] 正文中代码块识别
+- [x] [化学式识别](docs/chemical_knowledge_introduction/introduction.pdf)(https://mineru.net)
 - [ ] 图表内容识别
 
 # Known Issues

+ 83 - 90
demo/demo.py

@@ -15,7 +15,6 @@ from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc
 from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
 from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
 from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
-from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
 
 
 def do_parse(
@@ -27,7 +26,7 @@ def do_parse(
     parse_method="auto",  # The method for parsing PDF, default is 'auto'
     formula_enable=True,  # Enable formula parsing
     table_enable=True,  # Enable table parsing
-    server_url=None,  # Server URL for vlm-sglang-client backend
+    server_url=None,  # Server URL for vlm-http-client backend
     f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
     f_draw_span_bbox=True,  # Whether to draw span bounding boxes
     f_dump_md=True,  # Whether to dump markdown files
@@ -62,47 +61,12 @@ def do_parse(
             pdf_info = middle_json["pdf_info"]
 
             pdf_bytes = pdf_bytes_list[idx]
-            if f_draw_layout_bbox:
-                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
-
-            if f_draw_span_bbox:
-                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
-
-            if f_dump_orig_pdf:
-                md_writer.write(
-                    f"{pdf_file_name}_origin.pdf",
-                    pdf_bytes,
-                )
-
-            if f_dump_md:
-                image_dir = str(os.path.basename(local_image_dir))
-                md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}.md",
-                    md_content_str,
-                )
-
-            if f_dump_content_list:
-                image_dir = str(os.path.basename(local_image_dir))
-                content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}_content_list.json",
-                    json.dumps(content_list, ensure_ascii=False, indent=4),
-                )
-
-            if f_dump_middle_json:
-                md_writer.write_string(
-                    f"{pdf_file_name}_middle.json",
-                    json.dumps(middle_json, ensure_ascii=False, indent=4),
-                )
-
-            if f_dump_model_output:
-                md_writer.write_string(
-                    f"{pdf_file_name}_model.json",
-                    json.dumps(model_json, ensure_ascii=False, indent=4),
-                )
-
-            logger.info(f"local output dir is {local_md_dir}")
+            _process_output(
+                pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
+                md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
+                f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
+                f_make_md_mode, middle_json, model_json, is_pipeline=True
+            )
     else:
         if backend.startswith("vlm-"):
             backend = backend[4:]
@@ -118,48 +82,77 @@ def do_parse(
 
             pdf_info = middle_json["pdf_info"]
 
-            if f_draw_layout_bbox:
-                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
-
-            if f_draw_span_bbox:
-                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
-
-            if f_dump_orig_pdf:
-                md_writer.write(
-                    f"{pdf_file_name}_origin.pdf",
-                    pdf_bytes,
-                )
-
-            if f_dump_md:
-                image_dir = str(os.path.basename(local_image_dir))
-                md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}.md",
-                    md_content_str,
-                )
-
-            if f_dump_content_list:
-                image_dir = str(os.path.basename(local_image_dir))
-                content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
-                md_writer.write_string(
-                    f"{pdf_file_name}_content_list.json",
-                    json.dumps(content_list, ensure_ascii=False, indent=4),
-                )
-
-            if f_dump_middle_json:
-                md_writer.write_string(
-                    f"{pdf_file_name}_middle.json",
-                    json.dumps(middle_json, ensure_ascii=False, indent=4),
-                )
-
-            if f_dump_model_output:
-                model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
-                md_writer.write_string(
-                    f"{pdf_file_name}_model_output.txt",
-                    model_output,
-                )
-
-            logger.info(f"local output dir is {local_md_dir}")
+            _process_output(
+                pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
+                md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
+                f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
+                f_make_md_mode, middle_json, infer_result, is_pipeline=False
+            )
+
+
+def _process_output(
+        pdf_info,
+        pdf_bytes,
+        pdf_file_name,
+        local_md_dir,
+        local_image_dir,
+        md_writer,
+        f_draw_layout_bbox,
+        f_draw_span_bbox,
+        f_dump_orig_pdf,
+        f_dump_md,
+        f_dump_content_list,
+        f_dump_middle_json,
+        f_dump_model_output,
+        f_make_md_mode,
+        middle_json,
+        model_output=None,
+        is_pipeline=True
+):
+    """处理输出文件"""
+    if f_draw_layout_bbox:
+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
+
+    if f_draw_span_bbox:
+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
+
+    if f_dump_orig_pdf:
+        md_writer.write(
+            f"{pdf_file_name}_origin.pdf",
+            pdf_bytes,
+        )
+
+    image_dir = str(os.path.basename(local_image_dir))
+
+    if f_dump_md:
+        make_func = pipeline_union_make if is_pipeline else vlm_union_make
+        md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}.md",
+            md_content_str,
+        )
+
+    if f_dump_content_list:
+        make_func = pipeline_union_make if is_pipeline else vlm_union_make
+        content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
+        md_writer.write_string(
+            f"{pdf_file_name}_content_list.json",
+            json.dumps(content_list, ensure_ascii=False, indent=4),
+        )
+
+    if f_dump_middle_json:
+        md_writer.write_string(
+            f"{pdf_file_name}_middle.json",
+            json.dumps(middle_json, ensure_ascii=False, indent=4),
+        )
+
+    if f_dump_model_output:
+        md_writer.write_string(
+            f"{pdf_file_name}_model.json",
+            json.dumps(model_output, ensure_ascii=False, indent=4),
+        )
+
+    logger.info(f"local output dir is {local_md_dir}")
 
 
 def parse_doc(
@@ -182,8 +175,8 @@ def parse_doc(
         backend: the backend for parsing pdf:
             pipeline: More general.
             vlm-transformers: More general.
-            vlm-sglang-engine: Faster(engine).
-            vlm-sglang-client: Faster(client).
+            vlm-vllm-engine: Faster(engine).
+            vlm-http-client: Faster(client).
             without method specified, pipeline will be used by default.
         method: the method for parsing pdf:
             auto: Automatically determine the method based on the file type.
@@ -191,7 +184,7 @@ def parse_doc(
             ocr: Use OCR method for image-based PDFs.
             Without method specified, 'auto' will be used by default.
             Adapted only for the case where the backend is set to "pipeline".
-        server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+        server_url: When the backend is `http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
         start_page_id: Start page ID for parsing, default is 0
         end_page_id: End page ID for parsing, default is None (parse all pages until the end of the document)
     """
@@ -241,5 +234,5 @@ if __name__ == '__main__':
 
     """To enable VLM mode, change the backend to 'vlm-xxx'"""
     # parse_doc(doc_path_list, output_dir, backend="vlm-transformers")  # more general.
-    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine")  # faster(engine).
-    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000")  # faster(client).
+    # parse_doc(doc_path_list, output_dir, backend="vlm-vllm-engine")  # faster(engine).
+    # parse_doc(doc_path_list, output_dir, backend="vlm-http-client", server_url="http://127.0.0.1:30000")  # faster(client).

+ 4 - 8
docker/china/Dockerfile

@@ -1,12 +1,8 @@
-# Use DaoCloud mirrored sglang image for China region
-FROM docker.m.daocloud.io/lmsysorg/sglang:v0.4.10.post2-cu126
-# For blackwell GPU, use the following line instead:
-# FROM docker.m.daocloud.io/lmsysorg/sglang:v0.4.10.post2-cu128-b200
+# Use DaoCloud mirrored vllm image for China region
+FROM docker.m.daocloud.io/vllm/vllm-openai:v0.10.1.1
 
-# Use the official sglang image
-# FROM lmsysorg/sglang:v0.4.10.post2-cu126
-# For blackwell GPU, use the following line instead:
-# FROM lmsysorg/sglang:v0.4.10.post2-cu128-b200
+# Use the official vllm image
+# FROM vllm/vllm-openai:v0.10.1.1
 
 # Install libgl for opencv support & Noto fonts for Chinese characters
 RUN apt-get update && \

+ 16 - 22
docker/compose.yaml

@@ -1,21 +1,19 @@
 services:
-  mineru-sglang-server:
-    image: mineru-sglang:latest
-    container_name: mineru-sglang-server
+  mineru-vllm-server:
+    image: mineru-vllm:latest
+    container_name: mineru-vllm-server
     restart: always
-    profiles: ["sglang-server"]
+    profiles: ["vllm-server"]
     ports:
       - 30000:30000
     environment:
       MINERU_MODEL_SOURCE: local
-    entrypoint: mineru-sglang-server
+    entrypoint: mineru-vllm-server
     command:
       --host 0.0.0.0
       --port 30000
-      # --enable-torch-compile  # You can also enable torch.compile to accelerate inference speed by approximately 15%
-      # --dp-size 2  # If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode
-      # --tp-size 2  # If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode.
-      # --mem-fraction-static 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
+      # --data-parallel-size 2  # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
+      # --gpu-memory-utilization 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
     ulimits:
       memlock: -1
       stack: 67108864
@@ -31,7 +29,7 @@ services:
               capabilities: [gpu]
 
   mineru-api:
-    image: mineru-sglang:latest
+    image: mineru-vllm:latest
     container_name: mineru-api
     restart: always
     profiles: ["api"]
@@ -43,11 +41,9 @@ services:
     command:
       --host 0.0.0.0
       --port 8000
-      # parameters for sglang-engine
-      # --enable-torch-compile  # You can also enable torch.compile to accelerate inference speed by approximately 15%
-      # --dp-size 2  # If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode
-      # --tp-size 2  # If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode.
-      # --mem-fraction-static 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
+      # parameters for vllm-engine
+      # --data-parallel-size 2  # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
+      # --gpu-memory-utilization 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
     ulimits:
       memlock: -1
       stack: 67108864
@@ -61,7 +57,7 @@ services:
               capabilities: [ gpu ]
 
   mineru-gradio:
-    image: mineru-sglang:latest
+    image: mineru-vllm:latest
     container_name: mineru-gradio
     restart: always
     profiles: ["gradio"]
@@ -73,14 +69,12 @@ services:
     command:
       --server-name 0.0.0.0
       --server-port 7860
-      --enable-sglang-engine true  # Enable the sglang engine for Gradio
+      --enable-vllm-engine true  # Enable the vllm engine for Gradio
       # --enable-api false  # If you want to disable the API, set this to false
       # --max-convert-pages 20  # If you want to limit the number of pages for conversion, set this to a specific number
-      # parameters for sglang-engine
-      # --enable-torch-compile  # You can also enable torch.compile to accelerate inference speed by approximately 15%
-      # --dp-size 2  # If using multiple GPUs, increase throughput using sglang's multi-GPU parallel mode
-      # --tp-size 2  # If you have more than one GPU, you can expand available VRAM using tensor parallelism (TP) mode.
-      # --mem-fraction-static 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
+      # parameters for vllm-engine
+      # --data-parallel-size 2  # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
+      # --gpu-memory-utilization 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
     ulimits:
       memlock: -1
       stack: 67108864

+ 2 - 4
docker/global/Dockerfile

@@ -1,7 +1,5 @@
-# Use the official sglang image
-FROM lmsysorg/sglang:v0.4.10.post2-cu126
-# For blackwell GPU, use the following line instead:
-# FROM lmsysorg/sglang:v0.4.10.post2-cu128-b200
+# Use the official vllm image
+FROM vllm/vllm-openai:v0.10.1.1
 
 # Install libgl for opencv support & Noto fonts for Chinese characters
 RUN apt-get update && \

+ 0 - 12
docs/en/faq/index.md

@@ -15,18 +15,6 @@ For unresolved problems, join our [Discord](https://discord.gg/Tdedn9GTXq) or [W
     Reference: [#388](https://github.com/opendatalab/MinerU/issues/388)
 
 
-??? question "Error when installing MinerU on CentOS 7 or Ubuntu 18: `ERROR: Failed building wheel for simsimd`"
-
-    The new version of albumentations (1.4.21) introduces a dependency on simsimd. Since the pre-built package of simsimd for Linux requires a glibc version greater than or equal to 2.28, this causes installation issues on some Linux distributions released before 2019. You can resolve this issue by using the following command:
-    ```
-    conda create -n mineru python=3.11 -y
-    conda activate mineru
-    pip install -U "mineru[pipeline_old_linux]"
-    ```
-    
-    Reference: [#1004](https://github.com/opendatalab/MinerU/issues/1004)
-
-
 ??? question "Missing text information in parsing results when installing and using on Linux systems."
 
     MinerU uses `pypdfium2` instead of `pymupdf` as the PDF page rendering engine in versions >=2.0 to resolve AGPLv3 license issues. On some Linux distributions, due to missing CJK fonts, some text may be lost during the process of rendering PDFs to images.

+ 13 - 16
docs/en/quick_start/docker_deployment.md

@@ -6,25 +6,22 @@ MinerU provides a convenient Docker deployment method, which helps quickly set u
 
 ```bash
 wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/global/Dockerfile
-docker build -t mineru-sglang:latest -f Dockerfile .
+docker build -t mineru-vllm:latest -f Dockerfile .
 ```
 
 > [!TIP]
-> The [Dockerfile](https://github.com/opendatalab/MinerU/blob/master/docker/global/Dockerfile) uses `lmsysorg/sglang:v0.4.10.post2-cu126` as the base image by default, supporting Turing/Ampere/Ada Lovelace/Hopper platforms.
-> If you are using the newer `Blackwell` platform, please modify the base image to `lmsysorg/sglang:v0.4.10.post2-cu128-b200` before executing the build operation.
+> The [Dockerfile](https://github.com/opendatalab/MinerU/blob/master/docker/global/Dockerfile) uses `vllm/vllm-openai:v0.10.1.1` as the base image by default, supporting Turing/Ampere/Ada Lovelace/Hopper/Blackwell platforms.
 
 ## Docker Description
 
-MinerU's Docker uses `lmsysorg/sglang` as the base image, so it includes the `sglang` inference acceleration framework and necessary dependencies by default. Therefore, on compatible devices, you can directly use `sglang` to accelerate VLM model inference.
+MinerU's Docker uses `vllm/vllm-openai` as the base image, so it includes the `vllm` inference acceleration framework and necessary dependencies by default. Therefore, on compatible devices, you can directly use `vllm` to accelerate VLM model inference.
 
 > [!NOTE]
-> Requirements for using `sglang` to accelerate VLM model inference:
+> Requirements for using `vllm` to accelerate VLM model inference:
 > 
 > - Device must have Turing architecture or later graphics cards with 8GB+ available VRAM.
-> - The host machine's graphics driver should support CUDA 12.6 or higher; `Blackwell` platform should support CUDA 12.8 or higher. You can check the driver version using the `nvidia-smi` command.
+> - The host machine's graphics driver should support CUDA 12.8 or higher; You can check the driver version using the `nvidia-smi` command.
 > - Docker container must have access to the host machine's graphics devices.
->
-> If your device doesn't meet the above requirements, you can still use other features of MinerU, but cannot use `sglang` to accelerate VLM model inference, meaning you cannot use the `vlm-sglang-engine` backend or start the `vlm-sglang-server` service.
 
 ## Start Docker Container
 
@@ -33,12 +30,12 @@ docker run --gpus all \
   --shm-size 32g \
   -p 30000:30000 -p 7860:7860 -p 8000:8000 \
   --ipc=host \
-  -it mineru-sglang:latest \
+  -it mineru-vllm:latest \
   /bin/bash
 ```
 
 After executing this command, you will enter the Docker container's interactive terminal with some ports mapped for potential services. You can directly run MinerU-related commands within the container to use MinerU's features.
-You can also directly start MinerU services by replacing `/bin/bash` with service startup commands. For detailed instructions, please refer to the [Start the service via command](https://opendatalab.github.io/MinerU/usage/quick_usage/#advanced-usage-via-api-webui-sglang-clientserver).
+You can also directly start MinerU services by replacing `/bin/bash` with service startup commands. For detailed instructions, please refer to the [Start the service via command](https://opendatalab.github.io/MinerU/usage/quick_usage/#advanced-usage-via-api-webui-http-clientserver).
 
 ## Start Services Directly with Docker Compose
 
@@ -53,19 +50,19 @@ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
 >
 >- The `compose.yaml` file contains configurations for multiple services of MinerU, you can choose to start specific services as needed.
 >- Different services might have additional parameter configurations, which you can view and edit in the `compose.yaml` file.
->- Due to the pre-allocation of GPU memory by the `sglang` inference acceleration framework, you may not be able to run multiple `sglang` services simultaneously on the same machine. Therefore, ensure that other services that might use GPU memory have been stopped before starting the `vlm-sglang-server` service or using the `vlm-sglang-engine` backend.
+>- Due to the pre-allocation of GPU memory by the `vllm` inference acceleration framework, you may not be able to run multiple `vllm` services simultaneously on the same machine. Therefore, ensure that other services that might use GPU memory have been stopped before starting the `vlm-vllm-server` service or using the `vlm-vllm-engine` backend.
 
 ---
 
-### Start sglang-server service
-connect to `sglang-server` via `vlm-sglang-client` backend
+### Start vllm-server service
+connect to `vllm-server` via `vlm-http-client` backend
   ```bash
-  docker compose -f compose.yaml --profile sglang-server up -d
+  docker compose -f compose.yaml --profile vllm-server up -d
   ```
   >[!TIP]
-  >In another terminal, connect to sglang server via sglang client (only requires CPU and network, no sglang environment needed)
+  >In another terminal, connect to vllm server via http client (only requires CPU and network, no vllm environment needed)
   > ```bash
-  > mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://<server_ip>:30000
+  > mineru -p <input_path> -o <output_path> -b vlm-http-client -u http://<server_ip>:30000
   > ```
 
 ---

+ 7 - 15
docs/en/quick_start/extension_modules.md

@@ -4,34 +4,26 @@ MinerU supports installing extension modules on demand based on different needs
 ## Common Scenarios
 
 ### Core Functionality Installation
-The `core` module is the core dependency of MinerU, containing all functional modules except `sglang`. Installing this module ensures the basic functionality of MinerU works properly.
+The `core` module is the core dependency of MinerU, containing all functional modules except `vllm`. Installing this module ensures the basic functionality of MinerU works properly.
 ```bash
 uv pip install mineru[core]
 ```
 
 ---
 
-### Using `sglang` to Accelerate VLM Model Inference
-The `sglang` module provides acceleration support for VLM model inference, suitable for graphics cards with Turing architecture and later (8GB+ VRAM). Installing this module can significantly improve model inference speed.
-In the configuration, `all` includes both `core` and `sglang` modules, so `mineru[all]` and `mineru[core,sglang]` are equivalent.
+### Using `vllm` to Accelerate VLM Model Inference
+The `vllm` module provides acceleration support for VLM model inference, suitable for graphics cards with Turing architecture and later (8GB+ VRAM). Installing this module can significantly improve model inference speed.
+In the configuration, `all` includes both `core` and `vllm` modules, so `mineru[all]` and `mineru[core,vllm]` are equivalent.
 ```bash
 uv pip install mineru[all]
 ```
 > [!TIP]
-> If exceptions occur during installation of the complete package including sglang, please refer to the [sglang official documentation](https://docs.sglang.ai/start/install.html) to try to resolve the issue, or directly use the [Docker](./docker_deployment.md) deployment method.
+> If exceptions occur during installation of the complete package including vllm, please refer to the [vllm official documentation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to try to resolve the issue, or directly use the [Docker](./docker_deployment.md) deployment method.
 
 ---
 
-### Installing Lightweight Client to Connect to sglang-server
-If you need to install a lightweight client on edge devices to connect to `sglang-server`, you can install the basic mineru package, which is very lightweight and suitable for devices with only CPU and network connectivity.
+### Installing Lightweight Client to Connect to vllm-server
+If you need to install a lightweight client on edge devices to connect to `vllm-server`, you can install the basic mineru package, which is very lightweight and suitable for devices with only CPU and network connectivity.
 ```bash
 uv pip install mineru
 ```
-
----
-
-### Using Pipeline Backend on Outdated Linux Systems
-If your system is too outdated to meet the dependency requirements of `mineru[core]`, this option can minimally meet MinerU's runtime requirements, suitable for old systems that cannot be upgraded and only need to use the pipeline backend.
-```bash
-uv pip install mineru[pipeline_old_linux]
-```

+ 3 - 3
docs/en/quick_start/index.md

@@ -31,7 +31,7 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
         <td>Parsing Backend</td>
         <td>pipeline</td>
         <td>vlm-transformers</td>
-        <td>vlm-sglang</td>
+        <td>vlm-vllm</td>
     </tr>
     <tr>
         <td>Operating System</td>
@@ -80,8 +80,8 @@ uv pip install -e .[core]
 ```
 
 > [!TIP]
-> `mineru[core]` includes all core features except `sglang` acceleration, compatible with Windows / Linux / macOS systems, suitable for most users.
-> If you need to use `sglang` acceleration for VLM model inference or install a lightweight client on edge devices, please refer to the documentation [Extension Modules Installation Guide](./extension_modules.md).
+> `mineru[core]` includes all core features except `vllm` acceleration, compatible with Windows / Linux / macOS systems, suitable for most users.
+> If you need to use `vllm` acceleration for VLM model inference or install a lightweight client on edge devices, please refer to the documentation [Extension Modules Installation Guide](./extension_modules.md).
 
 ---
  

+ 269 - 64
docs/en/reference/output_files.md

@@ -51,14 +51,16 @@ The following sections provide detailed descriptions of each file's purpose and
 
 ## Structured Data Files
 
-### Model Inference Results (model.json)
+> [!IMPORTANT]
+> The VLM backend output has significant changes in version 2.5 and is not backward-compatible with the pipeline backend. If you plan to build secondary development on structured outputs, please read this document carefully.
 
-> [!NOTE]
-> Only applicable to pipeline backend
+### Pipeline Backend Output Results
+
+#### Model Inference Results (model.json)
 
 **File naming format**: `{original_filename}_model.json`
 
-#### Data Structure Definition
+##### Data Structure Definition
 
 ```python
 from pydantic import BaseModel, Field
@@ -103,7 +105,7 @@ class PageInferenceResults(BaseModel):
 inference_result: list[PageInferenceResults] = []
 ```
 
-#### Coordinate System Description
+##### Coordinate System Description
 
 `poly` coordinate format: `[x0, y0, x1, y1, x2, y2, x3, y3]`
 
@@ -112,7 +114,7 @@ inference_result: list[PageInferenceResults] = []
 
 ![poly coordinate diagram](../images/poly.png)
 
-#### Sample Data
+##### Sample Data
 
 ```json
 [
@@ -165,52 +167,11 @@ inference_result: list[PageInferenceResults] = []
 ]
 ```
 
-### VLM Output Results (model_output.txt)
-
-> [!NOTE]
-> Only applicable to VLM backend
-
-**File naming format**: `{original_filename}_model_output.txt`
-
-#### File Format Description
-
-- Uses `----` to separate output results for each page
-- Each page contains multiple text blocks starting with `<|box_start|>` and ending with `<|md_end|>`
-
-#### Field Meanings
-
-| Tag | Format | Description |
-|-----|--------|-------------|
-| Bounding box | `<\|box_start\|>x0 y0 x1 y1<\|box_end\|>` | Quadrilateral coordinates (top-left, bottom-right points), coordinate values after scaling page to 1000×1000 |
-| Type tag | `<\|ref_start\|>type<\|ref_end\|>` | Content block type identifier |
-| Content | `<\|md_start\|>markdown content<\|md_end\|>` | Markdown content of the block |
-
-#### Supported Content Types
-
-```json
-{
-    "text": "Text",
-    "title": "Title", 
-    "image": "Image",
-    "image_caption": "Image caption",
-    "image_footnote": "Image footnote",
-    "table": "Table",
-    "table_caption": "Table caption", 
-    "table_footnote": "Table footnote",
-    "equation": "Interline formula"
-}
-```
-
-#### Special Tags
-
-- `<|txt_contd|>`: Appears at the end of text, indicating that this text block can be connected with subsequent text blocks
-- Table content uses `otsl` format and needs to be converted to HTML for rendering in Markdown
-
-### Intermediate Processing Results (middle.json)
+#### Intermediate Processing Results (middle.json)
 
 **File naming format**: `{original_filename}_middle.json`
 
-#### Top-level Structure
+##### Top-level Structure
 
 | Field Name | Type | Description |
 |------------|------|-------------|
@@ -218,22 +179,20 @@ inference_result: list[PageInferenceResults] = []
 | `_backend` | `string` | Parsing mode: `pipeline` or `vlm` |
 | `_version_name` | `string` | MinerU version number |
 
-#### Page Information Structure (pdf_info)
+##### Page Information Structure (pdf_info)
 
 | Field Name | Description |
 |------------|-------------|
 | `preproc_blocks` | Unsegmented intermediate results after PDF preprocessing |
-| `layout_bboxes` | Layout segmentation results, including layout direction and bounding boxes, sorted by reading order |
 | `page_idx` | Page number, starting from 0 |
 | `page_size` | Page width and height `[width, height]` |
-| `_layout_tree` | Layout tree structure |
 | `images` | Image block information list |
 | `tables` | Table block information list |
 | `interline_equations` | Interline formula block information list |
 | `discarded_blocks` | Block information to be discarded |
 | `para_blocks` | Content block results after segmentation |
 
-#### Block Structure Hierarchy
+##### Block Structure Hierarchy
 
 ```
 Level 1 blocks (table | image)
@@ -242,7 +201,7 @@ Level 1 blocks (table | image)
         └── Spans
 ```
 
-#### Level 1 Block Fields
+##### Level 1 Block Fields
 
 | Field Name | Description |
 |------------|-------------|
@@ -250,7 +209,7 @@ Level 1 blocks (table | image)
 | `bbox` | Rectangular box coordinates of the block `[x0, y0, x1, y1]` |
 | `blocks` | List of contained level 2 blocks |
 
-#### Level 2 Block Fields
+##### Level 2 Block Fields
 
 | Field Name | Description |
 |------------|-------------|
@@ -258,7 +217,7 @@ Level 1 blocks (table | image)
 | `bbox` | Rectangular box coordinates of the block |
 | `lines` | List of contained line information |
 
-#### Level 2 Block Types
+##### Level 2 Block Types
 
 | Type | Description |
 |------|-------------|
@@ -274,7 +233,7 @@ Level 1 blocks (table | image)
 | `list` | List block |
 | `interline_equation` | Interline formula block |
 
-#### Line and Span Structure
+##### Line and Span Structure
 
 **Line fields**:
 - `bbox`: Rectangular box coordinates of the line
@@ -285,7 +244,7 @@ Level 1 blocks (table | image)
 - `type`: Span type (`image`, `table`, `text`, `inline_equation`, `interline_equation`)
 - `content` | `img_path`: Text content or image path
 
-#### Sample Data
+##### Sample Data
 
 ```json
 {
@@ -388,15 +347,15 @@ Level 1 blocks (table | image)
 }
 ```
 
-### Content List (content_list.json)
+#### Content List (content_list.json)
 
 **File naming format**: `{original_filename}_content_list.json`
 
-#### Functionality
+##### Functionality
 
 This is a simplified version of `middle.json` that stores all readable content blocks in reading order as a flat structure, removing complex layout information for easier subsequent processing.
 
-#### Content Types
+##### Content Types
 
 | Type | Description |
 |------|-------------|
@@ -405,7 +364,7 @@ This is a simplified version of `middle.json` that stores all readable content b
 | `text` | Text/Title |
 | `equation` | Interline formula |
 
-#### Text Level Identification
+##### Text Level Identification
 
 Text levels are distinguished through the `text_level` field:
 
@@ -414,12 +373,12 @@ Text levels are distinguished through the `text_level` field:
 - `text_level: 2`: Level 2 heading
 - And so on...
 
-#### Common Fields
+##### Common Fields
 
 - All content blocks include a `page_idx` field indicating the page number (starting from 0).
 - All content blocks include a `bbox` field representing the bounding box coordinates of the content block `[x0, y0, x1, y1]`, mapped to a range of 0-1000.
 
-#### Sample Data
+##### Sample Data
 
 ```json
 [
@@ -484,6 +443,252 @@ Text levels are distinguished through the `text_level` field:
 ]
 ```
 
+### VLM Backend Output Results
+
+#### Model Inference Results (model.json)
+
+**File naming format**: `{original_filename}_model.json`
+
+##### File format description
+- Two-level nested list: outer list = pages; inner list = content blocks of that page
+- Each block is a dict with at least: `type`, `bbox`, `angle`, `content` (some types add extra fields like `score`, `block_tags`, `content_tags`, `format`)
+- Designed for direct, raw model inspection
+
+##### Supported content types (type field values)
+```json
+{
+  "text": "Plain text",
+  "title": "Title",
+  "equation": "Display (interline) formula",
+  "image": "Image",
+  "image_caption": "Image caption",
+  "image_footnote": "Image footnote",
+  "table": "Table",
+  "table_caption": "Table caption",
+  "table_footnote": "Table footnote",
+  "phonetic": "Phonetic annotation",
+  "code": "Code block",
+  "code_caption": "Code caption",
+  "ref_text": "Reference / citation entry",
+  "algorithm": "Algorithm block (treated as code subtype)",
+  "list": "List container",
+  "header": "Page header",
+  "footer": "Page footer",
+  "page_number": "Page number",
+  "aside_text": "Side / margin note",
+  "page_footnote": "Page footnote"
+}
+```
+
+##### Coordinate system
+- `bbox` = `[x0, y0, x1, y1]` (top-left, bottom-right)
+- Origin at top-left of the page
+- All coordinates are normalized percentages in `[0,1]`
+
+##### Sample data
+```json
+[
+  [
+    {
+      "type": "header",
+      "bbox": [0.077, 0.095, 0.18, 0.181],
+      "angle": 0,
+      "score": null,
+      "block_tags": null,
+      "content": "ELSEVIER",
+      "format": null,
+      "content_tags": null
+    },
+    {
+      "type": "title",
+      "bbox": [0.157, 0.228, 0.833, 0.253],
+      "angle": 0,
+      "score": null,
+      "block_tags": null,
+      "content": "The response of flow duration curves to afforestation",
+      "format": null,
+      "content_tags": null
+    }
+  ]
+]
+```
+
+#### Intermediate Processing Results (middle.json)
+
+**File naming format**: `{original_filename}_middle.json`
+
+Structure is broadly similar to the pipeline backend, but with these differences:
+
+1. `list` becomes a second‑level block; a new field `sub_type` distinguishes list categories:
+   - `text`: ordinary list
+   - `ref_text`: reference / bibliography style list
+2. New `code` block type with `sub_type`:
+   - `code`
+   - `algorithm`
+   A code block always has at least a `code_body`; it may optionally have a `code_caption`.
+3. `discarded_blocks` may contain additional types: `header`, `footer`, `page_number`, `aside_text`, `page_footnote`.
+4. All blocks include an `angle` field indicating rotation (one of `0, 90, 180, 270`).
+
+##### Examples
+- Example: list block
+    ```json
+    {
+      "bbox": [174,155,818,333],
+      "type": "list",
+      "angle": 0,
+      "index": 11,
+      "blocks": [
+        {
+          "bbox": [174,157,311,175],
+          "type": "text",
+          "angle": 0,
+          "lines": [
+            {
+              "bbox": [174,157,311,175],
+                "spans": [
+                  {
+                    "bbox": [174,157,311,175],
+                    "type": "text",
+                    "content": "H.1 Introduction"
+                  }
+                ]
+            }
+          ],
+          "index": 3
+        },
+        {
+          "bbox": [175,182,464,229],
+          "type": "text",
+          "angle": 0,
+          "lines": [
+            {
+              "bbox": [175,182,464,229],
+              "spans": [
+                {
+                  "bbox": [175,182,464,229],
+                  "type": "text",
+                  "content": "H.2 Example: Divide by Zero without Exception Handling"
+                }
+              ]
+            }
+          ],
+          "index": 4
+        }
+      ],
+      "sub_type": "text"
+    }
+    ```
+
+- Example: code block with optional caption:
+    ```json
+    {
+      "type": "code",
+      "bbox": [114,780,885,1231],
+      "blocks": [
+        {
+          "bbox": [114,780,885,1231],
+          "lines": [
+            {
+              "bbox": [114,780,885,1231],
+              "spans": [
+                {
+                  "bbox": [114,780,885,1231],
+                  "type": "text",
+                  "content": "1 // Fig. H.1: DivideByZeroNoExceptionHandling.java  \n2 // Integer division without exception handling.  \n3 import java.util.Scanner;  \n4  \n5 public class DivideByZeroNoExceptionHandling  \n6 {  \n7 // demonstrates throwing an exception when a divide-by-zero occurs  \n8 public static int quotient( int numerator, int denominator )  \n9 {  \n10 return numerator / denominator; // possible division by zero  \n11 } // end method quotient  \n12  \n13 public static void main(String[] args)  \n14 {  \n15 Scanner scanner = new Scanner(System.in); // scanner for input  \n16  \n17 System.out.print(\"Please enter an integer numerator: \");  \n18 int numerator = scanner.nextInt();  \n19 System.out.print(\"Please enter an integer denominator: \");  \n20 int denominator = scanner.nextInt();  \n21"
+                }
+              ]
+            }
+          ],
+          "index": 17,
+          "angle": 0,
+          "type": "code_body"
+        },
+        {
+          "bbox": [867,160,1280,189],
+          "lines": [
+            {
+              "bbox": [867,160,1280,189],
+              "spans": [
+                {
+                  "bbox": [867,160,1280,189],
+                  "type": "text",
+                  "content": "Algorithm 1 Modules for MCTSteg"
+                }
+              ]
+            }
+          ],
+          "index": 19,
+          "angle": 0,
+          "type": "code_caption"
+        }
+      ],
+      "index": 17,
+      "sub_type": "code"
+    }
+    ```
+
+#### Content List (content_list.json)
+
+**File naming format**: `{original_filename}_content_list.json`
+
+Based on the pipeline format, with these VLM-specific extensions:
+
+1. New `code` type with `sub_type` (`code` | `algorithm`):
+   - Fields: `code_body` (string), optional `code_caption` (list of strings)
+2. New `list` type with `sub_type` (`text` | `ref_text`):
+   - Field: `list_items` (array of strings)
+3. All `discarded_blocks` entries are also output (e.g., headers, footers, page numbers, margin notes, page footnotes).
+4. Existing types (`image`, `table`, `text`, `equation`) remain unchanged.
+5. `bbox` still uses the 0–1000 normalized coordinate mapping.
+
+
+##### Examples
+Example: code (algorithm) entry
+```json
+{
+  "type": "code",
+  "sub_type": "algorithm",
+  "code_caption": ["Algorithm 1 Modules for MCTSteg"],
+  "code_body": "1: function GETCOORDINATE(d)  \n2:  $x \\gets d / l$ ,  $y \\gets d$  mod  $l$   \n3: return  $(x, y)$   \n4: end function  \n5: function BESTCHILD(v)  \n6:  $C \\gets$  child set of  $v$   \n7:  $v' \\gets \\arg \\max_{c \\in C} \\mathrm{UCTScore}(c)$   \n8:  $v'.n \\gets v'.n + 1$   \n9: return  $v'$   \n10: end function  \n11: function BACK PROPAGATE(v)  \n12: Calculate  $R$  using Equation 11  \n13: while  $v$  is not a root node do  \n14:  $v.r \\gets v.r + R$ ,  $v \\gets v.p$   \n15: end while  \n16: end function  \n17: function RANDOMSEARCH(v)  \n18: while  $v$  is not a leaf node do  \n19: Randomly select an untried action  $a \\in A(v)$   \n20: Create a new node  $v'$   \n21:  $(x, y) \\gets \\mathrm{GETCOORDINATE}(v'.d)$   \n22:  $v'.p \\gets v$ ,  $v'.d \\gets v.d + 1$ ,  $v'.\\Gamma \\gets v.\\Gamma$   \n23:  $v'.\\gamma_{x,y} \\gets a$   \n24: if  $a = -1$  then  \n25:  $v.lc \\gets v'$   \n26: else if  $a = 0$  then  \n27:  $v.mc \\gets v'$   \n28: else  \n29:  $v.rc \\gets v'$   \n30: end if  \n31:  $v \\gets v'$   \n32: end while  \n33: return  $v$   \n34: end function  \n35: function SEARCH(v)  \n36: while  $v$  is fully expanded do  \n37:  $v \\gets$  BESTCHILD(v)  \n38: end while  \n39: if  $v$  is not a leaf node then  \n40:  $v \\gets$  RANDOMSEARCH(v)  \n41: end if  \n42: return  $v$   \n43: end function",
+  "bbox": [510,87,881,740],
+  "page_idx": 0
+}
+```
+
+Example: list (text) entry
+```json
+{
+  "type": "list",
+  "sub_type": "text",
+  "list_items": [
+    "H.1 Introduction",
+    "H.2 Example: Divide by Zero without Exception Handling",
+    "H.3 Example: Divide by Zero with Exception Handling",
+    "H.4 Summary"
+  ],
+  "bbox": [174,155,818,333],
+  "page_idx": 0
+}
+```
+
+Example: discarded blocks output
+```json
+[
+  {
+    "type": "header",
+    "text": "Journal of Hydrology 310 (2005) 253-265",
+    "bbox": [363,164,623,177],
+    "page_idx": 0
+  },
+  {
+    "type": "page_footnote",
+    "text": "* Corresponding author. Address: Forest Science Centre, Department of Sustainability and Environment, P.O. Box 137, Heidelberg, Vic. 3084, Australia. Tel.: +61 3 9450 8719; fax: +61 3 9450 8644.",
+    "bbox": [71,815,915,841],
+    "page_idx": 0
+  }
+]
+```
+
 ## Summary
 
 The above files constitute MinerU's complete output results. Users can choose appropriate files for subsequent processing based on their needs:

+ 8 - 21
docs/en/usage/advanced_cli_parameters.md

@@ -1,25 +1,17 @@
 # Advanced Command Line Parameters
 
-## SGLang Acceleration Parameter Optimization
-
-### Memory Optimization Parameters
-> [!TIP]
-> SGLang acceleration mode currently supports running on Turing architecture graphics cards with a minimum of 8GB VRAM, but graphics cards with <24GB VRAM may encounter insufficient memory issues. You can optimize memory usage with the following parameters:
-> 
-> - If you encounter insufficient VRAM when using a single graphics card, you may need to reduce the KV cache size with `--mem-fraction-static 0.5`. If VRAM issues persist, try reducing it further to `0.4` or lower.
-> - If you have two or more graphics cards, you can try using tensor parallelism (TP) mode to simply expand available VRAM: `--tp-size 2`
+## vllm Acceleration Parameter Optimization
 
 ### Performance Optimization Parameters
 > [!TIP]
-> If you can already use SGLang normally for accelerated VLM model inference but still want to further improve inference speed, you can try the following parameters:
+> If you can already use vllm normally for accelerated VLM model inference but still want to further improve inference speed, you can try the following parameters:
 > 
-> - If you have multiple graphics cards, you can use SGLang's multi-card parallel mode to increase throughput: `--dp-size 2`
-> - You can also enable `torch.compile` to accelerate inference speed by approximately 15%: `--enable-torch-compile`
+> - If you have multiple graphics cards, you can use vllm's multi-card parallel mode to increase throughput: `--data-parallel-size 2`
 
 ### Parameter Passing Instructions
 > [!TIP]
-> - All officially supported SGLang parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-sglang-server`, `mineru-gradio`, `mineru-api`
-> - If you want to learn more about `sglang` parameter usage, please refer to the [SGLang official documentation](https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands)
+> - All officially supported vllm parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-vllm-server`, `mineru-gradio`, `mineru-api`
+> - If you want to learn more about `vllm` parameter usage, please refer to the [vllm official documentation](https://docs.vllm.ai/en/latest/cli/serve.html)
 
 ## GPU Device Selection and Configuration
 
@@ -29,7 +21,7 @@
 >   ```bash
 >   CUDA_VISIBLE_DEVICES=1 mineru -p <input_path> -o <output_path>
 >   ```
-> - This specification method is effective for all command line calls, including `mineru`, `mineru-sglang-server`, `mineru-gradio`, and `mineru-api`, and applies to both `pipeline` and `vlm` backends.
+> - This specification method is effective for all command line calls, including `mineru`, `mineru-vllm-server`, `mineru-gradio`, and `mineru-api`, and applies to both `pipeline` and `vlm` backends.
 
 ### Common Device Configuration Examples
 > [!TIP]
@@ -46,14 +38,9 @@
 > [!TIP]
 > Here are some possible usage scenarios:
 > 
-> - If you have multiple graphics cards and need to specify cards 0 and 1, using multi-card parallelism to start `sglang-server`, you can use the following command:
->   ```bash
->   CUDA_VISIBLE_DEVICES=0,1 mineru-sglang-server --port 30000 --dp-size 2
->   ```
-> 
-> - If you have multiple GPUs and need to specify GPU 0–3, and start the `sglang-server` using multi-GPU data parallelism and tensor parallelism, you can use the following command:
+> - If you have multiple graphics cards and need to specify cards 0 and 1, using multi-card parallelism to start `vllm-server`, you can use the following command:
 >   ```bash
->   CUDA_VISIBLE_DEVICES=0,1,2,3 mineru-sglang-server --port 30000 --dp-size 2 --tp-size 2
+>   CUDA_VISIBLE_DEVICES=0,1 mineru-vllm-server --port 30000 --data-parallel-size 2
 >   ```
 >       
 > - If you have multiple graphics cards and need to start two `fastapi` services on cards 0 and 1, listening on different ports respectively, you can use the following commands:

+ 3 - 3
docs/en/usage/cli_tools.md

@@ -11,11 +11,11 @@ Options:
   -p, --path PATH                 Input file path or directory (required)
   -o, --output PATH               Output directory (required)
   -m, --method [auto|txt|ocr]     Parsing method: auto (default), txt, ocr (pipeline backend only)
-  -b, --backend [pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client]
+  -b, --backend [pipeline|vlm-transformers|vlm-vllm-engine|vlm-http-client]
                                   Parsing backend (default: pipeline)
   -l, --lang [ch|ch_server|ch_lite|en|korean|japan|chinese_cht|ta|te|ka|th|el|latin|arabic|east_slavic|cyrillic|devanagari]
                                   Specify document language (improves OCR accuracy, pipeline backend only)
-  -u, --url TEXT                  Service address when using sglang-client
+  -u, --url TEXT                  Service address when using http-client
   -s, --start INTEGER             Starting page number for parsing (0-based)
   -e, --end INTEGER               Ending page number for parsing (0-based)
   -f, --formula BOOLEAN           Enable formula parsing (default: enabled)
@@ -45,7 +45,7 @@ Options:
                                   files to be input need to be placed in the
                                   `example` folder within the directory where
                                   the command is currently executed.
-  --enable-sglang-engine BOOLEAN  Enable SgLang engine backend for faster
+  --enable-vllm-engine BOOLEAN  Enable vllm engine backend for faster
                                   processing.
   --enable-api BOOLEAN            Enable gradio API for serving the
                                   application.

+ 12 - 12
docs/en/usage/quick_usage.md

@@ -29,11 +29,11 @@ mineru -p <input_path> -o <output_path>
 mineru -p <input_path> -o <output_path> -b vlm-transformers
 ```
 > [!TIP]
-> The vlm backend additionally supports `sglang` acceleration. Compared to the `transformers` backend, `sglang` can achieve 20-30x speedup. You can check the installation method for the complete package supporting `sglang` acceleration in the [Extension Modules Installation Guide](../quick_start/extension_modules.md).
+> The vlm backend additionally supports `vllm` acceleration. Compared to the `transformers` backend, `vllm` can achieve 20-30x speedup. You can check the installation method for the complete package supporting `vllm` acceleration in the [Extension Modules Installation Guide](../quick_start/extension_modules.md).
 
 If you need to adjust parsing options through custom parameters, you can also check the more detailed [Command Line Tools Usage Instructions](./cli_tools.md) in the documentation.
 
-## Advanced Usage via API, WebUI, sglang-client/server
+## Advanced Usage via API, WebUI, http-client/server
 
 - Direct Python API calls: [Python Usage Example](https://github.com/opendatalab/MinerU/blob/master/demo/demo.py)
 - FastAPI calls:
@@ -44,29 +44,29 @@ If you need to adjust parsing options through custom parameters, you can also ch
   >Access `http://127.0.0.1:8000/docs` in your browser to view the API documentation.
 - Start Gradio WebUI visual frontend:
   ```bash
-  # Using pipeline/vlm-transformers/vlm-sglang-client backends
+  # Using pipeline/vlm-transformers/vlm-vllm-client backends
   mineru-gradio --server-name 0.0.0.0 --server-port 7860
-  # Or using vlm-sglang-engine/pipeline backends (requires sglang environment)
-  mineru-gradio --server-name 0.0.0.0 --server-port 7860 --enable-sglang-engine true
+  # Or using vlm-vllm-engine/pipeline backends (requires vllm environment)
+  mineru-gradio --server-name 0.0.0.0 --server-port 7860 --enable-vllm-engine true
   ```
   >[!TIP]
   >
   >- Access `http://127.0.0.1:7860` in your browser to use the Gradio WebUI.
   >- Access `http://127.0.0.1:7860/?view=api` to use the Gradio API.
-- Using `sglang-client/server` method:
+- Using `http-client/server` method:
   ```bash
-  # Start sglang server (requires sglang environment)
-  mineru-sglang-server --port 30000
+  # Start vllm server (requires vllm environment)
+  mineru-vllm-server --port 30000
   ``` 
   >[!TIP]
-  >In another terminal, connect to sglang server via sglang client (only requires CPU and network, no sglang environment needed)
+  >In another terminal, connect to vllm server via http client (only requires CPU and network, no vllm environment needed)
   > ```bash
-  > mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1:30000
+  > mineru -p <input_path> -o <output_path> -b vlm-http-client -u http://127.0.0.1:30000
   > ```
 
 > [!NOTE]
-> All officially supported sglang parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-sglang-server`, `mineru-gradio`, `mineru-api`.
-> We have compiled some commonly used parameters and usage methods for `sglang`, which can be found in the documentation [Advanced Command Line Parameters](./advanced_cli_parameters.md).
+> All officially supported vllm parameters can be passed to MinerU through command line arguments, including the following commands: `mineru`, `mineru-vllm-server`, `mineru-gradio`, `mineru-api`.
+> We have compiled some commonly used parameters and usage methods for `vllm`, which can be found in the documentation [Advanced Command Line Parameters](./advanced_cli_parameters.md).
 
 ## Extending MinerU Functionality with Configuration Files
 

+ 0 - 12
docs/zh/faq/index.md

@@ -14,18 +14,6 @@
     
     参考:[#388](https://github.com/opendatalab/MinerU/issues/388)
 
-
-??? question "在 CentOS 7 或 Ubuntu 18 系统安装MinerU时报错`ERROR: Failed building wheel for simsimd`"
-
-    新版本albumentations(1.4.21)引入了依赖simsimd,由于simsimd在linux的预编译包要求glibc的版本大于等于2.28,导致部分2019年之前发布的Linux发行版无法正常安装,可通过如下命令安装:
-    ```
-    conda create -n mineru python=3.11 -y
-    conda activate mineru
-    pip install -U "mineru[pipeline_old_linux]"
-    ```
-    
-    参考:[#1004](https://github.com/opendatalab/MinerU/issues/1004)
-
 ??? question "在 Linux 系统安装并使用时,解析结果缺失部份文字信息。"
 
     MinerU在>=2.0的版本中使用`pypdfium2`代替`pymupdf`作为PDF页面的渲染引擎,以解决AGPLv3的许可证问题,在某些Linux发行版,由于缺少CJK字体,可能会在将PDF渲染成图片的过程中丢失部份文字。

+ 14 - 16
docs/zh/quick_start/docker_deployment.md

@@ -6,24 +6,22 @@ MinerU提供了便捷的docker部署方式,这有助于快速搭建环境并
 
 ```bash
 wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/Dockerfile
-docker build -t mineru-sglang:latest -f Dockerfile .
+docker build -t mineru-vllm:latest -f Dockerfile .
 ```
 
 > [!TIP]
-> [Dockerfile](https://github.com/opendatalab/MinerU/blob/master/docker/china/Dockerfile)默认使用`lmsysorg/sglang:v0.4.10.post2-cu126`作为基础镜像,支持Turing/Ampere/Ada Lovelace/Hopper平台,
-> 如您使用较新的`Blackwell`平台,请将基础镜像修改为`lmsysorg/sglang:v0.4.10.post2-cu128-b200` 再执行build操作。
+> [Dockerfile](https://github.com/opendatalab/MinerU/blob/master/docker/china/Dockerfile)默认使用`vllm/vllm-openai:v0.10.1.1`作为基础镜像,支持Turing/Ampere/Ada Lovelace/Hopper/Blackwell平台,
 
 ## Docker说明
 
-Mineru的docker使用了`lmsysorg/sglang`作为基础镜像,因此在docker中默认集成了`sglang`推理加速框架和必需的依赖环境。因此在满足条件的设备上,您可以直接使用`sglang`加速VLM模型推理。
+Mineru的docker使用了`vllm/vllm-openai`作为基础镜像,因此在docker中默认集成了`vllm`推理加速框架和必需的依赖环境。因此在满足条件的设备上,您可以直接使用`vllm`加速VLM模型推理。
 > [!NOTE]
-> 使用`sglang`加速VLM模型推理需要满足的条件是:
+> 使用`vllm`加速VLM模型推理需要满足的条件是:
 > 
 > - 设备包含Turing及以后架构的显卡,且可用显存大于等于8G。
-> - 物理机的显卡驱动应支持CUDA 12.6或更高版本,`Blackwell`平台应支持CUDA 12.8及更高版本,可通过`nvidia-smi`命令检查驱动版本。
+> - 物理机的显卡驱动应支持CUDA 12.8或更高版本,可通过`nvidia-smi`命令检查驱动版本。
 > - docker中能够访问物理机的显卡设备。
->
-> 如果您的设备不满足上述条件,您仍然可以使用MinerU的其他功能,但无法使用`sglang`加速VLM模型推理,即无法使用`vlm-sglang-engine`后端和启动`vlm-sglang-server`服务。
+
 
 ## 启动 Docker 容器
 
@@ -32,12 +30,12 @@ docker run --gpus all \
   --shm-size 32g \
   -p 30000:30000 -p 7860:7860 -p 8000:8000 \
   --ipc=host \
-  -it mineru-sglang:latest \
+  -it mineru-vllm:latest \
   /bin/bash
 ```
 
 执行该命令后,您将进入到Docker容器的交互式终端,并映射了一些端口用于可能会使用的服务,您可以直接在容器内运行MinerU相关命令来使用MinerU的功能。
-您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务,详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuisglang-clientserver)。
+您也可以直接通过替换`/bin/bash`为服务启动命令来启动MinerU服务,详细说明请参考[通过命令启动服务](https://opendatalab.github.io/MinerU/zh/usage/quick_usage/#apiwebuihttp-clientserver)。
 
 ## 通过 Docker Compose 直接启动服务
 
@@ -51,19 +49,19 @@ wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/compose.yaml
 >  
 >- `compose.yaml`文件中包含了MinerU的多个服务配置,您可以根据需要选择启动特定的服务。
 >- 不同的服务可能会有额外的参数配置,您可以在`compose.yaml`文件中查看并编辑。
->- 由于`sglang`推理加速框架预分配显存的特性,您可能无法在同一台机器上同时运行多个`sglang`服务,因此请确保在启动`vlm-sglang-server`服务或使用`vlm-sglang-engine`后端时,其他可能使用显存的服务已停止。
+>- 由于`vllm`推理加速框架预分配显存的特性,您可能无法在同一台机器上同时运行多个`vllm`服务,因此请确保在启动`vlm-vllm-server`服务或使用`vlm-vllm-engine`后端时,其他可能使用显存的服务已停止。
 
 ---
 
-### 启动 sglang-server 服务
-并通过`vlm-sglang-client`后端连接`sglang-server`
+### 启动 vllm-server 服务
+并通过`vlm-http-client`后端连接`vllm-server`
   ```bash
-  docker compose -f compose.yaml --profile sglang-server up -d
+  docker compose -f compose.yaml --profile vllm-server up -d
   ```
   >[!TIP]
-  >在另一个终端中通过sglang client连接sglang server(只需cpu与网络,不需要sglang环境)
+  >在另一个终端中通过http client连接vllm server(只需cpu与网络,不需要vllm环境)
   > ```bash
-  > mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://<server_ip>:30000
+  > mineru -p <input_path> -o <output_path> -b vlm-http-client -u http://<server_ip>:30000
   > ```
 
 ---

+ 7 - 15
docs/zh/quick_start/extension_modules.md

@@ -4,34 +4,26 @@ MinerU 支持根据不同需求,按需安装扩展模块,以增强功能或
 ## 常见场景
 
 ### 核心功能安装
-`core` 模块是 MinerU 的核心依赖,包含了除`sglang`外的所有功能模块。安装此模块可以确保 MinerU 的基本功能正常运行。
+`core` 模块是 MinerU 的核心依赖,包含了除`vllm`外的所有功能模块。安装此模块可以确保 MinerU 的基本功能正常运行。
 ```bash
 uv pip install mineru[core]
 ```
 
 ---
 
-### 使用`sglang`加速 VLM 模型推理
-`sglang` 模块提供了对 VLM 模型推理的加速支持,适用于具有 Turing 及以后架构的显卡(8G 显存及以上)。安装此模块可以显著提升模型推理速度。
-在配置中,`all`包含了`core`和`sglang`模块,因此`mineru[all]`和`mineru[core,sglang]`是等价的。
+### 使用`vllm`加速 VLM 模型推理
+`vllm` 模块提供了对 VLM 模型推理的加速支持,适用于具有 Turing 及以后架构的显卡(8G 显存及以上)。安装此模块可以显著提升模型推理速度。
+在配置中,`all`包含了`core`和`vllm`模块,因此`mineru[all]`和`mineru[core,vllm]`是等价的。
 ```bash
 uv pip install mineru[all]
 ```
 > [!TIP]
-> 如在安装包含sglang的完整包过程中发生异常,请参考 [sglang 官方文档](https://docs.sglang.ai/start/install.html) 尝试解决,或直接使用 [Docker](./docker_deployment.md) 方式部署镜像。
+> 如在安装包含vllm的完整包过程中发生异常,请参考 [vllm 官方文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 尝试解决,或直接使用 [Docker](./docker_deployment.md) 方式部署镜像。
 
 ---
 
-### 安装轻量版client连接sglang-server使用
-如果您需要在边缘设备上安装轻量版的 client 端以连接 `sglang-server`,可以安装mineru的基础包,非常轻量,适合在只有cpu和网络连接的设备上使用。
+### 安装轻量版client连接vllm-server使用
+如果您需要在边缘设备上安装轻量版的 client 端以连接 `vllm-server`,可以安装mineru的基础包,非常轻量,适合在只有cpu和网络连接的设备上使用。
 ```bash
 uv pip install mineru
 ```
-
----
-
-### 在过时的linux系统上使用pipeline后端
-如果您的系统过于陈旧,无法满足`mineru[core]`的依赖要求,该选项可以最低限度的满足 MinerU 的运行需求,适用于老旧系统无法升级且仅需使用 pipeline 后端的场景。
-```bash
-uv pip install mineru[pipeline_old_linux]
-```

+ 3 - 3
docs/zh/quick_start/index.md

@@ -31,7 +31,7 @@
         <td>解析后端</td>
         <td>pipeline</td>
         <td>vlm-transformers</td>
-        <td>vlm-sglang</td>
+        <td>vlm-vllm</td>
     </tr>
     <tr>
         <td>操作系统</td>
@@ -80,8 +80,8 @@ uv pip install -e .[core] -i https://mirrors.aliyun.com/pypi/simple
 ```
 
 > [!TIP]
-> `mineru[core]`包含除`sglang`加速外的所有核心功能,兼容Windows / Linux / macOS系统,适合绝大多数用户。
-> 如果您有使用`sglang`加速VLM模型推理,或是在边缘设备安装轻量版client端等需求,可以参考文档[扩展模块安装指南](./extension_modules.md)。
+> `mineru[core]`包含除`vllm`加速外的所有核心功能,兼容Windows / Linux / macOS系统,适合绝大多数用户。
+> 如果您有使用`vllm`加速VLM模型推理,或是在边缘设备安装轻量版client端等需求,可以参考文档[扩展模块安装指南](./extension_modules.md)。
 
 ---
  

+ 364 - 64
docs/zh/reference/output_files.md

@@ -51,14 +51,16 @@
 
 ## 结构化数据文件
 
-### 模型推理结果 (model.json)
+> [!IMPORTANT]
+> 2.5版本vlm后端的输出存在较大变化,与pipeline版本存在不兼容情况,如需基于结构化输出进行二次开发,请仔细阅读本文档内容。
 
-> [!NOTE]
-> 仅适用于 pipeline 后端
+### pipeline 后端 输出结果
+
+#### 模型推理结果 (model.json)
 
 **文件命名格式**:`{原文件名}_model.json`
 
-#### 数据结构定义
+##### 数据结构定义
 
 ```python
 from pydantic import BaseModel, Field
@@ -103,7 +105,7 @@ class PageInferenceResults(BaseModel):
 inference_result: list[PageInferenceResults] = []
 ```
 
-#### 坐标系统说明
+##### 坐标系统说明
 
 `poly` 坐标格式:`[x0, y0, x1, y1, x2, y2, x3, y3]`
 
@@ -112,7 +114,7 @@ inference_result: list[PageInferenceResults] = []
 
 ![poly 坐标示意图](../images/poly.png)
 
-#### 示例数据
+##### 示例数据
 
 ```json
 [
@@ -165,52 +167,11 @@ inference_result: list[PageInferenceResults] = []
 ]
 ```
 
-### VLM 输出结果 (model_output.txt)
-
-> [!NOTE]
-> 仅适用于 VLM 后端
-
-**文件命名格式**:`{原文件名}_model_output.txt`
-
-#### 文件格式说明
-
-- 使用 `----` 分割每一页的输出结果
-- 每页包含多个以 `<|box_start|>` 开头、`<|md_end|>` 结尾的文本块
-
-#### 字段含义
-
-| 标记 | 格式 | 说明 |
-|------|---|------|
-| 边界框 | `<\|box_start\|>x0 y0 x1 y1<\|box_end\|>` | 四边形坐标(左上、右下两点),页面缩放至 1000×1000 后的坐标值 |
-| 类型标记 | `<\|ref_start\|>type<\|ref_end\|>` | 内容块类型标识 |
-| 内容 | `<\|md_start\|>markdown内容<\|md_end\|>` | 该块的 Markdown 内容 |
-
-#### 支持的内容类型
-
-```json
-{
-    "text": "文本",
-    "title": "标题", 
-    "image": "图片",
-    "image_caption": "图片描述",
-    "image_footnote": "图片脚注",
-    "table": "表格",
-    "table_caption": "表格描述", 
-    "table_footnote": "表格脚注",
-    "equation": "行间公式"
-}
-```
-
-#### 特殊标记
-
-- `<|txt_contd|>`:出现在文本末尾,表示该文本块可与后续文本块连接
-- 表格内容采用 `otsl` 格式,需转换为 HTML 才能在 Markdown 中渲染
-
-### 中间处理结果 (middle.json)
+#### 中间处理结果 (middle.json)
 
 **文件命名格式**:`{原文件名}_middle.json`
 
-#### 顶层结构
+##### 顶层结构
 
 | 字段名 | 类型 | 说明 |
 |--------|------|------|
@@ -218,22 +179,20 @@ inference_result: list[PageInferenceResults] = []
 | `_backend` | `string` | 解析模式:`pipeline` 或 `vlm` |
 | `_version_name` | `string` | MinerU 版本号 |
 
-#### 页面信息结构 (pdf_info)
+##### 页面信息结构 (pdf_info)
 
 | 字段名 | 说明 |
 |--------|------|
 | `preproc_blocks` | PDF 预处理后的未分段中间结果 |
-| `layout_bboxes` | 布局分割结果,包含布局方向和边界框,按阅读顺序排序 |
 | `page_idx` | 页码,从 0 开始 |
 | `page_size` | 页面的宽度和高度 `[width, height]` |
-| `_layout_tree` | 布局树状结构 |
 | `images` | 图片块信息列表 |
 | `tables` | 表格块信息列表 |
 | `interline_equations` | 行间公式块信息列表 |
 | `discarded_blocks` | 需要丢弃的块信息 |
 | `para_blocks` | 分段后的内容块结果 |
 
-#### 块结构层次
+##### 块结构层次
 
 ```
 一级块 (table | image)
@@ -242,7 +201,7 @@ inference_result: list[PageInferenceResults] = []
         └── 片段 (span)
 ```
 
-#### 一级块字段
+##### 一级块字段
 
 | 字段名 | 说明 |
 |--------|------|
@@ -250,7 +209,7 @@ inference_result: list[PageInferenceResults] = []
 | `bbox` | 块的矩形框坐标 `[x0, y0, x1, y1]` |
 | `blocks` | 包含的二级块列表 |
 
-#### 二级块字段
+##### 二级块字段
 
 | 字段名 | 说明 |
 |--------|------|
@@ -258,7 +217,7 @@ inference_result: list[PageInferenceResults] = []
 | `bbox` | 块的矩形框坐标 |
 | `lines` | 包含的行信息列表 |
 
-#### 二级块类型
+##### 二级块类型
 
 | 类型 | 说明 |
 |------|------|
@@ -274,7 +233,7 @@ inference_result: list[PageInferenceResults] = []
 | `list` | 列表块 |
 | `interline_equation` | 行间公式块 |
 
-#### 行和片段结构
+##### 行和片段结构
 
 **行 (line) 字段**:
 - `bbox`:行的矩形框坐标
@@ -285,7 +244,7 @@ inference_result: list[PageInferenceResults] = []
 - `type`:片段类型(`image`、`table`、`text`、`inline_equation`、`interline_equation`)
 - `content` | `img_path`:文本内容或图片路径
 
-#### 示例数据
+##### 示例数据
 
 ```json
 {
@@ -388,15 +347,15 @@ inference_result: list[PageInferenceResults] = []
 }
 ```
 
-### 内容列表 (content_list.json)
+#### 内容列表 (content_list.json)
 
 **文件命名格式**:`{原文件名}_content_list.json`
 
-#### 功能说明
+##### 功能说明
 
 这是一个简化版的 `middle.json`,按阅读顺序平铺存储所有可读内容块,去除了复杂的布局信息,便于后续处理。
 
-#### 内容类型
+##### 内容类型
 
 | 类型 | 说明 |
 |------|------|
@@ -405,7 +364,7 @@ inference_result: list[PageInferenceResults] = []
 | `text` | 文本/标题 |
 | `equation` | 行间公式 |
 
-#### 文本层级标识
+##### 文本层级标识
 
 通过 `text_level` 字段区分文本层级:
 
@@ -414,12 +373,12 @@ inference_result: list[PageInferenceResults] = []
 - `text_level: 2`:二级标题
 - 以此类推...
 
-#### 通用字段
+##### 通用字段
 
 - 所有内容块都包含 `page_idx` 字段,表示所在页码(从 0 开始)。
 - 所有内容块都包含 `bbox` 字段,表示内容块的边界框坐标 `[x0, y0, x1, y1]` 映射在0-1000范围内的结果。
 
-#### 示例数据
+##### 示例数据
 
 ```json
 [
@@ -484,6 +443,347 @@ inference_result: list[PageInferenceResults] = []
 ]
 ```
 
+### VLM 后端 输出结果
+
+#### 模型推理结果 (model.json)
+
+**文件命名格式**:`{原文件名}_model.json`
+
+##### 文件格式说明
+
+- 该文件为 VLM 模型的原始输出结果,包含两层嵌套list,外层表示页面,内层表示该页的内容块
+- 每个内容块都是一个dict,包含 `type`、`bbox`、`angle`、`content` 字段
+
+
+##### 支持的内容类型
+
+```json
+{
+    "text": "文本",
+    "title": "标题", 
+    "equation": "行间公式",
+    "image": "图片",
+    "image_caption": "图片描述",
+    "image_footnote": "图片脚注",
+    "table": "表格",
+    "table_caption": "表格描述",
+    "table_footnote": "表格脚注",
+    "phonetic": "拼音",
+    "code": "代码块",
+    "code_caption": "代码描述",
+    "ref_text": "参考文献",
+    "algorithm": "算法块",
+    "list": "列表",
+    "header": "页眉",
+    "footer": "页脚",
+    "page_number": "页码",
+    "aside_text": "装订线旁注", 
+    "page_footnote": "页面脚注"
+}
+```
+
+##### 坐标系统说明
+
+`bbox` 坐标格式:`[x0, y0, x1, y1]`
+
+- 分别表示左上、右下两点的坐标
+- 坐标原点在页面左上角
+- 坐标为相对于原始页面尺寸的百分比,范围在0-1之间
+
+##### 示例数据
+
+```json
+[
+    [
+        {
+            "type": "header",
+            "bbox": [
+                0.077,
+                0.095,
+                0.18,
+                0.181
+            ],
+            "angle": 0,
+            "score": null,
+            "block_tags": null,
+            "content": "ELSEVIER",
+            "format": null,
+            "content_tags": null
+        },
+        {
+            "type": "title",
+            "bbox": [
+                0.157,
+                0.228,
+                0.833,
+                0.253
+            ],
+            "angle": 0,
+            "score": null,
+            "block_tags": null,
+            "content": "The response of flow duration curves to afforestation",
+            "format": null,
+            "content_tags": null
+        }
+    ]
+]
+```
+
+#### 中间处理结果 (middle.json)
+
+**文件命名格式**:`{原文件名}_middle.json`
+
+##### 文件格式说明
+vlm 后端的 middle.json 文件结构与 pipeline 后端类似,但存在以下差异: 
+1. list变成二级block,增加"sub_type"字段区分list类型,"sub_type"可选"text"(文本类型),"ref_text"(引用类型)
+2. 增加code类型block,code类型包含两种"sub_type",分别是"code"和"algorithm",至少有code_body,可选code_caption
+3. `discarded_blocks`内元素type增加"header"、"footer"、"page_number"、"aside_text"、"page_footnote"类型
+4. 所有block增加`angle`字段,用来表示旋转角度,0,90,180,270
+
+
+##### 示例数据
+- list block 示例
+    ```json
+    {
+        "bbox": [
+            174,
+            155,
+            818,
+            333
+        ],
+        "type": "list",
+        "angle": 0,
+        "index": 11,
+        "blocks": [
+            {
+                "bbox": [
+                    174,
+                    157,
+                    311,
+                    175
+                ],
+                "type": "text",
+                "angle": 0,
+                "lines": [
+                    {
+                        "bbox": [
+                            174,
+                            157,
+                            311,
+                            175
+                        ],
+                        "spans": [
+                            {
+                                "bbox": [
+                                    174,
+                                    157,
+                                    311,
+                                    175
+                                ],
+                                "type": "text",
+                                "content": "H.1 Introduction"
+                            }
+                        ]
+                    }
+                ],
+                "index": 3
+            },
+            {
+                "bbox": [
+                    175,
+                    182,
+                    464,
+                    229
+                ],
+                "type": "text",
+                "angle": 0,
+                "lines": [
+                    {
+                        "bbox": [
+                            175,
+                            182,
+                            464,
+                            229
+                        ],
+                        "spans": [
+                            {
+                                "bbox": [
+                                    175,
+                                    182,
+                                    464,
+                                    229
+                                ],
+                                "type": "text",
+                                "content": "H.2 Example: Divide by Zero without Exception Handling"
+                            }
+                        ]
+                    }
+                ],
+                "index": 4
+            }
+        ],
+        "sub_type": "text"
+    }
+    ```
+- code block 示例
+    ```json
+    {
+        "type": "code",
+        "bbox": [
+            114,
+            780,
+            885,
+            1231
+        ],
+        "blocks": [
+            {
+                "bbox": [
+                    114,
+                    780,
+                    885,
+                    1231
+                ],
+                "lines": [
+                    {
+                        "bbox": [
+                            114,
+                            780,
+                            885,
+                            1231
+                        ],
+                        "spans": [
+                            {
+                                "bbox": [
+                                    114,
+                                    780,
+                                    885,
+                                    1231
+                                ],
+                                "type": "text",
+                                "content": "1 // Fig. H.1: DivideByZeroNoExceptionHandling.java  \n2 // Integer division without exception handling.  \n3 import java.util.Scanner;  \n4  \n5 public class DivideByZeroNoExceptionHandling  \n6 {  \n7 // demonstrates throwing an exception when a divide-by-zero occurs  \n8 public static int quotient( int numerator, int denominator )  \n9 {  \n10 return numerator / denominator; // possible division by zero  \n11 } // end method quotient  \n12  \n13 public static void main(String[] args)  \n14 {  \n15 Scanner scanner = new Scanner(System.in); // scanner for input  \n16  \n17 System.out.print(\"Please enter an integer numerator: \");  \n18 int numerator = scanner.nextInt();  \n19 System.out.print(\"Please enter an integer denominator: \");  \n20 int denominator = scanner.nextInt();  \n21"
+                            }
+                        ]
+                    }
+                ],
+                "index": 17,
+                "angle": 0,
+                "type": "code_body"
+            },
+            {
+                "bbox": [
+                    867,
+                    160,
+                    1280,
+                    189
+                ],
+                "lines": [
+                    {
+                        "bbox": [
+                            867,
+                            160,
+                            1280,
+                            189
+                        ],
+                        "spans": [
+                            {
+                                "bbox": [
+                                    867,
+                                    160,
+                                    1280,
+                                    189
+                                ],
+                                "type": "text",
+                                "content": "Algorithm 1 Modules for MCTSteg"
+                            }
+                        ]
+                    }
+                ],
+                "index": 19,
+                "angle": 0,
+                "type": "code_caption"
+            }
+        ],
+        "index": 17,
+        "sub_type": "code"
+    }
+    ```
+
+#### 内容列表 (content_list.json)
+
+**文件命名格式**:`{原文件名}_content_list.json`
+
+##### 文件格式说明
+vlm 后端的 content_list.json 文件结构与 pipeline 后端类似,伴随本次middle.json的变化,做了以下调整:
+1. 新增`code`类型,code类型包含两种"sub_type",分别是"code"和"algorithm",至少有code_body,可选code_caption
+2. 新增`list`类型,list类型包含两种"sub_type",分别是"text"和"ref_text" 
+3. 增加所有所有`discarded_blocks`的输出内容
+
+##### 示例数据
+- code 类型 content
+    ```json
+    {
+        "type": "code",
+        "sub_type": "algorithm",
+        "code_caption": [
+            "Algorithm 1 Modules for MCTSteg"
+        ],
+        "code_body": "1: function GETCOORDINATE(d)  \n2:  $x \\gets d / l$ ,  $y \\gets d$  mod  $l$   \n3: return  $(x, y)$   \n4: end function  \n5: function BESTCHILD(v)  \n6:  $C \\gets$  child set of  $v$   \n7:  $v' \\gets \\arg \\max_{c \\in C} \\mathrm{UCTScore}(c)$   \n8:  $v'.n \\gets v'.n + 1$   \n9: return  $v'$   \n10: end function  \n11: function BACK PROPAGATE(v)  \n12: Calculate  $R$  using Equation 11  \n13: while  $v$  is not a root node do  \n14:  $v.r \\gets v.r + R$ ,  $v \\gets v.p$   \n15: end while  \n16: end function  \n17: function RANDOMSEARCH(v)  \n18: while  $v$  is not a leaf node do  \n19: Randomly select an untried action  $a \\in A(v)$   \n20: Create a new node  $v'$   \n21:  $(x, y) \\gets \\mathrm{GETCOORDINATE}(v'.d)$   \n22:  $v'.p \\gets v$ ,  $v'.d \\gets v.d + 1$ ,  $v'.\\Gamma \\gets v.\\Gamma$   \n23:  $v'.\\gamma_{x,y} \\gets a$   \n24: if  $a = -1$  then  \n25:  $v.lc \\gets v'$   \n26: else if  $a = 0$  then  \n27:  $v.mc \\gets v'$   \n28: else  \n29:  $v.rc \\gets v'$   \n30: end if  \n31:  $v \\gets v'$   \n32: end while  \n33: return  $v$   \n34: end function  \n35: function SEARCH(v)  \n36: while  $v$  is fully expanded do  \n37:  $v \\gets$  BESTCHILD(v)  \n38: end while  \n39: if  $v$  is not a leaf node then  \n40:  $v \\gets$  RANDOMSEARCH(v)  \n41: end if  \n42: return  $v$   \n43: end function",
+        "bbox": [
+            510,
+            87,
+            881,
+            740
+        ],
+        "page_idx": 0
+    }
+    ```
+- list 类型 content
+    ```json
+    {
+        "type": "list",
+        "sub_type": "text",
+        "list_items": [
+            "H.1 Introduction",
+            "H.2 Example: Divide by Zero without Exception Handling",
+            "H.3 Example: Divide by Zero with Exception Handling",
+            "H.4 Summary"
+        ],
+        "bbox": [
+            174,
+            155,
+            818,
+            333
+        ],
+        "page_idx": 0
+    }
+    ```
+- discarded 类型 content
+  ```json
+  [{
+      "type": "header",
+      "text": "Journal of Hydrology 310 (2005) 253-265",
+      "bbox": [
+          363,
+          164,
+          623,
+          177
+      ],
+      "page_idx": 0
+  },
+  {
+      "type": "page_footnote",
+      "text": "* Corresponding author. Address: Forest Science Centre, Department of Sustainability and Environment, P.O. Box 137, Heidelberg, Vic. 3084, Australia. Tel.: +61 3 9450 8719; fax: +61 3 9450 8644.",
+      "bbox": [
+          71,
+          815,
+          915,
+          841
+      ],
+      "page_idx": 0
+  }]
+  ```
+
+
 ## 总结
 
 以上文件为 MinerU 的完整输出结果,用户可根据需要选择合适的文件进行后续处理:

+ 8 - 21
docs/zh/usage/advanced_cli_parameters.md

@@ -1,25 +1,17 @@
 # 命令行参数进阶
 
-## SGLang 加速参数优化
-
-### 显存优化参数
-> [!TIP]
-> sglang加速模式目前支持在最低8G显存的Turing架构显卡上运行,但在显存<24G的显卡上可能会遇到显存不足的问题, 可以通过使用以下参数来优化显存使用:
-> 
-> - 如果您使用单张显卡遇到显存不足的情况时,可能需要调低KV缓存大小,`--mem-fraction-static 0.5`,如仍出现显存不足问题,可尝试进一步降低到`0.4`或更低
-> - 如您有两张以上显卡,可尝试通过张量并行(TP)模式简单扩充可用显存:`--tp-size 2`
+## vllm 加速参数优化
 
 ### 性能优化参数
 > [!TIP]
-> 如果您已经可以正常使用sglang对vlm模型进行加速推理,但仍然希望进一步提升推理速度,可以尝试以下参数:
+> 如果您已经可以正常使用vllm对vlm模型进行加速推理,但仍然希望进一步提升推理速度,可以尝试以下参数:
 > 
-> - 如果您有超过多张显卡,可以使用sglang的多卡并行模式来增加吞吐量:`--dp-size 2`
-> - 同时您可以启用`torch.compile`来将推理速度加速约15%:`--enable-torch-compile`
+> - 如果您有超过多张显卡,可以使用vllm的多卡并行模式来增加吞吐量:`--data-parallel-size 2`
 
 ### 参数传递说明
 > [!TIP]
-> - 所有sglang官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-sglang-server`、`mineru-gradio`、`mineru-api`
-> - 如果您想了解更多有关`sglang`的参数使用方法,请参考 [sglang官方文档](https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands)
+> - 所有vllm官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-vllm-server`、`mineru-gradio`、`mineru-api`
+> - 如果您想了解更多有关`vllm`的参数使用方法,请参考 [vllm官方文档](https://docs.vllm.ai/en/latest/cli/serve.html)
 
 ## GPU 设备选择与配置
 
@@ -29,7 +21,7 @@
 >   ```bash
 >   CUDA_VISIBLE_DEVICES=1 mineru -p <input_path> -o <output_path>
 >   ```
-> - 这种指定方式对所有的命令行调用都有效,包括 `mineru`、`mineru-sglang-server`、`mineru-gradio` 和 `mineru-api`,且对`pipeline`、`vlm`后端均适用。
+> - 这种指定方式对所有的命令行调用都有效,包括 `mineru`、`mineru-vllm-server`、`mineru-gradio` 和 `mineru-api`,且对`pipeline`、`vlm`后端均适用。
 
 ### 常见设备配置示例
 > [!TIP]
@@ -47,14 +39,9 @@
 > [!TIP]
 > 以下是一些可能的使用场景:
 > 
-> - 如果您有多张显卡,需要指定卡0和卡1,并使用多卡并行来启动`sglang-server`,可以使用以下命令: 
->   ```bash
->   CUDA_VISIBLE_DEVICES=0,1 mineru-sglang-server --port 30000 --dp-size 2
->   ```
->   
-> - 如果您有多张显卡,需要指定卡0-3,并使用多卡数据并行和张量并行来启动`sglang-server`,可以使用以下命令: 
+> - 如果您有多张显卡,需要指定卡0和卡1,并使用多卡并行来启动`vllm-server`,可以使用以下命令: 
 >   ```bash
->   CUDA_VISIBLE_DEVICES=0,1,2,3 mineru-sglang-server --port 30000 --dp-size 2 --tp-size 2
+>   CUDA_VISIBLE_DEVICES=0,1 mineru-vllm-server --port 30000 --data-parallel-size 2
 >   ```
 >   
 > - 如果您有多张显卡,需要在卡0和卡1上启动两个`fastapi`服务,并分别监听不同的端口,可以使用以下命令: 

+ 3 - 3
docs/zh/usage/cli_tools.md

@@ -11,11 +11,11 @@ Options:
   -p, --path PATH                 输入文件路径或目录(必填)
   -o, --output PATH               输出目录(必填)
   -m, --method [auto|txt|ocr]     解析方法:auto(默认)、txt、ocr(仅用于 pipeline 后端)
-  -b, --backend [pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client]
+  -b, --backend [pipeline|vlm-transformers|vlm-vllm-engine|vlm-http-client]
                                   解析后端(默认为 pipeline)
   -l, --lang [ch|ch_server|ch_lite|en|korean|japan|chinese_cht|ta|te|ka|th|el|latin|arabic|east_slavic|cyrillic|devanagari]
                                   指定文档语言(可提升 OCR 准确率,仅用于 pipeline 后端)
-  -u, --url TEXT                  当使用 sglang-client 时,需指定服务地址
+  -u, --url TEXT                  当使用 http-client 时,需指定服务地址
   -s, --start INTEGER             开始解析的页码(从 0 开始)
   -e, --end INTEGER               结束解析的页码(从 0 开始)
   -f, --formula BOOLEAN           是否启用公式解析(默认开启)
@@ -43,7 +43,7 @@ Usage: mineru-gradio [OPTIONS]
 Options:
   --enable-example BOOLEAN        启用示例文件输入(需要将示例文件放置在当前
                                   执行命令目录下的 `example` 文件夹中)
-  --enable-sglang-engine BOOLEAN  启用 SgLang 引擎后端以提高处理速度
+  --enable-vllm-engine BOOLEAN  启用 vllm 引擎后端以提高处理速度
   --enable-api BOOLEAN            启用 Gradio API 以提供应用程序服务
   --max-convert-pages INTEGER     设置从 PDF 转换为 Markdown 的最大页数
   --server-name TEXT              设置 Gradio 应用程序的服务器主机名

+ 12 - 12
docs/zh/usage/quick_usage.md

@@ -28,11 +28,11 @@ mineru -p <input_path> -o <output_path>
 mineru -p <input_path> -o <output_path> -b vlm-transformers
 ```
 > [!TIP]
-> vlm后端另外支持`sglang`加速,与`transformers`后端相比,`sglang`的加速比可达20~30倍,可以在[扩展模块安装指南](../quick_start/extension_modules.md)中查看支持`sglang`加速的完整包安装方法。
+> vlm后端另外支持`vllm`加速,与`transformers`后端相比,`vllm`的加速比可达20~30倍,可以在[扩展模块安装指南](../quick_start/extension_modules.md)中查看支持`vllm`加速的完整包安装方法。
 
 如果需要通过自定义参数调整解析选项,您也可以在文档中查看更详细的[命令行工具使用说明](./cli_tools.md)。
 
-## 通过api、webui、sglang-client/server进阶使用
+## 通过api、webui、http-client/server进阶使用
 
 - 通过python api直接调用:[Python 调用示例](https://github.com/opendatalab/MinerU/blob/master/demo/demo.py)
 - 通过fast api方式调用:
@@ -43,29 +43,29 @@ mineru -p <input_path> -o <output_path> -b vlm-transformers
   >在浏览器中访问 `http://127.0.0.1:8000/docs` 查看API文档。
 - 启动gradio webui 可视化前端:
   ```bash
-  # 使用 pipeline/vlm-transformers/vlm-sglang-client 后端
+  # 使用 pipeline/vlm-transformers/vlm-http-client 后端
   mineru-gradio --server-name 0.0.0.0 --server-port 7860
-  # 或使用 vlm-sglang-engine/pipeline 后端(需安装sglang环境)
-  mineru-gradio --server-name 0.0.0.0 --server-port 7860 --enable-sglang-engine true
+  # 或使用 vlm-vllm-engine/pipeline 后端(需安装vllm环境)
+  mineru-gradio --server-name 0.0.0.0 --server-port 7860 --enable-vllm-engine true
   ```
   >[!TIP]
   > 
   >- 在浏览器中访问 `http://127.0.0.1:7860` 使用 Gradio WebUI。
   >- 访问 `http://127.0.0.1:7860/?view=api` 使用 Gradio API。
-- 使用`sglang-client/server`方式调用:
+- 使用`http-client/server`方式调用:
   ```bash
-  # 启动sglang server(需要安装sglang环境)
-  mineru-sglang-server --port 30000
+  # 启动vllm server(需要安装vllm环境)
+  mineru-vllm-server --port 30000
   ``` 
   >[!TIP]
-  >在另一个终端中通过sglang client连接sglang server(只需cpu与网络,不需要sglang环境)
+  >在另一个终端中通过http client连接vllm server(只需cpu与网络,不需要vllm环境)
   > ```bash
-  > mineru -p <input_path> -o <output_path> -b vlm-sglang-client -u http://127.0.0.1:30000
+  > mineru -p <input_path> -o <output_path> -b vlm-http-client -u http://127.0.0.1:30000
   > ```
 
 > [!NOTE]
-> 所有sglang官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-sglang-server`、`mineru-gradio`、`mineru-api`,
-> 我们整理了一些`sglang`使用中的常用参数和使用方法,可以在文档[命令行进阶参数](./advanced_cli_parameters.md)中获取。
+> 所有vllm官方支持的参数都可用通过命令行参数传递给 MinerU,包括以下命令:`mineru`、`mineru-vllm -server`、`mineru-gradio`、`mineru-api`,
+> 我们整理了一些`vllm`使用中的常用参数和使用方法,可以在文档[命令行进阶参数](./advanced_cli_parameters.md)中获取。
 
 ## 基于配置文件扩展 MinerU 功能
 

+ 0 - 186
mineru/backend/vlm/base_predictor.py

@@ -1,186 +0,0 @@
-import asyncio
-from abc import ABC, abstractmethod
-from typing import AsyncIterable, Iterable, List, Optional, Union
-
-DEFAULT_SYSTEM_PROMPT = (
-    "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
-)
-DEFAULT_USER_PROMPT = "Document Parsing:"
-DEFAULT_TEMPERATURE = 0.0
-DEFAULT_TOP_P = 0.8
-DEFAULT_TOP_K = 20
-DEFAULT_REPETITION_PENALTY = 1.0
-DEFAULT_PRESENCE_PENALTY = 0.0
-DEFAULT_NO_REPEAT_NGRAM_SIZE = 100
-DEFAULT_MAX_NEW_TOKENS = 16384
-
-
-class BasePredictor(ABC):
-    system_prompt = DEFAULT_SYSTEM_PROMPT
-
-    def __init__(
-        self,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        top_k: int = DEFAULT_TOP_K,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
-        no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
-        max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-    ) -> None:
-        self.temperature = temperature
-        self.top_p = top_p
-        self.top_k = top_k
-        self.repetition_penalty = repetition_penalty
-        self.presence_penalty = presence_penalty
-        self.no_repeat_ngram_size = no_repeat_ngram_size
-        self.max_new_tokens = max_new_tokens
-
-    @abstractmethod
-    def predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> str: ...
-
-    @abstractmethod
-    def batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> List[str]: ...
-
-    @abstractmethod
-    def stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> Iterable[str]: ...
-
-    async def aio_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> str:
-        return await asyncio.to_thread(
-            self.predict,
-            image,
-            prompt,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            presence_penalty,
-            no_repeat_ngram_size,
-            max_new_tokens,
-        )
-
-    async def aio_batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> List[str]:
-        return await asyncio.to_thread(
-            self.batch_predict,
-            images,
-            prompts,
-            temperature,
-            top_p,
-            top_k,
-            repetition_penalty,
-            presence_penalty,
-            no_repeat_ngram_size,
-            max_new_tokens,
-        )
-
-    async def aio_stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> AsyncIterable[str]:
-        queue = asyncio.Queue()
-        loop = asyncio.get_running_loop()
-
-        def synced_predict():
-            for chunk in self.stream_predict(
-                image=image,
-                prompt=prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                repetition_penalty=repetition_penalty,
-                presence_penalty=presence_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                max_new_tokens=max_new_tokens,
-            ):
-                asyncio.run_coroutine_threadsafe(queue.put(chunk), loop)
-            asyncio.run_coroutine_threadsafe(queue.put(None), loop)
-
-        asyncio.create_task(
-            asyncio.to_thread(synced_predict),
-        )
-
-        while True:
-            chunk = await queue.get()
-            if chunk is None:
-                return
-            assert isinstance(chunk, str)
-            yield chunk
-
-    def build_prompt(self, prompt: str) -> str:
-        if prompt.startswith("<|im_start|>"):
-            return prompt
-        if not prompt:
-            prompt = DEFAULT_USER_PROMPT
-
-        return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n<image>\n{prompt}<|im_end|><|im_start|>assistant\n"
-        # Modify here. We add <|box_start|> at the end of the prompt to force the model to generate bounding box.
-        # if "Document OCR" in prompt:
-        #     return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n<image>\n{prompt}<|im_end|><|im_start|>assistant\n<|box_start|>"
-        # else:
-        #     return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n<image>\n{prompt}<|im_end|><|im_start|>assistant\n"
-
-    def close(self):
-        pass

+ 0 - 217
mineru/backend/vlm/hf_predictor.py

@@ -1,217 +0,0 @@
-from io import BytesIO
-from typing import Iterable, List, Optional, Union
-
-import torch
-from PIL import Image
-from tqdm import tqdm
-from transformers import AutoTokenizer, BitsAndBytesConfig, __version__
-
-from ...model.vlm_hf_model import Mineru2QwenForCausalLM
-from ...model.vlm_hf_model.image_processing_mineru2 import process_images
-from .base_predictor import (
-    DEFAULT_MAX_NEW_TOKENS,
-    DEFAULT_NO_REPEAT_NGRAM_SIZE,
-    DEFAULT_PRESENCE_PENALTY,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_K,
-    DEFAULT_TOP_P,
-    BasePredictor,
-)
-from .utils import load_resource
-
-
-class HuggingfacePredictor(BasePredictor):
-    def __init__(
-        self,
-        model_path: str,
-        device_map="auto",
-        device="cuda",
-        torch_dtype="auto",
-        load_in_8bit=False,
-        load_in_4bit=False,
-        use_flash_attn=False,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        top_k: int = DEFAULT_TOP_K,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
-        no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
-        max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-        **kwargs,
-    ):
-        super().__init__(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-
-        kwargs = {"device_map": device_map, **kwargs}
-
-        if device != "cuda":
-            kwargs["device_map"] = {"": device}
-
-        if load_in_8bit:
-            kwargs["load_in_8bit"] = True
-        elif load_in_4bit:
-            kwargs["load_in_4bit"] = True
-            kwargs["quantization_config"] = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4",
-            )
-        else:
-            from packaging import version
-            if version.parse(__version__) >= version.parse("4.56.0"):
-                kwargs["dtype"] = torch_dtype
-            else:
-                kwargs["torch_dtype"] = torch_dtype
-
-        if use_flash_attn:
-            kwargs["attn_implementation"] = "flash_attention_2"
-
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-        self.model = Mineru2QwenForCausalLM.from_pretrained(
-            model_path,
-            low_cpu_mem_usage=True,
-            **kwargs,
-        )
-        setattr(self.model.config, "_name_or_path", model_path)
-        self.model.eval()
-
-        vision_tower = self.model.get_model().vision_tower
-        if device_map != "auto":
-            vision_tower.to(device=device_map, dtype=self.model.dtype)
-
-        self.image_processor = vision_tower.image_processor
-        self.eos_token_id = self.model.config.eos_token_id
-
-    def predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        **kwargs,
-    ) -> str:
-        prompt = self.build_prompt(prompt)
-
-        if temperature is None:
-            temperature = self.temperature
-        if top_p is None:
-            top_p = self.top_p
-        if top_k is None:
-            top_k = self.top_k
-        if repetition_penalty is None:
-            repetition_penalty = self.repetition_penalty
-        if no_repeat_ngram_size is None:
-            no_repeat_ngram_size = self.no_repeat_ngram_size
-        if max_new_tokens is None:
-            max_new_tokens = self.max_new_tokens
-
-        do_sample = (temperature > 0.0) and (top_k > 1)
-
-        generate_kwargs = {
-            "repetition_penalty": repetition_penalty,
-            "no_repeat_ngram_size": no_repeat_ngram_size,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": do_sample,
-        }
-        if do_sample:
-            generate_kwargs["temperature"] = temperature
-            generate_kwargs["top_p"] = top_p
-            generate_kwargs["top_k"] = top_k
-
-        if isinstance(image, str):
-            image = load_resource(image)
-
-        image_obj = Image.open(BytesIO(image))
-        image_tensor = process_images([image_obj], self.image_processor, self.model.config)
-        image_tensor = image_tensor[0].unsqueeze(0)
-        image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype)
-        image_sizes = [[*image_obj.size]]
-
-        encoded_inputs = self.tokenizer(prompt, return_tensors="pt")
-        input_ids = encoded_inputs.input_ids.to(device=self.model.device)
-        attention_mask = encoded_inputs.attention_mask.to(device=self.model.device)
-
-        with torch.inference_mode():
-            output_ids = self.model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                images=image_tensor,
-                image_sizes=image_sizes,
-                use_cache=True,
-                **generate_kwargs,
-                **kwargs,
-            )
-
-        # Remove the last token if it is the eos_token_id
-        if len(output_ids[0]) > 0 and output_ids[0, -1] == self.eos_token_id:
-            output_ids = output_ids[:, :-1]
-
-        output = self.tokenizer.batch_decode(
-            output_ids,
-            skip_special_tokens=False,
-        )[0].strip()
-
-        return output
-
-    def batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,  # not supported by hf
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        **kwargs,
-    ) -> List[str]:
-        if not isinstance(prompts, list):
-            prompts = [prompts] * len(images)
-
-        assert len(prompts) == len(images), "Length of prompts and images must match."
-
-        outputs = []
-        for prompt, image in tqdm(zip(prompts, images), total=len(images), desc="Predict"):
-            output = self.predict(
-                image,
-                prompt,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                repetition_penalty=repetition_penalty,
-                presence_penalty=presence_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                max_new_tokens=max_new_tokens,
-                **kwargs,
-            )
-            outputs.append(output)
-        return outputs
-
-    def stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> Iterable[str]:
-        raise NotImplementedError("Streaming is not supported yet.")

+ 123 - 0
mineru/backend/vlm/model_output_to_middle_json.py

@@ -0,0 +1,123 @@
+import os
+import time
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from mineru.backend.vlm.vlm_magic_model import MagicModel
+from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
+from mineru.utils.cut_image import cut_image_and_table
+from mineru.utils.enum_class import ContentType
+from mineru.utils.hash_utils import bytes_md5
+from mineru.utils.pdf_image_tools import get_crop_img
+from mineru.utils.table_merge import merge_table
+from mineru.version import __version__
+
+
+heading_level_import_success = False
+llm_aided_config = get_llm_aided_config()
+if llm_aided_config:
+    title_aided_config = llm_aided_config.get('title_aided', {})
+    if title_aided_config.get('enable', False):
+        try:
+            from mineru.utils.llm_aided import llm_aided_title
+            from mineru.backend.pipeline.model_init import AtomModelSingleton
+            heading_level_import_success = True
+        except Exception as e:
+            logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
+                            "please execute `pip install mineru[core]` to install the required packages.")
+
+
+def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index) -> dict:
+    """将blocks转换为页面信息"""
+
+    scale = image_dict["scale"]
+    # page_pil_img = image_dict["img_pil"]
+    page_pil_img = image_dict["img_pil"]
+    page_img_md5 = bytes_md5(page_pil_img.tobytes())
+    width, height = map(int, page.get_size())
+
+    magic_model = MagicModel(page_blocks, width, height)
+    image_blocks = magic_model.get_image_blocks()
+    table_blocks = magic_model.get_table_blocks()
+    title_blocks = magic_model.get_title_blocks()
+    discarded_blocks = magic_model.get_discarded_blocks()
+    code_blocks = magic_model.get_code_blocks()
+    ref_text_blocks = magic_model.get_ref_text_blocks()
+    phonetic_blocks = magic_model.get_phonetic_blocks()
+    list_blocks = magic_model.get_list_blocks()
+
+    # 如果有标题优化需求,则对title_blocks截图det
+    if heading_level_import_success:
+        atom_model_manager = AtomModelSingleton()
+        ocr_model = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.3,
+            lang='ch_lite'
+        )
+        for title_block in title_blocks:
+            title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
+            title_np_img = np.array(title_pil_img)
+            # 给title_pil_img添加上下左右各50像素白边padding
+            title_np_img = cv2.copyMakeBorder(
+                title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
+            )
+            title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
+            ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
+            if len(ocr_det_res) > 0:
+                # 计算所有res的平均高度
+                avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
+                title_block['line_avg_height'] = round(avg_height/scale)
+
+    text_blocks = magic_model.get_text_blocks()
+    interline_equation_blocks = magic_model.get_interline_equation_blocks()
+
+    all_spans = magic_model.get_all_spans()
+    # 对image/table/interline_equation的span截图
+    for span in all_spans:
+        if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
+            span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
+
+    page_blocks = []
+    page_blocks.extend([
+        *image_blocks,
+        *table_blocks,
+        *code_blocks,
+        *ref_text_blocks,
+        *phonetic_blocks,
+        *title_blocks,
+        *text_blocks,
+        *interline_equation_blocks,
+        *list_blocks,
+    ])
+    # 对page_blocks根据index的值进行排序
+    page_blocks.sort(key=lambda x: x["index"])
+
+    page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_size": [width, height], "page_idx": page_index}
+    return page_info
+
+
+def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_writer):
+    middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
+    for index, page_blocks in enumerate(model_output_blocks_list):
+        page = pdf_doc[index]
+        image_dict = images_list[index]
+        page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, index)
+        middle_json["pdf_info"].append(page_info)
+
+    """表格跨页合并"""
+    table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
+    if table_enable:
+        merge_table(middle_json["pdf_info"])
+
+    """llm优化标题分级"""
+    if heading_level_import_success:
+        llm_aided_title_start_time = time.time()
+        llm_aided_title(middle_json["pdf_info"], title_aided_config)
+        logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
+
+    # 关闭pdf文档
+    pdf_doc.close()
+    return middle_json

+ 0 - 111
mineru/backend/vlm/predictor.py

@@ -1,111 +0,0 @@
-# Copyright (c) Opendatalab. All rights reserved.
-
-import time
-
-from loguru import logger
-
-from .base_predictor import (
-    DEFAULT_MAX_NEW_TOKENS,
-    DEFAULT_NO_REPEAT_NGRAM_SIZE,
-    DEFAULT_PRESENCE_PENALTY,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_K,
-    DEFAULT_TOP_P,
-    BasePredictor,
-)
-from .sglang_client_predictor import SglangClientPredictor
-
-hf_loaded = False
-try:
-    from .hf_predictor import HuggingfacePredictor
-
-    hf_loaded = True
-except ImportError as e:
-    logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.")
-
-engine_loaded = False
-try:
-    from sglang.srt.server_args import ServerArgs
-
-    from .sglang_engine_predictor import SglangEnginePredictor
-
-    engine_loaded = True
-except Exception as e:
-    logger.warning("sglang is not installed. If you are not using sglang, you can ignore this warning.")
-
-
-def get_predictor(
-    backend: str = "sglang-client",
-    model_path: str | None = None,
-    server_url: str | None = None,
-    temperature: float = DEFAULT_TEMPERATURE,
-    top_p: float = DEFAULT_TOP_P,
-    top_k: int = DEFAULT_TOP_K,
-    repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-    presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
-    no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
-    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-    http_timeout: int = 600,
-    **kwargs,
-) -> BasePredictor:
-    start_time = time.time()
-
-    if backend == "transformers":
-        if not model_path:
-            raise ValueError("model_path must be provided for transformers backend.")
-        if not hf_loaded:
-            raise ImportError(
-                "transformers is not installed, so huggingface backend cannot be used. "
-                "If you need to use huggingface backend, please install transformers first."
-            )
-        predictor = HuggingfacePredictor(
-            model_path=model_path,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-            **kwargs,
-        )
-    elif backend == "sglang-engine":
-        if not model_path:
-            raise ValueError("model_path must be provided for sglang-engine backend.")
-        if not engine_loaded:
-            raise ImportError(
-                "sglang is not installed, so sglang-engine backend cannot be used. "
-                "If you need to use sglang-engine backend for inference, "
-                "please install sglang[all]==0.4.8 or a newer version."
-            )
-        predictor = SglangEnginePredictor(
-            server_args=ServerArgs(model_path, **kwargs),
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-    elif backend == "sglang-client":
-        if not server_url:
-            raise ValueError("server_url must be provided for sglang-client backend.")
-        predictor = SglangClientPredictor(
-            server_url=server_url,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-            http_timeout=http_timeout,
-        )
-    else:
-        raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.")
-
-    elapsed = round(time.time() - start_time, 2)
-    logger.info(f"get_predictor cost: {elapsed}s")
-    return predictor

+ 0 - 443
mineru/backend/vlm/sglang_client_predictor.py

@@ -1,443 +0,0 @@
-import asyncio
-import json
-import re
-from base64 import b64encode
-from typing import AsyncIterable, Iterable, List, Optional, Set, Tuple, Union
-
-import httpx
-
-from .base_predictor import (
-    DEFAULT_MAX_NEW_TOKENS,
-    DEFAULT_NO_REPEAT_NGRAM_SIZE,
-    DEFAULT_PRESENCE_PENALTY,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_K,
-    DEFAULT_TOP_P,
-    BasePredictor,
-)
-from .utils import aio_load_resource, load_resource
-
-
-class SglangClientPredictor(BasePredictor):
-    def __init__(
-        self,
-        server_url: str,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        top_k: int = DEFAULT_TOP_K,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
-        no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
-        max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-        http_timeout: int = 600,
-    ) -> None:
-        super().__init__(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-        self.http_timeout = http_timeout
-
-        base_url = self.get_base_url(server_url)
-        self.check_server_health(base_url)
-        self.model_path = self.get_model_path(base_url)
-        self.server_url = f"{base_url}/generate"
-
-    @staticmethod
-    def get_base_url(server_url: str) -> str:
-        matched = re.match(r"^(https?://[^/]+)", server_url)
-        if not matched:
-            raise ValueError(f"Invalid server URL: {server_url}")
-        return matched.group(1)
-
-    def check_server_health(self, base_url: str):
-        try:
-            response = httpx.get(f"{base_url}/health_generate", timeout=self.http_timeout)
-        except httpx.ConnectError:
-            raise RuntimeError(f"Failed to connect to server {base_url}. Please check if the server is running.")
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Server {base_url} is not healthy. Status code: {response.status_code}, response body: {response.text}"
-            )
-
-    def get_model_path(self, base_url: str) -> str:
-        try:
-            response = httpx.get(f"{base_url}/get_model_info", timeout=self.http_timeout)
-        except httpx.ConnectError:
-            raise RuntimeError(f"Failed to connect to server {base_url}. Please check if the server is running.")
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Failed to get model info from {base_url}. Status code: {response.status_code}, response body: {response.text}"
-            )
-        return response.json()["model_path"]
-
-    def build_sampling_params(
-        self,
-        temperature: Optional[float],
-        top_p: Optional[float],
-        top_k: Optional[int],
-        repetition_penalty: Optional[float],
-        presence_penalty: Optional[float],
-        no_repeat_ngram_size: Optional[int],
-        max_new_tokens: Optional[int],
-    ) -> dict:
-        if temperature is None:
-            temperature = self.temperature
-        if top_p is None:
-            top_p = self.top_p
-        if top_k is None:
-            top_k = self.top_k
-        if repetition_penalty is None:
-            repetition_penalty = self.repetition_penalty
-        if presence_penalty is None:
-            presence_penalty = self.presence_penalty
-        if no_repeat_ngram_size is None:
-            no_repeat_ngram_size = self.no_repeat_ngram_size
-        if max_new_tokens is None:
-            max_new_tokens = self.max_new_tokens
-
-        # see SamplingParams for more details
-        return {
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-            "presence_penalty": presence_penalty,
-            "custom_params": {
-                "no_repeat_ngram_size": no_repeat_ngram_size,
-            },
-            "max_new_tokens": max_new_tokens,
-            "skip_special_tokens": False,
-        }
-
-    def build_request_body(
-        self,
-        image: bytes,
-        prompt: str,
-        sampling_params: dict,
-    ) -> dict:
-        image_base64 = b64encode(image).decode("utf-8")
-        return {
-            "text": prompt,
-            "image_data": image_base64,
-            "sampling_params": sampling_params,
-            "modalities": ["image"],
-        }
-
-    def predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> str:
-        prompt = self.build_prompt(prompt)
-
-        sampling_params = self.build_sampling_params(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-
-        if isinstance(image, str):
-            image = load_resource(image)
-
-        request_body = self.build_request_body(image, prompt, sampling_params)
-        response = httpx.post(self.server_url, json=request_body, timeout=self.http_timeout)
-        response_body = response.json()
-        return response_body["text"]
-
-    def batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        max_concurrency: int = 100,
-    ) -> List[str]:
-        try:
-            loop = asyncio.get_running_loop()
-        except RuntimeError:
-            loop = None
-
-        task = self.aio_batch_predict(
-            images=images,
-            prompts=prompts,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-            max_concurrency=max_concurrency,
-        )
-
-        if loop is not None:
-            return loop.run_until_complete(task)
-        else:
-            return asyncio.run(task)
-
-    def stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> Iterable[str]:
-        prompt = self.build_prompt(prompt)
-
-        sampling_params = self.build_sampling_params(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-
-        if isinstance(image, str):
-            image = load_resource(image)
-
-        request_body = self.build_request_body(image, prompt, sampling_params)
-        request_body["stream"] = True
-
-        with httpx.stream(
-            "POST",
-            self.server_url,
-            json=request_body,
-            timeout=self.http_timeout,
-        ) as response:
-            pos = 0
-            for chunk in response.iter_lines():
-                if not (chunk or "").startswith("data:"):
-                    continue
-                if chunk == "data: [DONE]":
-                    break
-                data = json.loads(chunk[5:].strip("\n"))
-                chunk_text = data["text"][pos:]
-                # meta_info = data["meta_info"]
-                pos += len(chunk_text)
-                yield chunk_text
-
-    async def aio_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        async_client: Optional[httpx.AsyncClient] = None,
-    ) -> str:
-        prompt = self.build_prompt(prompt)
-
-        sampling_params = self.build_sampling_params(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-
-        if isinstance(image, str):
-            image = await aio_load_resource(image)
-
-        request_body = self.build_request_body(image, prompt, sampling_params)
-
-        if async_client is None:
-            async with httpx.AsyncClient(timeout=self.http_timeout) as client:
-                response = await client.post(self.server_url, json=request_body)
-                response_body = response.json()
-        else:
-            response = await async_client.post(self.server_url, json=request_body)
-            response_body = response.json()
-
-        return response_body["text"]
-
-    async def aio_batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        max_concurrency: int = 100,
-    ) -> List[str]:
-        if not isinstance(prompts, list):
-            prompts = [prompts] * len(images)
-
-        assert len(prompts) == len(images), "Length of prompts and images must match."
-
-        semaphore = asyncio.Semaphore(max_concurrency)
-        outputs = [""] * len(images)
-
-        async def predict_with_semaphore(
-            idx: int,
-            image: str | bytes,
-            prompt: str,
-            async_client: httpx.AsyncClient,
-        ):
-            async with semaphore:
-                output = await self.aio_predict(
-                    image=image,
-                    prompt=prompt,
-                    temperature=temperature,
-                    top_p=top_p,
-                    top_k=top_k,
-                    repetition_penalty=repetition_penalty,
-                    presence_penalty=presence_penalty,
-                    no_repeat_ngram_size=no_repeat_ngram_size,
-                    max_new_tokens=max_new_tokens,
-                    async_client=async_client,
-                )
-                outputs[idx] = output
-
-        async with httpx.AsyncClient(timeout=self.http_timeout) as client:
-            tasks = []
-            for idx, (prompt, image) in enumerate(zip(prompts, images)):
-                tasks.append(predict_with_semaphore(idx, image, prompt, client))
-            await asyncio.gather(*tasks)
-
-        return outputs
-
-    async def aio_batch_predict_as_iter(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-        max_concurrency: int = 100,
-    ) -> AsyncIterable[Tuple[int, str]]:
-        if not isinstance(prompts, list):
-            prompts = [prompts] * len(images)
-
-        assert len(prompts) == len(images), "Length of prompts and images must match."
-
-        semaphore = asyncio.Semaphore(max_concurrency)
-
-        async def predict_with_semaphore(
-            idx: int,
-            image: str | bytes,
-            prompt: str,
-            async_client: httpx.AsyncClient,
-        ):
-            async with semaphore:
-                output = await self.aio_predict(
-                    image=image,
-                    prompt=prompt,
-                    temperature=temperature,
-                    top_p=top_p,
-                    top_k=top_k,
-                    repetition_penalty=repetition_penalty,
-                    presence_penalty=presence_penalty,
-                    no_repeat_ngram_size=no_repeat_ngram_size,
-                    max_new_tokens=max_new_tokens,
-                    async_client=async_client,
-                )
-                return (idx, output)
-
-        async with httpx.AsyncClient(timeout=self.http_timeout) as client:
-            pending: Set[asyncio.Task[Tuple[int, str]]] = set()
-
-            for idx, (prompt, image) in enumerate(zip(prompts, images)):
-                pending.add(
-                    asyncio.create_task(
-                        predict_with_semaphore(idx, image, prompt, client),
-                    )
-                )
-
-            while len(pending) > 0:
-                done, pending = await asyncio.wait(
-                    pending,
-                    return_when=asyncio.FIRST_COMPLETED,
-                )
-                for task in done:
-                    yield task.result()
-
-    async def aio_stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> AsyncIterable[str]:
-        prompt = self.build_prompt(prompt)
-
-        sampling_params = self.build_sampling_params(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-
-        if isinstance(image, str):
-            image = await aio_load_resource(image)
-
-        request_body = self.build_request_body(image, prompt, sampling_params)
-        request_body["stream"] = True
-
-        async with httpx.AsyncClient(timeout=self.http_timeout) as client:
-            async with client.stream(
-                "POST",
-                self.server_url,
-                json=request_body,
-            ) as response:
-                pos = 0
-                async for chunk in response.aiter_lines():
-                    if not (chunk or "").startswith("data:"):
-                        continue
-                    if chunk == "data: [DONE]":
-                        break
-                    data = json.loads(chunk[5:].strip("\n"))
-                    chunk_text = data["text"][pos:]
-                    # meta_info = data["meta_info"]
-                    pos += len(chunk_text)
-                    yield chunk_text

+ 0 - 246
mineru/backend/vlm/sglang_engine_predictor.py

@@ -1,246 +0,0 @@
-from base64 import b64encode
-from typing import AsyncIterable, Iterable, List, Optional, Union
-
-from sglang.srt.server_args import ServerArgs
-
-from ...model.vlm_sglang_model.engine import BatchEngine
-from .base_predictor import (
-    DEFAULT_MAX_NEW_TOKENS,
-    DEFAULT_NO_REPEAT_NGRAM_SIZE,
-    DEFAULT_PRESENCE_PENALTY,
-    DEFAULT_REPETITION_PENALTY,
-    DEFAULT_TEMPERATURE,
-    DEFAULT_TOP_K,
-    DEFAULT_TOP_P,
-    BasePredictor,
-)
-
-
-class SglangEnginePredictor(BasePredictor):
-    def __init__(
-        self,
-        server_args: ServerArgs,
-        temperature: float = DEFAULT_TEMPERATURE,
-        top_p: float = DEFAULT_TOP_P,
-        top_k: int = DEFAULT_TOP_K,
-        repetition_penalty: float = DEFAULT_REPETITION_PENALTY,
-        presence_penalty: float = DEFAULT_PRESENCE_PENALTY,
-        no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE,
-        max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
-    ) -> None:
-        super().__init__(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-        self.engine = BatchEngine(server_args=server_args)
-
-    def load_image_string(self, image: str | bytes) -> str:
-        if not isinstance(image, (str, bytes)):
-            raise ValueError("Image must be a string or bytes.")
-        if isinstance(image, bytes):
-            return b64encode(image).decode("utf-8")
-        if image.startswith("file://"):
-            return image[len("file://") :]
-        return image
-
-    def predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> str:
-        return self.batch_predict(
-            [image],  # type: ignore
-            [prompt],
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )[0]
-
-    def batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> List[str]:
-
-        if not isinstance(prompts, list):
-            prompts = [prompts] * len(images)
-
-        assert len(prompts) == len(images), "Length of prompts and images must match."
-        prompts = [self.build_prompt(prompt) for prompt in prompts]
-
-        if temperature is None:
-            temperature = self.temperature
-        if top_p is None:
-            top_p = self.top_p
-        if top_k is None:
-            top_k = self.top_k
-        if repetition_penalty is None:
-            repetition_penalty = self.repetition_penalty
-        if presence_penalty is None:
-            presence_penalty = self.presence_penalty
-        if no_repeat_ngram_size is None:
-            no_repeat_ngram_size = self.no_repeat_ngram_size
-        if max_new_tokens is None:
-            max_new_tokens = self.max_new_tokens
-
-        # see SamplingParams for more details
-        sampling_params = {
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-            "presence_penalty": presence_penalty,
-            "custom_params": {
-                "no_repeat_ngram_size": no_repeat_ngram_size,
-            },
-            "max_new_tokens": max_new_tokens,
-            "skip_special_tokens": False,
-        }
-
-        image_strings = [self.load_image_string(img) for img in images]
-
-        output = self.engine.generate(
-            prompt=prompts,
-            image_data=image_strings,
-            sampling_params=sampling_params,
-        )
-        return [item["text"] for item in output]
-
-    def stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> Iterable[str]:
-        raise NotImplementedError("Streaming is not supported yet.")
-
-    async def aio_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> str:
-        output = await self.aio_batch_predict(
-            [image],  # type: ignore
-            [prompt],
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            presence_penalty=presence_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            max_new_tokens=max_new_tokens,
-        )
-        return output[0]
-
-    async def aio_batch_predict(
-        self,
-        images: List[str] | List[bytes],
-        prompts: Union[List[str], str] = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> List[str]:
-
-        if not isinstance(prompts, list):
-            prompts = [prompts] * len(images)
-
-        assert len(prompts) == len(images), "Length of prompts and images must match."
-        prompts = [self.build_prompt(prompt) for prompt in prompts]
-
-        if temperature is None:
-            temperature = self.temperature
-        if top_p is None:
-            top_p = self.top_p
-        if top_k is None:
-            top_k = self.top_k
-        if repetition_penalty is None:
-            repetition_penalty = self.repetition_penalty
-        if presence_penalty is None:
-            presence_penalty = self.presence_penalty
-        if no_repeat_ngram_size is None:
-            no_repeat_ngram_size = self.no_repeat_ngram_size
-        if max_new_tokens is None:
-            max_new_tokens = self.max_new_tokens
-
-        # see SamplingParams for more details
-        sampling_params = {
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "repetition_penalty": repetition_penalty,
-            "presence_penalty": presence_penalty,
-            "custom_params": {
-                "no_repeat_ngram_size": no_repeat_ngram_size,
-            },
-            "max_new_tokens": max_new_tokens,
-            "skip_special_tokens": False,
-        }
-
-        image_strings = [self.load_image_string(img) for img in images]
-
-        output = await self.engine.async_generate(
-            prompt=prompts,
-            image_data=image_strings,
-            sampling_params=sampling_params,
-        )
-        ret = []
-        for item in output:  # type: ignore
-            ret.append(item["text"])
-        return ret
-
-    async def aio_stream_predict(
-        self,
-        image: str | bytes,
-        prompt: str = "",
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        max_new_tokens: Optional[int] = None,
-    ) -> AsyncIterable[str]:
-        raise NotImplementedError("Streaming is not supported yet.")
-
-    def close(self):
-        self.engine.shutdown()

Plik diff jest za duży
+ 0 - 114
mineru/backend/vlm/token_to_middle_json.py


+ 0 - 40
mineru/backend/vlm/utils.py

@@ -1,40 +0,0 @@
-import os
-import re
-from base64 import b64decode
-
-import httpx
-
-_timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
-_file_exts = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".pdf")
-_data_uri_regex = re.compile(r"^data:[^;,]+;base64,")
-
-
-def load_resource(uri: str) -> bytes:
-    if uri.startswith("http://") or uri.startswith("https://"):
-        response = httpx.get(uri, timeout=_timeout)
-        return response.content
-    if uri.startswith("file://"):
-        with open(uri[len("file://") :], "rb") as file:
-            return file.read()
-    if uri.lower().endswith(_file_exts):
-        with open(uri, "rb") as file:
-            return file.read()
-    if re.match(_data_uri_regex, uri):
-        return b64decode(uri.split(",")[1])
-    return b64decode(uri)
-
-
-async def aio_load_resource(uri: str) -> bytes:
-    if uri.startswith("http://") or uri.startswith("https://"):
-        async with httpx.AsyncClient(timeout=_timeout) as client:
-            response = await client.get(uri)
-            return response.content
-    if uri.startswith("file://"):
-        with open(uri[len("file://") :], "rb") as file:
-            return file.read()
-    if uri.lower().endswith(_file_exts):
-        with open(uri, "rb") as file:
-            return file.read()
-    if re.match(_data_uri_regex, uri):
-        return b64decode(uri.split(",")[1])
-    return b64decode(uri)

+ 65 - 16
mineru/backend/vlm/vlm_analyze.py

@@ -3,14 +3,15 @@ import time
 
 from loguru import logger
 
+from .model_output_to_middle_json import result_to_middle_json
 from ...data.data_reader_writer import DataWriter
 from mineru.utils.pdf_image_tools import load_images_from_pdf
-from .base_predictor import BasePredictor
-from .predictor import get_predictor
-from .token_to_middle_json import result_to_middle_json
+
 from ...utils.enum_class import ImageType
 from ...utils.models_download_utils import auto_download_and_get_model_root_path
 
+from mineru_vl_utils import MinerUClient
+
 
 class ModelSingleton:
     _instance = None
@@ -27,24 +28,72 @@ class ModelSingleton:
         model_path: str | None,
         server_url: str | None,
         **kwargs,
-    ) -> BasePredictor:
+    ) -> MinerUClient:
         key = (backend, model_path, server_url)
         if key not in self._models:
-            if backend in ['transformers', 'sglang-engine'] and not model_path:
+            start_time = time.time()
+            model = None
+            processor = None
+            vllm_llm = None
+            if backend in ['transformers', 'vllm-engine'] and not model_path:
                 model_path = auto_download_and_get_model_root_path("/","vlm")
-            self._models[key] = get_predictor(
+                if backend == "transformers":
+                    if not model_path:
+                        raise ValueError("model_path must be provided when model or processor is None.")
+
+                    try:
+                        from transformers import (
+                            AutoProcessor,
+                            Qwen2VLForConditionalGeneration,
+                        )
+                        from transformers import __version__ as transformers_version
+                    except ImportError:
+                        raise ImportError("Please install transformers to use the transformers backend.")
+
+                    from packaging import version
+                    if version.parse(transformers_version) >= version.parse("4.56.0"):
+                        dtype_key = "dtype"
+                    else:
+                        dtype_key = "torch_dtype"
+                    model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        model_path,
+                        device_map="auto",
+                        **{dtype_key: "auto"},  # type: ignore
+                    )
+                    processor = AutoProcessor.from_pretrained(
+                        model_path,
+                        use_fast=True,
+                    )
+                elif backend == "vllm-engine":
+                    if not model_path:
+                        raise ValueError("model_path must be provided when vllm_llm is None.")
+                    try:
+                        import vllm
+                    except ImportError:
+                        raise ImportError("Please install vllm to use the vllm-engine backend.")
+                    # logger.debug(kwargs)
+                    if "gpu_memory_utilization" not in kwargs:
+                        kwargs["gpu_memory_utilization"] = 0.5
+                    if "model" not in kwargs:
+                        kwargs["model"] = model_path
+                    # 使用kwargs为 vllm初始化参数
+                    vllm_llm = vllm.LLM(**kwargs)
+            self._models[key] = MinerUClient(
                 backend=backend,
-                model_path=model_path,
+                model=model,
+                processor=processor,
+                vllm_llm=vllm_llm,
                 server_url=server_url,
-                **kwargs,
             )
+            elapsed = round(time.time() - start_time, 2)
+            logger.info(f"get {backend} predictor cost: {elapsed}s")
         return self._models[key]
 
 
 def doc_analyze(
     pdf_bytes,
     image_writer: DataWriter | None,
-    predictor: BasePredictor | None = None,
+    predictor: MinerUClient | None = None,
     backend="transformers",
     model_path: str | None = None,
     server_url: str | None = None,
@@ -54,13 +103,13 @@ def doc_analyze(
         predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
 
     # load_images_start = time.time()
-    images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.BASE64)
-    images_base64_list = [image_dict["img_base64"] for image_dict in images_list]
+    images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
+    images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
     # load_images_time = round(time.time() - load_images_start, 2)
     # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
 
     # infer_start = time.time()
-    results = predictor.batch_predict(images=images_base64_list)
+    results = predictor.batch_two_step_extract(images=images_pil_list)
     # infer_time = round(time.time() - infer_start, 2)
     # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
 
@@ -71,7 +120,7 @@ def doc_analyze(
 async def aio_doc_analyze(
     pdf_bytes,
     image_writer: DataWriter | None,
-    predictor: BasePredictor | None = None,
+    predictor: MinerUClient | None = None,
     backend="transformers",
     model_path: str | None = None,
     server_url: str | None = None,
@@ -81,13 +130,13 @@ async def aio_doc_analyze(
         predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
 
     # load_images_start = time.time()
-    images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.BASE64)
-    images_base64_list = [image_dict["img_base64"] for image_dict in images_list]
+    images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
+    images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
     # load_images_time = round(time.time() - load_images_start, 2)
     # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
 
     # infer_start = time.time()
-    results = await predictor.aio_batch_predict(images=images_base64_list)
+    results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
     # infer_time = round(time.time() - infer_start, 2)
     # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
     middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)

+ 201 - 135
mineru/backend/vlm/vlm_magic_model.py

@@ -3,46 +3,37 @@ from typing import Literal
 
 from loguru import logger
 
-from mineru.utils.enum_class import ContentType, BlockType, SplitFlag
-from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
-from mineru.utils.format_utils import block_content_to_html
+from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+from mineru.utils.enum_class import ContentType, BlockType
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
 
 
 class MagicModel:
-    def __init__(self, token: str, width, height):
-        self.token = token
-
-        # 使用正则表达式查找所有块
-        pattern = (
-            r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
-        )
-        block_infos = re.findall(pattern, token, re.DOTALL)
+    def __init__(self, page_blocks: list, width, height):
+        self.page_blocks = page_blocks
 
         blocks = []
         self.all_spans = []
         # 解析每个块
-        for index, block_info in enumerate(block_infos):
-            block_bbox = block_info[0].strip()
+        for index, block_info in enumerate(page_blocks):
+            block_bbox = block_info["bbox"]
             try:
-                x1, y1, x2, y2 = map(int, block_bbox.split())
+                x1, y1, x2, y2 = block_bbox
                 x_1, y_1, x_2, y_2 = (
-                    int(x1 * width / 1000),
-                    int(y1 * height / 1000),
-                    int(x2 * width / 1000),
-                    int(y2 * height / 1000),
+                    int(x1 * width),
+                    int(y1 * height),
+                    int(x2 * width),
+                    int(y2 * height),
                 )
                 if x_2 < x_1:
                     x_1, x_2 = x_2, x_1
                 if y_2 < y_1:
                     y_1, y_2 = y_2, y_1
                 block_bbox = (x_1, y_1, x_2, y_2)
-                block_type = block_info[1].strip()
-                block_content = block_info[2].strip()
-
-                # 如果bbox是0,0,999,999,且type为text,按notes增加表格处理
-                if x1 == 0 and y1 == 0 and x2 == 999 and y2 == 999 and block_type == "text":
-                    block_content = block_content_to_html(block_content)
+                block_type = block_info["type"]
+                block_content = block_info["content"]
+                block_angle = block_info["angle"]
 
                 # print(f"坐标: {block_bbox}")
                 # print(f"类型: {block_type}")
@@ -54,6 +45,9 @@ class MagicModel:
                 continue
 
             span_type = "unknown"
+            line_type = None
+            guess_lang = None
+
             if block_type in [
                 "text",
                 "title",
@@ -61,8 +55,15 @@ class MagicModel:
                 "image_footnote",
                 "table_caption",
                 "table_footnote",
-                "list",
-                "index",
+                "code_caption",
+                "ref_text",
+                "phonetic",
+                "header",
+                "footer",
+                "page_number",
+                "aside_text",
+                "page_footnote",
+                "list"
             ]:
                 span_type = ContentType.TEXT
             elif block_type in ["image"]:
@@ -71,6 +72,12 @@ class MagicModel:
             elif block_type in ["table"]:
                 block_type = BlockType.TABLE_BODY
                 span_type = ContentType.TABLE
+            elif block_type in ["code", "algorithm"]:
+                block_content = code_content_clean(block_content)
+                line_type = block_type
+                block_type = BlockType.CODE_BODY
+                span_type = ContentType.TEXT
+                guess_lang = guess_language_by_text(block_content)
             elif block_type in ["equation"]:
                 block_type = BlockType.INTERLINE_EQUATION
                 span_type = ContentType.INTERLINE_EQUATION
@@ -81,7 +88,7 @@ class MagicModel:
                     "type": span_type,
                 }
                 if span_type == ContentType.TABLE:
-                    span["html"] = block_content_to_html(block_content)
+                    span["html"] = block_content
             elif span_type in [ContentType.INTERLINE_EQUATION]:
                 span = {
                     "bbox": block_bbox,
@@ -89,7 +96,12 @@ class MagicModel:
                     "content": isolated_formula_clean(block_content),
                 }
             else:
-                if block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
+
+                if block_content:
+                    block_content = clean_content(block_content)
+
+                if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
+
                     # 生成包含文本和公式的span列表
                     spans = []
                     last_end = 0
@@ -136,25 +148,27 @@ class MagicModel:
                         "content": block_content,
                     }
 
+            # 处理span类型并添加到all_spans
             if isinstance(span, dict) and "bbox" in span:
                 self.all_spans.append(span)
-                line = {
-                    "bbox": block_bbox,
-                    "spans": [span],
-                }
+                spans = [span]
             elif isinstance(span, list):
                 self.all_spans.extend(span)
-                line = {
-                    "bbox": block_bbox,
-                    "spans": span,
-                }
+                spans = span
             else:
                 raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
 
+            # 构造line对象
+            if block_type in [BlockType.CODE_BODY]:
+                line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
+            else:
+                line = {"bbox": block_bbox, "spans": spans}
+
             blocks.append(
                 {
                     "bbox": block_bbox,
                     "type": block_type,
+                    "angle": block_angle,
                     "lines": [line],
                     "index": index,
                 }
@@ -165,35 +179,87 @@ class MagicModel:
         self.interline_equation_blocks = []
         self.text_blocks = []
         self.title_blocks = []
+        self.code_blocks = []
+        self.discarded_blocks = []
+        self.ref_text_blocks = []
+        self.phonetic_blocks = []
+        self.list_blocks = []
         for block in blocks:
             if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
                 self.image_blocks.append(block)
             elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
                 self.table_blocks.append(block)
+            elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
+                self.code_blocks.append(block)
             elif block["type"] == BlockType.INTERLINE_EQUATION:
                 self.interline_equation_blocks.append(block)
             elif block["type"] == BlockType.TEXT:
                 self.text_blocks.append(block)
             elif block["type"] == BlockType.TITLE:
                 self.title_blocks.append(block)
+            elif block["type"] in [BlockType.REF_TEXT]:
+                self.ref_text_blocks.append(block)
+            elif block["type"] in [BlockType.PHONETIC]:
+                self.phonetic_blocks.append(block)
+            elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
+                self.discarded_blocks.append(block)
+            elif block["type"] == BlockType.LIST:
+                self.list_blocks.append(block)
             else:
                 continue
 
+        self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
+        self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
+        self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
+        self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
+        for code_block in self.code_blocks:
+            for block in code_block['blocks']:
+                if block['type'] == BlockType.CODE_BODY:
+                    if len(block["lines"]) > 0:
+                        line = block["lines"][0]
+                        code_block["sub_type"] = line["extra"]["type"]
+                        if code_block["sub_type"] in ["code"]:
+                            code_block["guess_lang"] = line["extra"]["guess_lang"]
+                        del line["extra"]
+                    else:
+                        code_block["sub_type"] = "code"
+                        code_block["guess_lang"] = "txt"
+
+        for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
+            block["type"] = BlockType.TEXT
+            self.text_blocks.append(block)
+
+
+    def get_list_blocks(self):
+        return self.list_blocks
+
     def get_image_blocks(self):
-        return fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
+        return self.image_blocks
 
     def get_table_blocks(self):
-        return fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
+        return self.table_blocks
+
+    def get_code_blocks(self):
+        return self.code_blocks
+
+    def get_ref_text_blocks(self):
+        return self.ref_text_blocks
+
+    def get_phonetic_blocks(self):
+        return self.phonetic_blocks
 
     def get_title_blocks(self):
-        return fix_title_blocks(self.title_blocks)
+        return self.title_blocks
 
     def get_text_blocks(self):
-        return fix_text_blocks(self.text_blocks)
+        return self.text_blocks
 
     def get_interline_equation_blocks(self):
         return self.interline_equation_blocks
 
+    def get_discarded_blocks(self):
+        return self.discarded_blocks
+
     def get_all_spans(self):
         return self.all_spans
 
@@ -202,48 +268,46 @@ def isolated_formula_clean(txt):
     latex = txt[:]
     if latex.startswith("\\["): latex = latex[2:]
     if latex.endswith("\\]"): latex = latex[:-2]
-    latex = latex_fix(latex.strip())
+    latex = latex.strip()
     return latex
 
 
-def latex_fix(latex):
-    # valid pairs:
-    # \left\{ ... \right\}
-    # \left( ... \right)
-    # \left| ... \right|
-    # \left\| ... \right\|
-    # \left[ ... \right]
-
-    LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
-    RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
-    left_count = len(LEFT_COUNT_PATTERN.findall(latex))  # 不匹配\lefteqn等
-    right_count = len(RIGHT_COUNT_PATTERN.findall(latex))  # 不匹配\rightarrow
-
-    if left_count != right_count:
-        for _ in range(2):
-            # replace valid pairs
-            latex = re.sub(r'\\left\\\{', "{", latex) # \left\{
-            latex = re.sub(r"\\left\|", "|", latex) # \left|
-            latex = re.sub(r"\\left\\\|", "|", latex) # \left\|
-            latex = re.sub(r"\\left\(", "(", latex) # \left(
-            latex = re.sub(r"\\left\[", "[", latex) # \left[
-
-            latex = re.sub(r"\\right\\\}", "}", latex) # \right\}
-            latex = re.sub(r"\\right\|", "|", latex) # \right|
-            latex = re.sub(r"\\right\\\|", "|", latex) # \right\|
-            latex = re.sub(r"\\right\)", ")", latex) # \right)
-            latex = re.sub(r"\\right\]", "]", latex) # \right]
-            latex = re.sub(r"\\right\.", "", latex) # \right.
-
-            # replace invalid pairs first
-            latex = re.sub(r'\\left\{', "{", latex)
-            latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right}
-            latex = re.sub(r'\\left\\\(', "(", latex)
-            latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\)
-            latex = re.sub(r'\\left\\\[', "[", latex)
-            latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\]
+def code_content_clean(content):
+    """清理代码内容,移除Markdown代码块的开始和结束标记"""
+    if not content:
+        return ""
+
+    lines = content.splitlines()
+    start_idx = 0
+    end_idx = len(lines)
+
+    # 处理开头的三个反引号
+    if lines and lines[0].startswith("```"):
+        start_idx = 1
+
+    # 处理结尾的三个反引号
+    if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
+        end_idx -= 1
+
+    # 只有在有内容时才进行join操作
+    if start_idx < end_idx:
+        return "\n".join(lines[start_idx:end_idx]).strip()
+    return ""
 
-    return latex
+
+def clean_content(content):
+    if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
+        # Function to handle each match
+        def replace_pattern(match):
+            # Extract content between \[ and \]
+            inner_content = match.group(1)
+            return f"[{inner_content}]"
+
+        # Find all patterns of \[x\] and apply replacement
+        pattern = r'\\\[(.*?)\\\]'
+        content = re.sub(pattern, replace_pattern, content)
+
+    return content
 
 
 def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_type):
@@ -252,7 +316,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
                     filter(
                         lambda x: x["type"] == subject_block_type,
                         blocks,
@@ -265,7 +329,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
                     filter(
                         lambda x: x["type"] == object_block_type,
                         blocks,
@@ -281,7 +345,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
     )
 
 
-def get_type_blocks(blocks, block_type: Literal["image", "table"]):
+def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
     with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
     with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
     ret = []
@@ -297,9 +361,13 @@ def get_type_blocks(blocks, block_type: Literal["image", "table"]):
     return ret
 
 
-def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
+def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
     need_fix_blocks = get_type_blocks(blocks, fix_type)
     fixed_blocks = []
+    not_include_blocks = []
+    processed_indices = set()
+
+    # 处理需要组织成two_layer结构的blocks
     for block in need_fix_blocks:
         body = block[f"{fix_type}_body"]
         caption_list = block[f"{fix_type}_caption_list"]
@@ -308,8 +376,12 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
         body["type"] = f"{fix_type}_body"
         for caption in caption_list:
             caption["type"] = f"{fix_type}_caption"
+            processed_indices.add(caption["index"])
         for footnote in footnote_list:
             footnote["type"] = f"{fix_type}_footnote"
+            processed_indices.add(footnote["index"])
+
+        processed_indices.add(body["index"])
 
         two_layer_block = {
             "type": fix_type,
@@ -323,58 +395,52 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
 
         fixed_blocks.append(two_layer_block)
 
-    return fixed_blocks
-
-
-def fix_title_blocks(blocks):
+    # 添加未处理的blocks
     for block in blocks:
-        if block["type"] == BlockType.TITLE:
-            title_content = merge_para_with_text(block)
-            title_level = count_leading_hashes(title_content)
-            block['level'] = title_level
-            for line in block['lines']:
-                for span in line['spans']:
-                    span['content'] = strip_leading_hashes(span['content'])
-                    break
+        if block["index"] not in processed_indices:
+            # 直接添加未处理的block
+            not_include_blocks.append(block)
+
+    return fixed_blocks, not_include_blocks
+
+
+def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
+    for list_block in list_blocks:
+        list_block["blocks"] = []
+        if "lines" in list_block:
+            del list_block["lines"]
+
+    temp_text_blocks = text_blocks + ref_text_blocks
+    need_remove_blocks = []
+    for block in temp_text_blocks:
+        for list_block in list_blocks:
+            if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
+                list_block["blocks"].append(block)
+                need_remove_blocks.append(block)
                 break
-    return blocks
-
-
-def count_leading_hashes(text):
-    match = re.match(r'^(#+)', text)
-    return len(match.group(1)) if match else 0
-
-
-def strip_leading_hashes(text):
-    # 去除开头的#和紧随其后的空格
-    return re.sub(r'^#+\s*', '', text)
-
-
-def fix_text_blocks(blocks):
-    i = 0
-    while i < len(blocks):
-        block = blocks[i]
-        last_line = block["lines"][-1]if block["lines"] else None
-        if last_line:
-            last_span = last_line["spans"][-1] if last_line["spans"] else None
-            if last_span and last_span['content'].endswith('<|txt_contd|>'):
-                last_span['content'] = last_span['content'][:-len('<|txt_contd|>')]
-
-                # 查找下一个未被清空的块
-                next_idx = i + 1
-                while next_idx < len(blocks) and blocks[next_idx].get(SplitFlag.LINES_DELETED, False):
-                    next_idx += 1
-
-                # 如果找到下一个有效块,则合并
-                if next_idx < len(blocks):
-                    next_block = blocks[next_idx]
-                    # 将下一个块的lines扩展到当前块的lines中
-                    block["lines"].extend(next_block["lines"])
-                    # 清空下一个块的lines
-                    next_block["lines"] = []
-                    # 在下一个块中添加标志
-                    next_block[SplitFlag.LINES_DELETED] = True
-                    # 不增加i,继续检查当前块(现在已包含下一个块的内容)
-                    continue
-        i += 1
-    return blocks
+
+    for block in need_remove_blocks:
+        if block in text_blocks:
+            text_blocks.remove(block)
+        elif block in ref_text_blocks:
+            ref_text_blocks.remove(block)
+
+    # 移除blocks为空的list_block
+    list_blocks = [lb for lb in list_blocks if lb["blocks"]]
+
+    for list_block in list_blocks:
+        # 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
+        type_count = {}
+        line_content = []
+        for sub_block in list_block["blocks"]:
+            sub_block_type = sub_block["type"]
+            if sub_block_type not in type_count:
+                type_count[sub_block_type] = 0
+            type_count[sub_block_type] += 1
+
+        if type_count:
+            list_block["sub_type"] = max(type_count, key=type_count.get)
+        else:
+            list_block["sub_type"] = "unknown"
+
+    return list_blocks, text_blocks, ref_text_blocks

+ 48 - 5
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -3,7 +3,6 @@ import os
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
 from mineru.utils.enum_class import MakeMode, BlockType, ContentType
 
-
 latex_delimiters_config = get_latex_delimiter_config()
 
 default_delimiters = {
@@ -50,8 +49,12 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
     for para_block in para_blocks:
         para_text = ''
         para_type = para_block['type']
-        if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
+        if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
             para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
+        elif para_type == BlockType.LIST:
+            for block in para_block['blocks']:
+                item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
+                para_text += f"{item_text}\n"
         elif para_type == BlockType.TITLE:
             title_level = get_title_level(para_block)
             para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
@@ -112,6 +115,17 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TABLE_FOOTNOTE:
                         para_text += '\n' + merge_para_with_text(block) + '  '
+        elif para_type == BlockType.CODE:
+            sub_type = para_block["sub_type"]
+            for block in para_block['blocks']:  # 1st.拼code_caption
+                if block['type'] == BlockType.CODE_CAPTION:
+                    para_text += merge_para_with_text(block) + '  \n'
+            for block in para_block['blocks']:  # 2nd.拼code_body
+                if block['type'] == BlockType.CODE_BODY:
+                    if sub_type == BlockType.CODE:
+                        para_text += f"```{para_block["guess_lang"]}\n{merge_para_with_text(block)}\n```"
+                    elif sub_type == BlockType.ALGORITHM:
+                        para_text += merge_para_with_text(block)
 
         if para_text.strip() == '':
             continue
@@ -128,11 +142,30 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
 def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
-    if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
+    if para_type in [
+        BlockType.TEXT,
+        BlockType.REF_TEXT,
+        BlockType.PHONETIC,
+        BlockType.HEADER,
+        BlockType.FOOTER,
+        BlockType.PAGE_NUMBER,
+        BlockType.ASIDE_TEXT,
+        BlockType.PAGE_FOOTNOTE,
+    ]:
         para_content = {
-            'type': ContentType.TEXT,
+            'type': para_type,
             'text': merge_para_with_text(para_block),
         }
+    elif para_type == BlockType.LIST:
+        para_content = {
+            'type': para_type,
+            'sub_type': para_block.get('sub_type', ''),
+            'list_items':[],
+        }
+        for block in para_block['blocks']:
+            item_text = merge_para_with_text(block)
+            if item_text.strip():
+                para_content['list_items'].append(item_text)
     elif para_type == BlockType.TITLE:
         title_level = get_title_level(para_block)
         para_content = {
@@ -178,6 +211,15 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.TABLE_FOOTNOTE:
                 para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
+    elif para_type == BlockType.CODE:
+        para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.CODE_BODY:
+                para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
+                if para_block["sub_type"] == BlockType.CODE:
+                    para_content["guess_lang"] = para_block["guess_lang"]
+            if block['type'] == BlockType.CODE_CAPTION:
+                para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
 
     page_weight, page_height = page_size
     para_bbox = para_block.get('bbox')
@@ -205,6 +247,7 @@ def union_make(pdf_info_dict: list,
     output_content = []
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
+        paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
         if not paras_of_layout:
@@ -213,7 +256,7 @@ def union_make(pdf_info_dict: list,
             page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout:
+            for para_block in paras_of_layout+paras_of_discarded:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)
 

+ 6 - 5
mineru/cli/client.py

@@ -6,6 +6,7 @@ from loguru import logger
 
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.config_reader import get_device
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
 from mineru.utils.model_utils import get_vram
 from ..version import __version__
 from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
@@ -49,12 +50,12 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     '-b',
     '--backend',
     'backend',
-    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client']),
+    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
     help="""the backend for parsing pdf:
     pipeline: More general.
     vlm-transformers: More general.
-    vlm-sglang-engine: Faster(engine).
-    vlm-sglang-client: Faster(client).
+    vlm-vllm-engine: Faster(engine).
+    vlm-http-client: Faster(client).
     without method specified, pipeline will be used by default.""",
     default='pipeline',
 )
@@ -77,7 +78,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     'server_url',
     type=str,
     help="""
-    When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+    When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
     """,
     default=None,
 )
@@ -202,7 +203,7 @@ def main(
     if os.path.isdir(input_path):
         doc_path_list = []
         for doc_path in Path(input_path).glob('*'):
-            if doc_path.suffix in pdf_suffixes + image_suffixes:
+            if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
                 doc_path_list.append(doc_path)
         parse_doc(doc_path_list)
     else:

+ 11 - 16
mineru/cli/common.py

@@ -11,13 +11,14 @@ from loguru import logger
 from mineru.data.data_reader_writer import FileBasedDataWriter
 from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
 from mineru.utils.enum_class import MakeMode
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
 from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
 from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
 from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
 from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
 
-pdf_suffixes = [".pdf"]
-image_suffixes = [".png", ".jpeg", ".jpg", ".webp", ".gif"]
+pdf_suffixes = ["pdf"]
+image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
 
 
 def read_fn(path):
@@ -25,12 +26,13 @@ def read_fn(path):
         path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
-        if path.suffix in image_suffixes:
+        file_suffix = guess_suffix_by_bytes(file_bytes)
+        if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
-        elif path.suffix in pdf_suffixes:
+        elif file_suffix in pdf_suffixes:
             return file_bytes
         else:
-            raise Exception(f"Unknown file suffix: {path.suffix}")
+            raise Exception(f"Unknown file suffix: {file_suffix}")
 
 
 def prepare_env(output_dir, pdf_file_name, parse_method):
@@ -145,17 +147,10 @@ def _process_output(
         )
 
     if f_dump_model_output:
-        if is_pipeline:
-            md_writer.write_string(
-                f"{pdf_file_name}_model.json",
-                json.dumps(model_output, ensure_ascii=False, indent=4),
-            )
-        else:
-            output_text = ("\n" + "-" * 50 + "\n").join(model_output)
-            md_writer.write_string(
-                f"{pdf_file_name}_model_output.txt",
-                output_text,
-            )
+        md_writer.write_string(
+            f"{pdf_file_name}_model.json",
+            json.dumps(model_output, ensure_ascii=False, indent=4),
+        )
 
     logger.info(f"local output dir is {local_md_dir}")
 

+ 9 - 7
mineru/cli/fast_api.py

@@ -18,6 +18,7 @@ from base64 import b64encode
 
 from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
 from mineru.utils.cli_parser import arg_parse
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
 from mineru.version import __version__
 
 app = FastAPI()
@@ -95,13 +96,14 @@ async def parse_pdf(
             content = await file.read()
             file_path = Path(file.filename)
 
-            # 如果是图像文件或PDF,使用read_fn处理
-            if file_path.suffix.lower() in pdf_suffixes + image_suffixes:
-                # 创建临时文件以便使用read_fn
-                temp_path = Path(unique_dir) / file_path.name
-                with open(temp_path, "wb") as f:
-                    f.write(content)
+            # 创建临时文件
+            temp_path = Path(unique_dir) / file_path.name
+            with open(temp_path, "wb") as f:
+                f.write(content)
 
+            # 如果是图像文件或PDF,使用read_fn处理
+            file_suffix = guess_suffix_by_path(temp_path)
+            if file_suffix in pdf_suffixes + image_suffixes:
                 try:
                     pdf_bytes = read_fn(temp_path)
                     pdf_bytes_list.append(pdf_bytes)
@@ -115,7 +117,7 @@ async def parse_pdf(
             else:
                 return JSONResponse(
                     status_code=400,
-                    content={"error": f"Unsupported file type: {file_path.suffix}"}
+                    content={"error": f"Unsupported file type: {file_suffix}"}
                 )
 
 

+ 17 - 16
mineru/cli/gradio_app.py

@@ -182,9 +182,9 @@ def to_pdf(file_path):
 
 # 更新界面函数
 def update_interface(backend_choice):
-    if backend_choice in ["vlm-transformers", "vlm-sglang-engine"]:
+    if backend_choice in ["vlm-transformers", "vlm-vllm-engine"]:
         return gr.update(visible=False), gr.update(visible=False)
-    elif backend_choice in ["vlm-sglang-client"]:
+    elif backend_choice in ["vlm-http-client"]:
         return gr.update(visible=True), gr.update(visible=False)
     elif backend_choice in ["pipeline"]:
         return gr.update(visible=False), gr.update(visible=True)
@@ -203,10 +203,10 @@ def update_interface(backend_choice):
     default=True,
 )
 @click.option(
-    '--enable-sglang-engine',
-    'sglang_engine_enable',
+    '--enable-vllm-engine',
+    'vllm_engine_enable',
     type=bool,
-    help="Enable SgLang engine backend for faster processing.",
+    help="Enable vLLM engine backend for faster processing.",
     default=False,
 )
 @click.option(
@@ -246,7 +246,7 @@ def update_interface(backend_choice):
     default='all',
 )
 def main(ctx,
-        example_enable, sglang_engine_enable, api_enable, max_convert_pages,
+        example_enable, vllm_engine_enable, api_enable, max_convert_pages,
         server_name, server_port, latex_delimiters_type, **kwargs
 ):
 
@@ -261,22 +261,23 @@ def main(ctx,
     else:
         raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.")
 
-    if sglang_engine_enable:
+    if vllm_engine_enable:
         try:
-            print("Start init SgLang engine...")
+            print("Start init vLLM engine...")
             from mineru.backend.vlm.vlm_analyze import ModelSingleton
             model_singleton = ModelSingleton()
             predictor = model_singleton.get_model(
-                "sglang-engine",
+                "vllm-engine",
                 None,
                 None,
                 **kwargs
             )
-            print("SgLang engine init successfully.")
+            print("vLLM engine init successfully.")
         except Exception as e:
             logger.exception(e)
-
-    suffixes = pdf_suffixes + image_suffixes
+    suffixes = []
+    for suffix in pdf_suffixes + image_suffixes:
+        suffixes.append(f".{suffix}")
     with gr.Blocks() as demo:
         gr.HTML(header)
         with gr.Row():
@@ -286,11 +287,11 @@ def main(ctx,
                 with gr.Row():
                     max_pages = gr.Slider(1, max_convert_pages, int(max_convert_pages/2), step=1, label='Max convert pages')
                 with gr.Row():
-                    if sglang_engine_enable:
-                        drop_list = ["pipeline", "vlm-sglang-engine"]
-                        preferred_option = "vlm-sglang-engine"
+                    if vllm_engine_enable:
+                        drop_list = ["pipeline", "vlm-vllm-engine"]
+                        preferred_option = "vlm-vllm-engine"
                     else:
-                        drop_list = ["pipeline", "vlm-transformers", "vlm-sglang-client"]
+                        drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
                         preferred_option = "pipeline"
                     backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
                 with gr.Row(visible=False) as client_options:

+ 0 - 4
mineru/cli/vlm_sglang_server.py

@@ -1,4 +0,0 @@
-from ..model.vlm_sglang_model.server import main
-
-if __name__ == "__main__":
-    main()

+ 4 - 0
mineru/cli/vlm_vllm_server.py

@@ -0,0 +1,4 @@
+from mineru.model.vlm_vllm_model.server import main
+
+if __name__ == "__main__":
+    main()

+ 0 - 9
mineru/model/vlm_hf_model/__init__.py

@@ -1,9 +0,0 @@
-from transformers import AutoConfig, AutoImageProcessor, AutoModelForCausalLM
-
-from .configuration_mineru2 import Mineru2QwenConfig
-from .image_processing_mineru2 import Mineru2ImageProcessor
-from .modeling_mineru2 import Mineru2QwenForCausalLM
-
-AutoConfig.register(Mineru2QwenConfig.model_type, Mineru2QwenConfig)
-AutoModelForCausalLM.register(Mineru2QwenConfig, Mineru2QwenForCausalLM)
-AutoImageProcessor.register(Mineru2QwenConfig, slow_image_processor_class=Mineru2ImageProcessor)

+ 0 - 38
mineru/model/vlm_hf_model/configuration_mineru2.py

@@ -1,38 +0,0 @@
-from transformers import Qwen2Config
-
-
-class Mineru2QwenConfig(Qwen2Config):
-    model_type = "mineru2_qwen"
-
-    def __init__(
-        self,
-        ignore_index=-100,
-        image_aspect_ratio="square_anyres_max_9",
-        image_grid_pinpoints="(1x1),...,(4x4)",
-        image_token_index=151646,
-        mm_hidden_size=1152,
-        mm_patch_merge_type="spatial_unpad",
-        mm_projector_type="mlp2x_gelu",
-        mm_vision_select_feature="full",
-        mm_vision_select_layer=-2,
-        mm_vision_tower="google/siglip-so400m-patch14-384",
-        tie_word_embeddings=False,
-        tokenizer_model_max_length=16384,
-        tokenizer_padding_side="right",
-        unfreeze_mm_vision_tower=True,
-        **kwargs,
-    ):
-        self.ignore_index = ignore_index
-        self.image_aspect_ratio = image_aspect_ratio
-        self.image_grid_pinpoints = image_grid_pinpoints
-        self.image_token_index = image_token_index
-        self.mm_hidden_size = mm_hidden_size
-        self.mm_patch_merge_type = mm_patch_merge_type
-        self.mm_projector_type = mm_projector_type
-        self.mm_vision_select_feature = mm_vision_select_feature
-        self.mm_vision_select_layer = mm_vision_select_layer
-        self.mm_vision_tower = mm_vision_tower
-        self.tokenizer_model_max_length = tokenizer_model_max_length
-        self.tokenizer_padding_side = tokenizer_padding_side
-        self.unfreeze_mm_vision_tower = unfreeze_mm_vision_tower
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

+ 0 - 269
mineru/model/vlm_hf_model/image_processing_mineru2.py

@@ -1,269 +0,0 @@
-import ast
-import math
-import re
-from functools import partial, reduce
-from typing import Dict, Optional, Union
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers.image_processing_utils import (
-    BaseImageProcessor,
-    BatchFeature,
-    get_size_dict,
-)
-from transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    PILImageResampling,
-    to_numpy_array,
-)
-from transformers.utils import TensorType
-
-
-def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
-    original_width, original_height = original_size
-    best_fit = (0, 0)
-    max_effective_resolution = 0
-    min_wasted_resolution = float("inf")
-
-    for width, height in possible_resolutions:
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = (width * height) - effective_resolution
-
-        if effective_resolution > max_effective_resolution or (
-            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
-        ):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = (width, height)
-
-    return best_fit
-
-
-def divide_to_patches(image, patch_size):
-    patches = []
-    width, height = image.size
-    for i in range(0, height, patch_size):
-        for j in range(0, width, patch_size):
-            box = (j, i, j + patch_size, i + patch_size)
-            patch = image.crop(box)
-            patches.append(patch)
-    return patches
-
-
-def expand2square(pil_img, background_color):
-    width, height = pil_img.size
-    if width == height:
-        return pil_img
-    if pil_img.mode == "L":
-        pil_img = pil_img.convert("RGB")
-    if width > height:
-        result = Image.new(pil_img.mode, (width, width), background_color)
-        result.paste(pil_img, (0, (width - height) // 2))
-        return result
-    else:
-        result = Image.new(pil_img.mode, (height, height), background_color)
-        result.paste(pil_img, ((height - width) // 2, 0))
-        return result
-
-
-def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
-    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
-        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
-        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
-        range_start = tuple(map(int, matches[0]))
-        range_end = tuple(map(int, matches[-1]))
-        grid_pinpoints = [
-            (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
-        ]
-        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
-    if type(grid_pinpoints) is list:
-        possible_resolutions = grid_pinpoints
-    else:
-        possible_resolutions = ast.literal_eval(grid_pinpoints)  # type: ignore
-    width, height = select_best_resolution(image_size, possible_resolutions)
-    return width // patch_size, height // patch_size
-
-
-# This functions is not used.
-def resize_and_pad_image(image, target_resolution):
-    original_width, original_height = image.size
-    target_width, target_height = target_resolution
-
-    scale_w = target_width / original_width
-    scale_h = target_height / original_height
-
-    if scale_w < scale_h:
-        new_width = target_width
-        new_height = min(math.ceil(original_height * scale_w), target_height)
-    else:
-        new_height = target_height
-        new_width = min(math.ceil(original_width * scale_h), target_width)
-
-    # Resize the image
-    resized_image = image.resize((new_width, new_height))
-
-    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
-    paste_x = (target_width - new_width) // 2
-    paste_y = (target_height - new_height) // 2
-    new_image.paste(resized_image, (paste_x, paste_y))
-
-    return new_image
-
-
-# DIFFERENT from sglang.srt.mm_utils.process_anyres_image
-def process_anyres_image(image, processor, grid_pinpoints):
-    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
-        patch_size = processor.crop_size["height"]
-        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
-        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
-        range_start = tuple(map(int, matches[0]))
-        range_end = tuple(map(int, matches[-1]))
-        grid_pinpoints = [
-            (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
-        ]
-        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
-
-    if type(grid_pinpoints) is list:
-        possible_resolutions = grid_pinpoints
-    else:
-        possible_resolutions = ast.literal_eval(grid_pinpoints)  # type: ignore
-    best_resolution = select_best_resolution(image.size, possible_resolutions)
-
-    # image_padded = resize_and_pad_image(image, best_resolution)
-    image_padded = image.resize(best_resolution)
-
-    patches = divide_to_patches(image_padded, processor.crop_size["height"])
-
-    image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
-
-    image_patches = [image_original_resize] + patches
-    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
-    return torch.stack(image_patches, dim=0)
-
-
-def process_images(images, image_processor, model_cfg):
-    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", "")
-    new_images = []
-    if image_aspect_ratio == "pad":
-        for image in images:
-            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
-            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
-            new_images.append(image)
-    elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
-        for image in images:
-            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
-            new_images.append(image)
-    else:
-        return image_processor(images, return_tensors="pt")["pixel_values"]
-    if all(x.shape == new_images[0].shape for x in new_images):
-        new_images = torch.stack(new_images, dim=0)
-    return new_images
-
-
-class Mineru2ImageProcessor(BaseImageProcessor):
-    model_input_names = ["pixel_values"]
-
-    def __init__(
-        self,
-        image_mean=(0.5, 0.5, 0.5),
-        image_std=(0.5, 0.5, 0.5),
-        size=(384, 384),
-        crop_size: Optional[Dict[str, int]] = None,
-        resample=PILImageResampling.BICUBIC,
-        rescale_factor=1 / 255,
-        data_format=ChannelDimension.FIRST,
-        image_aspect_ratio: Optional[str] = None,
-        image_grid_pinpoints: Optional[list] = None,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-
-        crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384}
-        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.size = size
-        self.resample = resample
-        self.rescale_factor = rescale_factor
-        self.data_format = data_format
-        self.crop_size = crop_size
-        self.image_aspect_ratio = image_aspect_ratio
-        self.image_grid_pinpoints = image_grid_pinpoints
-        self.in_e2e_processing = False
-
-    def _preprocess(self, images):
-        if isinstance(images, Image.Image):
-            images = [images]
-        else:
-            # to adapt video data
-            images = [to_numpy_array(image) for image in images]
-            assert isinstance(images, list)
-
-        transforms = [
-            convert_to_rgb,
-            to_numpy_array,
-            partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
-            partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
-            partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
-            partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
-        ]
-
-        images = reduce(lambda x, f: [*map(f, x)], transforms, images)
-        return {"pixel_values": images}
-
-    def _preprocess_end_to_end(self, images):
-        image_aspect_ratio = self.image_aspect_ratio
-        image_grid_pinpoints = self.image_grid_pinpoints
-        assert image_aspect_ratio is not None
-        assert image_grid_pinpoints is not None
-
-        pixel_values = []
-        if image_aspect_ratio == "pad":
-            for image in images:
-                image = expand2square(image, tuple(int(x * 255) for x in self.image_mean))
-                image = self._preprocess(image)["pixel_values"][0]
-                pixel_values.append(image)
-        elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
-            for image in images:
-                image = process_anyres_image(image, self, self.image_grid_pinpoints)
-                pixel_values.append(image.numpy())
-        else:
-            pixel_values = self._preprocess(images)["pixel_values"]
-
-        if isinstance(pixel_values, list) and all(x.shape == pixel_values[0].shape for x in pixel_values):
-            pixel_values = np.stack(pixel_values, axis=0)
-
-        # CAUTION: here used (height, width).
-        image_sizes = [(image.height, image.width) for image in images]
-        assert len(pixel_values) == len(image_sizes)
-
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
-
-    def preprocess(
-        self,
-        images,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ):
-        if self.image_aspect_ratio is None or self.in_e2e_processing:
-            data = self._preprocess(images)
-        else:
-            assert self.image_grid_pinpoints is not None
-            self.in_e2e_processing = True
-            try:
-                data = self._preprocess_end_to_end(images)
-            finally:
-                self.in_e2e_processing = False
-
-        return BatchFeature(data=data, tensor_type=return_tensors)

+ 0 - 449
mineru/model/vlm_hf_model/modeling_mineru2.py

@@ -1,449 +0,0 @@
-import math
-import re
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from transformers import (
-    Qwen2ForCausalLM,
-    Qwen2Model,
-    SiglipVisionConfig,
-    SiglipVisionModel,
-)
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from .configuration_mineru2 import Mineru2QwenConfig
-from .image_processing_mineru2 import Mineru2ImageProcessor, get_anyres_image_grid_shape
-
-
-class SiglipVisionTower(nn.Module):
-    def __init__(self, vision_tower):
-        super().__init__()
-
-        self.config = SiglipVisionConfig.from_pretrained(vision_tower)
-        assert isinstance(self.config, SiglipVisionConfig)
-        self.config.num_hidden_layers -= 1  # drop the last hidden layer
-        self.config.vision_use_head = False
-
-        self.vision_tower = SiglipVisionModel(self.config)
-        self.vision_tower.requires_grad_(False)
-
-        self.image_processor = Mineru2ImageProcessor()
-
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_forward_out = self.vision_tower(
-                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
-                )
-                image_feature = image_forward_out.hidden_states[-1].to(image.dtype)
-                image_features.append(image_feature)
-        else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
-            image_features = image_forward_outs.hidden_states[-1].to(images.dtype)
-
-        return image_features
-
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-
-    @property
-    def dtype(self):
-        for p in self.vision_tower.parameters():
-            return p.dtype
-
-    @property
-    def device(self):
-        for p in self.vision_tower.parameters():
-            return p.device
-
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-
-    @property
-    def num_patches(self):
-        return (self.config.image_size // self.config.patch_size) ** 2
-
-    @property
-    def num_patches_per_side(self):
-        return self.config.image_size // self.config.patch_size
-
-    @property
-    def image_size(self):
-        return self.config.image_size
-
-
-def build_vision_tower(config: Mineru2QwenConfig):
-    vision_tower = getattr(config, "mm_vision_tower", getattr(config, "vision_tower", ""))
-    model_path = getattr(config, "_name_or_path", "")
-    if "siglip" in vision_tower.lower():
-        if model_path:
-            return SiglipVisionTower(f"{model_path}/{vision_tower}")
-        else:
-            return SiglipVisionTower(vision_tower)
-    raise ValueError(f"Unknown vision tower: {vision_tower}")
-
-
-def build_vision_projector(config: Mineru2QwenConfig):
-    projector_type = getattr(config, "mm_projector_type", "linear")
-
-    if projector_type == "linear":
-        return nn.Linear(config.mm_hidden_size, config.hidden_size)
-
-    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
-    if mlp_gelu_match:
-        mlp_depth = int(mlp_gelu_match.group(1))
-        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
-        for _ in range(1, mlp_depth):
-            modules.append(nn.GELU())  # type: ignore
-            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
-        return nn.Sequential(*modules)
-
-    if projector_type == "identity":
-        return nn.Identity()
-
-    raise ValueError(f"Unknown projector type: {projector_type}")
-
-
-class Mineru2QwenModel(Qwen2Model):
-    config_class = Mineru2QwenConfig
-
-    def __init__(self, config: Mineru2QwenConfig):
-        super(Mineru2QwenModel, self).__init__(config)
-
-        self.vision_tower = build_vision_tower(config)
-        self.mm_projector = build_vision_projector(config)
-
-        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
-            self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype))
-
-
-class Mineru2QwenForCausalLM(Qwen2ForCausalLM):
-    config_class = Mineru2QwenConfig
-
-    def __init__(self, config: Mineru2QwenConfig):
-        super(Qwen2ForCausalLM, self).__init__(config)
-        config.rope_scaling = None
-        self.model = Mineru2QwenModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        self.ignore_index = config.ignore_index
-        self.image_token_index = config.image_token_index
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_model(self):
-        return self.model
-
-    def encode_images(self, images: torch.Tensor):
-        image_features = self.get_model().vision_tower(images)
-        image_features = self.get_model().mm_projector(image_features)
-        return image_features
-
-    def prepare_inputs_labels_for_multimodal(
-        self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None
-    ):
-        vision_tower = self.get_model().vision_tower
-        if vision_tower is None or images is None or input_ids.shape[1] == 1:
-            return input_ids, position_ids, attention_mask, past_key_values, None, labels
-
-        if type(images) is list or images.ndim == 5:
-            if type(images) is list:
-                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
-            concat_images = torch.cat([image for image in images], dim=0)
-            image_features = self.encode_images(concat_images)
-            split_sizes = [image.shape[0] for image in images]
-            image_features = torch.split(image_features, split_sizes, dim=0)
-            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
-            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
-            if mm_patch_merge_type == "flat":
-                image_features = [x.flatten(0, 1) for x in image_features]
-            elif mm_patch_merge_type.startswith("spatial"):
-                new_image_features = []
-                for image_idx, image_feature in enumerate(image_features):
-                    if image_feature.shape[0] > 1:
-                        base_image_feature = image_feature[0]
-                        image_feature = image_feature[1:]
-                        height = width = self.get_model().vision_tower.num_patches_per_side
-                        assert height * width == base_image_feature.shape[0]
-
-                        if "anyres_max" in image_aspect_ratio:
-                            matched_anyres_max_num_patches = re.match(r"square_anyres_max_(\d+)", image_aspect_ratio)
-                            if matched_anyres_max_num_patches:
-                                max_num_patches = int(matched_anyres_max_num_patches.group(1))
-
-                        if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
-                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                                image_sizes[image_idx],
-                                self.config.image_grid_pinpoints,
-                                self.get_model().vision_tower.config.image_size,
-                            )
-                            image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
-                        else:
-                            raise NotImplementedError
-                        if (
-                            "unpad" in mm_patch_merge_type
-                            and "anyres_max" in image_aspect_ratio
-                            and matched_anyres_max_num_patches
-                        ):
-                            unit = image_feature.shape[2]
-                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                            c, h, w = image_feature.shape
-                            times = math.sqrt(h * w / (max_num_patches * unit**2))
-                            if times > 1.1:
-                                image_feature = image_feature[None]
-                                image_feature = nn.functional.interpolate(
-                                    image_feature, [int(h // times), int(w // times)], mode="bilinear"
-                                )[0]
-                            image_feature = torch.cat(
-                                (
-                                    image_feature,
-                                    self.model.image_newline[:, None, None]
-                                    .expand(*image_feature.shape[:-1], 1)
-                                    .to(image_feature.device),
-                                ),
-                                dim=-1,
-                            )
-                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                        elif "unpad" in mm_patch_merge_type:
-                            image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-                            image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                            image_feature = torch.cat(
-                                (
-                                    image_feature,
-                                    self.model.image_newline[:, None, None]
-                                    .expand(*image_feature.shape[:-1], 1)
-                                    .to(image_feature.device),
-                                ),
-                                dim=-1,
-                            )
-                            image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                        else:
-                            image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
-                            image_feature = image_feature.flatten(0, 3)
-                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-                    else:
-                        image_feature = image_feature[0]
-                        if "unpad" in mm_patch_merge_type:
-                            image_feature = torch.cat(
-                                (image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0
-                            )
-                    new_image_features.append(image_feature)
-                image_features = new_image_features
-            else:
-                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
-        else:
-            image_features = self.encode_images(images)
-
-        _labels = labels
-        _position_ids = position_ids
-        _attention_mask = attention_mask
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
-        else:
-            attention_mask = attention_mask.bool()
-        if position_ids is None:
-            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
-        if labels is None:
-            labels = torch.full_like(input_ids, self.ignore_index)
-
-        # remove the padding using attention_mask -- FIXME
-        _input_ids = input_ids
-        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
-        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
-
-        new_input_embeds = []
-        new_labels = []
-        cur_image_idx = 0
-        for batch_idx, cur_input_ids in enumerate(input_ids):
-            num_images = (cur_input_ids == self.image_token_index).sum()
-            if num_images == 0:
-                cur_image_features = image_features[cur_image_idx]
-                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
-                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
-                new_input_embeds.append(cur_input_embeds)
-                new_labels.append(labels[batch_idx])
-                cur_image_idx += 1
-                continue
-
-            image_token_indices = (
-                [-1] + torch.where(cur_input_ids == self.image_token_index)[0].tolist() + [cur_input_ids.shape[0]]
-            )
-            cur_input_ids_noim = []
-            cur_labels = labels[batch_idx]
-            cur_labels_noim = []
-            for i in range(len(image_token_indices) - 1):
-                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
-                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
-            split_sizes = [x.shape[0] for x in cur_labels_noim]
-            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
-            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
-            cur_new_input_embeds = []
-            cur_new_labels = []
-
-            for i in range(num_images + 1):
-                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
-                cur_new_labels.append(cur_labels_noim[i])
-                if i < num_images:
-                    cur_image_features = image_features[cur_image_idx]
-                    cur_image_idx += 1
-                    cur_new_input_embeds.append(cur_image_features)
-                    cur_new_labels.append(
-                        torch.full(
-                            (cur_image_features.shape[0],), self.ignore_index, device=cur_labels.device, dtype=cur_labels.dtype
-                        )
-                    )
-
-            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
-
-            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
-            cur_new_labels = torch.cat(cur_new_labels)
-
-            new_input_embeds.append(cur_new_input_embeds)
-            new_labels.append(cur_new_labels)
-
-        # Truncate sequences to max length as image embeddings can make the sequence longer
-        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
-        if tokenizer_model_max_length is not None:
-            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
-            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
-
-        # Combine them
-        max_len = max(x.shape[0] for x in new_input_embeds)
-        batch_size = len(new_input_embeds)
-
-        new_input_embeds_padded = []
-        new_labels_padded = torch.full(
-            (batch_size, max_len), self.ignore_index, dtype=new_labels[0].dtype, device=new_labels[0].device
-        )
-        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
-        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
-
-        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
-            cur_len = cur_new_embed.shape[0]
-            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
-                new_input_embeds_padded.append(
-                    torch.cat(
-                        (
-                            torch.zeros(
-                                (max_len - cur_len, cur_new_embed.shape[1]),
-                                dtype=cur_new_embed.dtype,
-                                device=cur_new_embed.device,
-                            ),
-                            cur_new_embed,
-                        ),
-                        dim=0,
-                    )
-                )
-                if cur_len > 0:
-                    new_labels_padded[i, -cur_len:] = cur_new_labels
-                    attention_mask[i, -cur_len:] = True
-                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-            else:
-                new_input_embeds_padded.append(
-                    torch.cat(
-                        (
-                            cur_new_embed,
-                            torch.zeros(
-                                (max_len - cur_len, cur_new_embed.shape[1]),
-                                dtype=cur_new_embed.dtype,
-                                device=cur_new_embed.device,
-                            ),
-                        ),
-                        dim=0,
-                    )
-                )
-                if cur_len > 0:
-                    new_labels_padded[i, :cur_len] = cur_new_labels
-                    attention_mask[i, :cur_len] = True
-                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-
-        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
-
-        if _labels is None:
-            new_labels = None
-        else:
-            new_labels = new_labels_padded
-
-        if _attention_mask is None:
-            attention_mask = None
-        else:
-            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
-
-        if _position_ids is None:
-            position_ids = None
-
-        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        images: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[List[List[int]]] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
-        if inputs_embeds is None:
-            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = (
-                self.prepare_inputs_labels_for_multimodal(
-                    input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes
-                )
-            )
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-    @torch.no_grad()
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        images: Optional[torch.Tensor] = None,
-        image_sizes: Optional[List[List[int]]] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        position_ids = kwargs.pop("position_ids", None)
-        attention_mask = kwargs.pop("attention_mask", None)
-        if "inputs_embeds" in kwargs:
-            raise NotImplementedError("`inputs_embeds` is not supported")
-
-        inputs, position_ids, attention_mask, _, inputs_embeds, _ = self.prepare_inputs_labels_for_multimodal(
-            inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes
-        )
-
-        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
-
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
-        images = kwargs.pop("images", None)
-        image_sizes = kwargs.pop("image_sizes", None)
-        inputs = super().prepare_inputs_for_generation(
-            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
-        )
-        if images is not None:
-            inputs["images"] = images
-        if image_sizes is not None:
-            inputs["image_sizes"] = image_sizes
-        return inputs

+ 0 - 14
mineru/model/vlm_sglang_model/__init__.py

@@ -1,14 +0,0 @@
-from sglang.srt.configs.model_config import multimodal_model_archs
-from sglang.srt.models.registry import ModelRegistry
-
-from sglang.srt.managers.multimodal_processor import (
-    PROCESSOR_MAPPING as PROCESSOR_MAPPING,
-)
-
-from .. import vlm_hf_model as _
-from .image_processor import Mineru2ImageProcessor
-from .model import Mineru2QwenForCausalLM
-
-ModelRegistry.models[Mineru2QwenForCausalLM.__name__] = Mineru2QwenForCausalLM
-PROCESSOR_MAPPING[Mineru2QwenForCausalLM] = Mineru2ImageProcessor
-multimodal_model_archs.append(Mineru2QwenForCausalLM.__name__)

+ 0 - 264
mineru/model/vlm_sglang_model/engine.py

@@ -1,264 +0,0 @@
-import asyncio
-import time
-from types import MethodType
-from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
-
-import fastapi
-from sglang.srt.entrypoints.engine import Engine as _Engine
-from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
-from sglang.srt.managers.tokenizer_manager import (
-    TokenizerManager,
-    dataclass_to_string_truncated,
-    logger,
-)
-from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.server_args import ServerArgs
-
-from ...utils.run_async import run_async
-from .logit_processor import Mineru2LogitProcessor
-
-
-class BatchEngine(_Engine):
-    """
-    The engine is patched to support batch multi-modal generate, and early image preprocessing.
-    """
-
-    def __init__(self, server_args: ServerArgs, **kwargs):
-        server_args.enable_custom_logit_processor = True
-        super().__init__(server_args=server_args, **kwargs)
-        _patch_tokenizer_manager(self.tokenizer_manager)
-
-    def generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        # The image input. It can be a file name, a url, or base64 encoded string.
-        # See also python/sglang/srt/utils.py:load_image.
-        image_data: Optional[Union[List[str], str]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None,
-        return_hidden_states: bool = False,
-        stream: bool = False,
-    ) -> Union[Dict, Iterator[Dict]]:
-        """
-        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
-        Please refer to `GenerateReqInput` for the documentation.
-        """
-        modalities_list = []
-
-        # EDIT
-        if isinstance(image_data, list):
-            for _ in range(len(image_data)):
-                modalities_list.append(["image"])
-        elif image_data is not None:
-            modalities_list.append("image")
-
-        # ADD
-        if custom_logit_processor is None:
-            custom_logit_processor = Mineru2LogitProcessor().to_str()
-
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            image_data=image_data,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            token_ids_logprob=token_ids_logprob,
-            lora_path=lora_path,
-            modalities=modalities_list,
-            custom_logit_processor=custom_logit_processor,
-            return_hidden_states=return_hidden_states,
-            stream=stream,
-        )
-        generator = _generate_request(self.tokenizer_manager, obj, None)
-
-        if stream:
-
-            def generator_wrapper():
-                while True:
-                    try:
-                        chunk = run_async(generator.__anext__())
-                        yield chunk
-                    except StopAsyncIteration:
-                        break
-
-            return generator_wrapper()
-        else:
-            ret = run_async(generator.__anext__())
-            return ret
-
-    async def async_generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        # The image input. It can be a file name, a url, or base64 encoded string.
-        # See also python/sglang/srt/utils.py:load_image.
-        image_data: Optional[Union[List[str], str]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None,
-        return_hidden_states: bool = False,
-        stream: bool = False,
-    ) -> Union[Dict, AsyncIterator[Dict], Iterator[Dict]]:
-        """
-        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
-        Please refer to `GenerateReqInput` for the documentation.
-        """
-        modalities_list = []
-
-        # EDIT
-        if isinstance(image_data, list):
-            for _ in range(len(image_data)):
-                modalities_list.append(["image"])
-        elif image_data is not None:
-            modalities_list.append("image")
-
-        # ADD
-        if custom_logit_processor is None:
-            custom_logit_processor = Mineru2LogitProcessor().to_str()
-
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            image_data=image_data,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            token_ids_logprob=token_ids_logprob,
-            lora_path=lora_path,
-            modalities=modalities_list,
-            custom_logit_processor=custom_logit_processor,
-            return_hidden_states=return_hidden_states,
-            stream=stream,
-        )
-        generator = _generate_request(self.tokenizer_manager, obj, None)
-
-        if stream is True:
-            return generator
-        else:
-            return await generator.__anext__()
-
-
-def _auto_create_handle_loop(self: TokenizerManager):
-    """
-    patch the original `auto_create_handle_loop()` method to reset `no_create_loop`
-    when the event loop changes.
-    """
-    try:
-        curr_handle_loop = asyncio.get_running_loop()
-    except RuntimeError:
-        curr_handle_loop = None
-
-    last_handle_loop = getattr(self, "_last_handle_loop", None)
-    if last_handle_loop != curr_handle_loop:
-        self.no_create_loop = False
-        setattr(self, "_last_handle_loop", curr_handle_loop)
-    return TokenizerManager.auto_create_handle_loop(self)
-
-
-def _patch_tokenizer_manager(self: TokenizerManager):
-    self.auto_create_handle_loop = MethodType(_auto_create_handle_loop, self)
-
-
-async def _one_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request],
-    created_time: Optional[float],
-):
-    tokenized_obj = await self._tokenize_one_request(obj)
-    state = self._send_one_request(obj, tokenized_obj, created_time)
-    async for out in self._wait_one_response(obj, state, request):
-        yield out
-
-
-async def _handle_batch_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request] = None,
-    created_time: Optional[float] = None,
-):
-    batch_size = obj.batch_size
-
-    generators = []
-    rids = []
-
-    if getattr(obj, "parallel_sample_num", 1) != 1:
-        raise Exception("parallel_sample_num != 1 is not supported in this patched code.")
-
-    # Send all requests
-    for i in range(batch_size):
-        tmp_obj = obj[i]
-        generators.append(_one_request(self, tmp_obj, request, created_time))
-        rids.append(tmp_obj.rid)
-
-    # Wait for all requests
-    is_stream = hasattr(obj, "stream") and obj.stream
-    if not is_stream:
-        outputs = await asyncio.gather(*(gen.__anext__() for gen in generators))
-        yield outputs
-    else:
-        rid_to_index = {rid: i for i, rid in enumerate(rids)}
-        task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
-        while task_map:
-            done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED)
-
-            for task in done:
-                gen = task_map.pop(task)
-                try:
-                    result = task.result()
-                    result["index"] = rid_to_index[result["meta_info"]["id"]]
-                    yield result
-                    new_task = asyncio.create_task(gen.__anext__())
-                    task_map[new_task] = gen
-                except StopAsyncIteration:
-                    pass
-
-
-async def _generate_request(
-    self: TokenizerManager,
-    obj: Union[GenerateReqInput, EmbeddingReqInput],
-    request: Optional[fastapi.Request] = None,
-):
-    created_time = time.time()
-
-    self.auto_create_handle_loop()
-
-    if isinstance(obj, EmbeddingReqInput) and self.is_generation:
-        raise ValueError(
-            "This model does not appear to be an embedding model by default. "
-            "Please add `--is-embedding` when launching the server or try another model."
-        )
-
-    obj.normalize_batch_and_arguments()
-
-    if self.log_requests:
-        max_length, skip_names, _ = self.log_request_metadata
-        logger.info(f"Receive: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}")
-
-    async with self.model_update_lock.reader_lock:
-        is_single = obj.is_single
-        if is_single:
-            tokenized_obj = await self._tokenize_one_request(obj)
-            state = self._send_one_request(obj, tokenized_obj, created_time)
-            async for response in self._wait_one_response(obj, state, request):
-                yield response
-        else:
-            async for response in _handle_batch_request(self, obj, request, created_time):
-                yield response

+ 0 - 213
mineru/model/vlm_sglang_model/image_processor.py

@@ -1,213 +0,0 @@
-import ast
-import asyncio
-import re
-from typing import List, Optional, Union
-
-import numpy as np
-
-from sglang.version import __version__ as sglang_version
-from packaging import version
-if version.parse(sglang_version) >= version.parse("0.4.9"):
-    # sglang >= 0.4.9
-    from sglang.srt.multimodal.processors.base_processor import (
-        BaseMultimodalProcessor as BaseProcessor,
-    )
-    from sglang.srt.multimodal.mm_utils import divide_to_patches, expand2square, select_best_resolution
-else:
-    # 0.4.7 <= sglang < 0.4.9
-    from sglang.srt.managers.multimodal_processors.base_processor import (
-        BaseMultimodalProcessor as BaseProcessor,
-    )
-    from sglang.srt.mm_utils import divide_to_patches, expand2square, select_best_resolution
-
-get_global_processor = None
-from sglang.srt.utils import load_image, logger
-from sglang.utils import get_exception_traceback
-
-from .model import Mineru2QwenForCausalLM
-
-
-# image_best_res is only resized (not padded).
-def process_anyres_image(image, processor, grid_pinpoints):
-    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
-        patch_size = processor.crop_size["height"]
-        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
-        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
-        range_start = tuple(map(int, matches[0]))
-        range_end = tuple(map(int, matches[-1]))
-        grid_pinpoints = [
-            (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)
-        ]
-        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
-
-    if type(grid_pinpoints) is list:
-        possible_resolutions = grid_pinpoints
-    else:
-        possible_resolutions = ast.literal_eval(grid_pinpoints)
-    best_resolution = select_best_resolution(image.size, possible_resolutions)
-
-    image_best_res = image.resize(best_resolution)  # <<<<<<< Here changed
-    patches = divide_to_patches(image_best_res, processor.crop_size["height"])
-    image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"]))
-
-    image_patches = [image_original_resize] + patches
-    image_patches = [processor.preprocess(image_patch)["pixel_values"][0] for image_patch in image_patches]
-    return np.stack(image_patches, axis=0)
-
-
-class Mineru2ImageProcessor(BaseProcessor):
-    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
-        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
-
-    @staticmethod
-    def _process_single_image_task(
-        image_data: Union[str, bytes],
-        image_aspect_ratio: Optional[str] = None,
-        image_grid_pinpoints: Optional[str] = None,
-        image_processor=None,
-    ):
-        if image_processor is None:
-            assert get_global_processor is not None
-            image_processor = get_global_processor().image_processor
-
-        try:
-            image, image_size = load_image(image_data)
-            if image_size is not None:
-                # It is a video with multiple images
-                image_hash = hash(image_data)
-                pixel_values = image_processor(image)["pixel_values"]
-                pixel_values = np.stack(pixel_values, axis=0)
-                return pixel_values, image_hash, image_size
-            else:
-                # It is an image
-                image_hash = hash(image_data)
-                if image_aspect_ratio == "pad":
-                    image = expand2square(
-                        image,
-                        tuple(int(x * 255) for x in image_processor.image_mean),
-                    )
-                    pixel_values = image_processor(image.convert("RGB"))["pixel_values"][0]
-                elif image_aspect_ratio == "anyres" or (image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio):
-                    pixel_values = process_anyres_image(image, image_processor, image_grid_pinpoints)
-                else:
-                    pixel_values = image_processor(image)["pixel_values"][0]
-                return pixel_values, image_hash, image.size
-        except Exception:
-            logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
-
-    async def _process_single_image(self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str):
-        if hasattr(self, "cpu_executor"):
-            executor = self.cpu_executor
-        else:
-            executor = self.executor
-
-        if get_global_processor is not None:
-            image_processor = None  # save ipc cost
-        else:
-            image_processor = self._processor.image_processor
-
-        if executor is not None:
-            loop = asyncio.get_running_loop()
-            return await loop.run_in_executor(
-                executor,
-                Mineru2ImageProcessor._process_single_image_task,
-                image_data,
-                aspect_ratio,
-                grid_pinpoints,
-                image_processor,
-            )
-        else:
-            return self._process_single_image_task(
-                image_data,
-                aspect_ratio,
-                grid_pinpoints,
-                image_processor,
-            )
-
-    async def process_mm_data_async(
-        self,
-        image_data: List[Union[str, bytes]],
-        input_text,
-        request_obj,
-        *args,
-        **kwargs,
-    ):
-        from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-
-        if not image_data:
-            return None
-
-        modalities = request_obj.modalities or ["image"]
-        aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
-        grid_pinpoints = (
-            self.hf_config.image_grid_pinpoints
-            if hasattr(self.hf_config, "image_grid_pinpoints")
-               and "anyres" in aspect_ratio
-            else None
-        )
-
-        if isinstance(image_data, str):
-            image_data = [image_data]
-
-        if isinstance(image_data, list) and len(image_data) > 0:
-            if "multi-images" in modalities or "video" in modalities:
-                # Multiple images
-                aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
-                pixel_values, data_hashes, image_sizes = [], [], []
-                res = []
-                for img_data in image_data:
-                    res.append(
-                        self._process_single_image(
-                            img_data, aspect_ratio, grid_pinpoints
-                        )
-                    )
-
-                res = await asyncio.gather(*res)
-                for pixel_v, image_h, image_s in res:
-                    pixel_values.append(pixel_v)
-                    data_hashes.append(image_h)
-                    image_sizes.append(image_s)
-
-                if isinstance(pixel_values[0], np.ndarray):
-                    pixel_values = np.stack(pixel_values, axis=0)
-            else:
-                # A single image
-                pixel_values, image_hash, image_size = await self._process_single_image(
-                    image_data[0], aspect_ratio, grid_pinpoints
-                )
-                image_sizes = [image_size]
-        else:
-            raise ValueError(f"Invalid image data: {image_data}")
-        modality = Modality.IMAGE
-        if isinstance(request_obj.modalities, list):
-            if request_obj.modalities[0] == "multi-images":
-                modality = Modality.MULTI_IMAGES
-            elif request_obj.modalities[0] == "video":
-                modality = Modality.VIDEO
-
-        if version.parse(sglang_version) >= version.parse("0.4.9.post3"):
-            # sglang >= 0.4.9.post3
-            return {
-                "mm_items": [
-                    MultimodalDataItem(
-                        feature=pixel_values,
-                        model_specific_data={
-                            "image_sizes": image_sizes,
-                        },
-                        modality=modality,
-                    )
-                ],
-            }
-        else:
-            # 0.4.7 <= sglang <= 0.4.9.post2
-            return {
-                "mm_items": [
-                    MultimodalDataItem(
-                        pixel_values=pixel_values,
-                        image_sizes=image_sizes,
-                        modality=modality,
-                    )
-                ],
-            }
-
-ImageProcessorMapping = {Mineru2QwenForCausalLM: Mineru2ImageProcessor}

+ 0 - 90
mineru/model/vlm_sglang_model/logit_processor.py

@@ -1,90 +0,0 @@
-from typing import List
-
-from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
-
-
-class Mineru2LogitProcessor(CustomLogitProcessor):
-    """
-    Stateless logit processor for Mineru2.
-
-    (base-class: sglang.srt.sampling.custom_logit_processor.CustomLogitProcessor)
-
-    This processor applies token-level constraints to prevent repetition during generation.
-    It supports two main constraints:
-
-    - no_repeat_ngram_size (int):
-        Prevents repeating the same n-gram of specified size in the output.
-        Inspired by Hugging Face's NoRepeatNGramLogitsProcessor.
-        This implementation is slower due to its lack of specialized optimization.
-
-    - no_repeat_token_count (int):
-        (Placeholder for future logic)
-        Intended to prevent repeating the same token multiple times.
-        Not yet implemented in this version.
-    """
-
-    def __init__(self) -> None:
-        super().__init__()
-        self._generated_ngrams = {}  # Cache of generated n-grams by request ID
-        self._time = {}  # Timestamp of the last update for each request
-        self._gen_step = 0  # Global generation step counter
-
-    def __call__(self, logits, batch_info: List[dict]):
-        """
-        Applies repetition constraints to the logits before sampling tokens.
-
-        Args:
-            logits (FloatTensor): A tensor of shape (batch_size, vocab_size) containing raw token logits.
-            batch_info (List[dict]): A list of metadata dicts for each sample in the batch. Each dict must include:
-                - "__req__": Request object containing request ID and output_ids.
-                - "no_repeat_ngram_size": Size of n-gram to avoid repeating.
-
-        Returns:
-            FloatTensor: The modified logits tensor with banned token logits set to -inf.
-        """
-        from sglang.srt.managers.schedule_batch import Req
-
-        self._gen_step += 1  # Update global generation step
-
-        for idx, info in enumerate(batch_info):
-            if not isinstance(info, dict) or "__req__" not in info:
-                continue
-
-            req: Req = info["__req__"]
-            rid = req.rid
-            output_ids = req.output_ids
-            ngram_size = info.get("no_repeat_ngram_size", 0)
-
-            # Skip if there are not enough tokens to form an n-gram
-            if ngram_size <= 0 or len(output_ids) < ngram_size:
-                continue
-
-            # Record the current step for cache cleanup tracking
-            self._time[rid] = self._gen_step
-
-            # Initialize n-gram cache for this request if it doesn't exist
-            if rid not in self._generated_ngrams:
-                self._generated_ngrams[rid] = {}
-
-            # Get the n-gram prefix (all but the last token)
-            prev_ngram = tuple(output_ids[-ngram_size:-1])
-            last_token = output_ids[-1]
-
-            # Store this n-gram occurrence
-            self._generated_ngrams[rid][prev_ngram] = self._generated_ngrams[rid].get(prev_ngram, []) + [last_token]
-
-            # Get the next-token candidates to ban based on current prefix
-            current_prefix = tuple(output_ids[-ngram_size + 1 :])
-            banned_tokens = self._generated_ngrams[rid].get(current_prefix, [])
-
-            # Set the logits of banned tokens to negative infinity
-            for token in banned_tokens:
-                logits[idx][token] = -float("inf")
-
-        # Clean up cache for expired requests
-        expired_rids = [rid for rid, last_used in self._time.items() if last_used < self._gen_step]
-        for rid in expired_rids:
-            self._generated_ngrams.pop(rid, None)
-            self._time.pop(rid, None)
-
-        return logits

+ 0 - 453
mineru/model/vlm_sglang_model/model.py

@@ -1,453 +0,0 @@
-import math
-import re
-from typing import Iterable, List, Optional, Tuple
-
-import numpy as np
-import torch
-from sglang.srt.layers.quantization.base_config import QuantizationConfig
-
-from sglang.version import __version__ as sglang_version
-from packaging import version
-if version.parse(sglang_version) >= version.parse("0.4.9"):
-    # sglang >= 0.4.9
-    from sglang.srt.multimodal.mm_utils import (
-            get_anyres_image_grid_shape,
-        )
-else:
-    # 0.4.7 <= sglang < 0.4.9
-    from sglang.srt.mm_utils import (
-        get_anyres_image_grid_shape,
-    )
-
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.models.qwen2 import Qwen2ForCausalLM
-from sglang.srt.utils import add_prefix
-from torch import nn
-from transformers import (
-    CLIPVisionConfig,
-    CLIPVisionModel,
-    SiglipVisionConfig,
-    SiglipVisionModel,
-)
-
-from ..vlm_hf_model.configuration_mineru2 import Mineru2QwenConfig
-from ..vlm_hf_model.modeling_mineru2 import build_vision_projector
-from ...utils.models_download_utils import auto_download_and_get_model_root_path
-
-
-def flatten_nested_list(nested_list):
-    if isinstance(nested_list, list):
-        return [item for sublist in nested_list for item in flatten_nested_list(sublist)]
-    else:
-        return [nested_list]
-
-
-def downgrade_modality(modality):
-    modality_str = str(modality)
-    if "MULTI_IMAGES" in modality_str:
-        return "multi-images"
-    if "IMAGE" in modality_str:
-        return "image"
-    if "VIDEO" in modality_str:
-        return "video"
-    if "AUDIO" in modality_str:
-        return "audio"
-    raise ValueError(f"Unexpected modality: {modality_str}")
-
-
-class Mineru2QwenForCausalLM(nn.Module):
-    def __init__(
-        self,
-        config: Mineru2QwenConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.config = config
-
-        if getattr(self.config, "projector_hidden_act", None) is None:
-            self.config.projector_hidden_act = "gelu"
-        if getattr(self.config, "image_token_index", None) is None:
-            self.config.image_token_index = 151646
-
-        # load vision tower
-        mm_vision_tower = self.config.mm_vision_tower
-        model_root_path = auto_download_and_get_model_root_path(mm_vision_tower, "vlm")
-        mm_vision_tower = f"{model_root_path}/{mm_vision_tower}"
-
-        if "clip" in mm_vision_tower:
-            vision_config = CLIPVisionConfig.from_pretrained(mm_vision_tower)
-            self.vision_tower = CLIPVisionModel(vision_config)  # type: ignore
-        elif "siglip" in mm_vision_tower:
-            vision_config = SiglipVisionConfig.from_pretrained(mm_vision_tower)
-            self.vision_tower = SiglipVisionModel(vision_config)  # type: ignore
-            # Siglip needs all feature tokens
-            self.config.mm_vision_select_feature = "full"
-        else:
-            raise ValueError(f"Unexpected mm_vision_tower: {mm_vision_tower}")
-
-        ### EDIT: change projector
-        # the name `projector` contains `proj` which is often used in attention layers, which can cause bugs in quantization.
-        self.multi_modal_mlp = build_vision_projector(config)
-
-        self.language_model = Qwen2ForCausalLM(
-            config,
-            quant_config=quant_config,
-            prefix=add_prefix("language_model", prefix),
-        )
-
-        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
-            self.language_model.model.image_newline = nn.Parameter(torch.empty(config.hidden_size))
-
-        language_model_device = next(self.language_model.parameters()).device
-        self.vision_tower = self.vision_tower.to(language_model_device)
-        self.vision_tower.eval()
-
-        self.vision_feature_layer = self.config.mm_vision_select_layer
-        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
-        self.image_size = self.vision_tower.config.image_size
-        self.patch_size = self.vision_tower.config.patch_size
-
-        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
-        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
-        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
-
-        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
-        if self.vision_feature_select_strategy in ("patch", "full"):
-            pass
-        elif self.vision_feature_select_strategy == "cls_patch":
-            self.image_feature_len += 1
-        else:
-            raise ValueError(f"Unexpected select feature: {self.select_feature}")
-
-    def pad_input_ids(self, input_ids: List[int], image_inputs):
-
-        image_sizes = flatten_nested_list([item.image_sizes for item in image_inputs.mm_items])
-        pad_values = [item.pad_value for item in image_inputs.mm_items]
-
-        # hardcode for spatial_unpad + anyres
-        # if image_inputs.modalities is not None and (
-        #     "multi-images" in image_inputs.modalities or "video" in image_inputs.modalities
-        # ):
-        #     image_aspect_ratio = "pad"
-        # else:
-        #     image_aspect_ratio = "anyres"
-
-        offset_list = []
-        image_inputs.image_pad_len = []
-        for image_idx, image_s in enumerate(image_sizes):
-            if len(image_sizes) > 16:
-                # 2x2 pooling with stride 2
-                new_image_feature_len = math.ceil(self.image_size / self.patch_size / 2) ** 2
-            else:
-                new_image_feature_len = self.image_feature_len  # multiimage
-
-            height = width = self.num_patches_per_side
-            if "anyres" in self.config.image_aspect_ratio:
-                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    image_s,
-                    self.image_grid_pinpoints,
-                    self.vision_tower.config.image_size,
-                )
-                h = num_patch_height * height
-                w = num_patch_width * width
-
-                ### EDIT: remove `unpad_image_shape`
-                # new_h, new_w = unpad_image_shape(h, w, image_s)
-                new_h, new_w = h, w
-
-                if "anyres_max" in self.config.image_aspect_ratio:
-                    matched_anyres_max_num_patches = re.match(r".*anyres_max_(\d+)", self.config.image_aspect_ratio)
-                    if matched_anyres_max_num_patches:
-                        max_num_patches = int(matched_anyres_max_num_patches.group(1))
-                        times = math.sqrt(new_h * new_w / (max_num_patches * self.image_feature_len))
-                        if times > 1.1:
-                            new_h = int(new_h // times)
-                            new_w = int(new_w // times)
-                new_image_feature_len += new_h * (new_w + 1)
-
-            try:
-                offset = input_ids.index(self.config.image_token_index)
-            except ValueError:
-                offset = 0
-            # old_len + pad_len - 1, because we need to remove image_token_id
-            input_ids = input_ids[:offset] + [pad_values[image_idx]] * new_image_feature_len + input_ids[offset + 1 :]
-            offset_list.append(offset)
-            image_inputs.image_pad_len.append(new_image_feature_len)
-
-        image_inputs.image_offsets = offset_list
-        return input_ids
-
-    def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        pixel_values = pixel_values.to(device=self.vision_tower.device, dtype=self.vision_tower.dtype)
-        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-        # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
-
-        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
-        if self.vision_feature_select_strategy in ["default", "patch"]:
-            selected_image_feature = selected_image_feature[:, 1:]
-        elif self.vision_feature_select_strategy == "full":
-            selected_image_feature = selected_image_feature
-        else:
-            raise ValueError(f"Unexpected select feature strategy: {self.vision_feature_select_strategy}")
-
-        image_features = self.multi_modal_mlp(selected_image_feature)
-        return image_features
-
-    @torch.no_grad()
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        positions: torch.Tensor,
-        forward_batch: ForwardBatch,
-    ) -> torch.Tensor:
-
-        image_inputs = forward_batch.mm_inputs
-
-        if image_inputs is None:
-            image_inputs = []
-
-        if forward_batch.forward_mode.is_extend():
-            # Clamp input ids. This is because the input_ids for the image tokens are
-            # filled with the hash values of the image for the prefix matching in the radix attention.
-            # There values are useless because their embeddings will be replaced by vision embeddings anyway.
-            input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
-
-            # Embed text inputs
-            input_embeds = self.language_model.model.embed_tokens(input_ids)
-
-            # Got List[List[str]] extend it to List[str]
-            # The length of the List should be equal to batch size
-            modalities_list = []
-            max_image_offset = []
-            for im in image_inputs:
-                if im:
-                    modalities_list.extend([downgrade_modality(item.modality) for item in im.mm_items])
-                if im and im.image_offsets:
-                    max_image_offset.append(np.max(np.array(im.image_offsets) + np.array(im.image_pad_len)))
-                else:
-                    max_image_offset.append(-1)
-
-            start_positions = positions[forward_batch.extend_start_loc].cpu().numpy()
-            need_vision = start_positions <= np.array(max_image_offset)
-
-            if need_vision.any():
-                bs = forward_batch.batch_size
-
-                if version.parse(sglang_version) >= version.parse("0.4.9.post3"):
-                    # sglang >= 0.4.9.post3
-                    pixel_values = flatten_nested_list(
-                        [[item.feature for item in image_inputs[i].mm_items] for i in range(bs) if need_vision[i]]
-                    )  # image_inputs[batch_idx].mm_items[item_idx].pixel_values is Tensor
-                    image_sizes = [
-                        flatten_nested_list([item.model_specific_data["image_sizes"] for item in image_inputs[i].mm_items])
-                        for i in range(bs)
-                        if need_vision[i]
-                    ]  # image_inputs[batch_idx].mm_items[item_idx].image_sizes should be tuple, but is list of tuple for now.
-                else:
-                    # 0.4.7 <= sglang <= 0.4.9.post2
-                    pixel_values = flatten_nested_list(
-                        [[item.pixel_values for item in image_inputs[i].mm_items] for i in range(bs) if need_vision[i]]
-                    )  # image_inputs[batch_idx].mm_items[item_idx].pixel_values is Tensor
-                    image_sizes = [
-                        flatten_nested_list([item.image_sizes for item in image_inputs[i].mm_items])
-                        for i in range(bs)
-                        if need_vision[i]
-                    ]  # image_inputs[batch_idx].mm_items[item_idx].image_sizes should be tuple, but is list of tuple for now.
-
-                ########## Encode Image ########
-
-                if pixel_values[0].ndim == 4:
-                    # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images
-                    np.concatenate(pixel_values, axis=0)
-                    # ndim=4
-                    concat_images = torch.tensor(
-                        np.concatenate(pixel_values, axis=0),
-                        device=self.vision_tower.device,
-                    )
-                    image_features = self.encode_images(concat_images)
-                    split_sizes = [image.shape[0] for image in pixel_values]
-                    image_features = torch.split(image_features, split_sizes, dim=0)
-                    # hd image_features: BS, num_patch, 576, 4096
-                else:
-                    # normal pixel: BS, C=3, H=336, W=336
-                    pixel_values = torch.tensor(np.array(pixel_values), device=self.vision_tower.device)
-                    image_features = self.encode_images(pixel_values)
-                    # image_features: BS, 576, 4096
-
-                if self.mm_patch_merge_type.startswith("spatial"):
-                    new_image_features = []
-                    height = width = self.num_patches_per_side
-                    for image_idx, image_feature in enumerate(image_features):
-                        if modalities_list[image_idx] == "image":
-                            image_aspect_ratio = self.config.image_aspect_ratio  # single image
-                        elif modalities_list[image_idx] == "multi-images" or modalities_list[image_idx] == "video":
-                            image_aspect_ratio = "pad"  # multi image
-                        # image_aspect_ratio = (
-                        #     "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
-                        # )
-                        if (
-                            image_feature.shape[0] > 1
-                            and "anyres" in image_aspect_ratio
-                            and modalities_list[image_idx] == "image"
-                        ):
-                            base_image_feature = image_feature[0]
-                            image_feature = image_feature[1:]
-                            assert height * width == base_image_feature.shape[0]
-
-                            if "anyres_max" in image_aspect_ratio:
-                                matched_anyres_max_num_patches = re.match(r".*anyres_max_(\d+)", image_aspect_ratio)
-                                if matched_anyres_max_num_patches:
-                                    max_num_patches = int(matched_anyres_max_num_patches.group(1))
-
-                            if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
-                                vision_tower_image_size = self.image_size
-                                try:
-                                    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                                        image_sizes[image_idx][0],
-                                        self.config.image_grid_pinpoints,
-                                        vision_tower_image_size,
-                                    )
-                                except Exception as e:
-                                    print(f"Error: {e}")
-                                    num_patch_width, num_patch_height = 2, 2
-                                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
-                            else:
-                                image_feature = image_feature.view(2, 2, height, width, -1)
-
-                            if "unpad" in self.mm_patch_merge_type:
-                                unit = image_feature.shape[2]
-                                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-                                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-
-                                ### EDIT: remove `unpad_image`
-                                # image_feature = unpad_image(image_feature, image_sizes[image_idx][0])
-
-                                if "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
-                                    c, h, w = image_feature.shape
-                                    times = math.sqrt(h * w / (max_num_patches * unit**2))
-                                    if times > 1.1:
-                                        image_feature = image_feature[None]
-                                        image_feature = nn.functional.interpolate(
-                                            image_feature,
-                                            [int(h // times), int(w // times)],
-                                            mode="bilinear",
-                                        )[0]
-                                image_feature = torch.cat(
-                                    (
-                                        image_feature,
-                                        self.language_model.model.image_newline[:, None, None].expand(
-                                            *image_feature.shape[:-1], 1
-                                        ),
-                                    ),
-                                    dim=-1,
-                                )
-                                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                            else:
-                                image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
-                                image_feature = image_feature.flatten(0, 3)
-                            image_feature = torch.cat((base_image_feature, image_feature), dim=0)
-                            image_feature = image_feature.unsqueeze(0)
-                        else:
-                            if modalities_list[image_idx] == "video":  # video
-                                # 2x2 pooling
-                                num_of_frames = image_feature.shape[0]
-                                image_feature = image_feature.view(num_of_frames, height, width, -1)
-                                image_feature = image_feature.permute(0, 3, 1, 2).contiguous()  # N, C, H, W
-                                height, weight = image_feature.shape[2:]
-                                scaled_shape = [
-                                    math.ceil(height / 2),
-                                    math.ceil(weight / 2),
-                                ]
-                                image_feature = nn.functional.interpolate(image_feature, size=scaled_shape, mode="bilinear")
-                                image_feature = image_feature.flatten(2).transpose(1, 2).contiguous()  # N, C, H*W
-                            if "unpad" in self.mm_patch_merge_type:
-                                image_feature = torch.cat(
-                                    (
-                                        image_feature,
-                                        # Expand to (bs, 1, hidden_dim) and concat at the end of the image tokens
-                                        self.language_model.model.image_newline[None, None].expand(
-                                            image_feature.shape[0],
-                                            1,
-                                            image_feature.shape[-1],
-                                        ),
-                                    ),
-                                    dim=1,
-                                )
-
-                        new_image_features.append(image_feature)
-                    image_features = new_image_features
-
-                # Fill in the placeholder for the image
-                extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
-                extend_seq_lens = forward_batch.extend_seq_lens.cpu().numpy()
-                prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
-                pt = 0
-                for i in range(bs):
-                    if not need_vision[i]:
-                        continue
-
-                    start_idx = extend_start_loc_cpu[i]
-                    seq_len = extend_seq_lens[i]
-                    prefix_len = prefix_lens_cpu[i]
-
-                    # Multiple images
-                    for image_idx, image_offset in enumerate(image_inputs[i].image_offsets):
-                        if image_offset + image_inputs[i].image_pad_len[image_idx] <= prefix_len:
-                            continue
-                        if image_offset >= prefix_len + seq_len:
-                            break
-
-                        tmp_image_feature = image_features[pt][image_idx]
-                        pad_len = tmp_image_feature.shape[0]
-
-                        input_offset = image_offset - prefix_len
-                        left_idx = start_idx + input_offset
-                        right_idx = left_idx + pad_len
-                        assert right_idx > start_idx
-                        if input_offset < 0:
-                            left_idx = start_idx
-                            tmp_image_feature = tmp_image_feature[-input_offset:]
-                        if right_idx > start_idx + seq_len:
-                            tmp_image_feature = tmp_image_feature[: start_idx + seq_len - right_idx]
-                            right_idx = start_idx + seq_len
-                        try:
-                            input_embeds[left_idx:right_idx] = tmp_image_feature
-                        except RuntimeError as e:
-                            print(f"RuntimeError in image encoding: {e}")
-                            print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}")
-                            print(f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}")
-                    pt += 1
-
-            return self.language_model(input_ids, positions, forward_batch, input_embeds=input_embeds)
-        elif forward_batch.forward_mode.is_decode():
-            return self.language_model(input_ids, positions, forward_batch)
-        else:
-            raise ValueError(f"Unexpected forward mode: {forward_batch.forward_mode}")
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        projector_weights = {
-            "model.mm_projector": "multi_modal_mlp",
-            "model.vision_tower.vision_tower": "vision_tower",
-            # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
-            "model.image_newline": "language_model.model.image_newline",
-        }
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if "projector" in name or "vision_tower" in name or "image_newline" in name:
-                for weight_name, param_name in projector_weights.items():
-                    if weight_name in name:
-                        name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                weight_loader(param, loaded_weight)
-            else:
-                self.language_model.load_weights([(name, loaded_weight)])
-
-    @property
-    def num_patches_per_side(self):
-        return self.image_size // self.patch_size
-
-
-EntryClass = [Mineru2QwenForCausalLM]

+ 0 - 75
mineru/model/vlm_sglang_model/server.py

@@ -1,75 +0,0 @@
-import os
-import sys
-
-from fastapi import Request
-from sglang.srt.entrypoints.http_server import app, generate_request, launch_server
-from sglang.srt.managers.io_struct import GenerateReqInput
-from sglang.srt.server_args import prepare_server_args
-from sglang.srt.utils import kill_process_tree
-from sglang.srt.conversation import Conversation
-
-from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
-from .logit_processor import Mineru2LogitProcessor
-
-# mineru2.0的chat_template与chatml在换行上有微小区别
-def custom_get_prompt(self) -> str:
-    system_prompt = self.system_template.format(system_message=self.system_message)
-    if self.system_message == "":
-        ret = ""
-    else:
-        ret = system_prompt + self.sep
-
-    for role, message in self.messages:
-        if message:
-            ret += role + "\n" + message + self.sep
-        else:
-            ret += role + "\n"
-    return ret
-
-_custom_logit_processor_str = Mineru2LogitProcessor().to_str()
-
-# remote the existing /generate route
-for route in app.routes[:]:
-    if hasattr(route, "path") and getattr(route, "path") == "/generate":
-        app.routes.remove(route)
-
-
-# add the custom /generate route
-@app.api_route("/generate", methods=["POST", "PUT"])
-async def custom_generate_request(obj: GenerateReqInput, request: Request):
-    if obj.custom_logit_processor is None:
-        obj.custom_logit_processor = _custom_logit_processor_str
-    return await generate_request(obj, request)
-
-
-def main():
-    # 检查命令行参数中是否包含--model-path
-    args = sys.argv[1:]
-    has_model_path_arg = False
-
-    for i, arg in enumerate(args):
-        if arg == "--model-path" or arg.startswith("--model-path="):
-            has_model_path_arg = True
-            break
-
-    # 如果没有--model-path参数,在参数列表中添加它
-    if not has_model_path_arg:
-        default_path = auto_download_and_get_model_root_path("/", "vlm")
-        args.extend(["--model-path", default_path])
-
-    server_args = prepare_server_args(args)
-
-    if server_args.chat_template is None:
-        server_args.chat_template = "chatml"
-        Conversation.get_prompt = custom_get_prompt
-
-    server_args.enable_custom_logit_processor = True
-
-    try:
-        launch_server(server_args)
-    finally:
-        kill_process_tree(os.getpid(), include_parent=False)
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 0
mineru/model/vlm_vllm_model/__init__.py


+ 39 - 0
mineru/model/vlm_vllm_model/server.py

@@ -0,0 +1,39 @@
+import sys
+
+from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
+from vllm.entrypoints.cli.main import main as vllm_main
+
+
+def main():
+    args = sys.argv[1:]
+
+    has_port_arg = False
+    has_gpu_memory_utilization_arg = False
+    has_model_arg = False
+
+    for i, arg in enumerate(args):
+        if arg == "--port" or arg.startswith("--port="):
+            has_port_arg = True
+        if arg == "--gpu-memory-utilization" or arg.startswith("--gpu-memory-utilization="):
+            has_gpu_memory_utilization_arg = True
+        if arg == "--model" or arg.startswith("--model="):
+            has_model_arg = True
+
+    if not has_port_arg:
+        args.extend(["--port", "30000"])
+    if not has_gpu_memory_utilization_arg:
+        args.extend(["--gpu-memory-utilization", "0.5"])
+    if not has_model_arg:
+        default_path = auto_download_and_get_model_root_path("/", "vlm")
+        args.extend([default_path])
+
+    # 重新构造sys.argv,以便透传所有参数给vllm
+    sys.argv = [sys.argv[0]] + ["serve"] + args
+
+    # 启动vllm服务器
+    print(f"start vllm server: {sys.argv}")
+    vllm_main()
+
+
+if __name__ == "__main__":
+    main()

+ 30 - 9
mineru/utils/draw_bbox.py

@@ -119,22 +119,26 @@ def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_b
 
 def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
     dropped_bbox_list = []
-    tables_list, tables_body_list = [], []
-    tables_caption_list, tables_footnote_list = [], []
-    imgs_list, imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], [], []
+    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
+    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
+    codes_body_list, codes_caption_list = [], []
     titles_list = []
     texts_list = []
     interequations_list = []
     lists_list = []
+    list_items_list = []
     indexs_list = []
+
     for page in pdf_info:
         page_dropped_list = []
-        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
-        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
+        tables_body, tables_caption, tables_footnote = [], [], []
+        imgs_body, imgs_caption, imgs_footnote = [], [], []
+        codes_body, codes_caption = [], []
         titles = []
         texts = []
         interequations = []
         lists = []
+        list_items = []
         indices = []
 
         for dropped_bbox in page['discarded_blocks']:
@@ -143,7 +147,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         for block in page["para_blocks"]:
             bbox = block["bbox"]
             if block["type"] == BlockType.TABLE:
-                tables.append(bbox)
                 for nested_block in block["blocks"]:
                     bbox = nested_block["bbox"]
                     if nested_block["type"] == BlockType.TABLE_BODY:
@@ -155,7 +158,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                             continue
                         tables_footnote.append(bbox)
             elif block["type"] == BlockType.IMAGE:
-                imgs.append(bbox)
                 for nested_block in block["blocks"]:
                     bbox = nested_block["bbox"]
                     if nested_block["type"] == BlockType.IMAGE_BODY:
@@ -164,6 +166,14 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                         imgs_caption.append(bbox)
                     elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
                         imgs_footnote.append(bbox)
+            elif block["type"] == BlockType.CODE:
+                for nested_block in block["blocks"]:
+                    if nested_block["type"] == BlockType.CODE_BODY:
+                        bbox = nested_block["bbox"]
+                        codes_body.append(bbox)
+                    elif nested_block["type"] == BlockType.CODE_CAPTION:
+                        bbox = nested_block["bbox"]
+                        codes_caption.append(bbox)
             elif block["type"] == BlockType.TITLE:
                 titles.append(bbox)
             elif block["type"] == BlockType.TEXT:
@@ -172,14 +182,15 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                 interequations.append(bbox)
             elif block["type"] == BlockType.LIST:
                 lists.append(bbox)
+                if "blocks" in block:
+                    for sub_block in block["blocks"]:
+                        list_items.append(sub_block["bbox"])
             elif block["type"] == BlockType.INDEX:
                 indices.append(bbox)
 
-        tables_list.append(tables)
         tables_body_list.append(tables_body)
         tables_caption_list.append(tables_caption)
         tables_footnote_list.append(tables_footnote)
-        imgs_list.append(imgs)
         imgs_body_list.append(imgs_body)
         imgs_caption_list.append(imgs_caption)
         imgs_footnote_list.append(imgs_footnote)
@@ -187,7 +198,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         texts_list.append(texts)
         interequations_list.append(interequations)
         lists_list.append(lists)
+        list_items_list.append(list_items)
         indexs_list.append(indices)
+        codes_body_list.append(codes_body)
+        codes_caption_list.append(codes_caption)
 
     layout_bbox_list = []
 
@@ -215,6 +229,10 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                         continue
                     bbox = sub_block["bbox"]
                     page_block_list.append(bbox)
+            elif block["type"] in [BlockType.CODE]:
+                for sub_block in block["blocks"]:
+                    bbox = sub_block["bbox"]
+                    page_block_list.append(bbox)
 
         layout_bbox_list.append(page_block_list)
 
@@ -231,6 +249,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         # 使用原始PDF的尺寸创建canvas
         c = canvas.Canvas(packet, pagesize=custom_page_size)
 
+        c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
+        c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
         c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
         c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
         c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
@@ -242,6 +262,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
         c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
         c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
+        c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
         c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
         c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
 

+ 16 - 2
mineru/utils/enum_class.py

@@ -14,6 +14,19 @@ class BlockType:
     INDEX = 'index'
     DISCARDED = 'discarded'
 
+    # vlm 2.5新增
+    CODE = "code"
+    CODE_BODY = "code_body"
+    CODE_CAPTION = "code_caption"
+    ALGORITHM = "algorithm"
+    REF_TEXT = "ref_text"
+    PHONETIC = "phonetic"
+    HEADER = "header"
+    FOOTER = "footer"
+    PAGE_NUMBER = "page_number"
+    ASIDE_TEXT = "aside_text"
+    PAGE_FOOTNOTE = "page_footnote"
+
 
 class ContentType:
     IMAGE = 'image'
@@ -22,6 +35,7 @@ class ContentType:
     INTERLINE_EQUATION = 'interline_equation'
     INLINE_EQUATION = 'inline_equation'
     EQUATION = 'equation'
+    CODE = 'code'
 
 
 class CategoryId:
@@ -49,8 +63,8 @@ class MakeMode:
 
 
 class ModelPath:
-    vlm_root_hf = "opendatalab/MinerU2.0-2505-0.9B"
-    vlm_root_modelscope = "OpenDataLab/MinerU2.0-2505-0.9B"
+    vlm_root_hf = "opendatalab/MinerU2.5-2509-1.2B"
+    vlm_root_modelscope = "OpenDataLab/MinerU2.5-2509-1.2B"
     pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0"
     pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0"
     doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"

+ 20 - 0
mineru/utils/guess_suffix_or_lang.py

@@ -0,0 +1,20 @@
+from magika import Magika
+
+
+DEFAULT_LANG = "txt"
+magika = Magika()
+
+def guess_language_by_text(code):
+    codebytes = code.encode(encoding="utf-8")
+    lang = magika.identify_bytes(codebytes).prediction.output.label
+    return lang if lang != "unknown" else DEFAULT_LANG
+
+
+def guess_suffix_by_bytes(file_bytes) -> str:
+    suffix = magika.identify_bytes(file_bytes).prediction.output.label
+    return suffix
+
+
+def guess_suffix_by_path(file_path) -> str:
+    suffix = magika.identify_path(file_path).prediction.output.label
+    return suffix

+ 1 - 1
projects/multi_gpu_v2/client.py

@@ -63,7 +63,7 @@ async def main():
             'backend': 'pipeline', 'lang': 'ch', 'method': 'auto',
             'formula_enable': True, 'table_enable': True
         }
-        # 'backend': 'sglang-engine' requires 24+ GB VRAM per worker
+        # 'backend': 'vlm-vllm-engine' requires 8+ GB VRAM per worker
 
         custom_tasks = [mineru_parse_async(session, file_path, **custom_options) for file_path in existing_files[2:]]
 

+ 9 - 11
pyproject.toml

@@ -38,6 +38,8 @@ dependencies = [
     "scikit-image>=0.25.0,<1.0.0",
     "openai>=1.70.0,<2",
     "beautifulsoup4>=4.13.5,<5",
+    "magika>=0.6.2,<0.7.0",
+    "mineru_vl_utils",
 ]
 
 [project.optional-dependencies]
@@ -49,13 +51,12 @@ test = [
     "fuzzywuzzy"
 ]
 vlm = [
-    "transformers>=4.51.1",
-    "torch>=2.6.0",
+    "torch>=2.6.0,<2.8.0",
+    "transformers>=4.51.1,<5.0.0",
     "accelerate>=1.5.1",
-    "pydantic",
 ]
-sglang = [
-    "sglang[all]>=0.4.7,<0.4.11",
+vllm = [
+    "vllm==0.10.1.1",
 ]
 pipeline = [
     "matplotlib>=3.10,<4",
@@ -89,7 +90,7 @@ core = [
 ]
 all = [
     "mineru[core]",
-    "mineru[sglang]",
+    "mineru[vllm]",
 ]
 
 [project.urls]
@@ -100,7 +101,7 @@ issues = "https://github.com/opendatalab/MinerU/issues"
 
 [project.scripts]
 mineru = "mineru.cli:client.main"
-mineru-sglang-server = "mineru.cli.vlm_sglang_server:main"
+mineru-vllm-server = "mineru.cli.vlm_vllm_server:main"
 mineru-models-download = "mineru.cli.models_download:download_models"
 mineru-api = "mineru.cli.fast_api:main"
 mineru-gradio = "mineru.cli.gradio_app:main"
@@ -127,15 +128,12 @@ addopts = "-s --cov=mineru --cov-report html"
 command_line = "-m pytest tests/unittest/test_e2e.py"
 source = ["mineru/"]
 omit = [
-    "*/vlm_sglang_model/*",
     "*/gradio_app.py",
     "*/models_download.py",
     "*/fast_api.py",
     "*/cli/client.py",
-    "*/sglang_engine_predictor.py",
-    "*/vlm_sglang_server.py",
+    "*/vlm_vllm_server.py",
     "*/cli_parser.py",
-    "*/run_async.py"
 ]
 
 [tool.coverage.html]

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików