2 tygodni temu · 04860456e8
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -122,9 +122,9 @@ body:
 
				       #multiple: false
			
 
				       options:
			
 
				         -
			
 
				-        - "<2.2.0"
			
 
				-        - "2.2.x"
			
 
				-        - ">=2.5"
			
 
				+        - "`<2.2.0`"
			
 
				+        - "`2.2.x`"
			
 
				+        - "`>=2.5`"
			
 
				     validations:
			
 
				       required: true
			
 
				 
			
--- a/docs/en/quick_start/extension_modules.md
+++ b/docs/en/quick_start/extension_modules.md
@@ -6,7 +6,7 @@ MinerU supports installing extension modules on demand based on different needs
 
				 ### Core Functionality Installation
			
 
				 The `core` module is the core dependency of MinerU, containing all functional modules except `vllm`. Installing this module ensures the basic functionality of MinerU works properly.
			
 
				 ```bash
			
 
				-uv pip install mineru[core]
			
 
				+uv pip install "mineru[core]"
			
 
				 ```
			
 
				 
			
 
				 ---
			
@@ -15,7 +15,7 @@ uv pip install mineru[core]
 
				 The `vllm` module provides acceleration support for VLM model inference, suitable for graphics cards with Turing architecture and later (8GB+ VRAM). Installing this module can significantly improve model inference speed.
			
 
				 In the configuration, `all` includes both `core` and `vllm` modules, so `mineru[all]` and `mineru[core,vllm]` are equivalent.
			
 
				 ```bash
			
 
				-uv pip install mineru[all]
			
 
				+uv pip install "mineru[all]"
			
 
				 ```
			
 
				 > [!TIP]
			
 
				 > If exceptions occur during installation of the complete package including vllm, please refer to the [vllm official documentation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to try to resolve the issue, or directly use the [Docker](./docker_deployment.md) deployment method.
			
--- a/docs/en/usage/quick_usage.md
+++ b/docs/en/usage/quick_usage.md
@@ -83,8 +83,9 @@ Here are some available configuration options:
 
				   
			
 
				 - `llm-aided-config`:
			
 
				     * Used to configure parameters for LLM-assisted title hierarchy
			
 
				-    * Compatible with all LLM models supporting `openai protocol`, defaults to using Alibaba Cloud Bailian's `qwen2.5-32b-instruct` model. 
			
 
				+    * Compatible with all LLM models supporting `openai protocol`, defaults to using Alibaba Cloud Bailian's `qwen3-next-80b-a3b-instruct` model. 
			
 
				     * You need to configure your own API key and set `enable` to `true` to enable this feature.
			
 
				+    * If your API provider does not support the enable_thinking parameter, please manually remove it.
			
 
				   
			
 
				 - `models-dir`: 
			
 
				     * Used to specify local model storage directory
			
--- a/docs/zh/quick_start/extension_modules.md
+++ b/docs/zh/quick_start/extension_modules.md
@@ -6,7 +6,7 @@ MinerU 支持根据不同需求，按需安装扩展模块，以增强功能或
 
				 ### 核心功能安装
			
 
				 `core` 模块是 MinerU 的核心依赖，包含了除`vllm`外的所有功能模块。安装此模块可以确保 MinerU 的基本功能正常运行。
			
 
				 ```bash
			
 
				-uv pip install mineru[core]
			
 
				+uv pip install "mineru[core]"
			
 
				 ```
			
 
				 
			
 
				 ---
			
@@ -15,7 +15,7 @@ uv pip install mineru[core]
 
				 `vllm` 模块提供了对 VLM 模型推理的加速支持，适用于具有 Turing 及以后架构的显卡（8G 显存及以上）。安装此模块可以显著提升模型推理速度。
			
 
				 在配置中，`all`包含了`core`和`vllm`模块，因此`mineru[all]`和`mineru[core,vllm]`是等价的。
			
 
				 ```bash
			
 
				-uv pip install mineru[all]
			
 
				+uv pip install "mineru[all]"
			
 
				 ```
			
 
				 > [!TIP]
			
 
				 > 如在安装包含vllm的完整包过程中发生异常，请参考 [vllm 官方文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 尝试解决，或直接使用 [Docker](./docker_deployment.md) 方式部署镜像。
			
--- a/docs/zh/usage/quick_usage.md
+++ b/docs/zh/usage/quick_usage.md
@@ -82,8 +82,9 @@ MinerU 现已实现开箱即用，但也支持通过配置文件扩展功能。
 
				   
			
 
				 - `llm-aided-config`：
			
 
				     * 用于配置 LLM 辅助标题分级的相关参数，兼容所有支持`openai协议`的 LLM 模型
			
 
				-    * 默认使用`阿里云百炼`的`qwen2.5-32b-instruct`模型
			
 
				-    * 您需要自行配置 API 密钥并将`enable`设置为`true`来启用此功能。
			
 
				+    * 默认使用`阿里云百炼`的`qwen3-next-80b-a3b-instruct`模型
			
 
				+    * 您需要自行配置 API 密钥并将`enable`设置为`true`来启用此功能
			
 
				+    * 如果您的api供应商不支持`enable_thinking`参数，请手动将该参数删除
			
 
				   
			
 
				 - `models-dir`：
			
 
				     * 用于指定本地模型存储目录，请为`pipeline`和`vlm`后端分别指定模型目录，
			
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -281,28 +281,20 @@ class BatchAnalyze:
 
				 
			
 
				                 # 按分辨率分组并同时完成padding
			
 
				                 # RESOLUTION_GROUP_STRIDE = 32
			
 
				-                RESOLUTION_GROUP_STRIDE = 64  # 定义分辨率分组的步进值
			
 
				+                RESOLUTION_GROUP_STRIDE = 64
			
 
				 
			
 
				                 resolution_groups = defaultdict(list)
			
 
				                 for crop_info in lang_crop_list:
			
 
				                     cropped_img = crop_info[0]
			
 
				                     h, w = cropped_img.shape[:2]
			
 
				-                    # 使用更大的分组容差，减少分组数量
			
 
				-                    # 将尺寸标准化到32的倍数
			
 
				-                    normalized_h = ((h + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE  # 向上取整到32的倍数
			
 
				-                    normalized_w = ((w + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
			
 
				-                    group_key = (normalized_h, normalized_w)
			
 
				+                    # 直接计算目标尺寸并用作分组键
			
 
				+                    target_h = ((h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
			
 
				+                    target_w = ((w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
			
 
				+                    group_key = (target_h, target_w)
			
 
				                     resolution_groups[group_key].append(crop_info)
			
 
				 
			
 
				                 # 对每个分辨率组进行批处理
			
 
				-                for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
			
 
				-
			
 
				-                    # 计算目标尺寸（组内最大尺寸，向上取整到32的倍数）
			
 
				-                    max_h = max(crop_info[0].shape[0] for crop_info in group_crops)
			
 
				-                    max_w = max(crop_info[0].shape[1] for crop_info in group_crops)
			
 
				-                    target_h = ((max_h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
			
 
				-                    target_w = ((max_w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
			
 
				-
			
 
				+                for (target_h, target_w), group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
			
 
				                     # 对所有图像进行padding到统一尺寸
			
 
				                     batch_images = []
			
 
				                     for crop_info in group_crops:
			
@@ -310,49 +302,34 @@ class BatchAnalyze:
 
				                         h, w = img.shape[:2]
			
 
				                         # 创建目标尺寸的白色背景
			
 
				                         padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
			
 
				-                        # 将原图像粘贴到左上角
			
 
				                         padded_img[:h, :w] = img
			
 
				                         batch_images.append(padded_img)
			
 
				 
			
 
				                     # 批处理检测
			
 
				-                    det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)  # 增加批处理大小
			
 
				-                    # logger.debug(f"OCR-det batch: {det_batch_size} images, target size: {target_h}x{target_w}")
			
 
				+                    det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
			
 
				                     batch_results = ocr_model.text_detector.batch_predict(batch_images, det_batch_size)
			
 
				 
			
 
				                     # 处理批处理结果
			
 
				-                    for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
			
 
				+                    for crop_info, (dt_boxes, _) in zip(group_crops, batch_results):
			
 
				                         bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
			
 
				 
			
 
				                         if dt_boxes is not None and len(dt_boxes) > 0:
			
 
				-                            # 直接应用原始OCR流程中的关键处理步骤
			
 
				-
			
 
				-                            # 1. 排序检测框
			
 
				-                            if len(dt_boxes) > 0:
			
 
				-                                dt_boxes_sorted = sorted_boxes(dt_boxes)
			
 
				-                            else:
			
 
				-                                dt_boxes_sorted = []
			
 
				-
			
 
				-                            # 2. 合并相邻检测框
			
 
				-                            if dt_boxes_sorted:
			
 
				-                                dt_boxes_merged = merge_det_boxes(dt_boxes_sorted)
			
 
				-                            else:
			
 
				-                                dt_boxes_merged = []
			
 
				-
			
 
				-                            # 3. 根据公式位置更新检测框（关键步骤！）
			
 
				-                            if dt_boxes_merged and adjusted_mfdetrec_res:
			
 
				-                                dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
			
 
				-                            else:
			
 
				-                                dt_boxes_final = dt_boxes_merged
			
 
				-
			
 
				-                            # 构造OCR结果格式
			
 
				-                            ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
			
 
				-
			
 
				-                            if ocr_res:
			
 
				+                            # 处理检测框
			
 
				+                            dt_boxes_sorted = sorted_boxes(dt_boxes)
			
 
				+                            dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) if dt_boxes_sorted else []
			
 
				+
			
 
				+                            # 根据公式位置更新检测框
			
 
				+                            dt_boxes_final = (update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
			
 
				+                                              if dt_boxes_merged and adjusted_mfdetrec_res
			
 
				+                                              else dt_boxes_merged)
			
 
				+
			
 
				+                            if dt_boxes_final:
			
 
				+                                ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
			
 
				                                 ocr_result_list = get_ocr_result_list(
			
 
				                                     ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
			
 
				                                 )
			
 
				-
			
 
				                                 ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
 
				+
			
 
				         else:
			
 
				             # 原始单张处理模式
			
 
				             for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
			
--- a/mineru/model/ocr/pytorch_paddle.py
+++ b/mineru/model/ocr/pytorch_paddle.py
@@ -134,7 +134,7 @@ def get_model_params(lang, config):
 
				         raise Exception (f'Language {lang} not supported')
			
 
				 
			
 
				 
			
 
				-root_dir = os.path.join(Path(__file__).resolve().parent.parent.parent, 'utils')
			
 
				+root_dir = os.path.join(Path(__file__).resolve().parent.parent, 'utils')
			
 
				 
			
 
				 
			
 
				 class PytorchPaddleOCR(TextSystem):
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
 
				     "openai>=1.70.0,<3",
			
 
				     "beautifulsoup4>=4.13.5,<5",
			
 
				     "magika>=0.6.2,<0.7.0",
			
 
				-    "mineru-vl-utils>=0.1.14,<1",
			
 
				+    "mineru-vl-utils>=0.1.15,<1",
			
 
				 ]
			
 
				 
			
 
				 [project.optional-dependencies]