فهرست منبع

Merge pull request #3608 from opendatalab/release-2.5.4

Release 2.5.4
Xiaomeng Zhao 1 ماه پیش
والد
کامیت
61cc6886fe
7فایلهای تغییر یافته به همراه28 افزوده شده و 10 حذف شده
  1. 3 0
      README.md
  2. 4 0
      README_zh-CN.md
  3. 1 1
      mineru/cli/common.py
  4. 8 4
      mineru/cli/gradio_app.py
  5. 9 1
      mineru/utils/guess_suffix_or_lang.py
  6. 1 1
      pyproject.toml
  7. 2 3
      tests/unittest/test_e2e.py

+ 3 - 0
README.md

@@ -44,6 +44,9 @@
 
 # Changelog
 
+- 2025/09/26 2.5.4 released  
+  - Fixed an issue where some `PDF` files were mistakenly identified as `AI` files, causing parsing failures
+
 - 2025/09/20 2.5.3 Released
   - Dependency version range adjustment to enable Turing and earlier architecture GPUs to use vLLM acceleration for MinerU2.5 model inference.
   - `pipeline` backend compatibility fixes for torch 2.8.0.

+ 4 - 0
README_zh-CN.md

@@ -43,6 +43,10 @@
 </div>
 
 # 更新记录
+
+- 2025/09/26 2.5.4 发布
+  - 修复部分`pdf`文件被识别成`ai`文件导致无法解析的问题
+
 - 2025/09/20 2.5.3 发布
   - 依赖版本范围调整,使得Turing及更早架构显卡可以使用vLLM加速推理MinerU2.5模型。
   - `pipeline`后端对torch 2.8.0的一些兼容性修复。

+ 1 - 1
mineru/cli/common.py

@@ -26,7 +26,7 @@ def read_fn(path):
         path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
-        file_suffix = guess_suffix_by_bytes(file_bytes)
+        file_suffix = guess_suffix_by_bytes(file_bytes, path)
         if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
         elif file_suffix in pdf_suffixes:

+ 8 - 4
mineru/cli/gradio_app.py

@@ -86,10 +86,14 @@ def replace_image_with_base64(markdown_text, image_dir_path):
     # 替换图片链接
     def replace(match):
         relative_path = match.group(1)
-        full_path = os.path.join(image_dir_path, relative_path)
-        base64_image = image_to_base64(full_path)
-        return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
-
+        # 只处理以.jpg结尾的图片
+        if relative_path.endswith('.jpg'):
+            full_path = os.path.join(image_dir_path, relative_path)
+            base64_image = image_to_base64(full_path)
+            return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
+        else:
+            # 其他格式的图片保持原样
+            return match.group(0)
     # 应用替换
     return re.sub(pattern, replace, markdown_text)
 

+ 9 - 1
mineru/utils/guess_suffix_or_lang.py

@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from magika import Magika
 
 
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
     return lang if lang != "unknown" else DEFAULT_LANG
 
 
-def guess_suffix_by_bytes(file_bytes) -> str:
+def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
     suffix = magika.identify_bytes(file_bytes).prediction.output.label
+    if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
+        suffix = "pdf"
     return suffix
 
 
 def guess_suffix_by_path(file_path) -> str:
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
     suffix = magika.identify_path(file_path).prediction.output.label
+    if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
+        suffix = "pdf"
     return suffix

+ 1 - 1
pyproject.toml

@@ -39,7 +39,7 @@ dependencies = [
     "openai>=1.70.0,<2",
     "beautifulsoup4>=4.13.5,<5",
     "magika>=0.6.2,<0.7.0",
-    "mineru-vl-utils>=0.1.8,<1",
+    "mineru-vl-utils>=0.1.11,<1",
 ]
 
 [project.optional-dependencies]

+ 2 - 3
tests/unittest/test_e2e.py

@@ -154,10 +154,9 @@ def test_vlm_transformers_with_default_config():
             json.dumps(middle_json, ensure_ascii=False, indent=4),
         )
 
-        model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
         md_writer.write_string(
-            f"{pdf_file_name}_model_output.txt",
-            model_output,
+            f"{pdf_file_name}_model.json",
+            json.dumps(infer_result, ensure_ascii=False, indent=4),
         )
 
         logger.info(f"local output dir is {local_md_dir}")