1 ماه پیش · 61cc6886fe
--- a/README.md
+++ b/README.md
@@ -44,6 +44,9 @@
 
				 
			
 
				 # Changelog
			
 
				 
			
 
				+- 2025/09/26 2.5.4 released  
			
 
				+  - Fixed an issue where some `PDF` files were mistakenly identified as `AI` files, causing parsing failures
			
 
				+
			
 
				 - 2025/09/20 2.5.3 Released
			
 
				   - Dependency version range adjustment to enable Turing and earlier architecture GPUs to use vLLM acceleration for MinerU2.5 model inference.
			
 
				   - `pipeline` backend compatibility fixes for torch 2.8.0.
			
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -43,6 +43,10 @@
 
				 </div>
			
 
				 
			
 
				 # 更新记录
			
 
				+
			
 
				+- 2025/09/26 2.5.4 发布
			
 
				+  - 修复部分`pdf`文件被识别成`ai`文件导致无法解析的问题
			
 
				+
			
 
				 - 2025/09/20 2.5.3 发布
			
 
				   - 依赖版本范围调整，使得Turing及更早架构显卡可以使用vLLM加速推理MinerU2.5模型。
			
 
				   - `pipeline`后端对torch 2.8.0的一些兼容性修复。
			
--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -26,7 +26,7 @@ def read_fn(path):
 
				         path = Path(path)
			
 
				     with open(str(path), "rb") as input_file:
			
 
				         file_bytes = input_file.read()
			
 
				-        file_suffix = guess_suffix_by_bytes(file_bytes)
			
 
				+        file_suffix = guess_suffix_by_bytes(file_bytes, path)
			
 
				         if file_suffix in image_suffixes:
			
 
				             return images_bytes_to_pdf_bytes(file_bytes)
			
 
				         elif file_suffix in pdf_suffixes:
			
--- a/mineru/cli/gradio_app.py
+++ b/mineru/cli/gradio_app.py
@@ -86,10 +86,14 @@ def replace_image_with_base64(markdown_text, image_dir_path):
 
				     # 替换图片链接
			
 
				     def replace(match):
			
 
				         relative_path = match.group(1)
			
 
				-        full_path = os.path.join(image_dir_path, relative_path)
			
 
				-        base64_image = image_to_base64(full_path)
			
 
				-        return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
			
 
				-
			
 
				+        # 只处理以.jpg结尾的图片
			
 
				+        if relative_path.endswith('.jpg'):
			
 
				+            full_path = os.path.join(image_dir_path, relative_path)
			
 
				+            base64_image = image_to_base64(full_path)
			
 
				+            return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
			
 
				+        else:
			
 
				+            # 其他格式的图片保持原样
			
 
				+            return match.group(0)
			
 
				     # 应用替换
			
 
				     return re.sub(pattern, replace, markdown_text)
			
 
				 
			
--- a/mineru/utils/guess_suffix_or_lang.py
+++ b/mineru/utils/guess_suffix_or_lang.py
@@ -1,3 +1,5 @@
 
				+from pathlib import Path
			
 
				+
			
 
				 from magika import Magika
			
 
				 
			
 
				 
			
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
 
				     return lang if lang != "unknown" else DEFAULT_LANG
			
 
				 
			
 
				 
			
 
				-def guess_suffix_by_bytes(file_bytes) -> str:
			
 
				+def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
			
 
				     suffix = magika.identify_bytes(file_bytes).prediction.output.label
			
 
				+    if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
			
 
				+        suffix = "pdf"
			
 
				     return suffix
			
 
				 
			
 
				 
			
 
				 def guess_suffix_by_path(file_path) -> str:
			
 
				+    if not isinstance(file_path, Path):
			
 
				+        file_path = Path(file_path)
			
 
				     suffix = magika.identify_path(file_path).prediction.output.label
			
 
				+    if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
			
 
				+        suffix = "pdf"
			
 
				     return suffix
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
 
				     "openai>=1.70.0,<2",
			
 
				     "beautifulsoup4>=4.13.5,<5",
			
 
				     "magika>=0.6.2,<0.7.0",
			
 
				-    "mineru-vl-utils>=0.1.8,<1",
			
 
				+    "mineru-vl-utils>=0.1.11,<1",
			
 
				 ]
			
 
				 
			
 
				 [project.optional-dependencies]
			
--- a/tests/unittest/test_e2e.py
+++ b/tests/unittest/test_e2e.py
@@ -154,10 +154,9 @@ def test_vlm_transformers_with_default_config():
 
				             json.dumps(middle_json, ensure_ascii=False, indent=4),
			
 
				         )
			
 
				 
			
 
				-        model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
			
 
				         md_writer.write_string(
			
 
				-            f"{pdf_file_name}_model_output.txt",
			
 
				-            model_output,
			
 
				+            f"{pdf_file_name}_model.json",
			
 
				+            json.dumps(infer_result, ensure_ascii=False, indent=4),
			
 
				         )
			
 
				 
			
 
				         logger.info(f"local output dir is {local_md_dir}")