Browse Source

Merge pull request #3606 from myhloli/dev

Dev
Xiaomeng Zhao 1 tháng trước cách đây
mục cha
commit
5656f1363b
3 tập tin đã thay đổi với 18 bổ sung6 xóa
  1. 1 1
      mineru/cli/common.py
  2. 8 4
      mineru/cli/gradio_app.py
  3. 9 1
      mineru/utils/guess_suffix_or_lang.py

+ 1 - 1
mineru/cli/common.py

@@ -26,7 +26,7 @@ def read_fn(path):
         path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
-        file_suffix = guess_suffix_by_bytes(file_bytes)
+        file_suffix = guess_suffix_by_bytes(file_bytes, path)
         if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
         elif file_suffix in pdf_suffixes:

+ 8 - 4
mineru/cli/gradio_app.py

@@ -86,10 +86,14 @@ def replace_image_with_base64(markdown_text, image_dir_path):
     # 替换图片链接
     def replace(match):
         relative_path = match.group(1)
-        full_path = os.path.join(image_dir_path, relative_path)
-        base64_image = image_to_base64(full_path)
-        return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
-
+        # 只处理以.jpg结尾的图片
+        if relative_path.endswith('.jpg'):
+            full_path = os.path.join(image_dir_path, relative_path)
+            base64_image = image_to_base64(full_path)
+            return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
+        else:
+            # 其他格式的图片保持原样
+            return match.group(0)
     # 应用替换
     return re.sub(pattern, replace, markdown_text)
 

+ 9 - 1
mineru/utils/guess_suffix_or_lang.py

@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from magika import Magika
 
 
@@ -10,11 +12,17 @@ def guess_language_by_text(code):
     return lang if lang != "unknown" else DEFAULT_LANG
 
 
-def guess_suffix_by_bytes(file_bytes) -> str:
+def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
     suffix = magika.identify_bytes(file_bytes).prediction.output.label
+    if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
+        suffix = "pdf"
     return suffix
 
 
 def guess_suffix_by_path(file_path) -> str:
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
     suffix = magika.identify_path(file_path).prediction.output.label
+    if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
+        suffix = "pdf"
     return suffix