Jelajahi Sumber

Refactor suffix guessing to handle PDF extensions for AI files

myhloli 1 bulan lalu
induk
melakukan
c9315b8e10
2 mengubah file dengan 4 tambahan dan 4 penghapusan
  1. 1 3
      mineru/cli/common.py
  2. 3 1
      mineru/utils/guess_suffix_or_lang.py

+ 1 - 3
mineru/cli/common.py

@@ -26,9 +26,7 @@ def read_fn(path):
         path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
-        file_suffix = guess_suffix_by_bytes(file_bytes)
-        if file_suffix in ["ai"] and path.suffix.lower() in [".pdf"]:
-            file_suffix = "pdf"
+        file_suffix = guess_suffix_by_bytes(file_bytes, path)
         if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
         elif file_suffix in pdf_suffixes:

+ 3 - 1
mineru/utils/guess_suffix_or_lang.py

@@ -12,8 +12,10 @@ def guess_language_by_text(code):
     return lang if lang != "unknown" else DEFAULT_LANG
 
 
-def guess_suffix_by_bytes(file_bytes) -> str:
+def guess_suffix_by_bytes(file_bytes, file_path=None) -> str:
     suffix = magika.identify_bytes(file_bytes).prediction.output.label
+    if file_path and suffix in ["ai"] and Path(file_path).suffix.lower() in [".pdf"]:
+        suffix = "pdf"
     return suffix