Browse Source

Normalize PDF suffix handling for AI files to be case-insensitive

myhloli 1 month ago
parent
commit
907099762f
2 changed files with 2 additions and 2 deletions
  1. 1 1
      mineru/cli/common.py
  2. 1 1
      mineru/utils/guess_suffix_or_lang.py

+ 1 - 1
mineru/cli/common.py

@@ -27,7 +27,7 @@ def read_fn(path):
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
         file_suffix = guess_suffix_by_bytes(file_bytes)
-        if file_suffix in ["ai"] and path.suffix in [".pdf"]:
+        if file_suffix in ["ai"] and path.suffix.lower() in [".pdf"]:
             file_suffix = "pdf"
         if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)

+ 1 - 1
mineru/utils/guess_suffix_or_lang.py

@@ -21,6 +21,6 @@ def guess_suffix_by_path(file_path) -> str:
     if not isinstance(file_path, Path):
         file_path = Path(file_path)
     suffix = magika.identify_path(file_path).prediction.output.label
-    if suffix in ["ai"] and file_path.suffix in [".pdf"]:
+    if suffix in ["ai"] and file_path.suffix.lower() in [".pdf"]:
         suffix = "pdf"
     return suffix