소스 검색

Fix suffix identification for AI files to correctly handle PDF extensions

myhloli 1 개월 전
부모
커밋
2c356cccee
2개의 변경된 파일8개의 추가작업 그리고 0개의 파일을 삭제
  1. 2 0
      mineru/cli/common.py
  2. 6 0
      mineru/utils/guess_suffix_or_lang.py

+ 2 - 0
mineru/cli/common.py

@@ -27,6 +27,8 @@ def read_fn(path):
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
         file_suffix = guess_suffix_by_bytes(file_bytes)
+        if file_suffix in ["ai"] and path.suffix in [".pdf"]:
+            file_suffix = "pdf"
         if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
         elif file_suffix in pdf_suffixes:

+ 6 - 0
mineru/utils/guess_suffix_or_lang.py

@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from magika import Magika
 
 
@@ -16,5 +18,9 @@ def guess_suffix_by_bytes(file_bytes) -> str:
 
 
 def guess_suffix_by_path(file_path) -> str:
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
     suffix = magika.identify_path(file_path).prediction.output.label
+    if suffix in ["ai"] and file_path.suffix in [".pdf"]:
+        suffix = "pdf"
     return suffix