浏览代码

feat: enhance file type detection using Magika for improved suffix guessing

myhloli 2 月之前
父节点
当前提交
e60da65cca

+ 2 - 2
mineru/backend/vlm/vlm_magic_model.py

@@ -6,7 +6,7 @@ from loguru import logger
 from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
-from mineru.utils.guess_code_lang import guess_language
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
 
 
@@ -233,7 +233,7 @@ class MagicModel:
                             code_block["sub_type"] = "code"
                 if code_block["sub_type"] in ["code"]:
                     content_text = merge_para_with_text(block)
-                    code_block["guess_lang"] = guess_language(content_text)
+                    code_block["guess_lang"] = guess_language_by_text(content_text)
         for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
             block["type"] = BlockType.TEXT
             self.text_blocks.append(block)

+ 2 - 1
mineru/cli/client.py

@@ -6,6 +6,7 @@ from loguru import logger
 
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.config_reader import get_device
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
 from mineru.utils.model_utils import get_vram
 from ..version import __version__
 from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
@@ -202,7 +203,7 @@ def main(
     if os.path.isdir(input_path):
         doc_path_list = []
         for doc_path in Path(input_path).glob('*'):
-            if doc_path.suffix in pdf_suffixes + image_suffixes:
+            if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
                 doc_path_list.append(doc_path)
         parse_doc(doc_path_list)
     else:

+ 7 - 5
mineru/cli/common.py

@@ -11,13 +11,14 @@ from loguru import logger
 from mineru.data.data_reader_writer import FileBasedDataWriter
 from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
 from mineru.utils.enum_class import MakeMode
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
 from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
 from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
 from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
 from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
 
-pdf_suffixes = [".pdf"]
-image_suffixes = [".png", ".jpeg", ".jpg", ".webp", ".gif"]
+pdf_suffixes = ["pdf"]
+image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
 
 
 def read_fn(path):
@@ -25,12 +26,13 @@ def read_fn(path):
         path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
-        if path.suffix in image_suffixes:
+        file_suffix = guess_suffix_by_bytes(file_bytes)
+        if file_suffix in image_suffixes:
             return images_bytes_to_pdf_bytes(file_bytes)
-        elif path.suffix in pdf_suffixes:
+        elif file_suffix in pdf_suffixes:
             return file_bytes
         else:
-            raise Exception(f"Unknown file suffix: {path.suffix}")
+            raise Exception(f"Unknown file suffix: {file_suffix}")
 
 
 def prepare_env(output_dir, pdf_file_name, parse_method):

+ 4 - 2
mineru/cli/fast_api.py

@@ -18,6 +18,7 @@ from base64 import b64encode
 
 from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
 from mineru.utils.cli_parser import arg_parse
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
 from mineru.version import __version__
 
 app = FastAPI()
@@ -96,7 +97,8 @@ async def parse_pdf(
             file_path = Path(file.filename)
 
             # 如果是图像文件或PDF,使用read_fn处理
-            if file_path.suffix.lower() in pdf_suffixes + image_suffixes:
+            file_suffix = guess_suffix_by_path(file_path)
+            if file_suffix in pdf_suffixes + image_suffixes:
                 # 创建临时文件以便使用read_fn
                 temp_path = Path(unique_dir) / file_path.name
                 with open(temp_path, "wb") as f:
@@ -115,7 +117,7 @@ async def parse_pdf(
             else:
                 return JSONResponse(
                     status_code=400,
-                    content={"error": f"Unsupported file type: {file_path.suffix}"}
+                    content={"error": f"Unsupported file type: {file_suffix}"}
                 )
 
 

+ 3 - 2
mineru/cli/gradio_app.py

@@ -275,8 +275,9 @@ def main(ctx,
             print("vLLM engine init successfully.")
         except Exception as e:
             logger.exception(e)
-
-    suffixes = pdf_suffixes + image_suffixes
+    suffixes = []
+    for suffix in pdf_suffixes + image_suffixes:
+        suffixes.append(f".{suffix}")
     with gr.Blocks() as demo:
         gr.HTML(header)
         with gr.Row():

+ 0 - 10
mineru/utils/guess_code_lang.py

@@ -1,10 +0,0 @@
-from magika import Magika
-
-
-DEFAULT_LANG = "txt"
-magika = Magika()
-
-def guess_language(code):
-    codebytes = code.encode(encoding="utf-8")
-    lang = magika.identify_bytes(codebytes).prediction.output.label
-    return lang if lang != "unknown" else DEFAULT_LANG

+ 20 - 0
mineru/utils/guess_suffix_or_lang.py

@@ -0,0 +1,20 @@
+from magika import Magika
+
+
+DEFAULT_LANG = "txt"
+magika = Magika()
+
+def guess_language_by_text(code):
+    codebytes = code.encode(encoding="utf-8")
+    lang = magika.identify_bytes(codebytes).prediction.output.label
+    return lang if lang != "unknown" else DEFAULT_LANG
+
+
+def guess_suffix_by_bytes(file_bytes) -> str:
+    suffix = magika.identify_bytes(file_bytes).prediction.output.label
+    return suffix
+
+
+def guess_suffix_by_path(file_path) -> str:
+    suffix = magika.identify_path(file_path).prediction.output.label
+    return suffix