Quellcode durchsuchen

feat: implement language guessing for code blocks using Magika

myhloli vor 2 Monaten
Ursprung
Commit
f081d36a3a

+ 2 - 3
mineru/backend/vlm/vlm_magic_model.py

@@ -2,11 +2,11 @@ import re
 from typing import Literal
 
 from loguru import logger
-from pygments.lexers import guess_lexer
 
 from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
+from mineru.utils.guess_code_lang import guess_language
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
 
 
@@ -233,8 +233,7 @@ class MagicModel:
                             code_block["sub_type"] = "code"
                 if code_block["sub_type"] in ["code"]:
                     content_text = merge_para_with_text(block)
-                    lexer = guess_lexer(content_text)
-                    code_block["guess_lang"] = lexer.aliases[0]
+                    code_block["guess_lang"] = guess_language(content_text)
         for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
             block["type"] = BlockType.TEXT
             self.text_blocks.append(block)

+ 1 - 0
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -216,6 +216,7 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
         for block in para_block['blocks']:
             if block['type'] == BlockType.CODE_BODY:
                 para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
+                para_content["guess_lang"] = para_block["guess_lang"]
             if block['type'] == BlockType.CODE_CAPTION:
                 para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
 

+ 10 - 0
mineru/utils/guess_code_lang.py

@@ -0,0 +1,10 @@
+from magika import Magika
+
+
+DEFAULT_LANG = "txt"
+magika = Magika()
+
+def guess_language(code):
+    codebytes = code.encode(encoding="utf-8")
+    lang = magika.identify_bytes(codebytes).prediction.output.label
+    return lang if lang != "unknown" else DEFAULT_LANG

+ 1 - 1
pyproject.toml

@@ -38,7 +38,7 @@ dependencies = [
     "scikit-image>=0.25.0,<1.0.0",
     "openai>=1.70.0,<2",
     "beautifulsoup4>=4.13.5,<5",
-    "Pygments",
+    "magika",
     "mineru_vl_utils",
 ]