Переглянути джерело

fix: enhance language guessing for code blocks in VLM processing

myhloli 2 місяців тому
батько
коміт
f2b944ab06

+ 6 - 0
mineru/backend/vlm/vlm_magic_model.py

@@ -2,7 +2,9 @@ import re
 from typing import Literal
 
 from loguru import logger
+from pygments.lexers import guess_lexer
 
+from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
@@ -229,6 +231,10 @@ class MagicModel:
                             del line["type"]
                         else:
                             code_block["sub_type"] = "code"
+                if code_block["sub_type"] in ["code"]:
+                    content_text = merge_para_with_text(block)
+                    lexer = guess_lexer(content_text)
+                    code_block["guess_lang"] = lexer.aliases[0]
         for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
             block["type"] = BlockType.TEXT
             self.text_blocks.append(block)

+ 1 - 6
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -3,9 +3,6 @@ import os
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
 from mineru.utils.enum_class import MakeMode, BlockType, ContentType
 
-from pygments.lexers import guess_lexer
-
-
 latex_delimiters_config = get_latex_delimiter_config()
 
 default_delimiters = {
@@ -126,9 +123,7 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
             for block in para_block['blocks']:  # 2nd.拼code_body
                 if block['type'] == BlockType.CODE_BODY:
                     if sub_type == BlockType.CODE:
-                        content_text = merge_para_with_text(block)
-                        lexer = guess_lexer(content_text)
-                        para_text += f"```{lexer.aliases[0]}\n{merge_para_with_text(block)}\n```"
+                        para_text += f"```{block["guess_lang"]}\n{merge_para_with_text(block)}\n```"
                     elif sub_type == BlockType.ALGORITHM:
                         para_text += merge_para_with_text(block)