Quellcode durchsuchen

feat: enhance language guessing for code blocks by integrating guess_lang into line structure

myhloli vor 2 Monaten
Ursprung
Commit
078099f19d

+ 22 - 32
mineru/backend/vlm/vlm_magic_model.py

@@ -3,7 +3,6 @@ from typing import Literal
 
 from loguru import logger
 
-from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
 from mineru.utils.guess_suffix_or_lang import guess_language_by_text
@@ -46,6 +45,8 @@ class MagicModel:
                 continue
 
             span_type = "unknown"
+            line_type = None
+            guess_lang = None
 
             if block_type in [
                 "text",
@@ -75,6 +76,7 @@ class MagicModel:
                 line_type = block_type
                 block_type = BlockType.CODE_BODY
                 span_type = ContentType.TEXT
+                guess_lang = guess_language_by_text(block_content)
             elif block_type in ["equation"]:
                 block_type = BlockType.INTERLINE_EQUATION
                 span_type = ContentType.INTERLINE_EQUATION
@@ -145,35 +147,22 @@ class MagicModel:
                         "content": block_content,
                     }
 
+            # 处理span类型并添加到all_spans
             if isinstance(span, dict) and "bbox" in span:
                 self.all_spans.append(span)
-                if block_type == BlockType.CODE_BODY:
-                    line = {
-                        "bbox": block_bbox,
-                        "spans": [span],
-                        "type": line_type
-                    }
-                else:
-                    line = {
-                        "bbox": block_bbox,
-                        "spans": [span],
-                    }
+                spans = [span]
             elif isinstance(span, list):
                 self.all_spans.extend(span)
-                if block_type == BlockType.CODE_BODY:
-                    line = {
-                        "bbox": block_bbox,
-                        "spans": span,
-                        "type": line_type
-                    }
-                else:
-                    line = {
-                        "bbox": block_bbox,
-                        "spans": span,
-                    }
+                spans = span
             else:
                 raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
 
+            # 构造line对象
+            if block_type in [BlockType.CODE_BODY]:
+                line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
+            else:
+                line = {"bbox": block_bbox, "spans": spans}
+
             blocks.append(
                 {
                     "bbox": block_bbox,
@@ -225,15 +214,16 @@ class MagicModel:
         for code_block in self.code_blocks:
             for block in code_block['blocks']:
                 if block['type'] == BlockType.CODE_BODY:
-                    for line in block["lines"]:
-                        if "type" in line:
-                            code_block["sub_type"] = line["type"]
-                            del line["type"]
-                        else:
-                            code_block["sub_type"] = "code"
-                if code_block["sub_type"] in ["code"]:
-                    content_text = merge_para_with_text(block)
-                    code_block["guess_lang"] = guess_language_by_text(content_text)
+                    if len(block["lines"]) > 0:
+                        line = block["lines"][0]
+                        code_block["sub_type"] = line["extra"]["type"]
+                        if code_block["sub_type"] in ["code"]:
+                            code_block["guess_lang"] = line["extra"]["guess_lang"]
+                        del line["extra"]
+                    else:
+                        code_block["sub_type"] = "code"
+                        code_block["guess_lang"] = "txt"
+
         for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
             block["type"] = BlockType.TEXT
             self.text_blocks.append(block)

+ 2 - 1
mineru/backend/vlm/vlm_middle_json_mkcontent.py

@@ -216,7 +216,8 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
         for block in para_block['blocks']:
             if block['type'] == BlockType.CODE_BODY:
                 para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
-                para_content["guess_lang"] = para_block["guess_lang"]
+                if para_block["sub_type"] == BlockType.CODE:
+                    para_content["guess_lang"] = para_block["guess_lang"]
             if block['type'] == BlockType.CODE_CAPTION:
                 para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))