Forráskód Böngészése

refactor: enhance title hierarchy logic and update model configuration

myhloli 3 hete
szülő
commit
a220b8a208

+ 1 - 1
mineru.template.json

@@ -17,7 +17,7 @@
         "title_aided": {
             "api_key": "your_api_key",
             "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-32b-instruct",
+            "model": "qwen3-next-80b-a3b-instruct",
             "enable": false
         }
     },

+ 4 - 2
mineru/backend/pipeline/model_init.py

@@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel
 from ...utils.enum_class import ModelPath
 from ...utils.models_download_utils import auto_download_and_get_model_root_path
 
-MFR_MODEL = "unimernet_small"
-# MFR_MODEL = "pp_formulanet_plus_m"
+MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
+if MFR_MODEL is None:
+    # MFR_MODEL = "unimernet_small"
+    MFR_MODEL = "pp_formulanet_plus_m"
 
 
 def img_orientation_cls_model_init():

+ 9 - 0
mineru/model/mfr/pp_formulanet_plus_m/processors.py

@@ -6,9 +6,13 @@ import re
 
 from PIL import Image, ImageOps
 from typing import List, Optional, Tuple, Union, Dict, Any
+
+from loguru import logger
 from tokenizers import AddedToken
 from tokenizers import Tokenizer as TokenizerFast
 
+from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right
+
 
 class UniMERNetImgDecode(object):
     """Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
@@ -589,6 +593,7 @@ class UniMERNetDecode(object):
         replaced_formula = pattern.sub(replacer, formula)
         return replaced_formula.replace('"', "")
 
+    UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
     def post_process(self, text: str) -> str:
         """Post-processes a string by fixing text and normalizing it.
 
@@ -602,6 +607,10 @@ class UniMERNetDecode(object):
 
         text = self.remove_chinese_text_wrapping(text)
         text = fix_text(text)
+        text = fix_latex_left_right(text)
+        text = self.UP_PATTERN.sub(
+            lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text
+        )
         text = self.normalize(text)
         return text
 

+ 4 - 2
mineru/utils/llm_aided.py

@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
 3. 保持字典内key-value的对应关系不变
 
 4. 优化层次结构:
-    - 为每个标题元素添加适当的层次结构
+    - 根据标题内容的语义为每个标题元素添加适当的层次结构
     - 行高较大的标题一般是更高级别的标题
     - 标题从前至后的层级必须是连续的,不能跳过层级
     - 标题层级最多为4级,不要添加过多的层级
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
     - 在完成初步分级后,仔细检查分级结果的合理性
     - 根据上下文关系和逻辑顺序,对不合理的分级进行微调
     - 确保最终的分级结果符合文档的实际结构和逻辑
-    - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
 
 IMPORTANT: 
 请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
@@ -78,6 +77,8 @@ Input title list:
 
 Corrected title list:
 """
+    #5.
+    #- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
 
     retry_count = 0
     max_retries = 3
@@ -89,6 +90,7 @@ Corrected title list:
                 model=title_aided_config["model"],
                 messages=[
                     {'role': 'user', 'content': title_optimize_prompt}],
+                extra_body={"enable_thinking": False},
                 temperature=0.7,
                 stream=True,
             )