3 hete · a220b8a208
--- a/mineru.template.json
+++ b/mineru.template.json
@@ -17,7 +17,7 @@
 
				         "title_aided": {
			
 
				             "api_key": "your_api_key",
			
 
				             "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
			
 
				-            "model": "qwen2.5-32b-instruct",
			
 
				+            "model": "qwen3-next-80b-a3b-instruct",
			
 
				             "enable": false
			
 
				         }
			
 
				     },
			
--- a/mineru/backend/pipeline/model_init.py
+++ b/mineru/backend/pipeline/model_init.py
@@ -17,8 +17,10 @@ from ...model.table.rec.unet_table.main import UnetTableModel
 
				 from ...utils.enum_class import ModelPath
			
 
				 from ...utils.models_download_utils import auto_download_and_get_model_root_path
			
 
				 
			
 
				-MFR_MODEL = "unimernet_small"
			
 
				-# MFR_MODEL = "pp_formulanet_plus_m"
			
 
				+MFR_MODEL = os.getenv('MINERU_MFR_MODEL', None)
			
 
				+if MFR_MODEL is None:
			
 
				+    # MFR_MODEL = "unimernet_small"
			
 
				+    MFR_MODEL = "pp_formulanet_plus_m"
			
 
				 
			
 
				 
			
 
				 def img_orientation_cls_model_init():
			
--- a/mineru/model/mfr/pp_formulanet_plus_m/processors.py
+++ b/mineru/model/mfr/pp_formulanet_plus_m/processors.py
@@ -6,9 +6,13 @@ import re
 
				 
			
 
				 from PIL import Image, ImageOps
			
 
				 from typing import List, Optional, Tuple, Union, Dict, Any
			
 
				+
			
 
				+from loguru import logger
			
 
				 from tokenizers import AddedToken
			
 
				 from tokenizers import Tokenizer as TokenizerFast
			
 
				 
			
 
				+from mineru.model.mfr.unimernet.unimernet_hf.modeling_unimernet import fix_latex_left_right
			
 
				+
			
 
				 
			
 
				 class UniMERNetImgDecode(object):
			
 
				     """Class for decoding images for UniMERNet, including cropping margins, resizing, and padding."""
			
@@ -589,6 +593,7 @@ class UniMERNetDecode(object):
 
				         replaced_formula = pattern.sub(replacer, formula)
			
 
				         return replaced_formula.replace('"', "")
			
 
				 
			
 
				+    UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
			
 
				     def post_process(self, text: str) -> str:
			
 
				         """Post-processes a string by fixing text and normalizing it.
			
 
				 
			
@@ -602,6 +607,10 @@ class UniMERNetDecode(object):
 
				 
			
 
				         text = self.remove_chinese_text_wrapping(text)
			
 
				         text = fix_text(text)
			
 
				+        text = fix_latex_left_right(text)
			
 
				+        text = self.UP_PATTERN.sub(
			
 
				+            lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", text
			
 
				+        )
			
 
				         text = self.normalize(text)
			
 
				         return text
			
 
				 
			
--- a/mineru/utils/llm_aided.py
+++ b/mineru/utils/llm_aided.py
@@ -51,7 +51,7 @@ def llm_aided_title(page_info_list, title_aided_config):
 
				 3. 保持字典内key-value的对应关系不变
			
 
				 
			
 
				 4. 优化层次结构：
			
 
				-    - 为每个标题元素添加适当的层次结构
			
 
				+    - 根据标题内容的语义为每个标题元素添加适当的层次结构
			
 
				     - 行高较大的标题一般是更高级别的标题
			
 
				     - 标题从前至后的层级必须是连续的，不能跳过层级
			
 
				     - 标题层级最多为4级，不要添加过多的层级
			
@@ -61,7 +61,6 @@ def llm_aided_title(page_info_list, title_aided_config):
 
				     - 在完成初步分级后，仔细检查分级结果的合理性
			
 
				     - 根据上下文关系和逻辑顺序，对不合理的分级进行微调
			
 
				     - 确保最终的分级结果符合文档的实际结构和逻辑
			
 
				-    - 字典中可能包含被误当成标题的正文，你可以通过将其层级标记为 0 来排除它们
			
 
				 
			
 
				 IMPORTANT: 
			
 
				 请直接返回优化过的由标题层级组成的字典，格式为{{标题id:标题层级}}，如下：
			
@@ -78,6 +77,8 @@ Input title list:
 
				 
			
 
				 Corrected title list:
			
 
				 """
			
 
				+    #5.
			
 
				+    #- 字典中可能包含被误当成标题的正文，你可以通过将其层级标记为 0 来排除它们
			
 
				 
			
 
				     retry_count = 0
			
 
				     max_retries = 3
			
@@ -89,6 +90,7 @@ Corrected title list:
 
				                 model=title_aided_config["model"],
			
 
				                 messages=[
			
 
				                     {'role': 'user', 'content': title_optimize_prompt}],
			
 
				+                extra_body={"enable_thinking": False},
			
 
				                 temperature=0.7,
			
 
				                 stream=True,
			
 
				             )