Просмотр исходного кода

fix: improve LaTeX delimiter handling by replacing valid and invalid pairs

myhloli 5 месяцев назад
Родитель
Сommit
4f6d8d7ca5
1 измененных файлов с 30 добавлено и 23 удалено
  1. 30 23
      mineru/backend/vlm/vlm_magic_model.py

+ 30 - 23
mineru/backend/vlm/vlm_magic_model.py

@@ -205,35 +205,42 @@ def isolated_formula_clean(txt):
 
 
 def latex_fix(latex):
-    # 白名单分隔符
-    valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
-                         r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
-                         r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
-                         r'\Uparrow', r'\Downarrow', r'\|', r'\.']
-
-    # 为\left后缺失有效分隔符的情况添加点
-    def fix_delim(match):
-        cmd = match.group(1)  # \left 或 \right
-        rest = match.group(2) if len(match.groups()) > 1 else ""
-        if not rest or rest not in valid_delims_list:
-            return cmd + "."
-        return match.group(0)
-
-    LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
-    RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
+    # valid pairs:
+    # \left\{ ... \right\}
+    # \left( ... \right)
+    # \left| ... \right|
+    # \left\| ... \right\|
+    # \left[ ... \right]
+
     LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
     RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
-    LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
-
-    latex = LEFT_PATTERN.sub(lambda m: fix_delim(m), latex)
-    latex = RIGHT_PATTERN.sub(lambda m: fix_delim(m), latex)
-
-
     left_count = len(LEFT_COUNT_PATTERN.findall(latex))  # 不匹配\lefteqn等
     right_count = len(RIGHT_COUNT_PATTERN.findall(latex))  # 不匹配\rightarrow
 
     if left_count != right_count:
-        return LEFT_RIGHT_REMOVE_PATTERN.sub('', latex)
+        for _ in range(2):
+            # replace valid pairs
+            latex = re.sub(r'\\left\\\{', "{", latex) # \left\{
+            latex = re.sub(r"\\left\|", "|", latex) # \left|
+            latex = re.sub(r"\\left\\\|", "|", latex) # \left\|
+            latex = re.sub(r"\\left\(", "(", latex) # \left(
+            latex = re.sub(r"\\left\[", "[", latex) # \left[
+
+            latex = re.sub(r"\\right\\\}", "}", latex) # \right\}
+            latex = re.sub(r"\\right\|", "|", latex) # \right|
+            latex = re.sub(r"\\right\\\|", "|", latex) # \right\|
+            latex = re.sub(r"\\right\)", ")", latex) # \right)
+            latex = re.sub(r"\\right\]", "]", latex) # \right]
+            latex = re.sub(r"\\right\.", "", latex) # \right.
+
+            # replace invalid pairs first
+            latex = re.sub(r'\\left\{', "{", latex)
+            latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right}
+            latex = re.sub(r'\\left\\\(', "(", latex)
+            latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\)
+            latex = re.sub(r'\\left\\\[', "[", latex)
+            latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\]
+
     return latex