Просмотр исходного кода

fix(mfr): improve LaTeX formula processing and repair

- Add functions to fix LaTeX left and right commands
- Implement brace matching and repair in LaTeX formulas
- Remove unnecessary whitespace and repair LaTeX code
- Replace specific LaTeX commands with appropriate alternatives
- Add logging for debugging purposes
myhloli 6 месяцев назад
Родитель
Сommit
2e91fb3f52

+ 24 - 21
magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py

@@ -349,27 +349,30 @@ def latex_rm_whitespace(s: str):
     # \qquad后补空格
     s = re.sub(r'\\qquad(?!\s)', r'\\qquad ', s)
 
-    # 先保存 "\ " 序列,防止被错误处理
-    s = re.sub(r'\\ ', r'\\SPACE', s)
-
-    text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
-    letter = r'[a-zA-Z]'
-    noletter = r'[\W_^\d]'
-    names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
-    s = re.sub(text_reg, lambda _: str(names.pop(0)), s)
-    news = s
-    while True:
-        s = news
-        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
-        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
-        news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
-        if news == s:
-            break
-
-    # 恢复 "\ " 序列
-    news = re.sub(r'\\SPACE', r'\\ ', news)
-
-    return news
+    # \slash 换成 /
+    s = re.sub(r'\\slash', r'/', s)
+
+    # # 先保存 "\ " 序列,防止被错误处理
+    # s = re.sub(r'\\ ', r'\\SPACE', s)
+    #
+    # text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
+    # letter = r'[a-zA-Z]'
+    # noletter = r'[\W_^\d]'
+    # names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
+    # s = re.sub(text_reg, lambda _: str(names.pop(0)), s)
+    # news = s
+    # while True:
+    #     s = news
+    #     news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
+    #     news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
+    #     news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
+    #     if news == s:
+    #         break
+    #
+    # # 恢复 "\ " 序列
+    # news = re.sub(r'\\SPACE', r'\\ ', news)
+
+    return s
 
 
 class UnimernetModel(VisionEncoderDecoderModel):