| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- import re
- LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
- RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
- LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
- RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
- LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
- def fix_latex_left_right(s, fix_delimiter=True):
- """
- 修复LaTeX中的\\left和\\right命令
- 1. 确保它们后面跟有效分隔符
- 2. 平衡\\left和\\right的数量
- """
- # 白名单分隔符
- valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
- r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
- r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
- r'\Uparrow', r'\Downarrow', r'\|', r'\.']
- # 为\left后缺失有效分隔符的情况添加点
- def fix_delim(match, is_left=True):
- cmd = match.group(1) # \left 或 \right
- rest = match.group(2) if len(match.groups()) > 1 else ""
- if not rest or rest not in valid_delims_list:
- return cmd + "."
- return match.group(0)
- # 使用更精确的模式匹配\left和\right命令
- # 确保它们是独立的命令,不是其他命令的一部分
- # 使用预编译正则和统一回调函数
- if fix_delimiter:
- s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
- s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
- # 更精确地计算\left和\right的数量
- left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等
- right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等
- if left_count == right_count:
- # 如果数量相等,检查是否在同一组
- return fix_left_right_pairs(s)
- # return s
- else:
- # 如果数量不等,移除所有\left和\right
- # logger.debug(f"latex:{s}")
- # logger.warning(f"left_count: {left_count}, right_count: {right_count}")
- return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
- def fix_left_right_pairs(latex_formula):
- """
- 检测并修复LaTeX公式中\\left和\\right不在同一组的情况
- Args:
- latex_formula (str): 输入的LaTeX公式
- Returns:
- str: 修复后的LaTeX公式
- """
- # 用于跟踪花括号嵌套层级
- brace_stack = []
- # 用于存储\left信息: (位置, 深度, 分隔符)
- left_stack = []
- # 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
- adjustments = []
- i = 0
- while i < len(latex_formula):
- # 检查是否是转义字符
- if i > 0 and latex_formula[i - 1] == '\\':
- backslash_count = 0
- j = i - 1
- while j >= 0 and latex_formula[j] == '\\':
- backslash_count += 1
- j -= 1
- if backslash_count % 2 == 1:
- i += 1
- continue
- # 检测\left命令
- if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
- delimiter = latex_formula[i + 5]
- left_stack.append((i, len(brace_stack), delimiter))
- i += 6 # 跳过\left和分隔符
- continue
- # 检测\right命令
- elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
- delimiter = latex_formula[i + 6]
- if left_stack:
- left_pos, left_depth, left_delim = left_stack.pop()
- # 如果\left和\right不在同一花括号深度
- if left_depth != len(brace_stack):
- # 找到\left所在花括号组的结束位置
- target_pos = find_group_end(latex_formula, left_pos, left_depth)
- if target_pos != -1:
- # 记录需要移动的\right
- adjustments.append((i, i + 7, target_pos))
- i += 7 # 跳过\right和分隔符
- continue
- # 处理花括号
- if latex_formula[i] == '{':
- brace_stack.append(i)
- elif latex_formula[i] == '}':
- if brace_stack:
- brace_stack.pop()
- i += 1
- # 应用调整,从后向前处理以避免索引变化
- if not adjustments:
- return latex_formula
- result = list(latex_formula)
- adjustments.sort(reverse=True, key=lambda x: x[0])
- for start, end, target in adjustments:
- # 提取\right部分
- right_part = result[start:end]
- # 从原位置删除
- del result[start:end]
- # 在目标位置插入
- result.insert(target, ''.join(right_part))
- return ''.join(result)
- def find_group_end(text, pos, depth):
- """查找特定深度的花括号组的结束位置"""
- current_depth = depth
- i = pos
- while i < len(text):
- if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
- current_depth += 1
- elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
- current_depth -= 1
- if current_depth < depth:
- return i
- i += 1
- return -1 # 未找到对应结束位置
- def is_escaped(text, pos):
- """检查字符是否被转义"""
- backslash_count = 0
- j = pos - 1
- while j >= 0 and text[j] == '\\':
- backslash_count += 1
- j -= 1
- return backslash_count % 2 == 1
- def fix_unbalanced_braces(latex_formula):
- """
- 检测LaTeX公式中的花括号是否闭合,并删除无法配对的花括号
- Args:
- latex_formula (str): 输入的LaTeX公式
- Returns:
- str: 删除无法配对的花括号后的LaTeX公式
- """
- stack = [] # 存储左括号的索引
- unmatched = set() # 存储不匹配括号的索引
- i = 0
- while i < len(latex_formula):
- # 检查是否是转义的花括号
- if latex_formula[i] in ['{', '}']:
- # 计算前面连续的反斜杠数量
- backslash_count = 0
- j = i - 1
- while j >= 0 and latex_formula[j] == '\\':
- backslash_count += 1
- j -= 1
- # 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配
- if backslash_count % 2 == 1:
- i += 1
- continue
- # 否则,该花括号参与匹配
- if latex_formula[i] == '{':
- stack.append(i)
- else: # latex_formula[i] == '}'
- if stack: # 有对应的左括号
- stack.pop()
- else: # 没有对应的左括号
- unmatched.add(i)
- i += 1
- # 所有未匹配的左括号
- unmatched.update(stack)
- # 构建新字符串,删除不匹配的括号
- return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
- def process_latex(input_string):
- """
- 处理LaTeX公式中的反斜杠:
- 1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变
- 2. 如果\后跟两个小写字母,保持不变
- 3. 其他情况,在\后添加空格
- Args:
- input_string (str): 输入的LaTeX公式
- Returns:
- str: 处理后的LaTeX公式
- """
- def replace_func(match):
- # 获取\后面的字符
- next_char = match.group(1)
- # 如果是特殊字符或空格,保持不变
- if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
- return match.group(0)
- # 如果是字母,检查下一个字符
- if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
- pos = match.start() + 2 # \x后的位置
- if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
- # 下一个字符也是字母,保持不变
- return match.group(0)
- # 其他情况,在\后添加空格
- return '\\' + ' ' + next_char
- # 匹配\后面跟一个字符的情况
- pattern = r'\\(.)'
- return re.sub(pattern, replace_func, input_string)
- # 常见的在KaTeX/MathJax中可用的数学环境
- ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
- 'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered', 'align', 'align*',]
- ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
- ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
- ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
- def fix_latex_environments(s):
- """
- 检测LaTeX中环境(如array)的\\begin和\\end是否匹配
- 1. 如果缺少\\begin标签则在开头添加
- 2. 如果缺少\\end标签则在末尾添加
- """
- for env in ENV_TYPES:
- begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
- end_count = len(ENV_END_PATTERNS[env].findall(s))
- if begin_count != end_count:
- if end_count > begin_count:
- format_match = ENV_FORMAT_PATTERNS[env].search(s)
- default_format = '{c}' if env == 'array' else ''
- format_str = '{' + format_match.group(1) + '}' if format_match else default_format
- missing_count = end_count - begin_count
- begin_command = '\\begin{' + env + '}' + format_str + ' '
- s = begin_command * missing_count + s
- else:
- missing_count = begin_count - end_count
- s = s + (' \\end{' + env + '}') * missing_count
- return s
- REPLACEMENTS_PATTERNS = {
- re.compile(r'\\underbar'): r'\\underline',
- re.compile(r'\\Bar'): r'\\hat',
- re.compile(r'\\Hat'): r'\\hat',
- re.compile(r'\\Tilde'): r'\\tilde',
- re.compile(r'\\slash'): r'/',
- re.compile(r'\\textperthousand'): r'‰',
- re.compile(r'\\sun'): r'☉',
- re.compile(r'\\textunderscore'): r'\\_',
- re.compile(r'\\fint'): r'⨏',
- re.compile(r'\\up '): r'\\ ',
- re.compile(r'\\vline = '): r'\\models ',
- re.compile(r'\\vDash '): r'\\models ',
- re.compile(r'\\sq \\sqcup '): r'\\square ',
- re.compile(r'\\copyright'): r'©',
- }
- QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
- def remove_up_commands(s: str):
- """Remove unnecessary up commands from LaTeX code."""
- UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
- s = UP_PATTERN.sub(
- lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
- )
- return s
- def remove_unsupported_commands(s: str):
- """Remove unsupported LaTeX commands."""
- COMMANDS_TO_REMOVE_PATTERN = re.compile(
- r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
- s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
- return s
- def latex_rm_whitespace(s: str):
- """Remove unnecessary whitespace from LaTeX code."""
- s = fix_unbalanced_braces(s)
- s = fix_latex_left_right(s)
- s = fix_latex_environments(s)
- s = remove_up_commands(s)
- s = remove_unsupported_commands(s)
- # 应用所有替换
- for pattern, replacement in REPLACEMENTS_PATTERNS.items():
- s = pattern.sub(replacement, s)
- # 处理LaTeX中的反斜杠和空格
- s = process_latex(s)
- # \qquad后补空格
- s = QQUAD_PATTERN.sub(r'\\qquad ', s)
- # 如果字符串以反斜杠结尾,去掉最后的反斜杠
- while s.endswith('\\'):
- s = s[:-1]
- return s
|