import os import sys import json import re from PIL import Image from dots_ocr.utils.image_utils import PILimage_to_base64 def has_latex_markdown(text: str) -> bool: """ Checks if a string contains LaTeX markdown patterns. Args: text (str): The string to check. Returns: bool: True if LaTeX markdown is found, otherwise False. """ if not isinstance(text, str): return False # Define regular expression patterns for LaTeX markdown latex_patterns = [ r'\$\$.*?\$\$', # Block-level math formula $$...$$ r'\$[^$\n]+?\$', # Inline math formula $...$ r'\\begin\{.*?\}.*?\\end\{.*?\}', # LaTeX environment \begin{...}...\end{...} r'\\[a-zA-Z]+\{.*?\}', # LaTeX command \command{...} r'\\[a-zA-Z]+', # Simple LaTeX command \command r'\\\[.*?\\\]', # Display math formula \[...\] r'\\\(.*?\\\)', # Inline math formula \(...\) ] # Check if any of the patterns match for pattern in latex_patterns: if re.search(pattern, text, re.DOTALL): return True return False def clean_latex_preamble(latex_text: str) -> str: """ Removes LaTeX preamble commands like document class and package imports. Args: latex_text (str): The original LaTeX text. Returns: str: The cleaned LaTeX text without preamble commands. """ # Define patterns to be removed patterns = [ r'\\documentclass\{[^}]+\}', # \documentclass{...} r'\\usepackage\{[^}]+\}', # \usepackage{...} r'\\usepackage\[[^\]]*\]\{[^}]+\}', # \usepackage[options]{...} r'\\begin\{document\}', # \begin{document} r'\\end\{document\}', # \end{document} ] # Apply each pattern to clean the text cleaned_text = latex_text for pattern in patterns: cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE) return cleaned_text def get_formula_in_markdown(text: str) -> str: """ Formats a string containing a formula into a standard Markdown block. Args: text (str): The input string, potentially containing a formula. Returns: str: The formatted string, ready for Markdown rendering. """ # Remove leading/trailing whitespace text = text.strip() # Check if it's already enclosed in $$ if text.startswith('$$') and text.endswith('$$'): text_new = text[2:-2].strip() if not '$' in text_new: return f"$$\n{text_new}\n$$" else: return text # Handle \[...\] format, convert to $$...$$ if text.startswith('\\[') and text.endswith('\\]'): inner_content = text[2:-2].strip() return f"$$\n{inner_content}\n$$" # Check if it's enclosed in \[ \] if len(re.findall(r'.*\\\[.*\\\].*', text)) > 0: return text # Handle inline formulas ($...$) pattern = r'\$([^$]+)\$' matches = re.findall(pattern, text) if len(matches) > 0: # It's an inline formula, return it as is return text # If no LaTeX markdown syntax is present, return directly if not has_latex_markdown(text): return text # Handle unnecessary LaTeX formatting like \usepackage if 'usepackage' in text: text = clean_latex_preamble(text) if text[0] == '`' and text[-1] == '`': text = text[1:-1] # Enclose the final text in a $$ block with newlines text = f"$$\n{text}\n$$" return text def clean_text(text: str) -> str: """ Cleans text by removing extra whitespace. Args: text: The original text. Returns: str: The cleaned text. """ if not text: return "" # Remove leading and trailing whitespace text = text.strip() # Replace multiple consecutive whitespace characters with a single space if text[:2] == '`$' and text[-2:] == '$`': text = text[1:-1] return text def layoutjson2md(image: Image.Image, cells: list, text_key: str = 'text', no_page_hf: bool = False) -> str: """ Converts a layout JSON format to Markdown. In the layout JSON, formulas are LaTeX, tables are HTML, and text is Markdown. Args: image: A PIL Image object. cells: A list of dictionaries, each representing a layout cell. text_key: The key for the text field in the cell dictionary. no_page_header_footer: If True, skips page headers and footers. Returns: str: The text in Markdown format. """ text_items = [] for i, cell in enumerate(cells): x1, y1, x2, y2 = [int(coord) for coord in cell['bbox']] text = cell.get(text_key, "") if no_page_hf and cell['category'] in ['Page-header', 'Page-footer']: continue if cell['category'] == 'Picture': image_crop = image.crop((x1, y1, x2, y2)) image_base64 = PILimage_to_base64(image_crop) text_items.append(f"![]({image_base64})") elif cell['category'] == 'Formula': text_items.append(get_formula_in_markdown(text)) else: text = clean_text(text) text_items.append(f"{text}") markdown_text = '\n\n'.join(text_items) return markdown_text def fix_streamlit_formulas(md: str) -> str: """ Fixes the format of formulas in Markdown to ensure they display correctly in Streamlit. It adds a newline after the opening $$ and before the closing $$ if they don't already exist. Args: md_text (str): The Markdown text to fix. Returns: str: The fixed Markdown text. """ # This inner function will be used by re.sub to perform the replacement def replace_formula(match): content = match.group(1) # If the content already has surrounding newlines, don't add more. if content.startswith('\n'): content = content[1:] if content.endswith('\n'): content = content[:-1] return f'$$\n{content}\n$$' # Use regex to find all $$....$$ patterns and replace them using the helper function. return re.sub(r'\$\$(.*?)\$\$', replace_formula, md, flags=re.DOTALL)