| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- import os
- import sys
- import json
- import re
- from PIL import Image
- from dots_ocr.utils.image_utils import PILimage_to_base64
- def has_latex_markdown(text: str) -> bool:
- """
- Checks if a string contains LaTeX markdown patterns.
-
- Args:
- text (str): The string to check.
-
- Returns:
- bool: True if LaTeX markdown is found, otherwise False.
- """
- if not isinstance(text, str):
- return False
-
- # Define regular expression patterns for LaTeX markdown
- latex_patterns = [
- r'\$\$.*?\$\$', # Block-level math formula $$...$$
- r'\$[^$\n]+?\$', # Inline math formula $...$
- r'\\begin\{.*?\}.*?\\end\{.*?\}', # LaTeX environment \begin{...}...\end{...}
- r'\\[a-zA-Z]+\{.*?\}', # LaTeX command \command{...}
- r'\\[a-zA-Z]+', # Simple LaTeX command \command
- r'\\\[.*?\\\]', # Display math formula \[...\]
- r'\\\(.*?\\\)', # Inline math formula \(...\)
- ]
-
- # Check if any of the patterns match
- for pattern in latex_patterns:
- if re.search(pattern, text, re.DOTALL):
- return True
-
- return False
- def clean_latex_preamble(latex_text: str) -> str:
- """
- Removes LaTeX preamble commands like document class and package imports.
-
- Args:
- latex_text (str): The original LaTeX text.
- Returns:
- str: The cleaned LaTeX text without preamble commands.
- """
- # Define patterns to be removed
- patterns = [
- r'\\documentclass\{[^}]+\}', # \documentclass{...}
- r'\\usepackage\{[^}]+\}', # \usepackage{...}
- r'\\usepackage\[[^\]]*\]\{[^}]+\}', # \usepackage[options]{...}
- r'\\begin\{document\}', # \begin{document}
- r'\\end\{document\}', # \end{document}
- ]
-
- # Apply each pattern to clean the text
- cleaned_text = latex_text
- for pattern in patterns:
- cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
-
- return cleaned_text
-
- def get_formula_in_markdown(text: str) -> str:
- """
- Formats a string containing a formula into a standard Markdown block.
-
- Args:
- text (str): The input string, potentially containing a formula.
- Returns:
- str: The formatted string, ready for Markdown rendering.
- """
- # Remove leading/trailing whitespace
- text = text.strip()
-
- # Check if it's already enclosed in $$
- if text.startswith('$$') and text.endswith('$$'):
- text_new = text[2:-2].strip()
- if not '$' in text_new:
- return f"$$\n{text_new}\n$$"
- else:
- return text
- # Handle \[...\] format, convert to $$...$$
- if text.startswith('\\[') and text.endswith('\\]'):
- inner_content = text[2:-2].strip()
- return f"$$\n{inner_content}\n$$"
-
- # Check if it's enclosed in \[ \]
- if len(re.findall(r'.*\\\[.*\\\].*', text)) > 0:
- return text
- # Handle inline formulas ($...$)
- pattern = r'\$([^$]+)\$'
- matches = re.findall(pattern, text)
- if len(matches) > 0:
- # It's an inline formula, return it as is
- return text
- # If no LaTeX markdown syntax is present, return directly
- if not has_latex_markdown(text):
- return text
- # Handle unnecessary LaTeX formatting like \usepackage
- if 'usepackage' in text:
- text = clean_latex_preamble(text)
- if text[0] == '`' and text[-1] == '`':
- text = text[1:-1]
- # Enclose the final text in a $$ block with newlines
- text = f"$$\n{text}\n$$"
- return text
- def clean_text(text: str) -> str:
- """
- Cleans text by removing extra whitespace.
-
- Args:
- text: The original text.
-
- Returns:
- str: The cleaned text.
- """
- if not text:
- return ""
-
- # Remove leading and trailing whitespace
- text = text.strip()
-
- # Replace multiple consecutive whitespace characters with a single space
- text = re.sub(r'\s+', ' ', text)
-
- return text
- def layoutjson2md(image: Image.Image, cells: list, text_key: str = 'text', no_page_hf: bool = False) -> str:
- """
- Converts a layout JSON format to Markdown.
-
- In the layout JSON, formulas are LaTeX, tables are HTML, and text is Markdown.
-
- Args:
- image: A PIL Image object.
- cells: A list of dictionaries, each representing a layout cell.
- text_key: The key for the text field in the cell dictionary.
- no_page_header_footer: If True, skips page headers and footers.
-
- Returns:
- str: The text in Markdown format.
- """
- text_items = []
- for i, cell in enumerate(cells):
- x1, y1, x2, y2 = [int(coord) for coord in cell['bbox']]
- text = cell.get(text_key, "")
-
- if no_page_hf and cell['category'] in ['Page-header', 'Page-footer']:
- continue
-
- if cell['category'] == 'Picture':
- image_crop = image.crop((x1, y1, x2, y2))
- image_base64 = PILimage_to_base64(image_crop)
- text_items.append(f"")
- elif cell['category'] == 'Formula':
- text_items.append(get_formula_in_markdown(text))
- else:
- text = clean_text(text)
- text_items.append(f"{text}")
- markdown_text = '\n\n'.join(text_items)
- return markdown_text
- def fix_streamlit_formulas(md: str) -> str:
- """
- Fixes the format of formulas in Markdown to ensure they display correctly in Streamlit.
- It adds a newline after the opening $$ and before the closing $$ if they don't already exist.
-
- Args:
- md_text (str): The Markdown text to fix.
-
- Returns:
- str: The fixed Markdown text.
- """
-
- # This inner function will be used by re.sub to perform the replacement
- def replace_formula(match):
- content = match.group(1)
- # If the content already has surrounding newlines, don't add more.
- if content.startswith('\n'):
- content = content[1:]
- if content.endswith('\n'):
- content = content[:-1]
- return f'$$\n{content}\n$$'
-
- # Use regex to find all $$....$$ patterns and replace them using the helper function.
- return re.sub(r'\$\$(.*?)\$\$', replace_formula, md, flags=re.DOTALL)
|