format_transformer.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import os
  2. import sys
  3. import json
  4. import re
  5. from PIL import Image
  6. from dots_ocr.utils.image_utils import PILimage_to_base64
  7. def has_latex_markdown(text: str) -> bool:
  8. """
  9. Checks if a string contains LaTeX markdown patterns.
  10. Args:
  11. text (str): The string to check.
  12. Returns:
  13. bool: True if LaTeX markdown is found, otherwise False.
  14. """
  15. if not isinstance(text, str):
  16. return False
  17. # Define regular expression patterns for LaTeX markdown
  18. latex_patterns = [
  19. r'\$\$.*?\$\$', # Block-level math formula $$...$$
  20. r'\$[^$\n]+?\$', # Inline math formula $...$
  21. r'\\begin\{.*?\}.*?\\end\{.*?\}', # LaTeX environment \begin{...}...\end{...}
  22. r'\\[a-zA-Z]+\{.*?\}', # LaTeX command \command{...}
  23. r'\\[a-zA-Z]+', # Simple LaTeX command \command
  24. r'\\\[.*?\\\]', # Display math formula \[...\]
  25. r'\\\(.*?\\\)', # Inline math formula \(...\)
  26. ]
  27. # Check if any of the patterns match
  28. for pattern in latex_patterns:
  29. if re.search(pattern, text, re.DOTALL):
  30. return True
  31. return False
  32. def clean_latex_preamble(latex_text: str) -> str:
  33. """
  34. Removes LaTeX preamble commands like document class and package imports.
  35. Args:
  36. latex_text (str): The original LaTeX text.
  37. Returns:
  38. str: The cleaned LaTeX text without preamble commands.
  39. """
  40. # Define patterns to be removed
  41. patterns = [
  42. r'\\documentclass\{[^}]+\}', # \documentclass{...}
  43. r'\\usepackage\{[^}]+\}', # \usepackage{...}
  44. r'\\usepackage\[[^\]]*\]\{[^}]+\}', # \usepackage[options]{...}
  45. r'\\begin\{document\}', # \begin{document}
  46. r'\\end\{document\}', # \end{document}
  47. ]
  48. # Apply each pattern to clean the text
  49. cleaned_text = latex_text
  50. for pattern in patterns:
  51. cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
  52. return cleaned_text
  53. def get_formula_in_markdown(text: str) -> str:
  54. """
  55. Formats a string containing a formula into a standard Markdown block.
  56. Args:
  57. text (str): The input string, potentially containing a formula.
  58. Returns:
  59. str: The formatted string, ready for Markdown rendering.
  60. """
  61. # Remove leading/trailing whitespace
  62. text = text.strip()
  63. # Check if it's already enclosed in $$
  64. if text.startswith('$$') and text.endswith('$$'):
  65. text_new = text[2:-2].strip()
  66. if not '$' in text_new:
  67. return f"$$\n{text_new}\n$$"
  68. else:
  69. return text
  70. # Handle \[...\] format, convert to $$...$$
  71. if text.startswith('\\[') and text.endswith('\\]'):
  72. inner_content = text[2:-2].strip()
  73. return f"$$\n{inner_content}\n$$"
  74. # Check if it's enclosed in \[ \]
  75. if len(re.findall(r'.*\\\[.*\\\].*', text)) > 0:
  76. return text
  77. # Handle inline formulas ($...$)
  78. pattern = r'\$([^$]+)\$'
  79. matches = re.findall(pattern, text)
  80. if len(matches) > 0:
  81. # It's an inline formula, return it as is
  82. return text
  83. # If no LaTeX markdown syntax is present, return directly
  84. if not has_latex_markdown(text):
  85. return text
  86. # Handle unnecessary LaTeX formatting like \usepackage
  87. if 'usepackage' in text:
  88. text = clean_latex_preamble(text)
  89. if text[0] == '`' and text[-1] == '`':
  90. text = text[1:-1]
  91. # Enclose the final text in a $$ block with newlines
  92. text = f"$$\n{text}\n$$"
  93. return text
  94. def clean_text(text: str) -> str:
  95. """
  96. Cleans text by removing extra whitespace.
  97. Args:
  98. text: The original text.
  99. Returns:
  100. str: The cleaned text.
  101. """
  102. if not text:
  103. return ""
  104. # Remove leading and trailing whitespace
  105. text = text.strip()
  106. # Replace multiple consecutive whitespace characters with a single space
  107. text = re.sub(r'\s+', ' ', text)
  108. return text
  109. def layoutjson2md(image: Image.Image, cells: list, text_key: str = 'text', no_page_hf: bool = False) -> str:
  110. """
  111. Converts a layout JSON format to Markdown.
  112. In the layout JSON, formulas are LaTeX, tables are HTML, and text is Markdown.
  113. Args:
  114. image: A PIL Image object.
  115. cells: A list of dictionaries, each representing a layout cell.
  116. text_key: The key for the text field in the cell dictionary.
  117. no_page_header_footer: If True, skips page headers and footers.
  118. Returns:
  119. str: The text in Markdown format.
  120. """
  121. text_items = []
  122. for i, cell in enumerate(cells):
  123. x1, y1, x2, y2 = [int(coord) for coord in cell['bbox']]
  124. text = cell.get(text_key, "")
  125. if no_page_hf and cell['category'] in ['Page-header', 'Page-footer']:
  126. continue
  127. if cell['category'] == 'Picture':
  128. image_crop = image.crop((x1, y1, x2, y2))
  129. image_base64 = PILimage_to_base64(image_crop)
  130. text_items.append(f"![]({image_base64})")
  131. elif cell['category'] == 'Formula':
  132. text_items.append(get_formula_in_markdown(text))
  133. else:
  134. text = clean_text(text)
  135. text_items.append(f"{text}")
  136. markdown_text = '\n\n'.join(text_items)
  137. return markdown_text
  138. def fix_streamlit_formulas(md: str) -> str:
  139. """
  140. Fixes the format of formulas in Markdown to ensure they display correctly in Streamlit.
  141. It adds a newline after the opening $$ and before the closing $$ if they don't already exist.
  142. Args:
  143. md_text (str): The Markdown text to fix.
  144. Returns:
  145. str: The fixed Markdown text.
  146. """
  147. # This inner function will be used by re.sub to perform the replacement
  148. def replace_formula(match):
  149. content = match.group(1)
  150. # If the content already has surrounding newlines, don't add more.
  151. if content.startswith('\n'):
  152. content = content[1:]
  153. if content.endswith('\n'):
  154. content = content[:-1]
  155. return f'$$\n{content}\n$$'
  156. # Use regex to find all $$....$$ patterns and replace them using the helper function.
  157. return re.sub(r'\$\$(.*?)\$\$', replace_formula, md, flags=re.DOTALL)