|
@@ -30,6 +30,9 @@ if str(ocr_platform_root) not in sys.path:
|
|
|
from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
|
|
from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
|
|
|
from ocr_utils.visualization_utils import VisualizationUtils
|
|
from ocr_utils.visualization_utils import VisualizationUtils
|
|
|
|
|
|
|
|
|
|
+# BeautifulSoup用于精确HTML表格处理
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+
|
|
|
# 从本地文件导入 Streamlit 特定函数
|
|
# 从本地文件导入 Streamlit 特定函数
|
|
|
from ocr_validator_file_utils import load_css_styles
|
|
from ocr_validator_file_utils import load_css_styles
|
|
|
|
|
|
|
@@ -69,16 +72,21 @@ class OCRLayoutManager:
|
|
|
def _highlight_text_safely(self, content: str, text_to_highlight: str,
|
|
def _highlight_text_safely(self, content: str, text_to_highlight: str,
|
|
|
highlight_class: str, title: Optional[str] = None) -> str:
|
|
highlight_class: str, title: Optional[str] = None) -> str:
|
|
|
"""
|
|
"""
|
|
|
- 安全地高亮文本,避免替换base64编码中的内容
|
|
|
|
|
|
|
+ 安全地高亮文本,保护Markdown语法(特别是图片)
|
|
|
|
|
+
|
|
|
|
|
+ 策略:
|
|
|
|
|
+ 1. 保护特殊内容(HTML注释、Markdown图片)
|
|
|
|
|
+ 2. 只对HTML表格使用BeautifulSoup精确处理
|
|
|
|
|
+ 3. 其他部分使用简单字符串替换,保持Markdown格式
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
- content: 要处理的HTML内容
|
|
|
|
|
|
|
+ content: 要处理的Markdown/HTML混合内容
|
|
|
text_to_highlight: 要高亮的文本
|
|
text_to_highlight: 要高亮的文本
|
|
|
highlight_class: 高亮样式类名
|
|
highlight_class: 高亮样式类名
|
|
|
- title: 鼠标悬停提示文本,默认为text_to_highlight
|
|
|
|
|
|
|
+ title: 鼠标悬停提示文本
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
- 处理后的HTML内容
|
|
|
|
|
|
|
+ 处理后的内容
|
|
|
"""
|
|
"""
|
|
|
if not text_to_highlight or text_to_highlight not in content:
|
|
if not text_to_highlight or text_to_highlight not in content:
|
|
|
return content
|
|
return content
|
|
@@ -86,48 +94,78 @@ class OCRLayoutManager:
|
|
|
if title is None:
|
|
if title is None:
|
|
|
title = text_to_highlight
|
|
title = text_to_highlight
|
|
|
|
|
|
|
|
- # 转义特殊字符用于正则表达式
|
|
|
|
|
- escaped_text = re.escape(text_to_highlight)
|
|
|
|
|
-
|
|
|
|
|
- # 找出所有base64编码区域的位置
|
|
|
|
|
- # 匹配两种格式:
|
|
|
|
|
- # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
|
|
|
|
|
- # 2. Markdown: 
|
|
|
|
|
- # 使用更通用的模式来匹配base64数据
|
|
|
|
|
- base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
|
|
|
|
- base64_regions = []
|
|
|
|
|
-
|
|
|
|
|
- for match in re.finditer(base64_pattern, content):
|
|
|
|
|
- base64_regions.append((match.start(), match.end()))
|
|
|
|
|
-
|
|
|
|
|
- # 找出所有要高亮文本的位置
|
|
|
|
|
- text_pattern = re.compile(escaped_text)
|
|
|
|
|
- matches = []
|
|
|
|
|
-
|
|
|
|
|
- for match in text_pattern.finditer(content):
|
|
|
|
|
- start, end = match.start(), match.end()
|
|
|
|
|
-
|
|
|
|
|
- # 检查该位置是否在base64区域内
|
|
|
|
|
- in_base64 = False
|
|
|
|
|
- for base64_start, base64_end in base64_regions:
|
|
|
|
|
- if base64_start <= start < base64_end:
|
|
|
|
|
- in_base64 = True
|
|
|
|
|
- break
|
|
|
|
|
-
|
|
|
|
|
- # 只保留不在base64区域内的匹配
|
|
|
|
|
- if not in_base64:
|
|
|
|
|
- matches.append((start, end))
|
|
|
|
|
-
|
|
|
|
|
- # 从后向前替换,避免位置偏移
|
|
|
|
|
- for start, end in reversed(matches):
|
|
|
|
|
- original_text = content[start:end]
|
|
|
|
|
- # 转义HTML特殊字符
|
|
|
|
|
- escaped_original = html.escape(original_text)
|
|
|
|
|
- escaped_title = html.escape(title)
|
|
|
|
|
- highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
|
|
|
|
|
- content = content[:start] + highlighted + content[end:]
|
|
|
|
|
-
|
|
|
|
|
- return content
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ import re
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 提取并保护特殊内容
|
|
|
|
|
+ protected_parts = []
|
|
|
|
|
+
|
|
|
|
|
+ # 保护 HTML 注释
|
|
|
|
|
+ def protect_comment(match):
|
|
|
|
|
+ protected_parts.append(match.group(0))
|
|
|
|
|
+ return f"__PROTECTED_{len(protected_parts) - 1}__"
|
|
|
|
|
+
|
|
|
|
|
+ content = re.sub(r'<!--.*?-->', protect_comment, content, flags=re.DOTALL)
|
|
|
|
|
+
|
|
|
|
|
+ # 保护 Markdown 图片(完整语法)
|
|
|
|
|
+ def protect_image(match):
|
|
|
|
|
+ protected_parts.append(match.group(0))
|
|
|
|
|
+ return f"__PROTECTED_{len(protected_parts) - 1}__"
|
|
|
|
|
+
|
|
|
|
|
+ content = re.sub(r'!\[.*?\]\([^)]+\)', protect_image, content)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 提取表格并单独处理
|
|
|
|
|
+ tables = []
|
|
|
|
|
+ def extract_table(match):
|
|
|
|
|
+ tables.append(match.group(0))
|
|
|
|
|
+ return f"__TABLE_{len(tables) - 1}__"
|
|
|
|
|
+
|
|
|
|
|
+ content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 对表格使用 BeautifulSoup 精确处理
|
|
|
|
|
+ highlighted_tables = []
|
|
|
|
|
+
|
|
|
|
|
+ for table_html in tables:
|
|
|
|
|
+ soup = BeautifulSoup(table_html, 'html.parser')
|
|
|
|
|
+
|
|
|
|
|
+ # 在表格单元格中查找完全匹配
|
|
|
|
|
+ for td in soup.find_all(['td', 'th']):
|
|
|
|
|
+ cell_text = td.get_text(strip=True)
|
|
|
|
|
+ if cell_text == text_to_highlight:
|
|
|
|
|
+ # 给整个单元格添加高亮类
|
|
|
|
|
+ current_classes = td.get('class', [])
|
|
|
|
|
+ td['class'] = current_classes + highlight_class.split()
|
|
|
|
|
+ if title:
|
|
|
|
|
+ td['title'] = title
|
|
|
|
|
+
|
|
|
|
|
+ highlighted_tables.append(str(soup))
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 对普通文本进行简单替换(保持Markdown格式,跳过占位符)
|
|
|
|
|
+ if text_to_highlight in content:
|
|
|
|
|
+ highlight_span = f'<span class="{highlight_class}"'
|
|
|
|
|
+ if title:
|
|
|
|
|
+ highlight_span += f' title="{title}"'
|
|
|
|
|
+ highlight_span += f'>{text_to_highlight}</span>'
|
|
|
|
|
+
|
|
|
|
|
+ # 🎯 安全替换:使用正则表达式,排除占位符内的匹配
|
|
|
|
|
+ # 负向前瞻:确保前面不是占位符的一部分
|
|
|
|
|
+ pattern = f'(?<!__PROTECTED_)(?<!__TABLE_){re.escape(text_to_highlight)}(?!__)'
|
|
|
|
|
+ content = re.sub(pattern, highlight_span, content)
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 恢复表格
|
|
|
|
|
+ for i, table in enumerate(highlighted_tables):
|
|
|
|
|
+ content = content.replace(f"__TABLE_{i}__", table)
|
|
|
|
|
+
|
|
|
|
|
+ # 6. 恢复受保护的内容(图片和注释)
|
|
|
|
|
+ for i, protected in enumerate(protected_parts):
|
|
|
|
|
+ content = content.replace(f"__PROTECTED_{i}__", protected)
|
|
|
|
|
+
|
|
|
|
|
+ return content
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ st.warning(f"文本高亮时出错: {str(e)}")
|
|
|
|
|
+ return content
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def clear_image_cache(self):
|
|
def clear_image_cache(self):
|
|
|
"""清理所有图像缓存"""
|
|
"""清理所有图像缓存"""
|