5 месяцев назад · 73553fa5a5
--- a/ocr_validator/ocr_validator_layout.py
+++ b/ocr_validator/ocr_validator_layout.py
@@ -30,6 +30,9 @@ if str(ocr_platform_root) not in sys.path:
 
															 from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
														
 
															 from ocr_utils.visualization_utils import VisualizationUtils
														
 
															+# BeautifulSoup用于精确HTML表格处理
														
 
															+from bs4 import BeautifulSoup
														
 
															+
														
 
															 # 从本地文件导入 Streamlit 特定函数
														
 
															 from ocr_validator_file_utils import load_css_styles
														
@@ -69,16 +72,21 @@ class OCRLayoutManager:
 
															     def _highlight_text_safely(self, content: str, text_to_highlight: str, 
														
 
															                                highlight_class: str, title: Optional[str] = None) -> str:
														
 
															         """
														
 
															-        安全地高亮文本，避免替换base64编码中的内容
														
 
															+        安全地高亮文本，保护Markdown语法（特别是图片）
														
 
															+        
														
 
															+        策略：
														
 
															+        1. 保护特殊内容（HTML注释、Markdown图片）
														
 
															+        2. 只对HTML表格使用BeautifulSoup精确处理
														
 
															+        3. 其他部分使用简单字符串替换，保持Markdown格式
														
 
															         Args:
														
 
															-            content: 要处理的HTML内容
														
 
															+            content: 要处理的Markdown/HTML混合内容
														
 
															             text_to_highlight: 要高亮的文本
														
 
															             highlight_class: 高亮样式类名
														
 
															-            title: 鼠标悬停提示文本，默认为text_to_highlight
														
 
															+            title: 鼠标悬停提示文本
														
 
															         Returns:
														
 
															-            处理后的HTML内容
														
 
															+            处理后的内容
														
 
															         """
														
 
															         if not text_to_highlight or text_to_highlight not in content:
														
 
															             return content
														
@@ -86,48 +94,78 @@ class OCRLayoutManager:
 
															         if title is None:
														
 
															             title = text_to_highlight
														
 
															-        # 转义特殊字符用于正则表达式
														
 
															-        escaped_text = re.escape(text_to_highlight)
														
 
															-        
														
 
															-        # 找出所有base64编码区域的位置
														
 
															-        # 匹配两种格式：
														
 
															-        # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
														
 
															-        # 2. Markdown: ![...](data:image/...;base64,...)
														
 
															-        # 使用更通用的模式来匹配base64数据
														
 
															-        base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
														
 
															-        base64_regions = []
														
 
															-        
														
 
															-        for match in re.finditer(base64_pattern, content):
														
 
															-            base64_regions.append((match.start(), match.end()))
														
 
															-        
														
 
															-        # 找出所有要高亮文本的位置
														
 
															-        text_pattern = re.compile(escaped_text)
														
 
															-        matches = []
														
 
															-        
														
 
															-        for match in text_pattern.finditer(content):
														
 
															-            start, end = match.start(), match.end()
														
 
															-            
														
 
															-            # 检查该位置是否在base64区域内
														
 
															-            in_base64 = False
														
 
															-            for base64_start, base64_end in base64_regions:
														
 
															-                if base64_start <= start < base64_end:
														
 
															-                    in_base64 = True
														
 
															-                    break
														
 
															-            
														
 
															-            # 只保留不在base64区域内的匹配
														
 
															-            if not in_base64:
														
 
															-                matches.append((start, end))
														
 
															-        
														
 
															-        # 从后向前替换，避免位置偏移
														
 
															-        for start, end in reversed(matches):
														
 
															-            original_text = content[start:end]
														
 
															-            # 转义HTML特殊字符
														
 
															-            escaped_original = html.escape(original_text)
														
 
															-            escaped_title = html.escape(title)
														
 
															-            highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
														
 
															-            content = content[:start] + highlighted + content[end:]
														
 
															-        
														
 
															-        return content
														
 
															+        try:
														
 
															+            import re
														
 
															+            
														
 
															+            # 1. 提取并保护特殊内容
														
 
															+            protected_parts = []
														
 
															+            
														
 
															+            # 保护 HTML 注释
														
 
															+            def protect_comment(match):
														
 
															+                protected_parts.append(match.group(0))
														
 
															+                return f"__PROTECTED_{len(protected_parts) - 1}__"
														
 
															+            
														
 
															+            content = re.sub(r'<!--.*?-->', protect_comment, content, flags=re.DOTALL)
														
 
															+            
														
 
															+            # 保护 Markdown 图片（完整语法）
														
 
															+            def protect_image(match):
														
 
															+                protected_parts.append(match.group(0))
														
 
															+                return f"__PROTECTED_{len(protected_parts) - 1}__"
														
 
															+            
														
 
															+            content = re.sub(r'!\[.*?\]\([^)]+\)', protect_image, content)
														
 
															+            
														
 
															+            # 2. 提取表格并单独处理
														
 
															+            tables = []
														
 
															+            def extract_table(match):
														
 
															+                tables.append(match.group(0))
														
 
															+                return f"__TABLE_{len(tables) - 1}__"
														
 
															+            
														
 
															+            content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
														
 
															+            
														
 
															+            # 3. 对表格使用 BeautifulSoup 精确处理
														
 
															+            highlighted_tables = []
														
 
															+            
														
 
															+            for table_html in tables:
														
 
															+                soup = BeautifulSoup(table_html, 'html.parser')
														
 
															+                
														
 
															+                # 在表格单元格中查找完全匹配
														
 
															+                for td in soup.find_all(['td', 'th']):
														
 
															+                    cell_text = td.get_text(strip=True)
														
 
															+                    if cell_text == text_to_highlight:
														
 
															+                        # 给整个单元格添加高亮类
														
 
															+                        current_classes = td.get('class', [])
														
 
															+                        td['class'] = current_classes + highlight_class.split()
														
 
															+                        if title:
														
 
															+                            td['title'] = title
														
 
															+                
														
 
															+                highlighted_tables.append(str(soup))
														
 
															+            
														
 
															+            # 4. 对普通文本进行简单替换（保持Markdown格式，跳过占位符）
														
 
															+            if text_to_highlight in content:
														
 
															+                highlight_span = f'<span class="{highlight_class}"'
														
 
															+                if title:
														
 
															+                    highlight_span += f' title="{title}"'
														
 
															+                highlight_span += f'>{text_to_highlight}</span>'
														
 
															+                
														
 
															+                # 🎯 安全替换：使用正则表达式，排除占位符内的匹配
														
 
															+                # 负向前瞻：确保前面不是占位符的一部分
														
 
															+                pattern = f'(?<!__PROTECTED_)(?<!__TABLE_){re.escape(text_to_highlight)}(?!__)'
														
 
															+                content = re.sub(pattern, highlight_span, content)
														
 
															+            
														
 
															+            # 5. 恢复表格
														
 
															+            for i, table in enumerate(highlighted_tables):
														
 
															+                content = content.replace(f"__TABLE_{i}__", table)
														
 
															+            
														
 
															+            # 6. 恢复受保护的内容（图片和注释）
														
 
															+            for i, protected in enumerate(protected_parts):
														
 
															+                content = content.replace(f"__PROTECTED_{i}__", protected)
														
 
															+            
														
 
															+            return content
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            st.warning(f"文本高亮时出错: {str(e)}")
														
 
															+            return content
														
 
															+            
														
 
															     def clear_image_cache(self):
														
 
															         """清理所有图像缓存"""