5 ay önce · 73553fa5a5
--- a/ocr_validator/ocr_validator_layout.py
+++ b/ocr_validator/ocr_validator_layout.py
@@ -30,6 +30,9 @@ if str(ocr_platform_root) not in sys.path:
 
				 from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
			
 
				 from ocr_utils.visualization_utils import VisualizationUtils
			
 
				 
			
 
				+# BeautifulSoup用于精确HTML表格处理
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				 # 从本地文件导入 Streamlit 特定函数
			
 
				 from ocr_validator_file_utils import load_css_styles
			
 
				 
			
@@ -69,16 +72,21 @@ class OCRLayoutManager:
 
				     def _highlight_text_safely(self, content: str, text_to_highlight: str, 
			
 
				                                highlight_class: str, title: Optional[str] = None) -> str:
			
 
				         """
			
 
				-        安全地高亮文本，避免替换base64编码中的内容
			
 
				+        安全地高亮文本，保护Markdown语法（特别是图片）
			
 
				+        
			
 
				+        策略：
			
 
				+        1. 保护特殊内容（HTML注释、Markdown图片）
			
 
				+        2. 只对HTML表格使用BeautifulSoup精确处理
			
 
				+        3. 其他部分使用简单字符串替换，保持Markdown格式
			
 
				         
			
 
				         Args:
			
 
				-            content: 要处理的HTML内容
			
 
				+            content: 要处理的Markdown/HTML混合内容
			
 
				             text_to_highlight: 要高亮的文本
			
 
				             highlight_class: 高亮样式类名
			
 
				-            title: 鼠标悬停提示文本，默认为text_to_highlight
			
 
				+            title: 鼠标悬停提示文本
			
 
				         
			
 
				         Returns:
			
 
				-            处理后的HTML内容
			
 
				+            处理后的内容
			
 
				         """
			
 
				         if not text_to_highlight or text_to_highlight not in content:
			
 
				             return content
			
@@ -86,48 +94,78 @@ class OCRLayoutManager:
 
				         if title is None:
			
 
				             title = text_to_highlight
			
 
				         
			
 
				-        # 转义特殊字符用于正则表达式
			
 
				-        escaped_text = re.escape(text_to_highlight)
			
 
				-        
			
 
				-        # 找出所有base64编码区域的位置
			
 
				-        # 匹配两种格式：
			
 
				-        # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
			
 
				-        # 2. Markdown: ![...](data:image/...;base64,...)
			
 
				-        # 使用更通用的模式来匹配base64数据
			
 
				-        base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
			
 
				-        base64_regions = []
			
 
				-        
			
 
				-        for match in re.finditer(base64_pattern, content):
			
 
				-            base64_regions.append((match.start(), match.end()))
			
 
				-        
			
 
				-        # 找出所有要高亮文本的位置
			
 
				-        text_pattern = re.compile(escaped_text)
			
 
				-        matches = []
			
 
				-        
			
 
				-        for match in text_pattern.finditer(content):
			
 
				-            start, end = match.start(), match.end()
			
 
				-            
			
 
				-            # 检查该位置是否在base64区域内
			
 
				-            in_base64 = False
			
 
				-            for base64_start, base64_end in base64_regions:
			
 
				-                if base64_start <= start < base64_end:
			
 
				-                    in_base64 = True
			
 
				-                    break
			
 
				-            
			
 
				-            # 只保留不在base64区域内的匹配
			
 
				-            if not in_base64:
			
 
				-                matches.append((start, end))
			
 
				-        
			
 
				-        # 从后向前替换，避免位置偏移
			
 
				-        for start, end in reversed(matches):
			
 
				-            original_text = content[start:end]
			
 
				-            # 转义HTML特殊字符
			
 
				-            escaped_original = html.escape(original_text)
			
 
				-            escaped_title = html.escape(title)
			
 
				-            highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
			
 
				-            content = content[:start] + highlighted + content[end:]
			
 
				-        
			
 
				-        return content
			
 
				+        try:
			
 
				+            import re
			
 
				+            
			
 
				+            # 1. 提取并保护特殊内容
			
 
				+            protected_parts = []
			
 
				+            
			
 
				+            # 保护 HTML 注释
			
 
				+            def protect_comment(match):
			
 
				+                protected_parts.append(match.group(0))
			
 
				+                return f"__PROTECTED_{len(protected_parts) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'<!--.*?-->', protect_comment, content, flags=re.DOTALL)
			
 
				+            
			
 
				+            # 保护 Markdown 图片（完整语法）
			
 
				+            def protect_image(match):
			
 
				+                protected_parts.append(match.group(0))
			
 
				+                return f"__PROTECTED_{len(protected_parts) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'!\[.*?\]\([^)]+\)', protect_image, content)
			
 
				+            
			
 
				+            # 2. 提取表格并单独处理
			
 
				+            tables = []
			
 
				+            def extract_table(match):
			
 
				+                tables.append(match.group(0))
			
 
				+                return f"__TABLE_{len(tables) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
			
 
				+            
			
 
				+            # 3. 对表格使用 BeautifulSoup 精确处理
			
 
				+            highlighted_tables = []
			
 
				+            
			
 
				+            for table_html in tables:
			
 
				+                soup = BeautifulSoup(table_html, 'html.parser')
			
 
				+                
			
 
				+                # 在表格单元格中查找完全匹配
			
 
				+                for td in soup.find_all(['td', 'th']):
			
 
				+                    cell_text = td.get_text(strip=True)
			
 
				+                    if cell_text == text_to_highlight:
			
 
				+                        # 给整个单元格添加高亮类
			
 
				+                        current_classes = td.get('class', [])
			
 
				+                        td['class'] = current_classes + highlight_class.split()
			
 
				+                        if title:
			
 
				+                            td['title'] = title
			
 
				+                
			
 
				+                highlighted_tables.append(str(soup))
			
 
				+            
			
 
				+            # 4. 对普通文本进行简单替换（保持Markdown格式，跳过占位符）
			
 
				+            if text_to_highlight in content:
			
 
				+                highlight_span = f'<span class="{highlight_class}"'
			
 
				+                if title:
			
 
				+                    highlight_span += f' title="{title}"'
			
 
				+                highlight_span += f'>{text_to_highlight}</span>'
			
 
				+                
			
 
				+                # 🎯 安全替换：使用正则表达式，排除占位符内的匹配
			
 
				+                # 负向前瞻：确保前面不是占位符的一部分
			
 
				+                pattern = f'(?<!__PROTECTED_)(?<!__TABLE_){re.escape(text_to_highlight)}(?!__)'
			
 
				+                content = re.sub(pattern, highlight_span, content)
			
 
				+            
			
 
				+            # 5. 恢复表格
			
 
				+            for i, table in enumerate(highlighted_tables):
			
 
				+                content = content.replace(f"__TABLE_{i}__", table)
			
 
				+            
			
 
				+            # 6. 恢复受保护的内容（图片和注释）
			
 
				+            for i, protected in enumerate(protected_parts):
			
 
				+                content = content.replace(f"__PROTECTED_{i}__", protected)
			
 
				+            
			
 
				+            return content
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            st.warning(f"文本高亮时出错: {str(e)}")
			
 
				+            return content
			
 
				+            
			
 
				     
			
 
				     def clear_image_cache(self):
			
 
				         """清理所有图像缓存"""