hai 5 meses · fd2189a027
--- a/ocr_validator/ocr_validator_layout.py
+++ b/ocr_validator/ocr_validator_layout.py
@@ -10,6 +10,8 @@ from PIL import Image
 
															 from typing import Dict, List, Optional
														
 
															 import plotly.graph_objects as go
														
 
															 from typing import Tuple
														
 
															+import re
														
 
															+import html
														
 
															 from ocr_validator_utils import (
														
 
															     rotate_image_and_coordinates,
														
@@ -64,6 +66,69 @@ class OCRLayoutManager:
 
															         st.session_state[search_key] = ""
														
 
															         st.session_state[quick_select_key] = 0
														
 
															+    def _highlight_text_safely(self, content: str, text_to_highlight: str, 
														
 
															+                               highlight_class: str, title: Optional[str] = None) -> str:
														
 
															+        """
														
 
															+        安全地高亮文本，避免替换base64编码中的内容
														
 
															+        
														
 
															+        Args:
														
 
															+            content: 要处理的HTML内容
														
 
															+            text_to_highlight: 要高亮的文本
														
 
															+            highlight_class: 高亮样式类名
														
 
															+            title: 鼠标悬停提示文本，默认为text_to_highlight
														
 
															+        
														
 
															+        Returns:
														
 
															+            处理后的HTML内容
														
 
															+        """
														
 
															+        if not text_to_highlight or text_to_highlight not in content:
														
 
															+            return content
														
 
															+        
														
 
															+        if title is None:
														
 
															+            title = text_to_highlight
														
 
															+        
														
 
															+        # 转义特殊字符用于正则表达式
														
 
															+        escaped_text = re.escape(text_to_highlight)
														
 
															+        
														
 
															+        # 找出所有base64编码区域的位置
														
 
															+        # 匹配两种格式：
														
 
															+        # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
														
 
															+        # 2. Markdown: ![...](data:image/...;base64,...)
														
 
															+        # 使用更通用的模式来匹配base64数据
														
 
															+        base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
														
 
															+        base64_regions = []
														
 
															+        
														
 
															+        for match in re.finditer(base64_pattern, content):
														
 
															+            base64_regions.append((match.start(), match.end()))
														
 
															+        
														
 
															+        # 找出所有要高亮文本的位置
														
 
															+        text_pattern = re.compile(escaped_text)
														
 
															+        matches = []
														
 
															+        
														
 
															+        for match in text_pattern.finditer(content):
														
 
															+            start, end = match.start(), match.end()
														
 
															+            
														
 
															+            # 检查该位置是否在base64区域内
														
 
															+            in_base64 = False
														
 
															+            for base64_start, base64_end in base64_regions:
														
 
															+                if base64_start <= start < base64_end:
														
 
															+                    in_base64 = True
														
 
															+                    break
														
 
															+            
														
 
															+            # 只保留不在base64区域内的匹配
														
 
															+            if not in_base64:
														
 
															+                matches.append((start, end))
														
 
															+        
														
 
															+        # 从后向前替换，避免位置偏移
														
 
															+        for start, end in reversed(matches):
														
 
															+            original_text = content[start:end]
														
 
															+            # 转义HTML特殊字符
														
 
															+            escaped_original = html.escape(original_text)
														
 
															+            escaped_title = html.escape(title)
														
 
															+            highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
														
 
															+            content = content[:start] + highlighted + content[end:]
														
 
															+        
														
 
															+        return content
														
 
															+    
														
 
															     def clear_image_cache(self):
														
 
															         """清理所有图像缓存"""
														
 
															         self._rotated_image_cache.clear()
														
@@ -529,7 +594,7 @@ class OCRLayoutManager:
 
															                             match_type = "no_bbox"
														
 
															                     # 🎯 应用高亮
														
 
															-                    if len(selected_text) > 2:
														
 
															+                    if len(selected_text) >= self.config.get('ocr', {}).get('min_text_length', 2):
														
 
															                         # 1. 高亮原始文本
														
 
															                         if selected_text in highlighted_content:
														
 
															                             if match_type == "exact":
														
@@ -539,16 +604,21 @@ class OCRLayoutManager:
 
															                             else:
														
 
															                                 highlight_class = "highlight-text default"
														
 
															-                            highlighted_content = highlighted_content.replace(
														
 
															+                            # 使用正则表达式避免替换base64编码中的内容
														
 
															+                            highlighted_content = self._highlight_text_safely(
														
 
															+                                highlighted_content,
														
 
															                                 selected_text,
														
 
															-                                f'<span class="{highlight_class}" title="{selected_text}">{selected_text}</span>'
														
 
															+                                highlight_class
														
 
															                             )
														
 
															                         # 2. 如果有 matched_text 且不同，也高亮
														
 
															                         if matched_text and matched_text != selected_text and matched_text in highlighted_content:
														
 
															-                            highlighted_content = highlighted_content.replace(
														
 
															+                            # 使用正则表达式避免替换base64编码中的内容
														
 
															+                            highlighted_content = self._highlight_text_safely(
														
 
															+                                highlighted_content,
														
 
															                                 matched_text,
														
 
															-                                f'<span class="highlight-text ocr-match" title="OCR: {matched_text}">{matched_text}</span>'
														
 
															+                                "highlight-text ocr-match",
														
 
															+                                f"OCR: {matched_text}"
														
 
															                             )
														
 
															                 # 🎯 调用渲染方法（样式已内置）