|
@@ -10,6 +10,8 @@ from PIL import Image
|
|
|
from typing import Dict, List, Optional
|
|
from typing import Dict, List, Optional
|
|
|
import plotly.graph_objects as go
|
|
import plotly.graph_objects as go
|
|
|
from typing import Tuple
|
|
from typing import Tuple
|
|
|
|
|
+import re
|
|
|
|
|
+import html
|
|
|
|
|
|
|
|
from ocr_validator_utils import (
|
|
from ocr_validator_utils import (
|
|
|
rotate_image_and_coordinates,
|
|
rotate_image_and_coordinates,
|
|
@@ -64,6 +66,69 @@ class OCRLayoutManager:
|
|
|
st.session_state[search_key] = ""
|
|
st.session_state[search_key] = ""
|
|
|
st.session_state[quick_select_key] = 0
|
|
st.session_state[quick_select_key] = 0
|
|
|
|
|
|
|
|
|
|
+ def _highlight_text_safely(self, content: str, text_to_highlight: str,
|
|
|
|
|
+ highlight_class: str, title: Optional[str] = None) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 安全地高亮文本,避免替换base64编码中的内容
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ content: 要处理的HTML内容
|
|
|
|
|
+ text_to_highlight: 要高亮的文本
|
|
|
|
|
+ highlight_class: 高亮样式类名
|
|
|
|
|
+ title: 鼠标悬停提示文本,默认为text_to_highlight
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 处理后的HTML内容
|
|
|
|
|
+ """
|
|
|
|
|
+ if not text_to_highlight or text_to_highlight not in content:
|
|
|
|
|
+ return content
|
|
|
|
|
+
|
|
|
|
|
+ if title is None:
|
|
|
|
|
+ title = text_to_highlight
|
|
|
|
|
+
|
|
|
|
|
+ # 转义特殊字符用于正则表达式
|
|
|
|
|
+ escaped_text = re.escape(text_to_highlight)
|
|
|
|
|
+
|
|
|
|
|
+ # 找出所有base64编码区域的位置
|
|
|
|
|
+ # 匹配两种格式:
|
|
|
|
|
+ # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
|
|
|
|
|
+ # 2. Markdown: 
|
|
|
|
|
+ # 使用更通用的模式来匹配base64数据
|
|
|
|
|
+ base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
|
|
|
|
+ base64_regions = []
|
|
|
|
|
+
|
|
|
|
|
+ for match in re.finditer(base64_pattern, content):
|
|
|
|
|
+ base64_regions.append((match.start(), match.end()))
|
|
|
|
|
+
|
|
|
|
|
+ # 找出所有要高亮文本的位置
|
|
|
|
|
+ text_pattern = re.compile(escaped_text)
|
|
|
|
|
+ matches = []
|
|
|
|
|
+
|
|
|
|
|
+ for match in text_pattern.finditer(content):
|
|
|
|
|
+ start, end = match.start(), match.end()
|
|
|
|
|
+
|
|
|
|
|
+ # 检查该位置是否在base64区域内
|
|
|
|
|
+ in_base64 = False
|
|
|
|
|
+ for base64_start, base64_end in base64_regions:
|
|
|
|
|
+ if base64_start <= start < base64_end:
|
|
|
|
|
+ in_base64 = True
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ # 只保留不在base64区域内的匹配
|
|
|
|
|
+ if not in_base64:
|
|
|
|
|
+ matches.append((start, end))
|
|
|
|
|
+
|
|
|
|
|
+ # 从后向前替换,避免位置偏移
|
|
|
|
|
+ for start, end in reversed(matches):
|
|
|
|
|
+ original_text = content[start:end]
|
|
|
|
|
+ # 转义HTML特殊字符
|
|
|
|
|
+ escaped_original = html.escape(original_text)
|
|
|
|
|
+ escaped_title = html.escape(title)
|
|
|
|
|
+ highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
|
|
|
|
|
+ content = content[:start] + highlighted + content[end:]
|
|
|
|
|
+
|
|
|
|
|
+ return content
|
|
|
|
|
+
|
|
|
def clear_image_cache(self):
|
|
def clear_image_cache(self):
|
|
|
"""清理所有图像缓存"""
|
|
"""清理所有图像缓存"""
|
|
|
self._rotated_image_cache.clear()
|
|
self._rotated_image_cache.clear()
|
|
@@ -529,7 +594,7 @@ class OCRLayoutManager:
|
|
|
match_type = "no_bbox"
|
|
match_type = "no_bbox"
|
|
|
|
|
|
|
|
# 🎯 应用高亮
|
|
# 🎯 应用高亮
|
|
|
- if len(selected_text) > 2:
|
|
|
|
|
|
|
+ if len(selected_text) >= self.config.get('ocr', {}).get('min_text_length', 2):
|
|
|
# 1. 高亮原始文本
|
|
# 1. 高亮原始文本
|
|
|
if selected_text in highlighted_content:
|
|
if selected_text in highlighted_content:
|
|
|
if match_type == "exact":
|
|
if match_type == "exact":
|
|
@@ -539,16 +604,21 @@ class OCRLayoutManager:
|
|
|
else:
|
|
else:
|
|
|
highlight_class = "highlight-text default"
|
|
highlight_class = "highlight-text default"
|
|
|
|
|
|
|
|
- highlighted_content = highlighted_content.replace(
|
|
|
|
|
|
|
+ # 使用正则表达式避免替换base64编码中的内容
|
|
|
|
|
+ highlighted_content = self._highlight_text_safely(
|
|
|
|
|
+ highlighted_content,
|
|
|
selected_text,
|
|
selected_text,
|
|
|
- f'<span class="{highlight_class}" title="{selected_text}">{selected_text}</span>'
|
|
|
|
|
|
|
+ highlight_class
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 2. 如果有 matched_text 且不同,也高亮
|
|
# 2. 如果有 matched_text 且不同,也高亮
|
|
|
if matched_text and matched_text != selected_text and matched_text in highlighted_content:
|
|
if matched_text and matched_text != selected_text and matched_text in highlighted_content:
|
|
|
- highlighted_content = highlighted_content.replace(
|
|
|
|
|
|
|
+ # 使用正则表达式避免替换base64编码中的内容
|
|
|
|
|
+ highlighted_content = self._highlight_text_safely(
|
|
|
|
|
+ highlighted_content,
|
|
|
matched_text,
|
|
matched_text,
|
|
|
- f'<span class="highlight-text ocr-match" title="OCR: {matched_text}">{matched_text}</span>'
|
|
|
|
|
|
|
+ "highlight-text ocr-match",
|
|
|
|
|
+ f"OCR: {matched_text}"
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 🎯 调用渲染方法(样式已内置)
|
|
# 🎯 调用渲染方法(样式已内置)
|