Kaynağa Gözat

feat(OCRLayoutManager): 改进文本高亮功能,保护Markdown语法并精确处理HTML表格

zhch158_admin 2 hafta önce
ebeveyn
işleme
73553fa5a5
1 değiştirilmiş dosya ile 84 ekleme ve 46 silme
  1. 84 46
      ocr_validator/ocr_validator_layout.py

+ 84 - 46
ocr_validator/ocr_validator_layout.py

@@ -30,6 +30,9 @@ if str(ocr_platform_root) not in sys.path:
 from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
 from ocr_utils.visualization_utils import VisualizationUtils
 
+# BeautifulSoup用于精确HTML表格处理
+from bs4 import BeautifulSoup
+
 # 从本地文件导入 Streamlit 特定函数
 from ocr_validator_file_utils import load_css_styles
 
@@ -69,16 +72,21 @@ class OCRLayoutManager:
     def _highlight_text_safely(self, content: str, text_to_highlight: str, 
                                highlight_class: str, title: Optional[str] = None) -> str:
         """
-        安全地高亮文本,避免替换base64编码中的内容
+        安全地高亮文本,保护Markdown语法(特别是图片)
+        
+        策略:
+        1. 保护特殊内容(HTML注释、Markdown图片)
+        2. 只对HTML表格使用BeautifulSoup精确处理
+        3. 其他部分使用简单字符串替换,保持Markdown格式
         
         Args:
-            content: 要处理的HTML内容
+            content: 要处理的Markdown/HTML混合内容
             text_to_highlight: 要高亮的文本
             highlight_class: 高亮样式类名
-            title: 鼠标悬停提示文本,默认为text_to_highlight
+            title: 鼠标悬停提示文本
         
         Returns:
-            处理后的HTML内容
+            处理后的内容
         """
         if not text_to_highlight or text_to_highlight not in content:
             return content
@@ -86,48 +94,78 @@ class OCRLayoutManager:
         if title is None:
             title = text_to_highlight
         
-        # 转义特殊字符用于正则表达式
-        escaped_text = re.escape(text_to_highlight)
-        
-        # 找出所有base64编码区域的位置
-        # 匹配两种格式:
-        # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
-        # 2. Markdown: ![...](data:image/...;base64,...)
-        # 使用更通用的模式来匹配base64数据
-        base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
-        base64_regions = []
-        
-        for match in re.finditer(base64_pattern, content):
-            base64_regions.append((match.start(), match.end()))
-        
-        # 找出所有要高亮文本的位置
-        text_pattern = re.compile(escaped_text)
-        matches = []
-        
-        for match in text_pattern.finditer(content):
-            start, end = match.start(), match.end()
-            
-            # 检查该位置是否在base64区域内
-            in_base64 = False
-            for base64_start, base64_end in base64_regions:
-                if base64_start <= start < base64_end:
-                    in_base64 = True
-                    break
-            
-            # 只保留不在base64区域内的匹配
-            if not in_base64:
-                matches.append((start, end))
-        
-        # 从后向前替换,避免位置偏移
-        for start, end in reversed(matches):
-            original_text = content[start:end]
-            # 转义HTML特殊字符
-            escaped_original = html.escape(original_text)
-            escaped_title = html.escape(title)
-            highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
-            content = content[:start] + highlighted + content[end:]
-        
-        return content
+        try:
+            import re
+            
+            # 1. 提取并保护特殊内容
+            protected_parts = []
+            
+            # 保护 HTML 注释
+            def protect_comment(match):
+                protected_parts.append(match.group(0))
+                return f"__PROTECTED_{len(protected_parts) - 1}__"
+            
+            content = re.sub(r'<!--.*?-->', protect_comment, content, flags=re.DOTALL)
+            
+            # 保护 Markdown 图片(完整语法)
+            def protect_image(match):
+                protected_parts.append(match.group(0))
+                return f"__PROTECTED_{len(protected_parts) - 1}__"
+            
+            content = re.sub(r'!\[.*?\]\([^)]+\)', protect_image, content)
+            
+            # 2. 提取表格并单独处理
+            tables = []
+            def extract_table(match):
+                tables.append(match.group(0))
+                return f"__TABLE_{len(tables) - 1}__"
+            
+            content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
+            
+            # 3. 对表格使用 BeautifulSoup 精确处理
+            highlighted_tables = []
+            
+            for table_html in tables:
+                soup = BeautifulSoup(table_html, 'html.parser')
+                
+                # 在表格单元格中查找完全匹配
+                for td in soup.find_all(['td', 'th']):
+                    cell_text = td.get_text(strip=True)
+                    if cell_text == text_to_highlight:
+                        # 给整个单元格添加高亮类
+                        current_classes = td.get('class', [])
+                        td['class'] = current_classes + highlight_class.split()
+                        if title:
+                            td['title'] = title
+                
+                highlighted_tables.append(str(soup))
+            
+            # 4. 对普通文本进行简单替换(保持Markdown格式,跳过占位符)
+            if text_to_highlight in content:
+                highlight_span = f'<span class="{highlight_class}"'
+                if title:
+                    highlight_span += f' title="{title}"'
+                highlight_span += f'>{text_to_highlight}</span>'
+                
+                # 🎯 安全替换:使用正则表达式,排除占位符内的匹配
+                # 负向前瞻:确保前面不是占位符的一部分
+                pattern = f'(?<!__PROTECTED_)(?<!__TABLE_){re.escape(text_to_highlight)}(?!__)'
+                content = re.sub(pattern, highlight_span, content)
+            
+            # 5. 恢复表格
+            for i, table in enumerate(highlighted_tables):
+                content = content.replace(f"__TABLE_{i}__", table)
+            
+            # 6. 恢复受保护的内容(图片和注释)
+            for i, protected in enumerate(protected_parts):
+                content = content.replace(f"__PROTECTED_{i}__", protected)
+            
+            return content
+            
+        except Exception as e:
+            st.warning(f"文本高亮时出错: {str(e)}")
+            return content
+            
     
     def clear_image_cache(self):
         """清理所有图像缓存"""