Prechádzať zdrojové kódy

feat(OCRLayoutManager): 优化表格高亮处理,仅高亮文本而非整个单元格

zhch158_admin 2 týždňov pred
rodič
commit
d602b87e58
1 zmenil súbory, kde vykonal 11 pridanie a 5 odobranie
  1. 11 5
      ocr_validator/ocr_validator_layout.py

+ 11 - 5
ocr_validator/ocr_validator_layout.py

@@ -122,7 +122,7 @@ class OCRLayoutManager:
             
             content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
             
-            # 3. 对表格使用 BeautifulSoup 精确处理
+            # 3. 对表格使用 BeautifulSoup 精确处理(只高亮文本,不高亮整个单元格)
             highlighted_tables = []
             
             for table_html in tables:
@@ -132,11 +132,17 @@ class OCRLayoutManager:
                 for td in soup.find_all(['td', 'th']):
                     cell_text = td.get_text(strip=True)
                     if cell_text == text_to_highlight:
-                        # 给整个单元格添加高亮类
-                        current_classes = td.get('class', [])
-                        td['class'] = current_classes + highlight_class.split()
+                        # 🎯 只高亮文本,不高亮整个单元格
+                        # 清空单元格内容
+                        td.clear()
+                        # 创建高亮 span 包裹文本
+                        span = soup.new_tag('span')
+                        span['class'] = highlight_class.split()
                         if title:
-                            td['title'] = title
+                            span['title'] = title
+                        span.string = text_to_highlight
+                        # 将 span 添加到单元格
+                        td.append(span)
                 
                 highlighted_tables.append(str(soup))