SHA1
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,121 @@
 
				+# GitHub Copilot Agent 指令
			
 
				+
			
 
				+## Python 环境要求
			
 
				+
			
 
				+**重要：本项目必须在 `mineru2` conda 环境下运行所有代码。**
			
 
				+
			
 
				+### 环境配置
			
 
				+- **Python 解释器**: `/opt/miniconda3/envs/mineru2/bin/python`
			
 
				+- **Conda 环境**: `mineru2`
			
 
				+- **Python 版本**: 3.12+
			
 
				+- **平台**: macOS (Darwin)
			
 
				+
			
 
				+### 执行命令时的要求
			
 
				+
			
 
				+1. **所有 Python 脚本执行前必须激活环境**：
			
 
				+   ```bash
			
 
				+   conda activate mineru2
			
 
				+   ```
			
 
				+
			
 
				+2. **直接使用完整路径**：
			
 
				+   ```bash
			
 
				+   /opt/miniconda3/envs/mineru2/bin/python script.py
			
 
				+   ```
			
 
				+
			
 
				+3. **使用 run_in_terminal 工具时**，命令格式：
			
 
				+   ```bash
			
 
				+   conda activate mineru2 && python script.py
			
 
				+   ```
			
 
				+
			
 
				+### 项目模块路径
			
 
				+
			
 
				+确保以下路径在 PYTHONPATH 中：
			
 
				+- 当前项目: `/Users/zhch158/workspace/repository.git/ocr_platform`
			
 
				+- MinerU: `/Users/zhch158/workspace/repository.git/MinerU`
			
 
				+- dots.ocr: `/Users/zhch158/workspace/repository.git/dots.ocr`
			
 
				+- PaddleX: `/Users/zhch158/workspace/repository.git/PaddleX`
			
 
				+
			
 
				+### 常用命令模板
			
 
				+
			
 
				+#### 运行 Streamlit 应用
			
 
				+```bash
			
 
				+cd ocr_validator && conda activate mineru2 && streamlit run streamlit_ocr_validator.py --server.runOnSave=true
			
 
				+```
			
 
				+
			
 
				+#### 运行 Python 脚本
			
 
				+```bash
			
 
				+conda activate mineru2 && python script.py
			
 
				+```
			
 
				+
			
 
				+#### 安装依赖
			
 
				+```bash
			
 
				+conda activate mineru2 && pip install package-name
			
 
				+```
			
 
				+
			
 
				+#### 运行测试
			
 
				+```bash
			
 
				+conda activate mineru2 && pytest tests/
			
 
				+```
			
 
				+
			
 
				+### 禁止的操作
			
 
				+
			
 
				+❌ **不要使用系统 Python**：
			
 
				+```bash
			
 
				+# 错误示例
			
 
				+python3 script.py
			
 
				+/usr/bin/python script.py
			
 
				+```
			
 
				+
			
 
				+❌ **不要在其他虚拟环境中运行**：
			
 
				+```bash
			
 
				+# 错误示例
			
 
				+source venv/bin/activate
			
 
				+```
			
 
				+
			
 
				+### 代码生成规则
			
 
				+
			
 
				+1. 生成的 Python 代码应该兼容 Python 3.12
			
 
				+2. 导入语句要考虑项目模块路径
			
 
				+3. 使用类型注解（遵循 PEP 484）
			
 
				+4. 遵循项目现有的代码风格
			
 
				+
			
 
				+### 依赖包说明
			
 
				+
			
 
				+主要依赖（已安装在 mineru2 环境）：
			
 
				+- streamlit >= 1.30.0
			
 
				+- plotly >= 5.18.0
			
 
				+- pandas >= 2.1.0
			
 
				+- torch >= 2.0.0
			
 
				+- paddlepaddle >= 2.5.0
			
 
				+- opencv-python >= 4.8.0
			
 
				+- pillow >= 10.0.0
			
 
				+
			
 
				+### 文件操作规则
			
 
				+
			
 
				+1. 创建新文件时，确保使用项目的模块导入路径
			
 
				+2. 修改配置文件时，保持与 mineru2 环境的一致性
			
 
				+3. 添加新的脚本时，在文件头部添加 shebang：
			
 
				+   ```python
			
 
				+   #!/opt/miniconda3/envs/mineru2/bin/python
			
 
				+   ```
			
 
				+
			
 
				+### 调试和测试
			
 
				+
			
 
				+执行测试或调试时，始终使用：
			
 
				+```bash
			
 
				+conda activate mineru2 && python -m pytest
			
 
				+conda activate mineru2 && python -m pdb script.py
			
 
				+```
			
 
				+
			
 
				+### 环境验证
			
 
				+
			
 
				+在执行任何 Python 代码前，验证环境：
			
 
				+```bash
			
 
				+conda activate mineru2
			
 
				+python -c "import sys; print(sys.executable)"
			
 
				+# 应输出: /opt/miniconda3/envs/mineru2/bin/python
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**记住：任何涉及 Python 代码执行、包安装、测试运行的操作，都必须在 mineru2 环境下进行！**
			
--- a/ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/mineru_wired_table.py
@@ -406,7 +406,6 @@ class MinerUWiredTableRecognizer:
 
				                         unet_cells=bboxes,
			
 
				                         ocr_boxes=ocr_boxes or [],
			
 
				                         pdf_type=pdf_type,
			
 
				-                        upscale=upscale,
			
 
				                         debug_dir=debug_dir,
			
 
				                         debug_prefix=debug_prefix
			
 
				                     )
			
--- a/ocr_tools/universal_doc_parser/models/adapters/wired_table/cell_fusion.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/wired_table/cell_fusion.py
@@ -42,6 +42,7 @@ class CellFusionEngine:
 
				                 - rtdetr_conf_threshold: 0.5 (RT-DETR置信度阈值)
			
 
				                 - enable_ocr_compensation: True (启用OCR补偿)
			
 
				                 - skip_rtdetr_for_txt_pdf: True (文字PDF跳过RT-DETR)
			
 
				+                - enable_boundary_noise_filter: True (启用边界噪声过滤)
			
 
				         """
			
 
				         self.rtdetr_detector = rtdetr_detector
			
 
				         self.config = config or {}
			
@@ -54,10 +55,12 @@ class CellFusionEngine:
 
				         self.rtdetr_conf_threshold = self.config.get('rtdetr_conf_threshold', 0.5)
			
 
				         self.enable_ocr_compensation = self.config.get('enable_ocr_compensation', True)
			
 
				         self.skip_rtdetr_for_txt_pdf = self.config.get('skip_rtdetr_for_txt_pdf', True)
			
 
				+        self.enable_boundary_noise_filter = self.config.get('enable_boundary_noise_filter', True)
			
 
				         
			
 
				         logger.info(f"🔧 CellFusionEngine initialized: "
			
 
				                    f"unet_w={self.unet_weight}, rtdetr_w={self.rtdetr_weight}, "
			
 
				-                   f"iou_merge={self.iou_merge_threshold}, skip_txt_pdf={self.skip_rtdetr_for_txt_pdf}")
			
 
				+                   f"iou_merge={self.iou_merge_threshold}, skip_txt_pdf={self.skip_rtdetr_for_txt_pdf}, "
			
 
				+                   f"boundary_filter={self.enable_boundary_noise_filter}")
			
 
				     
			
 
				     def should_use_rtdetr(
			
 
				         self,
			
@@ -101,7 +104,6 @@ class CellFusionEngine:
 
				         unet_cells: List[List[float]],
			
 
				         ocr_boxes: List[Dict[str, Any]],
			
 
				         pdf_type: str = 'ocr',
			
 
				-        upscale: float = 1.0,
			
 
				         debug_dir: Optional[str] = None,
			
 
				         debug_prefix: str = "fusion"
			
 
				     ) -> Tuple[List[List[float]], Dict[str, Any]]:
			
@@ -113,7 +115,6 @@ class CellFusionEngine:
 
				             unet_cells: UNet检测的单元格列表 [[x1,y1,x2,y2], ...]（原图坐标系）
			
 
				             ocr_boxes: OCR结果列表
			
 
				             pdf_type: PDF类型 ('txt' 或 'ocr')
			
 
				-            upscale: UNet的上采样比例
			
 
				             debug_dir: 调试输出目录（可选）
			
 
				             debug_prefix: 调试文件前缀
			
 
				             
			
@@ -123,6 +124,12 @@ class CellFusionEngine:
 
				             - fusion_stats: 融合统计信息
			
 
				         """
			
 
				         h, w = table_image.shape[:2]
			
 
				+        unet_bbox = [
			
 
				+            min(unet_cells, key=lambda box: box[0])[0], \
			
 
				+            min(unet_cells, key=lambda box: box[1])[1], \
			
 
				+            max(unet_cells, key=lambda box: box[2])[2], \
			
 
				+            max(unet_cells, key=lambda box: box[3])[3]
			
 
				+        ] if unet_cells else [0,0,0,0]
			
 
				         
			
 
				         # 决策：是否使用 RT-DETR
			
 
				         use_rtdetr = self.should_use_rtdetr(pdf_type, len(unet_cells), (w, h))
			
@@ -143,14 +150,6 @@ class CellFusionEngine:
 
				             cell_labels = ['unet_only'] * len(fused_cells)  # 所有都是UNet独有
			
 
				             fusion_stats['fused_count'] = len(fused_cells)
			
 
				             
			
 
				-            # 可选：OCR补偿
			
 
				-            if self.enable_ocr_compensation and ocr_boxes:
			
 
				-                fused_cells, cell_labels, ocr_comp_count = self._compensate_with_ocr(
			
 
				-                    fused_cells, cell_labels, ocr_boxes, (w, h)
			
 
				-                )
			
 
				-                fusion_stats['ocr_compensated_count'] = ocr_comp_count
			
 
				-                fusion_stats['fused_count'] = len(fused_cells)
			
 
				-            
			
 
				             logger.info(f"📊 Fusion (UNet-only): {len(unet_cells)} → {len(fused_cells)} cells")
			
 
				             return fused_cells, fusion_stats
			
 
				         
			
@@ -165,6 +164,12 @@ class CellFusionEngine:
 
				             rtdetr_cells = [res['bbox'] for res in rtdetr_results]
			
 
				             rtdetr_scores = [res['score'] for res in rtdetr_results]
			
 
				             fusion_stats['rtdetr_count'] = len(rtdetr_cells)
			
 
				+            rtdetr_bbox = [
			
 
				+                min(rtdetr_cells, key=lambda box: box[0])[0],
			
 
				+                min(rtdetr_cells, key=lambda box: box[1])[1],
			
 
				+                max(rtdetr_cells, key=lambda box: box[2])[2],
			
 
				+                max(rtdetr_cells, key=lambda box: box[3])[3]
			
 
				+            ] if rtdetr_cells else [0,0,0,0]
			
 
				             
			
 
				             logger.debug(f"RT-DETR detected {len(rtdetr_cells)} cells")
			
 
				         except Exception as e:
			
@@ -175,28 +180,33 @@ class CellFusionEngine:
 
				         
			
 
				         # Phase 2: 智能融合
			
 
				         fused_cells, merge_stats, cell_labels = self._fuse_cells(
			
 
				-            unet_cells, rtdetr_cells, rtdetr_scores
			
 
				+            unet_bbox, unet_cells, rtdetr_cells, rtdetr_scores
			
 
				         )
			
 
				         fusion_stats['merged_count'] = merge_stats['merged']
			
 
				         fusion_stats['merged_cells_count'] = merge_stats['merged_cells']
			
 
				         fusion_stats['added_count'] = merge_stats['added']
			
 
				         
			
 
				         # Phase 3: NMS 去重
			
 
				-        fused_cells = self._nms_filter(fused_cells, self.iou_nms_threshold)
			
 
				-        
			
 
				-        # Phase 4: OCR 补偿（可选）
			
 
				-        # if self.enable_ocr_compensation and ocr_boxes:
			
 
				-        #     fused_cells, cell_labels, ocr_comp_count = self._compensate_with_ocr(
			
 
				-        #         fused_cells, cell_labels, ocr_boxes, (w, h)
			
 
				-        #     )
			
 
				-        #     fusion_stats['ocr_compensated_count'] = ocr_comp_count
			
 
				+        fused_cells, suppressed = self._nms_filter(fused_cells, self.iou_nms_threshold)
			
 
				+        # 同步更新 cell_labels
			
 
				+        cell_labels = [label for label, keep in zip(cell_labels, suppressed) if not keep]
			
 
				+        
			
 
				+        # Phase 4: 边界噪声过滤（过滤掉边界的 unet_only 噪声单元格）
			
 
				+        if self.enable_boundary_noise_filter:
			
 
				+            fused_cells, cell_labels, noise_filtered = self._filter_boundary_noise(
			
 
				+                fused_cells, cell_labels, ocr_boxes, rtdetr_bbox
			
 
				+            )
			
 
				+            fusion_stats['noise_filtered_count'] = noise_filtered
			
 
				+        else:
			
 
				+            fusion_stats['noise_filtered_count'] = 0
			
 
				+            noise_filtered = 0
			
 
				         
			
 
				         fusion_stats['fused_count'] = len(fused_cells)
			
 
				         
			
 
				         logger.info(
			
 
				             f"📊 Fusion (UNet+RT-DETR): UNet={len(unet_cells)}, RT-DETR={len(rtdetr_cells)}, "
			
 
				             f"1:1Merged={merge_stats['merged']}, MergedCells={merge_stats['merged_cells']}, "
			
 
				-            f"Added={merge_stats['added']}, Final={len(fused_cells)}"
			
 
				+            f"Added={merge_stats['added']}, NoiseFiltered={noise_filtered}, Final={len(fused_cells)}"
			
 
				         )
			
 
				         
			
 
				         # 可视化（调试）
			
@@ -210,6 +220,7 @@ class CellFusionEngine:
 
				     
			
 
				     def _fuse_cells(
			
 
				         self,
			
 
				+        unet_bbox: List[float],
			
 
				         unet_cells: List[List[float]],
			
 
				         rtdetr_cells: List[List[float]],
			
 
				         rtdetr_scores: List[float]
			
@@ -232,6 +243,7 @@ class CellFusionEngine:
 
				         - 总覆盖率>40%（所有UNet面积之和 / RT-DETR面积）
			
 
				         
			
 
				         Args:
			
 
				+            unet_bbox: UNet单元格的边界框 [x1, y1, x2, y2]
			
 
				             unet_cells: UNet单元格列表
			
 
				             rtdetr_cells: RT-DETR单元格列表
			
 
				             rtdetr_scores: RT-DETR置信度列表
			
@@ -243,14 +255,6 @@ class CellFusionEngine:
 
				             - cell_labels: 每个单元格的来源标签列表 ['merged_span', 'merged_1to1', 'unet_only', 'rtdetr_only', 'new']
			
 
				         """
			
 
				         
			
 
				-        # 计算unet_cells的边界框bbox[x1,y1,x2,y2]
			
 
				-        unet_bbox = [
			
 
				-                    min(unet_cells, key=lambda box: box[0])[0], \
			
 
				-                    min(unet_cells, key=lambda box: box[1])[1], \
			
 
				-                    max(unet_cells, key=lambda box: box[2])[2], \
			
 
				-                    max(unet_cells, key=lambda box: box[3])[3]
			
 
				-                ] if unet_cells else [0,0,0,0]
			
 
				-                    
			
 
				         fused_cells = []
			
 
				         cell_labels = []  # 记录每个单元格的来源标签
			
 
				         unet_matched = [False] * len(unet_cells)
			
@@ -360,6 +364,7 @@ class CellFusionEngine:
 
				             if best_match_idx >= 0 and best_iou >= self.iou_merge_threshold:
			
 
				                 # 高IoU：加权平均合并
			
 
				                 merged_cell = self._weighted_merge_bbox(
			
 
				+                    unet_bbox,
			
 
				                     unet_cell,
			
 
				                     rtdetr_cells[best_match_idx],
			
 
				                     self.unet_weight,
			
@@ -396,6 +401,7 @@ class CellFusionEngine:
 
				     
			
 
				     def _weighted_merge_bbox(
			
 
				         self,
			
 
				+        table_bbox: List[float],
			
 
				         bbox1: List[float],
			
 
				         bbox2: List[float],
			
 
				         weight1: float,
			
@@ -405,6 +411,7 @@ class CellFusionEngine:
 
				         加权平均合并两个 bbox
			
 
				         
			
 
				         Args:
			
 
				+            table_bbox: 表格整体 bbox（用于限制合并结果）
			
 
				             bbox1: [x1, y1, x2, y2]
			
 
				             bbox2: [x1, y1, x2, y2]
			
 
				             weight1: bbox1 的权重
			
@@ -414,17 +421,17 @@ class CellFusionEngine:
 
				             merged_bbox: [x1, y1, x2, y2]
			
 
				         """
			
 
				         return [
			
 
				-            weight1 * bbox1[0] + weight2 * bbox2[0],
			
 
				-            weight1 * bbox1[1] + weight2 * bbox2[1],
			
 
				-            weight1 * bbox1[2] + weight2 * bbox2[2],
			
 
				-            weight1 * bbox1[3] + weight2 * bbox2[3]
			
 
				+            max(table_bbox[0], weight1 * bbox1[0] + weight2 * bbox2[0]),
			
 
				+            max(table_bbox[1], weight1 * bbox1[1] + weight2 * bbox2[1]),
			
 
				+            min(table_bbox[2], weight1 * bbox1[2] + weight2 * bbox2[2]),
			
 
				+            min(table_bbox[3], weight1 * bbox1[3] + weight2 * bbox2[3])
			
 
				         ]
			
 
				     
			
 
				     def _nms_filter(
			
 
				         self,
			
 
				         cells: List[List[float]],
			
 
				         iou_threshold: float
			
 
				-    ) -> List[List[float]]:
			
 
				+    ) -> Tuple[List[List[float]], List[bool]]:
			
 
				         """
			
 
				         简单 NMS 过滤（去除高度重叠的冗余框）
			
 
				         
			
@@ -436,9 +443,10 @@ class CellFusionEngine:
 
				             
			
 
				         Returns:
			
 
				             过滤后的单元格列表
			
 
				+            抑制标记列表
			
 
				         """
			
 
				         if len(cells) == 0:
			
 
				-            return []
			
 
				+            return [], []
			
 
				         
			
 
				         # 计算面积并排序（大框优先）
			
 
				         areas = [(x2 - x1) * (y2 - y1) for x1, y1, x2, y2 in cells]
			
@@ -463,81 +471,92 @@ class CellFusionEngine:
 
				                     suppressed[other_idx] = True
			
 
				         
			
 
				         logger.debug(f"NMS: {len(cells)} → {len(keep)} cells (threshold={iou_threshold})")
			
 
				-        return keep
			
 
				+        return keep, suppressed
			
 
				     
			
 
				-    def _compensate_with_ocr(
			
 
				+    def _filter_boundary_noise(
			
 
				         self,
			
 
				         cells: List[List[float]],
			
 
				         cell_labels: List[str],
			
 
				         ocr_boxes: List[Dict[str, Any]],
			
 
				-        table_size: Tuple[int, int]
			
 
				+        rtdetr_bbox: List[float]
			
 
				     ) -> Tuple[List[List[float]], List[str], int]:
			
 
				         """
			
 
				-        使用 OCR 补偿遗漏的单元格
			
 
				+        过滤边界噪声单元格
			
 
				         
			
 
				-        策略：如果 OCR 文本没有匹配到任何单元格，创建新单元格
			
 
				+        过滤条件：
			
 
				+        1. 单元格标记为 'unet_only'（只在 UNet 中检测到，RT-DETR 未匹配）
			
 
				+        2. 单元格位于表格边界（左边界或右边界）
			
 
				+        3. 单元格内没有任何 OCR 文本框（说明是空白区域）
			
 
				         
			
 
				         Args:
			
 
				-            cells: 现有单元格列表
			
 
				+            cells: 单元格列表
			
 
				             cell_labels: 单元格标签列表
			
 
				             ocr_boxes: OCR结果列表
			
 
				-            table_size: 表格尺寸 (width, height)
			
 
				-            
			
 
				+            rtdetr_bbox: RT-DETR单元格的边界框 [x1, y1, x2, y2]
			
 
				         Returns:
			
 
				-            (compensated_cells, compensated_labels, compensation_count)
			
 
				+            (filtered_cells, filtered_labels, filtered_count)
			
 
				         """
			
 
				-        compensated = cells.copy()
			
 
				-        compensated_labels = cell_labels.copy()
			
 
				-        compensation_count = 0
			
 
				-        w, h = table_size
			
 
				-        
			
 
				-        for ocr in ocr_boxes:
			
 
				-            ocr_bbox = ocr.get('bbox', [])
			
 
				-            if not ocr_bbox or len(ocr_bbox) < 4:
			
 
				+        filtered_cells = []
			
 
				+        filtered_labels = []
			
 
				+        filtered_count = 0
			
 
				+        
			
 
				+        for cell, label in zip(cells, cell_labels):
			
 
				+            # # 只过滤 unet_only 标记的单元格
			
 
				+            # if label != 'unet_only':
			
 
				+            #     filtered_cells.append(cell)
			
 
				+            #     filtered_labels.append(label)
			
 
				+            #     continue
			
 
				+            
			
 
				+            x1, y1, x2, y2 = cell
			
 
				+            
			
 
				+            # 检查是否在边界
			
 
				+            is_left_boundary = x1 <= rtdetr_bbox[0]
			
 
				+            is_right_boundary = x2 >= rtdetr_bbox[2]
			
 
				+            is_on_boundary = is_left_boundary or is_right_boundary
			
 
				+            
			
 
				+            if not is_on_boundary:
			
 
				+                # 不在边界，保留
			
 
				+                filtered_cells.append(cell)
			
 
				+                filtered_labels.append(label)
			
 
				                 continue
			
 
				             
			
 
				-            # 计算 OCR 中心点
			
 
				-            if len(ocr_bbox) == 8:  # poly format
			
 
				-                ocr_cx = (ocr_bbox[0] + ocr_bbox[2] + ocr_bbox[4] + ocr_bbox[6]) / 4
			
 
				-                ocr_cy = (ocr_bbox[1] + ocr_bbox[3] + ocr_bbox[5] + ocr_bbox[7]) / 4
			
 
				-            else:  # bbox format
			
 
				-                ocr_cx = (ocr_bbox[0] + ocr_bbox[2]) / 2
			
 
				-                ocr_cy = (ocr_bbox[1] + ocr_bbox[3]) / 2
			
 
				-            
			
 
				-            # 检查是否在任何单元格内
			
 
				-            is_covered = False
			
 
				-            for cell in compensated:
			
 
				-                x1, y1, x2, y2 = cell
			
 
				+            # 检查单元格内是否有 OCR 文本框
			
 
				+            has_ocr = False
			
 
				+            for ocr in ocr_boxes:
			
 
				+                ocr_bbox = ocr.get('bbox', [])
			
 
				+                if not ocr_bbox or len(ocr_bbox) < 4:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 计算 OCR 中心点
			
 
				+                if len(ocr_bbox) == 8:  # poly format
			
 
				+                    ocr_cx = (ocr_bbox[0] + ocr_bbox[2] + ocr_bbox[4] + ocr_bbox[6]) / 4
			
 
				+                    ocr_cy = (ocr_bbox[1] + ocr_bbox[3] + ocr_bbox[5] + ocr_bbox[7]) / 4
			
 
				+                else:  # bbox format
			
 
				+                    ocr_cx = (ocr_bbox[0] + ocr_bbox[2]) / 2
			
 
				+                    ocr_cy = (ocr_bbox[1] + ocr_bbox[3]) / 2
			
 
				+                
			
 
				+                # 检查 OCR 中心点是否在当前单元格内
			
 
				                 if x1 <= ocr_cx <= x2 and y1 <= ocr_cy <= y2:
			
 
				-                    is_covered = True
			
 
				+                    has_ocr = True
			
 
				                     break
			
 
				             
			
 
				-            # 如果孤立，创建新单元格
			
 
				-            if not is_covered:
			
 
				-                # 扩展 OCR bbox 作为新单元格
			
 
				-                if len(ocr_bbox) == 8:
			
 
				-                    new_cell = [
			
 
				-                        float(max(0, min(ocr_bbox[0], ocr_bbox[6]) - 5)),
			
 
				-                        float(max(0, min(ocr_bbox[1], ocr_bbox[3]) - 5)),
			
 
				-                        float(min(w, max(ocr_bbox[2], ocr_bbox[4]) + 5)),
			
 
				-                        float(min(h, max(ocr_bbox[5], ocr_bbox[7]) + 5))
			
 
				-                    ]
			
 
				-                else:
			
 
				-                    new_cell = [
			
 
				-                        float(max(0, ocr_bbox[0] - 5)),
			
 
				-                        float(max(0, ocr_bbox[1] - 5)),
			
 
				-                        float(min(w, ocr_bbox[2] + 5)),
			
 
				-                        float(min(h, ocr_bbox[3] + 5))
			
 
				-                    ]
			
 
				-                
			
 
				-                compensated.append(new_cell)
			
 
				-                compensated_labels.append('new')  # 标记为新增（OCR补偿）
			
 
				-                compensation_count += 1
			
 
				+            # 如果在边界且没有 OCR 文本，认为是噪声，过滤掉
			
 
				+            if not has_ocr:
			
 
				+                boundary_type = "left" if is_left_boundary else "right"
			
 
				+                logger.debug(
			
 
				+                    f"🗑️ 过滤边界噪声: {boundary_type} boundary cell "
			
 
				+                    f"[{x1:.1f}, {y1:.1f}, {x2:.1f}, {y2:.1f}] (no OCR)"
			
 
				+                )
			
 
				+                filtered_count += 1
			
 
				+            else:
			
 
				+                # 有 OCR 文本，保留
			
 
				+                filtered_cells.append(cell)
			
 
				+                filtered_labels.append(label)
			
 
				         
			
 
				-        if compensation_count > 0:
			
 
				-            logger.debug(f"OCR compensation: added {compensation_count} cells")
			
 
				+        if filtered_count > 0:
			
 
				+            logger.info(f"🗑️ Boundary noise filtering: removed {filtered_count} unet_only cells from boundaries")
			
 
				         
			
 
				-        return compensated, compensated_labels, compensation_count
			
 
				+        return filtered_cells, filtered_labels, filtered_count
			
 
				     
			
 
				     def _visualize_fusion(
			
 
				         self,
			
--- a/ocr_validator/ocr_validator_layout.py
+++ b/ocr_validator/ocr_validator_layout.py
@@ -30,6 +30,9 @@ if str(ocr_platform_root) not in sys.path:
 
				 from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
			
 
				 from ocr_utils.visualization_utils import VisualizationUtils
			
 
				 
			
 
				+# BeautifulSoup用于精确HTML表格处理
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				 # 从本地文件导入 Streamlit 特定函数
			
 
				 from ocr_validator_file_utils import load_css_styles
			
 
				 
			
@@ -69,16 +72,21 @@ class OCRLayoutManager:
 
				     def _highlight_text_safely(self, content: str, text_to_highlight: str, 
			
 
				                                highlight_class: str, title: Optional[str] = None) -> str:
			
 
				         """
			
 
				-        安全地高亮文本，避免替换base64编码中的内容
			
 
				+        安全地高亮文本，保护Markdown语法（特别是图片）
			
 
				+        
			
 
				+        策略：
			
 
				+        1. 保护特殊内容（HTML注释、Markdown图片）
			
 
				+        2. 只对HTML表格使用BeautifulSoup精确处理
			
 
				+        3. 其他部分使用简单字符串替换，保持Markdown格式
			
 
				         
			
 
				         Args:
			
 
				-            content: 要处理的HTML内容
			
 
				+            content: 要处理的Markdown/HTML混合内容
			
 
				             text_to_highlight: 要高亮的文本
			
 
				             highlight_class: 高亮样式类名
			
 
				-            title: 鼠标悬停提示文本，默认为text_to_highlight
			
 
				+            title: 鼠标悬停提示文本
			
 
				         
			
 
				         Returns:
			
 
				-            处理后的HTML内容
			
 
				+            处理后的内容
			
 
				         """
			
 
				         if not text_to_highlight or text_to_highlight not in content:
			
 
				             return content
			
@@ -86,48 +94,78 @@ class OCRLayoutManager:
 
				         if title is None:
			
 
				             title = text_to_highlight
			
 
				         
			
 
				-        # 转义特殊字符用于正则表达式
			
 
				-        escaped_text = re.escape(text_to_highlight)
			
 
				-        
			
 
				-        # 找出所有base64编码区域的位置
			
 
				-        # 匹配两种格式：
			
 
				-        # 1. HTML: src="data:image/...;base64,..." 或 src='data:image/...;base64,...'
			
 
				-        # 2. Markdown: ![...](data:image/...;base64,...)
			
 
				-        # 使用更通用的模式来匹配base64数据
			
 
				-        base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
			
 
				-        base64_regions = []
			
 
				-        
			
 
				-        for match in re.finditer(base64_pattern, content):
			
 
				-            base64_regions.append((match.start(), match.end()))
			
 
				-        
			
 
				-        # 找出所有要高亮文本的位置
			
 
				-        text_pattern = re.compile(escaped_text)
			
 
				-        matches = []
			
 
				-        
			
 
				-        for match in text_pattern.finditer(content):
			
 
				-            start, end = match.start(), match.end()
			
 
				-            
			
 
				-            # 检查该位置是否在base64区域内
			
 
				-            in_base64 = False
			
 
				-            for base64_start, base64_end in base64_regions:
			
 
				-                if base64_start <= start < base64_end:
			
 
				-                    in_base64 = True
			
 
				-                    break
			
 
				-            
			
 
				-            # 只保留不在base64区域内的匹配
			
 
				-            if not in_base64:
			
 
				-                matches.append((start, end))
			
 
				-        
			
 
				-        # 从后向前替换，避免位置偏移
			
 
				-        for start, end in reversed(matches):
			
 
				-            original_text = content[start:end]
			
 
				-            # 转义HTML特殊字符
			
 
				-            escaped_original = html.escape(original_text)
			
 
				-            escaped_title = html.escape(title)
			
 
				-            highlighted = f'<span class="{highlight_class}" title="{escaped_title}">{escaped_original}</span>'
			
 
				-            content = content[:start] + highlighted + content[end:]
			
 
				-        
			
 
				-        return content
			
 
				+        try:
			
 
				+            import re
			
 
				+            
			
 
				+            # 1. 提取并保护特殊内容
			
 
				+            protected_parts = []
			
 
				+            
			
 
				+            # 保护 HTML 注释
			
 
				+            def protect_comment(match):
			
 
				+                protected_parts.append(match.group(0))
			
 
				+                return f"__PROTECTED_{len(protected_parts) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'<!--.*?-->', protect_comment, content, flags=re.DOTALL)
			
 
				+            
			
 
				+            # 保护 Markdown 图片（完整语法）
			
 
				+            def protect_image(match):
			
 
				+                protected_parts.append(match.group(0))
			
 
				+                return f"__PROTECTED_{len(protected_parts) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'!\[.*?\]\([^)]+\)', protect_image, content)
			
 
				+            
			
 
				+            # 2. 提取表格并单独处理
			
 
				+            tables = []
			
 
				+            def extract_table(match):
			
 
				+                tables.append(match.group(0))
			
 
				+                return f"__TABLE_{len(tables) - 1}__"
			
 
				+            
			
 
				+            content = re.sub(r'<table[^>]*>.*?</table>', extract_table, content, flags=re.DOTALL)
			
 
				+            
			
 
				+            # 3. 对表格使用 BeautifulSoup 精确处理
			
 
				+            highlighted_tables = []
			
 
				+            
			
 
				+            for table_html in tables:
			
 
				+                soup = BeautifulSoup(table_html, 'html.parser')
			
 
				+                
			
 
				+                # 在表格单元格中查找完全匹配
			
 
				+                for td in soup.find_all(['td', 'th']):
			
 
				+                    cell_text = td.get_text(strip=True)
			
 
				+                    if cell_text == text_to_highlight:
			
 
				+                        # 给整个单元格添加高亮类
			
 
				+                        current_classes = td.get('class', [])
			
 
				+                        td['class'] = current_classes + highlight_class.split()
			
 
				+                        if title:
			
 
				+                            td['title'] = title
			
 
				+                
			
 
				+                highlighted_tables.append(str(soup))
			
 
				+            
			
 
				+            # 4. 对普通文本进行简单替换（保持Markdown格式，跳过占位符）
			
 
				+            if text_to_highlight in content:
			
 
				+                highlight_span = f'<span class="{highlight_class}"'
			
 
				+                if title:
			
 
				+                    highlight_span += f' title="{title}"'
			
 
				+                highlight_span += f'>{text_to_highlight}</span>'
			
 
				+                
			
 
				+                # 🎯 安全替换：使用正则表达式，排除占位符内的匹配
			
 
				+                # 负向前瞻：确保前面不是占位符的一部分
			
 
				+                pattern = f'(?<!__PROTECTED_)(?<!__TABLE_){re.escape(text_to_highlight)}(?!__)'
			
 
				+                content = re.sub(pattern, highlight_span, content)
			
 
				+            
			
 
				+            # 5. 恢复表格
			
 
				+            for i, table in enumerate(highlighted_tables):
			
 
				+                content = content.replace(f"__TABLE_{i}__", table)
			
 
				+            
			
 
				+            # 6. 恢复受保护的内容（图片和注释）
			
 
				+            for i, protected in enumerate(protected_parts):
			
 
				+                content = content.replace(f"__PROTECTED_{i}__", protected)
			
 
				+            
			
 
				+            return content
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            st.warning(f"文本高亮时出错: {str(e)}")
			
 
				+            return content
			
 
				+            
			
 
				     
			
 
				     def clear_image_cache(self):
			
 
				         """清理所有图像缓存"""
Yazar	SHA1 Mesaj	Tarih
zhch158_admin	73553fa5a5 feat(OCRLayoutManager): 改进文本高亮功能，保护Markdown语法并精确处理HTML表格	2 ay önce
zhch158_admin	225c4d2795 fix(mineru_wired_table): 移除不必要的upscale参数以简化方法调用	2 ay önce
zhch158_admin	49c5f418ac feat(cell_fusion): 添加边界噪声过滤功能以提高单元格融合质量	2 ay önce
zhch158_admin	aec2884ced feat(docs): 添加GitHub Copilot Agent指令文档，包含Python环境要求和常用命令模板	2 ay önce