il y a 6 mois · 444025c466
--- a/ocr_validator/README.md
+++ b/ocr_validator/README.md
@@ -0,0 +1,227 @@
 
				+# 🔍 OCR 可视化验证系统
			
 
				+
			
 
				+一个功能强大的 OCR 识别与验证系统，集成了多种 OCR 工具支持、智能交叉验证、可视化校验和表格数据分析功能，专为财务报表、数据表格等复杂文档设计。
			
 
				+
			
 
				+## 📁 目录结构
			
 
				+
			
 
				+```
			
 
				+ocr_validator/
			
 
				+├── __init__.py                      # 包初始化文件
			
 
				+├── streamlit_ocr_validator.py       # 主应用入口
			
 
				+├── streamlit_validator_core.py      # 核心验证器类
			
 
				+├── streamlit_validator_ui.py        # UI 组件
			
 
				+├── streamlit_validator_table.py     # 表格处理
			
 
				+├── streamlit_validator_cross.py     # 交叉验证
			
 
				+├── streamlit_validator_result.py    # 结果展示
			
 
				+├── ocr_validator_layout.py          # 布局管理
			
 
				+├── ocr_validator_utils.py           # OCR 验证工具函数
			
 
				+├── ocr_validator_file_utils.py     # 文件处理工具（Streamlit 特定）
			
 
				+├── config_manager.py                # 配置管理器
			
 
				+├── run_streamlit_validator.py       # 启动脚本
			
 
				+├── styles.css                       # 样式文件
			
 
				+├── config/                          # 配置文件目录
			
 
				+│   ├── global.yaml
			
 
				+│   └── ... (其他配置文件)
			
 
				+└── README.md                        # 本文档
			
 
				+```
			
 
				+
			
 
				+## ✨ 核心功能
			
 
				+
			
 
				+### 🎯 多工具 OCR 支持
			
 
				+
			
 
				+- **Dots OCR**：专业 VLM OCR 引擎
			
 
				+- **PaddleOCR PPStructV3**：结构化文档识别
			
 
				+- **PaddleOCR-VL**：视觉语言模型增强版
			
 
				+- **Table Recognition V2**：专业表格识别
			
 
				+- **MinerU VLM**：多模态文档理解
			
 
				+
			
 
				+### 🔄 智能交叉验证
			
 
				+
			
 
				+- **多数据源对比**：支持不同 OCR 工具结果的交叉验证
			
 
				+- **细粒度差异检测**：精确到单元格级别的差异分析
			
 
				+- **智能表格对比**：
			
 
				+  - 标准表格模式（`standard`）
			
 
				+  - 流水表格模式（`flow_list`）- 支持表头位置智能检测
			
 
				+  - **资产负债表等多层表头识别**：自动识别总表头和分类标题
			
 
				+  - 自动列类型检测（数字、日期、文本、文本型数字）
			
 
				+- **差异分类统计**：
			
 
				+  - 金额差异（`table_amount`）- 严重度：**high**
			
 
				+  - 日期时间差异（`table_datetime`）- 严重度：**medium**
			
 
				+  - 文本差异（`table_text`）- 严重度：**low/medium**（根据相似度）
			
 
				+  - 表头差异（位置、内容、结构）
			
 
				+  - 段落差异
			
 
				+  - **列类型冲突**（自动提升严重度到 **high**）
			
 
				+
			
 
				+### 📊 可视化校验工具
			
 
				+
			
 
				+- **交互式图像标注**：点击文本高亮对应图像位置
			
 
				+- **精确坐标定位**：基于 bbox 的精确位置标示
			
 
				+- **旋转角度处理**：
			
 
				+  - 自动检测文档旋转角度
			
 
				+  - 支持手动调整旋转
			
 
				+  - 智能坐标转换（使用 `ocr_utils.image_utils.rotate_image_and_coordinates`）
			
 
				+- **多渲染模式**：
			
 
				+  - HTML 渲染（支持横向滚动，使用 `ocr_utils.html_utils.process_html_images`）
			
 
				+  - Markdown 渲染（使用 `ocr_utils.html_utils.process_markdown_images`）
			
 
				+  - DataFrame 表格（使用 `ocr_utils.html_utils.parse_html_tables`）
			
 
				+  - 原始文本
			
 
				+
			
 
				+### 📈 表格数据分析
			
 
				+
			
 
				+- **智能表格解析**：自动识别并转换 HTML 表格（使用 `ocr_utils.html_utils.convert_html_table_to_markdown`）
			
 
				+- **交互式操作**：
			
 
				+  - 过滤（按列值、关键词）
			
 
				+  - 排序（升序/降序）
			
 
				+  - 分页显示
			
 
				+  - 列选择显示
			
 
				+- **数据导出**：支持 CSV、Excel 格式
			
 
				+- **统计分析**：自动生成数值列统计信息
			
 
				+
			
 
				+### 🔧 数字标准化
			
 
				+
			
 
				+- 全角/半角字符转换
			
 
				+- 千分位分隔符标准化
			
 
				+- 小数点格式统一
			
 
				+- 支持批量标准化处理（使用 `ocr_utils.number_utils`）
			
 
				+
			
 
				+## 🚀 快速开始
			
 
				+
			
 
				+### 环境配置
			
 
				+
			
 
				+```bash
			
 
				+# 创建 conda 环境
			
 
				+conda create -n py312 python=3.12 -y
			
 
				+conda activate py312
			
 
				+
			
 
				+# 进入 ocr_platform 目录
			
 
				+cd /path/to/ocr_platform
			
 
				+
			
 
				+# 安装依赖
			
 
				+pip install streamlit plotly pandas pillow numpy opencv-python openpyxl \
			
 
				+    beautifulsoup4 pyyaml fuzzywuzzy python-Levenshtein
			
 
				+```
			
 
				+
			
 
				+### 配置文件
			
 
				+
			
 
				+编辑配置文件（详见 `config/README.md`）：
			
 
				+
			
 
				+#### 1. 全局配置 (`config/global.yaml`)
			
 
				+
			
 
				+```yaml
			
 
				+data_sources:
			
 
				+  - 德_内蒙古银行照.yaml
			
 
				+  - 对公_招商银行图.yaml
			
 
				+  - A用户_单元格扫描流水.yaml
			
 
				+  - B用户_扫描流水.yaml
			
 
				+  - 至远彩色_2023年报.yaml
			
 
				+```
			
 
				+
			
 
				+#### 2. 项目配置示例 (`config/B用户_扫描流水.yaml`)
			
 
				+
			
 
				+```yaml
			
 
				+name: "B用户_扫描流水"
			
 
				+description: "扫描流水数据验证"
			
 
				+data_path: "/path/to/data"
			
 
				+ocr:
			
 
				+  tools:
			
 
				+    dots_ocr:
			
 
				+      name: "Dots OCR"
			
 
				+      json_structure: "array"
			
 
				+      text_field: "text"
			
 
				+      bbox_field: "bbox"
			
 
				+      category_field: "category"
			
 
				+      confidence_field: "confidence"
			
 
				+```
			
 
				+
			
 
				+### 启动应用
			
 
				+
			
 
				+```bash
			
 
				+# 方式 1: 使用启动脚本
			
 
				+cd /path/to/ocr_platform/ocr_validator
			
 
				+python run_streamlit_validator.py
			
 
				+
			
 
				+# 方式 2: 直接使用 Streamlit
			
 
				+streamlit run streamlit_ocr_validator.py
			
 
				+```
			
 
				+
			
 
				+## 🔗 依赖关系
			
 
				+
			
 
				+### 内部依赖
			
 
				+
			
 
				+- **`ocr_utils`**：通用工具模块
			
 
				+  - `ocr_utils.html_utils`：HTML/Markdown 处理
			
 
				+  - `ocr_utils.image_utils`：图像处理和旋转
			
 
				+  - `ocr_utils.visualization_utils`：可视化工具
			
 
				+  - `ocr_utils.number_utils`：数字标准化
			
 
				+
			
 
				+### 外部依赖
			
 
				+
			
 
				+- **Streamlit**：Web 应用框架
			
 
				+- **Pandas**：数据处理
			
 
				+- **Plotly**：数据可视化
			
 
				+- **Pillow**：图像处理
			
 
				+- **OpenCV**：图像方向检测
			
 
				+- **PyYAML**：配置文件解析
			
 
				+
			
 
				+## 📝 使用说明
			
 
				+
			
 
				+### 1. 数据源配置
			
 
				+
			
 
				+在 `config/global.yaml` 中配置数据源列表，每个数据源对应一个 YAML 配置文件。
			
 
				+
			
 
				+### 2. OCR 数据加载
			
 
				+
			
 
				+系统会自动扫描配置的数据路径，查找 OCR 结果文件（JSON 格式）。
			
 
				+
			
 
				+### 3. 交叉验证
			
 
				+
			
 
				+选择两个不同的 OCR 数据源进行对比，系统会自动检测差异并生成报告。
			
 
				+
			
 
				+### 4. 可视化校验
			
 
				+
			
 
				+- 点击文本区域，自动高亮对应的图像位置
			
 
				+- 支持旋转角度调整
			
 
				+- 支持多种渲染模式切换
			
 
				+
			
 
				+### 5. 表格分析
			
 
				+
			
 
				+- 自动识别 HTML 表格
			
 
				+- 支持交互式过滤、排序、分页
			
 
				+- 支持数据导出
			
 
				+
			
 
				+## 🛠️ 开发说明
			
 
				+
			
 
				+### 模块结构
			
 
				+
			
 
				+- **`streamlit_ocr_validator.py`**：主入口，负责页面布局和路由
			
 
				+- **`streamlit_validator_core.py`**：核心验证器类，管理 OCR 数据和状态
			
 
				+- **`streamlit_validator_ui.py`**：UI 组件和页面配置
			
 
				+- **`streamlit_validator_table.py`**：表格处理和分析
			
 
				+- **`streamlit_validator_cross.py`**：交叉验证功能
			
 
				+- **`streamlit_validator_result.py`**：结果展示
			
 
				+- **`ocr_validator_layout.py`**：布局管理和图像处理
			
 
				+- **`ocr_validator_utils.py`**：OCR 验证专用工具函数
			
 
				+- **`ocr_validator_file_utils.py`**：Streamlit 特定的文件处理函数
			
 
				+
			
 
				+### 公共工具复用
			
 
				+
			
 
				+通用工具已提取到 `ocr_utils` 模块：
			
 
				+
			
 
				+- **HTML/Markdown 处理**：`ocr_utils.html_utils`
			
 
				+- **图像旋转**：`ocr_utils.image_utils.rotate_image_and_coordinates`
			
 
				+- **可视化**：`ocr_utils.visualization_utils.VisualizationUtils.draw_bbox_on_image`
			
 
				+
			
 
				+### 添加新的 OCR 工具支持
			
 
				+
			
 
				+1. 在 `config/` 目录下创建工具配置文件
			
 
				+2. 在 `ocr_validator_utils.py` 中添加数据解析函数
			
 
				+3. 更新 `config/global.yaml` 添加数据源
			
 
				+
			
 
				+## 📄 许可证
			
 
				+
			
 
				+[根据项目许可证填写]
			
 
				+
			
 
				+## 🤝 贡献
			
 
				+
			
 
				+欢迎提交 Issue 和 Pull Request！
			
 
				+
			
--- a/ocr_validator/__init__.py
+++ b/ocr_validator/__init__.py
@@ -0,0 +1,25 @@
 
				+"""
			
 
				+OCR Validator - 基于 Streamlit 的 OCR 可视化校验工具
			
 
				+
			
 
				+提供 OCR 结果的交互式验证、交叉验证和可视化功能。
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+from .streamlit_validator_core import StreamlitOCRValidator
			
 
				+from .config_manager import ConfigManager, load_config
			
 
				+
			
 
				+__all__ = [
			
 
				+    'StreamlitOCRValidator',
			
 
				+    'ConfigManager',
			
 
				+    'load_config',
			
 
				+]
			
 
				+
			
--- a/ocr_validator/config/A用户_单元格扫描流水.yaml
+++ b/ocr_validator/config/A用户_单元格扫描流水.yaml
@@ -0,0 +1,61 @@
 
				+document:
			
 
				+  name: "A用户_单元格扫描流水"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水"
			
 
				+
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/B用户_扫描流水.yaml
+++ b/ocr_validator/config/B用户_扫描流水.yaml
@@ -0,0 +1,61 @@
 
				+document:
			
 
				+  name: "B用户_扫描流水"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/README.md
+++ b/ocr_validator/config/README.md
@@ -0,0 +1,479 @@
 
				+# 📋 配置文件说明
			
 
				+
			
 
				+本目录包含 OCR 验证系统的所有配置文件，用于管理数据源、OCR 工具参数和项目特定配置。
			
 
				+
			
 
				+## 📁 文件结构
			
 
				+
			
 
				+```
			
 
				+config/
			
 
				+├── global.yaml                    # 全局配置（必需）
			
 
				+├── 至远彩色_2023年报.yaml         # 年报类项目配置
			
 
				+├── A用户_单元格扫描流水.yaml      # 流水表格配置
			
 
				+├── B用户_扫描流水.yaml            # 流水表格配置
			
 
				+├── 德_内蒙古银行照.yaml           # 银行流水配置
			
 
				+├── 对公_招商银行图.yaml           # 银行流水配置
			
 
				+└── README.md                      # 配置文档（本文件）
			
 
				+```
			
 
				+
			
 
				+## 🔧 配置文件类型
			
 
				+
			
 
				+### 1. 全局配置 (`global.yaml`)
			
 
				+
			
 
				+**用途**：引用所有项目特定配置文件，作为配置入口。
			
 
				+
			
 
				+**示例**：
			
 
				+
			
 
				+```yaml
			
 
				+data_sources:
			
 
				+  - 德_内蒙古银行照.yaml
			
 
				+  - 对公_招商银行图.yaml
			
 
				+  - A用户_单元格扫描流水.yaml
			
 
				+  - B用户_扫描流水.yaml
			
 
				+  - 至远彩色_2023年报.yaml
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 全局配置文件仅列出需要加载的项目配置文件
			
 
				+- 每个项目的具体配置在单独的 YAML 文件中
			
 
				+- 系统启动时会自动加载所有引用的配置文件
			
 
				+
			
 
				+### 2. 项目特定配置
			
 
				+
			
 
				+**用途**：定义每个项目的文档信息和 OCR 工具结果路径。
			
 
				+
			
 
				+#### 配置结构说明
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "项目名称"
			
 
				+  base_dir: "/绝对路径/到/项目根目录"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    - tool: "工具标识"
			
 
				+      result_dir: "结果目录名"
			
 
				+      image_dir: "图像目录路径（支持模板变量）"
			
 
				+      description: "工具描述"
			
 
				+      enabled: true/false
			
 
				+```
			
 
				+
			
 
				+#### 模板变量
			
 
				+
			
 
				+配置文件支持以下模板变量：
			
 
				+
			
 
				+| 变量 | 说明 | 示例 |
			
 
				+|------|------|------|
			
 
				+| `{{name}}` | 项目名称（来自 `document.name`） | `"B用户_扫描流水"` |
			
 
				+
			
 
				+**示例**：
			
 
				+```yaml
			
 
				+image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+# 实际路径：paddleocr_vl_results/B用户_扫描流水
			
 
				+```
			
 
				+
			
 
				+## 📝 配置参数详解
			
 
				+
			
 
				+### 文档配置 (`document`)
			
 
				+
			
 
				+| 参数 | 类型 | 必需 | 说明 | 示例 |
			
 
				+|------|------|------|------|------|
			
 
				+| `name` | string | ✅ | 项目名称 | `"B用户_扫描流水"` |
			
 
				+| `base_dir` | string | ✅ | 项目根目录（绝对路径） | `"/Users/zhch158/workspace/data/流水分析/B用户_扫描流水"` |
			
 
				+
			
 
				+### OCR 结果配置 (`ocr_results`)
			
 
				+
			
 
				+| 参数 | 类型 | 必需 | 说明 | 示例 |
			
 
				+|------|------|------|------|------|
			
 
				+| `tool` | string | ✅ | OCR 工具标识 | `"ppstructv3"` / `"paddleocr_vl"` / `"mineru"` / `"dots_ocr"` |
			
 
				+| `result_dir` | string | ✅ | OCR 结果目录（相对于 `base_dir`） | `"ppstructurev3_client_results"` |
			
 
				+| `image_dir` | string | ✅ | 图像目录路径（支持模板变量 `{{name}}`） | `"ppstructurev3_client_results/{{name}}` |
			
 
				+| `description` | string | ✅ | 工具描述 | `"PPStructV3 图片合成结果"` |
			
 
				+| `enabled` | boolean | ✅ | 是否启用该数据源 | `true` / `false` |
			
 
				+
			
 
				+### 支持的 OCR 工具标识
			
 
				+
			
 
				+| 工具标识 | 工具名称 | 适用场景 |
			
 
				+|---------|---------|---------|
			
 
				+| `ppstructv3` | PPStructV3 | 结构化文档、表格 |
			
 
				+| `paddleocr_vl` | PaddleOCR-VL | VLM 文档理解 |
			
 
				+| `mineru` | MinerU | VLM 表格识别 |
			
 
				+| `dots_ocr` | Dots OCR | VLM 专业识别 |
			
 
				+
			
 
				+**说明**：
			
 
				+- `mineru` 工具标识也用于 PaddleOCR-VL、Dots OCR 等工具的合并结果
			
 
				+- 当 `result_dir` 包含 `cell_bbox` 时，表示该结果已与 PaddleOCR 坐标合并
			
 
				+
			
 
				+## 🎯 配置示例
			
 
				+
			
 
				+### 示例 1：流水表格配置 (`B用户_扫描流水.yaml`)
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "B用户_扫描流水"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+```
			
 
				+
			
 
				+**目录结构**：
			
 
				+```
			
 
				+/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/
			
 
				+├── ppstructurev3_client_results/
			
 
				+│   └── B用户_扫描流水/              # {{name}} 替换后
			
 
				+│       └── *.jpg
			
 
				+├── paddleocr_vl_results/
			
 
				+│   └── B用户_扫描流水/
			
 
				+│       └── *.jpg
			
 
				+├── paddleocr_vl_results_cell_bbox/
			
 
				+│   └── *.json
			
 
				+├── mineru_vllm_results/
			
 
				+│   └── B用户_扫描流水/
			
 
				+│       └── *.jpg
			
 
				+├── mineru_vllm_results_cell_bbox/
			
 
				+│   └── *.json
			
 
				+├── dotsocr_vllm_results/
			
 
				+│   └── B用户_扫描流水/
			
 
				+│       └── *.jpg
			
 
				+└── dotsocr_vllm_results_cell_bbox/
			
 
				+    └── *.json
			
 
				+```
			
 
				+
			
 
				+## 🛠️ 配置最佳实践
			
 
				+
			
 
				+### 1. 目录结构规范
			
 
				+
			
 
				+```
			
 
				+项目根目录/
			
 
				+├── data/
			
 
				+│   └── 流水分析/
			
 
				+│       └── B用户_扫描流水/                    # base_dir
			
 
				+│           ├── ppstructurev3_client_results/  # result_dir
			
 
				+│           │   ├── B用户_扫描流水/            # image_dir ({{name}})
			
 
				+│           │   │   ├── page_001.jpg
			
 
				+│           │   │   └── page_002.jpg
			
 
				+│           │   └── *.json
			
 
				+│           ├── paddleocr_vl_results/
			
 
				+│           │   ├── B用户_扫描流水/
			
 
				+│           │   │   └── *.jpg
			
 
				+│           │   └── *.json
			
 
				+│           └── mineru_vllm_results_cell_bbox/
			
 
				+│               └── *.json
			
 
				+└── ocr_verify/
			
 
				+    └── config/
			
 
				+        ├── global.yaml
			
 
				+        └── B用户_扫描流水.yaml
			
 
				+```
			
 
				+
			
 
				+### 2. 路径配置建议
			
 
				+
			
 
				+- **使用绝对路径**：`base_dir` 必须是绝对路径
			
 
				+- **result_dir 使用相对路径**：相对于 `base_dir`
			
 
				+- **image_dir 支持模板变量**：使用 `{{name}}` 引用项目名称
			
 
				+- **路径分隔符**：统一使用 `/`
			
 
				+
			
 
				+### 3. 工具选择建议
			
 
				+
			
 
				+| 场景 | 推荐工具组合 | 配置示例 |
			
 
				+|------|-------------|---------|
			
 
				+| 财务报表 | MinerU + MinerU (cell bbox) | `tool: "mineru"` + `result_dir` 包含 `cell_bbox` |
			
 
				+| 银行流水 | Dots OCR + PaddleOCR-VL | `tool: "dots_ocr"` + `tool: "paddleocr_vl"` |
			
 
				+| 复杂表格 | 多工具交叉验证 | 启用所有 `enabled: true` 的工具 |
			
 
				+
			
 
				+### 4. 模板变量使用
			
 
				+
			
 
				+**支持的位置**：
			
 
				+- `image_dir` 路径中
			
 
				+
			
 
				+**变量来源**：
			
 
				+- `{{name}}`：来自 `document.name`
			
 
				+
			
 
				+**示例**：
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "B用户_扫描流水"
			
 
				+  
			
 
				+ocr_results:
			
 
				+  - image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+    # 实际路径：paddleocr_vl_results/B用户_扫描流水
			
 
				+```
			
 
				+
			
 
				+### 5. 合并结果配置
			
 
				+
			
 
				+当使用合并工具（如 `merge_mineru_paddle_ocr.py`）生成带坐标的结果时：
			
 
				+
			
 
				+```yaml
			
 
				+ocr_results:
			
 
				+  # 原始 MinerU 结果
			
 
				+  - tool: "mineru"
			
 
				+    result_dir: "mineru_vllm_results"
			
 
				+    image_dir: "mineru_vllm_results/{{name}}"
			
 
				+    description: "MinerU 图片合成结果"
			
 
				+    enabled: true
			
 
				+  
			
 
				+  # 合并后的结果（MinerU + PaddleOCR 坐标）
			
 
				+  - tool: "mineru"
			
 
				+    result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+    image_dir: "mineru_vllm_results/{{name}}"
			
 
				+    description: "MinerU + PaddleOCR 坐标"
			
 
				+    enabled: true
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 合并结果后的`tool` 统一到**mineru**格式的工具
			
 
				+- `result_dir` 使用不同的目录名（通常包含 `cell_bbox`）
			
 
				+- `image_dir` 可以复用原始工具的图像路径
			
 
				+
			
 
				+## 🔍 添加新项目配置
			
 
				+
			
 
				+### 步骤 1：创建文档配置文件
			
 
				+
			
 
				+在 `config/` 目录下创建新的 YAML 文件，如 `新文档名.yaml`：
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "新文档名"
			
 
				+  base_dir: "/绝对路径/到/新项目"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructv3_results"
			
 
				+      image_dir: "ppstructv3_results/{{name}}"
			
 
				+      description: "PPStructV3 识别结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_results"
			
 
				+      image_dir: "mineru_results/{{name}}"
			
 
				+      description: "MinerU 识别结果"
			
 
				+      enabled: true
			
 
				+```
			
 
				+
			
 
				+### 步骤 2：在 global.yaml 中引用
			
 
				+
			
 
				+编辑 `global.yaml`，添加新配置文件：
			
 
				+
			
 
				+```yaml
			
 
				+data_sources:
			
 
				+  - 德_内蒙古银行照.yaml
			
 
				+  - 对公_招商银行图.yaml
			
 
				+  - A用户_单元格扫描流水.yaml
			
 
				+  - B用户_扫描流水.yaml
			
 
				+  - 至远彩色_2023年报.yaml
			
 
				+  - 新文档名.yaml          # 新增
			
 
				+```
			
 
				+
			
 
				+### 步骤 3：验证配置
			
 
				+
			
 
				+```bash
			
 
				+# 验证 YAML 语法
			
 
				+python -c "import yaml; yaml.safe_load(open('config/新文档名.yaml'))"
			
 
				+
			
 
				+# 启动应用测试
			
 
				+python -m streamlit run streamlit_ocr_validator.py
			
 
				+```
			
 
				+
			
 
				+## 🔧 调试配置
			
 
				+
			
 
				+### 验证配置文件语法
			
 
				+
			
 
				+```bash
			
 
				+# 验证单个配置文件
			
 
				+python -c "import yaml; yaml.safe_load(open('config/B用户_扫描流水.yaml'))"
			
 
				+
			
 
				+# 验证全局配置
			
 
				+python -c "import yaml; yaml.safe_load(open('config/global.yaml'))"
			
 
				+
			
 
				+# 查看解析后的配置
			
 
				+python -c "
			
 
				+import yaml
			
 
				+with open('config/B用户_扫描流水.yaml') as f:
			
 
				+    config = yaml.safe_load(f)
			
 
				+    print(yaml.dump(config, default_flow_style=False, allow_unicode=True))
			
 
				+"
			
 
				+```
			
 
				+
			
 
				+### 检查路径有效性
			
 
				+
			
 
				+```bash
			
 
				+# macOS/Linux
			
 
				+ls -la /Users/zhch158/workspace/data/流水分析/B用户_扫描流水
			
 
				+
			
 
				+# 检查结果目录
			
 
				+ls -la /Users/zhch158/workspace/data/流水分析/B用户_扫描流水/ppstructurev3_client_results
			
 
				+
			
 
				+# 检查图像目录（替换模板变量后）
			
 
				+ls -la /Users/zhch158/workspace/data/流水分析/B用户_扫描流水/ppstructurev3_client_results/B用户_扫描流水
			
 
				+```
			
 
				+
			
 
				+## 🐛 常见问题
			
 
				+
			
 
				+### Q1: 配置文件不生效？
			
 
				+
			
 
				+**A:** 
			
 
				+1. 检查 `global.yaml` 中是否正确引用了项目配置文件
			
 
				+2. 确认 YAML 语法正确（缩进必须用空格，不能用 Tab）
			
 
				+3. 确保配置文件在 `config/` 目录下
			
 
				+4. 重启 Streamlit 应用
			
 
				+
			
 
				+### Q2: 找不到 OCR 输出文件？
			
 
				+
			
 
				+**A:** 
			
 
				+- 检查 `base_dir` 路径是否正确（必须是绝对路径）
			
 
				+- 检查 `result_dir` 是否存在于 `base_dir` 下
			
 
				+- 确认 `image_dir` 的模板变量 `{{name}}` 是否正确替换
			
 
				+- 确保目录存在且包含 JSON 和图像文件
			
 
				+
			
 
				+### Q3: 模板变量未替换？
			
 
				+
			
 
				+**A:** 
			
 
				+- 确认使用了正确的模板变量语法：`{{name}}`
			
 
				+- 检查 `document.name` 是否已定义
			
 
				+- 查看 Streamlit 控制台的路径解析日志
			
 
				+
			
 
				+### Q4: 合并结果无法加载？
			
 
				+
			
 
				+**A:** 
			
 
				+- 确认合并结果的 `result_dir` 存在
			
 
				+- 检查 `tool` 标识是否正确（通常与原始工具一致）
			
 
				+- 确保 `enabled: true`
			
 
				+
			
 
				+### Q5: enabled 参数的作用？
			
 
				+
			
 
				+**A:** 
			
 
				+- `enabled: true`：该数据源会显示在 Streamlit 数据源列表中
			
 
				+- `enabled: false`：该数据源不会加载，但配置保留（方便临时禁用）
			
 
				+
			
 
				+## 📚 参考资料
			
 
				+
			
 
				+- [主项目 README](../README.md) - 系统总览
			
 
				+- [Comparator 模块文档](../comparator/README.md) - 对比算法详解
			
 
				+- [Merger 模块文档](../merger/README.md) - OCR 结果合并
			
 
				+- [Batch OCR 模块文档](../batch_ocr/README.md) - 批量处理工具
			
 
				+
			
 
				+## 📋 配置文件模板
			
 
				+
			
 
				+### 基础模板
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "项目名称"
			
 
				+  base_dir: "/绝对路径/到/项目根目录"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    - tool: "工具标识"
			
 
				+      result_dir: "结果目录名"
			
 
				+      image_dir: "图像目录路径"
			
 
				+      description: "工具描述"
			
 
				+      enabled: true
			
 
				+```
			
 
				+
			
 
				+### 多工具模板
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "项目名称"
			
 
				+  base_dir: "/绝对路径/到/项目根目录"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructv3_results"
			
 
				+      image_dir: "ppstructv3_results/{{name}}"
			
 
				+      description: "PPStructV3 识别结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR-VL 识别结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_results"
			
 
				+      image_dir: "mineru_results/{{name}}"
			
 
				+      description: "MinerU 识别结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # Dots OCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_results"
			
 
				+      image_dir: "dotsocr_results/{{name}}"
			
 
				+      description: "Dots OCR 识别结果"
			
 
				+      enabled: true
			
 
				+```
			
 
				+
			
 
				+### 包含合并结果的模板
			
 
				+
			
 
				+```yaml
			
 
				+document:
			
 
				+  name: "项目名称"
			
 
				+  base_dir: "/绝对路径/到/项目根目录"
			
 
				+  
			
 
				+  ocr_results:
			
 
				+    # 原始结果
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_results"
			
 
				+      image_dir: "mineru_results/{{name}}"
			
 
				+      description: "MinerU 原始结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # 合并结果（带 cell bbox）
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_results_cell_bbox"
			
 
				+      image_dir: "mineru_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**最后更新**: 2025年11月7日
			
--- a/ocr_validator/config/global.yaml
+++ b/ocr_validator/config/global.yaml
@@ -0,0 +1,156 @@
 
				+# OCR验证工具配置文件
			
 
				+
			
 
				+# 样式配置
			
 
				+styles:
			
 
				+  font_size: 8
			
 
				+  
			
 
				+  colors:
			
 
				+    primary: "#0288d1"
			
 
				+    secondary: "#ff9800"
			
 
				+    success: "#4caf50"
			
 
				+    error: "#f44336"
			
 
				+    warning: "#ff9800"
			
 
				+    background: "#fafafa"
			
 
				+    text: "#333333"
			
 
				+  
			
 
				+  layout:
			
 
				+    default_zoom: 1.0
			
 
				+    default_height: 800
			
 
				+    sidebar_width: 1
			
 
				+    content_width: 0.65
			
 
				+
			
 
				+# 界面配置
			
 
				+ui:
			
 
				+  page_title: "OCR可视化校验工具"
			
 
				+  page_icon: "🔍"
			
 
				+  layout: "wide"
			
 
				+  sidebar_state: "expanded"
			
 
				+  
			
 
				+# OCR数据配置
			
 
				+ocr:
			
 
				+  min_text_length: 2
			
 
				+  default_confidence: 1.0
			
 
				+  exclude_texts: ["Picture", ""]
			
 
				+  
			
 
				+  # 图片方向检测配置
			
 
				+  orientation_detection:
			
 
				+    enabled: true
			
 
				+    confidence_threshold: 0.3  # 置信度阈值
			
 
				+    methods: ["opencv_analysis"]  # 检测方法
			
 
				+    cache_results: true  # 缓存检测结果
			
 
				+  
			
 
				+  # OCR工具类型配置
			
 
				+  tools:
			
 
				+    dots_ocr:
			
 
				+      name: "Dots OCR"
			
 
				+      description: "专业VLM OCR"
			
 
				+      json_structure: "array"  # JSON为数组格式
			
 
				+      text_field: "text"
			
 
				+      bbox_field: "bbox"
			
 
				+      category_field: "category"
			
 
				+      confidence_field: "confidence"
			
 
				+      # 旋转处理配置
			
 
				+      rotation:
			
 
				+        coordinates_are_pre_rotated: false  # 坐标不是预旋转的
			
 
				+        
			
 
				+    ppstructv3:
			
 
				+      name: "PPStructV3"
			
 
				+      description: "PaddleOCR PP-StructureV3"
			
 
				+      json_structure: "object"  # JSON为对象格式
			
 
				+      parsing_results_field: "parsing_res_list"
			
 
				+      text_field: "block_content"
			
 
				+      bbox_field: "block_bbox"
			
 
				+      rec_texts_field: "overall_ocr_res.rec_texts" # 针对表格中的文字块
			
 
				+      rec_boxes_field: "overall_ocr_res.rec_boxes" # 针对表格中的文字块
			
 
				+      category_field: "block_label"
			
 
				+      confidence_field: "confidence"
			
 
				+      # 旋转处理配置
			
 
				+      rotation:
			
 
				+        coordinates_are_pre_rotated: true  # 坐标已经是预旋转的
			
 
				+      
			
 
				+    table_recognition_v2:
			
 
				+      name: "TableRecognitionV2"
			
 
				+      description: "PaddleOCR Table Recognition V2"
			
 
				+      json_structure: "object"
			
 
				+      parsing_results_field: "table_res_list"
			
 
				+      text_field: "pred_html"
			
 
				+      bbox_field: "cell_box_list"            # 原先的 cell_box_listox 为笔误
			
 
				+      rec_texts_field: "table_ocr_pred.rec_texts" # 针对表格中的文字块
			
 
				+      rec_boxes_field: "table_ocr_pred.rec_boxes" # 针对表格中的文字块
			
 
				+      category_field: "type"
			
 
				+      confidence_field: "confidence"
			
 
				+      rotation:
			
 
				+        coordinates_are_pre_rotated: true
			
 
				+    
			
 
				+    mineru:
			
 
				+      name: "MinerU"
			
 
				+      description: "MinerU OCR"
			
 
				+      json_structure: "array"  # JSON为数组格式
			
 
				+      text_field: "text"
			
 
				+      bbox_field: "bbox"
			
 
				+      category_field: "type"
			
 
				+      confidence_field: "confidence"
			
 
				+      # 表格相关字段
			
 
				+      table_body_field: "table_body"
			
 
				+      table_cells_field: "table_cells"
			
 
				+      img_path_field: "img_path"
			
 
				+      # 旋转处理配置
			
 
				+      rotation:
			
 
				+        coordinates_are_pre_rotated: false
			
 
				+  
			
 
				+  # 自动检测工具类型的规则（按优先级从高到低）
			
 
				+  auto_detection:
			
 
				+    enabled: true
			
 
				+    rules:
			
 
				+      # Table Recognition V2 - 最高优先级
			
 
				+      - tool_type: "table_recognition_v2"
			
 
				+        conditions:
			
 
				+          - type: "field_exists"
			
 
				+            field: "table_res_list"
			
 
				+          - type: "field_not_exists"
			
 
				+            field: "parsing_res_list"
			
 
				+        priority: 4
			
 
				+      
			
 
				+      # PPStructV3 - 第二优先级
			
 
				+      - tool_type: "ppstructv3"
			
 
				+        conditions:
			
 
				+          - type: "field_exists"
			
 
				+            field: "parsing_res_list"
			
 
				+          - type: "field_exists"
			
 
				+            field: "doc_preprocessor_res"
			
 
				+        priority: 2
			
 
				+      
			
 
				+      # MinerU - 第三优先级
			
 
				+      - tool_type: "mineru"
			
 
				+        conditions:
			
 
				+          - type: "field_exists"
			
 
				+            field: "page_idx"
			
 
				+          - type: "field_exists"
			
 
				+            field: "type"
			
 
				+          - type: "json_structure"
			
 
				+            structure: "array"
			
 
				+        priority: 1
			
 
				+      
			
 
				+      # Dots OCR - 最低优先级（默认）
			
 
				+      - tool_type: "dots_ocr"
			
 
				+        conditions:
			
 
				+          - type: "json_structure"
			
 
				+            structure: "array"
			
 
				+          - type: "field_exists"
			
 
				+            field: "category"
			
 
				+        priority: 3
			
 
				+
			
 
				+# 预校验结果文件路径
			
 
				+pre_validation:
			
 
				+  out_dir: "./output/pre_validation/"
			
 
				+
			
 
				+data_sources:
			
 
				+  - 德_内蒙古银行照.yaml
			
 
				+  - 对公_招商银行图.yaml
			
 
				+  - A用户_单元格扫描流水.yaml
			
 
				+  - B用户_扫描流水.yaml
			
 
				+  - 康强_北京农村商业银行.yaml
			
 
				+  - 施博深.yaml
			
 
				+  - 山西云集科技有限公司.yaml
			
 
				+  - 至远彩色_2023年报.yaml
			
 
				+
			
--- a/ocr_validator/config/对公_招商银行图.yaml
+++ b/ocr_validator/config/对公_招商银行图.yaml
@@ -0,0 +1,61 @@
 
				+document:
			
 
				+  name: "对公_招商银行图"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/对公_招商银行图"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/山西云集科技有限公司.yaml
+++ b/ocr_validator/config/山西云集科技有限公司.yaml
@@ -0,0 +1,61 @@
 
				+document:
			
 
				+  name: "山西云集科技有限公司"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/山西云集科技有限公司"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/康强_北京农村商业银行.yaml
+++ b/ocr_validator/config/康强_北京农村商业银行.yaml
@@ -0,0 +1,62 @@
 
				+# 文档: 康强_北京农村商业银行
			
 
				+document:
			
 
				+  name: "康强_北京农村商业银行"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/康强_北京农村商业银行"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/德_内蒙古银行照.yaml
+++ b/ocr_validator/config/德_内蒙古银行照.yaml
@@ -0,0 +1,62 @@
 
				+# 文档: 德_内蒙古银行照
			
 
				+document:
			
 
				+  name: "德_内蒙古银行照"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/施博深.yaml
+++ b/ocr_validator/config/施博深.yaml
@@ -0,0 +1,68 @@
 
				+document:
			
 
				+  name: "施博深"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/施博深"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_wired_unet
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_wired_unet"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架, 数据标注后"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+        
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config/至远彩色_2023年报.yaml
+++ b/ocr_validator/config/至远彩色_2023年报.yaml
@@ -0,0 +1,61 @@
 
				+document:
			
 
				+  name: "2023年度报告母公司"
			
 
				+  base_dir: "/Users/zhch158/workspace/data/流水分析/2023年度报告母公司"
			
 
				+  
			
 
				+  # 🎯 关键改进：定义该文档使用的 OCR 工具及其结果目录
			
 
				+  ocr_results:
			
 
				+    # bank_statement_yusys_v2
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "bank_statement_yusys_v2"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "YUSYS统一OCR框架"
			
 
				+      enabled: true
			
 
				+
			
 
				+     # MinerU
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # MinerU (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "mineru_vllm_results_cell_bbox"
			
 
				+      image_dir: "mineru_vllm_results/{{name}}"
			
 
				+      description: "MinerU + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+   
			
 
				+    # PaddleOCR-VL
			
 
				+    - tool: "paddleocr_vl"
			
 
				+      result_dir: "paddleocr_vl_results"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM 图片合成结果"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # PaddleOCR-VL (带 cell bbox)
			
 
				+    - tool: "mineru"  # 格式同 MinerU
			
 
				+      result_dir: "paddleocr_vl_results_cell_bbox"
			
 
				+      image_dir: "paddleocr_vl_results/{{name}}"
			
 
				+      description: "PaddleOCR VLM + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+    
			
 
				+    # DotsOCR
			
 
				+    - tool: "dots_ocr"
			
 
				+      result_dir: "dotsocr_vllm_results"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR 图片合成结果"
			
 
				+      enabled: true
			
 
				+  
			
 
				+    # DotsOCR (带 cell bbox)
			
 
				+    - tool: "mineru"
			
 
				+      result_dir: "dotsocr_vllm_results_cell_bbox"
			
 
				+      image_dir: "dotsocr_vllm_results/{{name}}"
			
 
				+      description: "Dots OCR + PaddleOCR 坐标"
			
 
				+      enabled: true
			
 
				+
			
 
				+    # PPStructV3
			
 
				+    - tool: "ppstructv3"
			
 
				+      result_dir: "ppstructurev3_client_results"
			
 
				+      image_dir: "ppstructurev3_client_results/{{name}}"
			
 
				+      description: "PPStructV3 图片合成结果"
			
 
				+      enabled: true
			
--- a/ocr_validator/config_manager.py
+++ b/ocr_validator/config_manager.py
@@ -0,0 +1,355 @@
 
				+"""
			
 
				+配置管理器
			
 
				+支持分层配置和自动发现数据源
			
 
				+支持 Jinja2 模板变量
			
 
				+"""
			
 
				+
			
 
				+import yaml
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional, Any
			
 
				+from dataclasses import dataclass, field
			
 
				+import logging
			
 
				+from jinja2 import Template  # 🎯 新增
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class OCRToolConfig:
			
 
				+    """OCR 工具配置"""
			
 
				+    name: str
			
 
				+    description: str
			
 
				+    json_structure: str
			
 
				+    text_field: str
			
 
				+    bbox_field: str
			
 
				+    category_field: str
			
 
				+    confidence_field: str = "confidence"
			
 
				+    parsing_results_field: Optional[str] = None
			
 
				+    rec_texts_field: Optional[str] = None
			
 
				+    rec_boxes_field: Optional[str] = None
			
 
				+    table_body_field: Optional[str] = None
			
 
				+    table_cells_field: Optional[str] = None
			
 
				+    img_path_field: Optional[str] = None
			
 
				+    rotation: Dict[str, Any] = field(default_factory=dict)
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, tool_id: str, data: Dict) -> 'OCRToolConfig':
			
 
				+        """从字典创建"""
			
 
				+        return cls(
			
 
				+            name=data.get('name', tool_id),
			
 
				+            description=data.get('description', ''),
			
 
				+            json_structure=data.get('json_structure', 'object'),
			
 
				+            text_field=data.get('text_field', 'text'),
			
 
				+            bbox_field=data.get('bbox_field', 'bbox'),
			
 
				+            category_field=data.get('category_field', 'category'),
			
 
				+            confidence_field=data.get('confidence_field', 'confidence'),
			
 
				+            parsing_results_field=data.get('parsing_results_field'),
			
 
				+            rec_texts_field=data.get('rec_texts_field'),
			
 
				+            rec_boxes_field=data.get('rec_boxes_field'),
			
 
				+            table_body_field=data.get('table_body_field'),
			
 
				+            table_cells_field=data.get('table_cells_field'),
			
 
				+            img_path_field=data.get('img_path_field'),
			
 
				+            rotation=data.get('rotation', {})
			
 
				+        )
			
 
				+    
			
 
				+    def to_dict(self) -> Dict:
			
 
				+        """转换为字典（用于 OCRValidator）"""
			
 
				+        config_dict = {
			
 
				+            'name': self.name,
			
 
				+            'description': self.description,
			
 
				+            'json_structure': self.json_structure,
			
 
				+            'text_field': self.text_field,
			
 
				+            'bbox_field': self.bbox_field,
			
 
				+            'category_field': self.category_field,
			
 
				+            'confidence_field': self.confidence_field,
			
 
				+            'rotation': self.rotation
			
 
				+        }
			
 
				+        
			
 
				+        # 添加可选字段
			
 
				+        if self.parsing_results_field:
			
 
				+            config_dict['parsing_results_field'] = self.parsing_results_field
			
 
				+        if self.rec_texts_field:
			
 
				+            config_dict['rec_texts_field'] = self.rec_texts_field
			
 
				+        if self.rec_boxes_field:
			
 
				+            config_dict['rec_boxes_field'] = self.rec_boxes_field
			
 
				+        if self.table_body_field:
			
 
				+            config_dict['table_body_field'] = self.table_body_field
			
 
				+        if self.table_cells_field:
			
 
				+            config_dict['table_cells_field'] = self.table_cells_field
			
 
				+        if self.img_path_field:
			
 
				+            config_dict['img_path_field'] = self.img_path_field
			
 
				+        
			
 
				+        return config_dict
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class OCRResultConfig:
			
 
				+    """OCR 结果配置"""
			
 
				+    tool: str
			
 
				+    result_dir: str
			
 
				+    image_dir: Optional[str]
			
 
				+    description: str = ""
			
 
				+    enabled: bool = True
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data: Dict, context: Dict = None) -> 'OCRResultConfig':
			
 
				+        """
			
 
				+        🎯 从字典创建（支持 Jinja2 模板）
			
 
				+        
			
 
				+        Args:
			
 
				+            data: 配置数据
			
 
				+            context: 模板上下文（如 {'name': '德_内蒙古银行照'}）
			
 
				+        """
			
 
				+        # 🎯 渲染模板
			
 
				+        if context:
			
 
				+            result_dir = cls._render_template(data['result_dir'], context)
			
 
				+            image_dir = cls._render_template(data.get('image_dir'), context) if data.get('image_dir') else None
			
 
				+            description = cls._render_template(data.get('description', ''), context)
			
 
				+        else:
			
 
				+            result_dir = data['result_dir']
			
 
				+            image_dir = data.get('image_dir')
			
 
				+            description = data.get('description', '')
			
 
				+        
			
 
				+        return cls(
			
 
				+            tool=data['tool'],
			
 
				+            result_dir=result_dir,
			
 
				+            image_dir=image_dir,
			
 
				+            description=description,
			
 
				+            enabled=data.get('enabled', True)
			
 
				+        )
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def _render_template(template_str: Optional[str], context: Dict) -> Optional[str]:
			
 
				+        """🎯 渲染 Jinja2 模板"""
			
 
				+        if not template_str:
			
 
				+            return None
			
 
				+        
			
 
				+        try:
			
 
				+            template = Template(template_str)
			
 
				+            return template.render(context)
			
 
				+        except Exception as e:
			
 
				+            logging.warning(f"模板渲染失败: {template_str}, 错误: {e}")
			
 
				+            return template_str
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentConfig:
			
 
				+    """文档配置"""
			
 
				+    name: str
			
 
				+    base_dir: str
			
 
				+    ocr_results: List[OCRResultConfig] = field(default_factory=list)
			
 
				+    
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data: Dict) -> 'DocumentConfig':
			
 
				+        """从字典创建（支持 Jinja2 模板）"""
			
 
				+        doc_data = data.get('document', data)
			
 
				+        
			
 
				+        # 🎯 构建模板上下文
			
 
				+        context = {
			
 
				+            'name': doc_data['name'],
			
 
				+            'base_dir': doc_data['base_dir']
			
 
				+        }
			
 
				+        
			
 
				+        return cls(
			
 
				+            name=doc_data['name'],
			
 
				+            base_dir=doc_data['base_dir'],
			
 
				+            ocr_results=[
			
 
				+                OCRResultConfig.from_dict(r, context) 
			
 
				+                for r in doc_data.get('ocr_results', [])
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DataSource:
			
 
				+    """数据源（用于 OCRValidator）"""
			
 
				+    name: str
			
 
				+    ocr_tool: str
			
 
				+    ocr_out_dir: str
			
 
				+    src_img_dir: str
			
 
				+    description: str = ""
			
 
				+
			
 
				+
			
 
				+class ConfigManager:
			
 
				+    """配置管理器"""
			
 
				+    
			
 
				+    def __init__(self, config_dir: str = "config"):
			
 
				+        """
			
 
				+        Args:
			
 
				+            config_dir: 配置文件目录
			
 
				+        """
			
 
				+        self.config_dir = Path(config_dir)
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        
			
 
				+        # 加载配置
			
 
				+        self.global_config = self._load_global_config()
			
 
				+        self.ocr_tools = self._load_ocr_tools()
			
 
				+        self.documents = self._load_documents()
			
 
				+    
			
 
				+    def _load_global_config(self) -> Dict:
			
 
				+        """加载全局配置"""
			
 
				+        config_file = self.config_dir / "global.yaml"
			
 
				+        
			
 
				+        if not config_file.exists():
			
 
				+            self.logger.warning(f"全局配置文件不存在: {config_file}")
			
 
				+            return {}
			
 
				+        
			
 
				+        with open(config_file, 'r', encoding='utf-8') as f:
			
 
				+            return yaml.safe_load(f) or {}
			
 
				+    
			
 
				+    def _load_ocr_tools(self) -> Dict[str, OCRToolConfig]:
			
 
				+        """加载 OCR 工具配置（从 global.yaml）"""
			
 
				+        tools_data = self.global_config.get('ocr', {}).get('tools', {})
			
 
				+        
			
 
				+        tools = {}
			
 
				+        for tool_id, tool_data in tools_data.items():
			
 
				+            tools[tool_id] = OCRToolConfig.from_dict(tool_id, tool_data)
			
 
				+        
			
 
				+        return tools
			
 
				+    
			
 
				+    def _load_documents(self) -> Dict[str, DocumentConfig]:
			
 
				+        """加载文档配置（支持 Jinja2 模板）"""
			
 
				+        documents = {}
			
 
				+        
			
 
				+        # 从 global.yaml 读取文档配置文件列表
			
 
				+        doc_files = self.global_config.get('data_sources', [])
			
 
				+        
			
 
				+        for doc_file in doc_files:
			
 
				+            # 支持相对路径和绝对路径
			
 
				+            if not doc_file.endswith('.yaml'):
			
 
				+                doc_file = f"{doc_file}.yaml"
			
 
				+            
			
 
				+            yaml_path = self.config_dir / doc_file
			
 
				+            
			
 
				+            if not yaml_path.exists():
			
 
				+                self.logger.warning(f"文档配置文件不存在: {yaml_path}")
			
 
				+                continue
			
 
				+            
			
 
				+            try:
			
 
				+                with open(yaml_path, 'r', encoding='utf-8') as f:
			
 
				+                    data = yaml.safe_load(f)
			
 
				+                
			
 
				+                # 🎯 使用支持 Jinja2 的解析方法
			
 
				+                doc_config = DocumentConfig.from_dict(data)
			
 
				+                documents[doc_config.name] = doc_config
			
 
				+                
			
 
				+                self.logger.info(f"✅ 加载文档配置: {doc_config.name} ({len(doc_config.ocr_results)} 个 OCR 结果)")
			
 
				+                
			
 
				+            except Exception as e:
			
 
				+                self.logger.error(f"加载文档配置失败: {yaml_path}, 错误: {e}")
			
 
				+        
			
 
				+        return documents
			
 
				+    
			
 
				+    def get_ocr_tool(self, tool_id: str) -> Optional[OCRToolConfig]:
			
 
				+        """获取 OCR 工具配置"""
			
 
				+        return self.ocr_tools.get(tool_id)
			
 
				+    
			
 
				+    def get_document(self, doc_name: str) -> Optional[DocumentConfig]:
			
 
				+        """获取文档配置"""
			
 
				+        return self.documents.get(doc_name)
			
 
				+    
			
 
				+    def list_documents(self) -> List[str]:
			
 
				+        """列出所有文档"""
			
 
				+        return list(self.documents.keys())
			
 
				+    
			
 
				+    def list_ocr_tools(self) -> List[str]:
			
 
				+        """列出所有 OCR 工具"""
			
 
				+        return list(self.ocr_tools.keys())
			
 
				+    
			
 
				+    def get_data_sources(self) -> List[DataSource]:
			
 
				+        """
			
 
				+        生成数据源列表（供 OCRValidator 使用）
			
 
				+        
			
 
				+        从文档配置自动生成 data_sources
			
 
				+        """
			
 
				+        data_sources = []
			
 
				+        
			
 
				+        for doc_name, doc_config in self.documents.items():
			
 
				+            base_dir = Path(doc_config.base_dir)
			
 
				+            
			
 
				+            for ocr_result in doc_config.ocr_results:
			
 
				+                if not ocr_result.enabled:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 构建完整路径
			
 
				+                ocr_out_dir = str(base_dir / ocr_result.result_dir)
			
 
				+                
			
 
				+                if ocr_result.image_dir:
			
 
				+                    src_img_dir = str(base_dir / ocr_result.image_dir)
			
 
				+                else:
			
 
				+                    # 如果未指定图片目录，使用结果目录
			
 
				+                    src_img_dir = str(base_dir / ocr_result.result_dir / doc_name)
			
 
				+                
			
 
				+                # 🎯 使用 result_dir 生成数据源名称（更唯一、更清晰）
			
 
				+                source_name = f"{doc_name}_{ocr_result.result_dir}"
			
 
				+                
			
 
				+                data_source = DataSource(
			
 
				+                    name=source_name,
			
 
				+                    ocr_tool=ocr_result.tool,
			
 
				+                    ocr_out_dir=ocr_out_dir,
			
 
				+                    src_img_dir=src_img_dir,
			
 
				+                    description=ocr_result.description or ocr_result.result_dir
			
 
				+                )
			
 
				+                
			
 
				+                data_sources.append(data_source)
			
 
				+        
			
 
				+        return data_sources
			
 
				+    
			
 
				+    def get_config_value(self, key_path: str, default=None):
			
 
				+        """
			
 
				+        获取配置值（支持点号路径）
			
 
				+        
			
 
				+        Examples:
			
 
				+            get_config_value('styles.font_size')
			
 
				+            get_config_value('ocr.min_text_length')
			
 
				+        """
			
 
				+        keys = key_path.split('.')
			
 
				+        value = self.global_config
			
 
				+        
			
 
				+        for key in keys:
			
 
				+            if isinstance(value, dict):
			
 
				+                value = value.get(key)
			
 
				+            else:
			
 
				+                return default
			
 
				+        
			
 
				+        return value if value is not None else default
			
 
				+    
			
 
				+    def to_validator_config(self) -> Dict:
			
 
				+        """
			
 
				+        转换为 OCRValidator 所需的配置格式
			
 
				+        
			
 
				+        Returns:
			
 
				+            包含 data_sources 和 ocr.tools 的配置字典
			
 
				+        """
			
 
				+        # 构建 data_sources 列表
			
 
				+        data_sources_list = []
			
 
				+        for ds in self.get_data_sources():
			
 
				+            data_sources_list.append({
			
 
				+                'name': ds.name,
			
 
				+                'ocr_tool': ds.ocr_tool,
			
 
				+                'ocr_out_dir': ds.ocr_out_dir,
			
 
				+                'src_img_dir': ds.src_img_dir
			
 
				+            })
			
 
				+        
			
 
				+        # 构建 ocr.tools 字典
			
 
				+        ocr_tools_dict = {}
			
 
				+        for tool_id, tool_config in self.ocr_tools.items():
			
 
				+            ocr_tools_dict[tool_id] = tool_config.to_dict()
			
 
				+        
			
 
				+        # 返回完整配置
			
 
				+        config = self.global_config.copy()
			
 
				+        config['data_sources'] = data_sources_list
			
 
				+        
			
 
				+        # 确保 ocr.tools 存在
			
 
				+        if 'ocr' not in config:
			
 
				+            config['ocr'] = {}
			
 
				+        config['ocr']['tools'] = ocr_tools_dict
			
 
				+        
			
 
				+        return config
			
 
				+
			
 
				+
			
 
				+# ============================================================================
			
 
				+# 便捷函数
			
 
				+# ============================================================================
			
 
				+
			
 
				+def load_config(config_dir: str = "config") -> ConfigManager:
			
 
				+    """加载配置"""
			
 
				+    return ConfigManager(config_dir)
			
--- a/ocr_validator/ocr_validator_file_utils.py
+++ b/ocr_validator/ocr_validator_file_utils.py
@@ -0,0 +1,111 @@
 
				+"""
			
 
				+OCR Validator 文件处理工具（Streamlit 特定）
			
 
				+
			
 
				+保留 Streamlit 特定的文件处理函数，通用函数已迁移到 ocr_utils
			
 
				+"""
			
 
				+import os
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from typing import Optional, Dict
			
 
				+
			
 
				+
			
 
				+def load_css_styles(css_path: str = "styles.css") -> str:
			
 
				+    """
			
 
				+    加载CSS样式文件（Streamlit 特定）
			
 
				+    
			
 
				+    Args:
			
 
				+        css_path: CSS 文件路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        CSS 样式内容
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(css_path, 'r', encoding='utf-8') as f:
			
 
				+            return f.read()
			
 
				+    except Exception:
			
 
				+        # 返回基本样式
			
 
				+        return """
			
 
				+        .main > div { background-color: white !important; color: #333333 !important; }
			
 
				+        .stApp { background-color: white !important; }
			
 
				+        .block-container { background-color: white !important; color: #333333 !important; }
			
 
				+        """
			
 
				+
			
 
				+
			
 
				+def detect_image_orientation_by_opencv(image_path: str) -> Dict:
			
 
				+    """
			
 
				+    使用OpenCV的文本检测来判断图片方向
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 图片路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        包含检测结果的字典
			
 
				+    """
			
 
				+    try:
			
 
				+        # 读取图像
			
 
				+        image = cv2.imread(image_path)
			
 
				+        if image is None:
			
 
				+            raise ValueError("无法读取图像文件")
			
 
				+        
			
 
				+        height, width = image.shape[:2]
			
 
				+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
			
 
				+        
			
 
				+        # 使用EAST文本检测器或其他方法
			
 
				+        # 这里使用简单的边缘检测和轮廓分析
			
 
				+        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
			
 
				+        
			
 
				+        # 检测直线
			
 
				+        lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=100)
			
 
				+        
			
 
				+        if lines is None:
			
 
				+            return {
			
 
				+                'detected_angle': 0.0,
			
 
				+                'confidence': 0.0,
			
 
				+                'method': 'opencv_analysis',
			
 
				+                'message': '未检测到足够的直线特征'
			
 
				+            }
			
 
				+        
			
 
				+        # 分析直线角度
			
 
				+        angles = []
			
 
				+        for rho, theta in lines[:, 0]:
			
 
				+            angle = theta * 180 / np.pi
			
 
				+            # 将角度标准化到0-180度
			
 
				+            if angle > 90:
			
 
				+                angle = angle - 180
			
 
				+            angles.append(angle)
			
 
				+        
			
 
				+        # 统计主要角度
			
 
				+        angle_hist = np.histogram(angles, bins=36, range=(-90, 90))[0]
			
 
				+        dominant_angle_idx = np.argmax(angle_hist)
			
 
				+        dominant_angle = -90 + dominant_angle_idx * 5  # 每个bin 5度
			
 
				+        
			
 
				+        # 将角度映射到标准旋转角度
			
 
				+        if -22.5 <= dominant_angle <= 22.5:
			
 
				+            detected_angle = 0.0
			
 
				+        elif 22.5 < dominant_angle <= 67.5:
			
 
				+            detected_angle = 270.0
			
 
				+        elif 67.5 < dominant_angle <= 90 or -90 <= dominant_angle < -67.5:
			
 
				+            detected_angle = 90.0
			
 
				+        else:
			
 
				+            detected_angle = 180.0
			
 
				+        
			
 
				+        confidence = angle_hist[dominant_angle_idx] / len(lines) if len(lines) > 0 else 0.0
			
 
				+        
			
 
				+        return {
			
 
				+            'detected_angle': detected_angle,
			
 
				+            'confidence': min(1.0, confidence),
			
 
				+            'method': 'opencv_analysis',
			
 
				+            'line_count': len(lines),
			
 
				+            'dominant_angle': dominant_angle,
			
 
				+            'message': f'基于{len(lines)}条直线检测到旋转角度: {detected_angle}°'
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        return {
			
 
				+            'detected_angle': 0.0,
			
 
				+            'confidence': 0.0,
			
 
				+            'method': 'opencv_analysis',
			
 
				+            'error': str(e),
			
 
				+            'message': f'OpenCV检测过程中发生错误: {str(e)}'
			
 
				+        }
			
 
				+
			
--- a/ocr_validator/ocr_validator_layout.py
+++ b/ocr_validator/ocr_validator_layout.py
@@ -0,0 +1,856 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+OCR验证工具的布局管理模块
			
 
				+包含标准布局、滚动布局、紧凑布局的实现
			
 
				+"""
			
 
				+
			
 
				+import streamlit as st
			
 
				+from pathlib import Path
			
 
				+from PIL import Image
			
 
				+from typing import Dict, List, Optional
			
 
				+import plotly.graph_objects as go
			
 
				+from typing import Tuple
			
 
				+
			
 
				+from ocr_validator_utils import (
			
 
				+    rotate_image_and_coordinates,
			
 
				+    get_ocr_tool_rotation_config,
			
 
				+)
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+import sys
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # ocr_validator_layout.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_utils 导入通用工具
			
 
				+from ocr_utils.html_utils import convert_html_table_to_markdown, parse_html_tables
			
 
				+from ocr_utils.visualization_utils import VisualizationUtils
			
 
				+
			
 
				+# 从本地文件导入 Streamlit 特定函数
			
 
				+from ocr_validator_file_utils import load_css_styles
			
 
				+
			
 
				+# 为了向后兼容，提供函数别名
			
 
				+draw_bbox_on_image = VisualizationUtils.draw_bbox_on_image
			
 
				+
			
 
				+# detect_image_orientation_by_opencv 保留在 ocr_validator_file_utils
			
 
				+from ocr_validator_file_utils import detect_image_orientation_by_opencv
			
 
				+
			
 
				+class OCRLayoutManager:
			
 
				+    """OCR布局管理器"""
			
 
				+    
			
 
				+    def __init__(self, validator):
			
 
				+        self.validator = validator
			
 
				+        self.config = validator.config
			
 
				+        self._rotated_image_cache = {}
			
 
				+        self._cache_max_size = 10
			
 
				+        self._orientation_cache = {}  # 缓存方向检测结果
			
 
				+        self.rotated_angle = 0.0  # 自动检测的旋转角度缓存
			
 
				+        self.show_all_boxes = False
			
 
				+        self.fit_to_container = False
			
 
				+        self.zoom_level = 1.0
			
 
				+    
			
 
				+    def clear_image_cache(self):
			
 
				+        """清理所有图像缓存"""
			
 
				+        self._rotated_image_cache.clear()
			
 
				+        
			
 
				+    def clear_cache_for_image(self, image_path: str):
			
 
				+        """清理指定图像的所有缓存"""
			
 
				+        keys_to_remove = [key for key in self._rotated_image_cache.keys() if key.startswith(image_path)]
			
 
				+        for key in keys_to_remove:
			
 
				+            del self._rotated_image_cache[key]
			
 
				+    
			
 
				+    def get_cache_info(self) -> dict:
			
 
				+        """获取缓存信息"""
			
 
				+        return {
			
 
				+            'cache_size': len(self._rotated_image_cache),
			
 
				+            'cached_images': list(self._rotated_image_cache.keys()),
			
 
				+            'max_size': self._cache_max_size
			
 
				+        }
			
 
				+    
			
 
				+    def _manage_cache_size(self):
			
 
				+        """管理缓存大小，超出限制时清理最旧的缓存"""
			
 
				+        if len(self._rotated_image_cache) > self._cache_max_size:
			
 
				+            # 删除最旧的缓存项（FIFO策略）
			
 
				+            oldest_key = next(iter(self._rotated_image_cache))
			
 
				+            del self._rotated_image_cache[oldest_key]
			
 
				+    
			
 
				+    def detect_and_suggest_rotation(self, image_path: str) -> Dict:
			
 
				+        """检测并建议图片旋转角度"""
			
 
				+        if image_path in self._orientation_cache:
			
 
				+            return self._orientation_cache[image_path]
			
 
				+        
			
 
				+        # 使用自动检测功能
			
 
				+        detection_result = detect_image_orientation_by_opencv(image_path)
			
 
				+        
			
 
				+        # 缓存结果
			
 
				+        self._orientation_cache[image_path] = detection_result
			
 
				+        return detection_result
			
 
				+    
			
 
				+    def get_rotation_angle(self) -> float:
			
 
				+        """获取旋转角度 - 增强版本支持自动检测"""
			
 
				+        # 如果没有预设角度，优先人工设置
			
 
				+        if hasattr(self, 'rotated_angle') and self.rotated_angle != 0:
			
 
				+            return self.rotated_angle
			
 
				+
			
 
				+        # 尝试从OCR数据中获取（PPStructV3等）
			
 
				+        if self.validator.ocr_data:
			
 
				+            for item in self.validator.ocr_data:
			
 
				+                if isinstance(item, dict) and 'rotation_angle' in item:
			
 
				+                    return item['rotation_angle']
			
 
				+        
			
 
				+        return 0.0
			
 
				+    
			
 
				+    def load_and_rotate_image(self, image_path: str) -> Optional[Image.Image]:
			
 
				+        """加载并根据需要旋转图像"""
			
 
				+        if not image_path or not Path(image_path).exists():
			
 
				+            return None
			
 
				+            
			
 
				+        # 检查缓存
			
 
				+        rotation_angle = self.get_rotation_angle()
			
 
				+        cache_key = f"{image_path}_{rotation_angle}"
			
 
				+        
			
 
				+        if cache_key in self._rotated_image_cache:
			
 
				+            self.validator.text_bbox_mapping = self._rotated_image_cache[cache_key]['text_bbox_mapping']
			
 
				+            return self._rotated_image_cache[cache_key]['image']
			
 
				+        
			
 
				+        try:
			
 
				+            image = Image.open(image_path)
			
 
				+            
			
 
				+            # 如果需要旋转
			
 
				+            if rotation_angle != 0:
			
 
				+                # 获取OCR工具的旋转配置
			
 
				+                rotation_config = get_ocr_tool_rotation_config(self.validator.ocr_data, self.config)
			
 
				+                
			
 
				+                # st.info(f"🔄 检测到文档旋转角度: {rotation_angle}°，正在处理图像和坐标...")
			
 
				+                # st.info(f"📋 OCR工具配置: 坐标{'已预旋转' if rotation_config['coordinates_are_pre_rotated'] else '需要旋转'}")
			
 
				+                
			
 
				+                # 判断是否需要旋转坐标
			
 
				+                if rotation_config['coordinates_are_pre_rotated']:
			
 
				+                    # 图片的角度与坐标的角度不一致，比如PPStructV3，图片0度，坐标已旋转270度
			
 
				+                    # 这种情况下，只需要旋转图片，坐标不变
			
 
				+                    # PPStructV3: 坐标已经是旋转后的，只旋转图像
			
 
				+                    img_rotation_angle = (rotation_angle + self.rotated_angle) % 360
			
 
				+                    if img_rotation_angle == 270:
			
 
				+                        rotated_image = image.rotate(-90, expand=True)  # 顺时针90度
			
 
				+                    elif img_rotation_angle == 90:
			
 
				+                        rotated_image = image.rotate(90, expand=True)   # 逆时针90度
			
 
				+                    elif img_rotation_angle == 180:
			
 
				+                        rotated_image = image.rotate(180, expand=True)  # 180度
			
 
				+                    else:
			
 
				+                        rotated_image = image.rotate(-img_rotation_angle, expand=True)
			
 
				+                    
			
 
				+                    if self.rotated_angle == 0:
			
 
				+                        # 坐标不需要变换，因为JSON中已经是正确的坐标
			
 
				+                        self._rotated_image_cache[cache_key] = {'image': rotated_image, 'text_bbox_mapping': self.validator.text_bbox_mapping}
			
 
				+                        self._manage_cache_size()
			
 
				+                        return rotated_image
			
 
				+
			
 
				+                    image = rotated_image  # 继续使用旋转后的图像进行后续处理
			
 
				+                
			
 
				+                # VLM: 需要同时旋转图像和坐标
			
 
				+                # 收集所有bbox坐标
			
 
				+                all_bboxes = []
			
 
				+                text_to_bbox_map = {}  # 记录文本到bbox索引的映射
			
 
				+                
			
 
				+                bbox_index = 0
			
 
				+                for text, info_list in self.validator.text_bbox_mapping.items():
			
 
				+                    text_to_bbox_map[text] = []
			
 
				+                    for info in info_list:
			
 
				+                        all_bboxes.append(info['bbox'])
			
 
				+                        text_to_bbox_map[text].append(bbox_index)
			
 
				+                        bbox_index += 1
			
 
				+                
			
 
				+                # 旋转图像和坐标
			
 
				+                rotated_image, rotated_bboxes = rotate_image_and_coordinates(
			
 
				+                    image, rotation_angle, all_bboxes, 
			
 
				+                    rotate_coordinates=not rotation_config['coordinates_are_pre_rotated']
			
 
				+                )
			
 
				+                
			
 
				+                # 更新bbox映射 - 使用映射关系确保正确对应
			
 
				+                for text, bbox_indices in text_to_bbox_map.items():
			
 
				+                    for i, bbox_idx in enumerate(bbox_indices):
			
 
				+                        if bbox_idx < len(rotated_bboxes) and i < len(self.validator.text_bbox_mapping[text]):
			
 
				+                            self.validator.text_bbox_mapping[text][i]['bbox'] = rotated_bboxes[bbox_idx]
			
 
				+                
			
 
				+                # 缓存结果
			
 
				+                self._rotated_image_cache[cache_key] = {'image': rotated_image, 'text_bbox_mapping': self.validator.text_bbox_mapping}
			
 
				+                self._manage_cache_size()
			
 
				+                return rotated_image
			
 
				+                    
			
 
				+            else:
			
 
				+                # 无需旋转，直接缓存原图
			
 
				+                self._rotated_image_cache[cache_key] = {'image': image, 'text_bbox_mapping': self.validator.text_bbox_mapping}
			
 
				+                self._manage_cache_size()  # 检查并管理缓存大小
			
 
				+                return image
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            st.error(f"❌ 图像加载失败: {e}")
			
 
				+            return None
			
 
				+
			
 
				+    def render_content_by_mode(self, content: str, render_mode: str, font_size: int, 
			
 
				+                          container_height: int, layout_type: str, 
			
 
				+                          highlight_config: Optional[Dict] = None):
			
 
				+        """
			
 
				+        根据渲染模式显示内容 - 增强版本
			
 
				+        
			
 
				+        Args:
			
 
				+            content: 要渲染的内容
			
 
				+            render_mode: 渲染模式
			
 
				+            font_size: 字体大小
			
 
				+            container_height: 容器高度
			
 
				+            layout_type: 布局类型
			
 
				+            highlight_config: 高亮配置 {'has_bbox': bool, 'match_type': str}
			
 
				+        """
			
 
				+        if content is None or render_mode is None:
			
 
				+            return
			
 
				+            
			
 
				+        if render_mode == "HTML渲染":
			
 
				+            # 🎯 构建样式 - 包含基础样式和高亮样式
			
 
				+            content_style = f"""
			
 
				+            <style>
			
 
				+            /* ========== 基础容器样式 ========== */
			
 
				+            .{layout_type}-content-display {{
			
 
				+                height: {container_height}px;
			
 
				+                overflow-x: auto;
			
 
				+                overflow-y: auto;
			
 
				+                font-size: {font_size}px !important;
			
 
				+                line-height: 1.4;
			
 
				+                color: #333333 !important;
			
 
				+                background-color: #fafafa !important;
			
 
				+                padding: 10px;
			
 
				+                border-radius: 5px;
			
 
				+                border: 1px solid #ddd;
			
 
				+                max-width: 100%;
			
 
				+            }}
			
 
				+            
			
 
				+            /* ========== 表格样式 ========== */
			
 
				+            .{layout_type}-content-display table {{
			
 
				+                width: 100%;
			
 
				+                border-collapse: collapse;
			
 
				+                margin: 10px 0;
			
 
				+                white-space: nowrap;
			
 
				+            }}
			
 
				+            
			
 
				+            .{layout_type}-content-display th,
			
 
				+            .{layout_type}-content-display td {{
			
 
				+                border: 1px solid #ddd;
			
 
				+                padding: 8px;
			
 
				+                text-align: left;
			
 
				+                max-width: 300px;
			
 
				+                word-wrap: break-word;
			
 
				+                word-break: break-all;
			
 
				+                vertical-align: top;
			
 
				+            }}
			
 
				+            
			
 
				+            .{layout_type}-content-display th {{
			
 
				+                background-color: #f5f5f5;
			
 
				+                position: sticky;
			
 
				+                top: 0;
			
 
				+                z-index: 1;
			
 
				+                font-weight: bold;
			
 
				+            }}
			
 
				+            
			
 
				+            /* 数字列右对齐 */
			
 
				+            .{layout_type}-content-display td.number {{
			
 
				+                text-align: right;
			
 
				+                white-space: nowrap;
			
 
				+                font-family: 'Monaco', 'Menlo', monospace;
			
 
				+            }}
			
 
				+            
			
 
				+            /* 短文本列不换行 */
			
 
				+            .{layout_type}-content-display td.short-text {{
			
 
				+                white-space: nowrap;
			
 
				+                min-width: 80px;
			
 
				+            }}
			
 
				+            
			
 
				+            /* ========== 图片样式 ========== */
			
 
				+            .{layout_type}-content-display img {{
			
 
				+                max-width: 100%;
			
 
				+                height: auto;
			
 
				+                border-radius: 4px;
			
 
				+                margin: 10px 0;
			
 
				+            }}
			
 
				+            
			
 
				+            /* ========== 响应式设计 ========== */
			
 
				+            @media (max-width: 768px) {{
			
 
				+                .{layout_type}-content-display table {{
			
 
				+                    font-size: {max(font_size-2, 8)}px;
			
 
				+                }}
			
 
				+                .{layout_type}-content-display th,
			
 
				+                .{layout_type}-content-display td {{
			
 
				+                    padding: 4px;
			
 
				+                    max-width: 150px;
			
 
				+                }}
			
 
				+            }}
			
 
				+            
			
 
				+            /* ========== 高亮文本样式 ========== */
			
 
				+            .{layout_type}-content-display .highlight-text {{
			
 
				+                padding: 2px 4px;
			
 
				+                border-radius: 3px;
			
 
				+                cursor: pointer;
			
 
				+                font-weight: 500;
			
 
				+                transition: all 0.2s ease;
			
 
				+            }}
			
 
				+            
			
 
				+            .{layout_type}-content-display .highlight-text:hover {{
			
 
				+                opacity: 0.8;
			
 
				+                transform: scale(1.02);
			
 
				+            }}
			
 
				+            
			
 
				+            /* 🎯 精确匹配且有框 - 绿色 */
			
 
				+            .{layout_type}-content-display .highlight-text.selected-highlight {{
			
 
				+                background-color: #4caf50 !important;
			
 
				+                color: white !important;
			
 
				+                border: 1px solid #2e7d32 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            /* 🎯 OCR匹配 - 蓝色 */
			
 
				+            .{layout_type}-content-display .highlight-text.ocr-match {{
			
 
				+                background-color: #2196f3 !important;
			
 
				+                color: white !important;
			
 
				+                border: 1px solid #1565c0 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            /* 🎯 无边界框 - 橙色虚线 */
			
 
				+            .{layout_type}-content-display .highlight-text.no-bbox {{
			
 
				+                background-color: #ff9800 !important;
			
 
				+                color: white !important;
			
 
				+                border: 1px dashed #f57c00 !important;
			
 
				+            }}
			
 
				+            
			
 
				+            /* 🎯 默认高亮 - 黄色 */
			
 
				+            .{layout_type}-content-display .highlight-text.default {{
			
 
				+                background-color: #ffeb3b !important;
			
 
				+                color: #333333 !important;
			
 
				+                border: 1px solid #fbc02d !important;
			
 
				+            }}
			
 
				+            </style>
			
 
				+            """
			
 
				+            
			
 
				+            st.markdown(content_style, unsafe_allow_html=True)
			
 
				+            st.markdown(f'<div class="{layout_type}-content-display">{content}</div>', 
			
 
				+                       unsafe_allow_html=True)
			
 
				+            
			
 
				+        elif render_mode == "Markdown渲染":
			
 
				+            converted_content = convert_html_table_to_markdown(content)
			
 
				+            st.markdown(converted_content, unsafe_allow_html=True)
			
 
				+            
			
 
				+        elif render_mode == "DataFrame表格":
			
 
				+            if '<table' in content.lower():
			
 
				+                self.validator.display_html_table_as_dataframe(content)
			
 
				+            else:
			
 
				+                st.info("当前内容中没有检测到HTML表格")
			
 
				+                st.markdown(content, unsafe_allow_html=True)
			
 
				+        else:  # 原始文本
			
 
				+            st.text_area(
			
 
				+                "MD内容预览",
			
 
				+                content,
			
 
				+                height=300,
			
 
				+                key=f"{layout_type}_text_area"
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+    def create_compact_layout(self, config: Dict):
			
 
				+        """创建紧凑的对比布局 - 增强搜索功能"""
			
 
				+        layout = config['styles']['layout']
			
 
				+        font_size = config['styles'].get('font_size', 10)
			
 
				+        container_height = layout.get('default_height', 600)
			
 
				+        zoom_level = layout.get('default_zoom', 1.0)
			
 
				+        layout_type = "compact"
			
 
				+
			
 
				+        left_col, right_col = st.columns([layout['content_width'], layout['sidebar_width']], 
			
 
				+                                         vertical_alignment='top', border=True)
			
 
				+
			
 
				+        with left_col:
			
 
				+            if self.validator.text_bbox_mapping:
			
 
				+                # 搜索输入框
			
 
				+                search_col, select_col = st.columns([1, 2])
			
 
				+                
			
 
				+                if "compact_search_query" not in st.session_state:
			
 
				+                    st.session_state.compact_search_query = ""
			
 
				+                
			
 
				+                with search_col:
			
 
				+                    search_query = st.text_input(
			
 
				+                        "搜索文本",
			
 
				+                        placeholder="输入关键词...",
			
 
				+                        value=st.session_state.compact_search_query,
			
 
				+                        key=f"{layout_type}_search_input",
			
 
				+                        label_visibility="collapsed"
			
 
				+                    )
			
 
				+                    st.session_state.compact_search_query = search_query
			
 
				+                
			
 
				+                # 🎯 增强搜索逻辑：构建选项列表
			
 
				+                text_options = ["请选择文本..."]
			
 
				+                text_display = ["请选择文本..."]
			
 
				+                match_info = [None]  # 记录匹配信息
			
 
				+                
			
 
				+                for text, info_list in self.validator.text_bbox_mapping.items():
			
 
				+                    # 🔑 关键改进：同时搜索 text 和 matched_text
			
 
				+                    if search_query and search_query.strip():
			
 
				+                        query_lower = search_query.lower()
			
 
				+                        
			
 
				+                        # 1. 检查原始文本
			
 
				+                        text_match = query_lower in text.lower()
			
 
				+                        
			
 
				+                        # 2. 检查 matched_text（OCR识别文本）
			
 
				+                        matched_text_match = False
			
 
				+                        matched_text = None
			
 
				+                        if info_list and isinstance(info_list[0], dict):
			
 
				+                            matched_text = info_list[0].get('matched_text', '')
			
 
				+                            matched_text_match = query_lower in matched_text.lower() if matched_text else False
			
 
				+                        
			
 
				+                        # 如果都不匹配，跳过
			
 
				+                        if not text_match and not matched_text_match:
			
 
				+                            continue
			
 
				+                        
			
 
				+                        # 记录匹配类型
			
 
				+                        if text_match:
			
 
				+                            match_type = "exact"
			
 
				+                            match_source = text
			
 
				+                        else:
			
 
				+                            match_type = "ocr"
			
 
				+                            match_source = matched_text
			
 
				+                    else:
			
 
				+                        match_type = None
			
 
				+                        match_source = text
			
 
				+                    
			
 
				+                    text_options.append(text)
			
 
				+                    
			
 
				+                    # 🎯 构建显示文本（带匹配提示）
			
 
				+                    if info_list and isinstance(info_list[0], dict):
			
 
				+                        first_info = info_list[0]
			
 
				+                        
			
 
				+                        # 检查是否有 bbox
			
 
				+                        has_bbox = 'bbox' in first_info and first_info['bbox']
			
 
				+                        
			
 
				+                        # 表格单元格显示
			
 
				+                        if 'row' in first_info and 'col' in first_info:
			
 
				+                            display_text = f"[R{first_info['row']},C{first_info['col']}] {text}"
			
 
				+                        else:
			
 
				+                            display_text = text
			
 
				+                        
			
 
				+                        # 🎯 添加匹配提示
			
 
				+                        if match_type == "ocr":
			
 
				+                            display_text = f"🔍 {display_text} (OCR: {match_source[:20]}...)"
			
 
				+                        elif not has_bbox:
			
 
				+                            display_text = f"⚠️ {display_text} (无框)"
			
 
				+                        
			
 
				+                        # 截断过长文本
			
 
				+                        if len(display_text) > 60:
			
 
				+                            display_text = display_text[:57] + "..."
			
 
				+                    else:
			
 
				+                        display_text = text[:57] + "..." if len(text) > 60 else text
			
 
				+                    
			
 
				+                    text_display.append(display_text)
			
 
				+                    match_info.append({
			
 
				+                        'type': match_type,
			
 
				+                        'source': match_source,
			
 
				+                        'has_bbox': has_bbox if info_list else False
			
 
				+                    })
			
 
				+                
			
 
				+                # 🎯 显示搜索统计
			
 
				+                if search_query and search_query.strip():
			
 
				+                    ocr_matches = sum(1 for m in match_info[1:] if m and m['type'] == 'ocr')
			
 
				+                    no_bbox_count = sum(1 for m in match_info[1:] if m and not m['has_bbox'])
			
 
				+                    
			
 
				+                    stat_parts = [f"找到 {len(text_options)-1} 个匹配项"]
			
 
				+                    if ocr_matches > 0:
			
 
				+                        stat_parts.append(f"🔍 {ocr_matches} 个OCR匹配")
			
 
				+                    if no_bbox_count > 0:
			
 
				+                        stat_parts.append(f"⚠️ {no_bbox_count} 个无框")
			
 
				+                    
			
 
				+                    st.caption(" | ".join(stat_parts))
			
 
				+                
			
 
				+                # 确定默认选中的索引
			
 
				+                default_index = 0
			
 
				+                if st.session_state.selected_text and st.session_state.selected_text in text_options:
			
 
				+                    default_index = text_options.index(st.session_state.selected_text)
			
 
				+                
			
 
				+                with select_col:
			
 
				+                    selected_index = st.selectbox(
			
 
				+                        "快速定位文本",
			
 
				+                        range(len(text_options)),
			
 
				+                        index=default_index,
			
 
				+                        format_func=lambda x: text_display[x] if x < len(text_display) else "",
			
 
				+                        label_visibility="collapsed",
			
 
				+                        key=f"{layout_type}_quick_text_selector"
			
 
				+                    )
			
 
				+                
			
 
				+                # 🎯 显示匹配详情
			
 
				+                if selected_index > 0:
			
 
				+                    st.session_state.selected_text = text_options[selected_index]
			
 
				+                    
			
 
				+                    # 获取匹配信息
			
 
				+                    selected_match_info = match_info[selected_index]
			
 
				+                    if selected_match_info:
			
 
				+                        if selected_match_info['type'] == 'ocr':
			
 
				+                            st.info(f"🔍 **OCR识别文本匹配**: `{selected_match_info['source']}`")
			
 
				+                        elif not selected_match_info['has_bbox']:
			
 
				+                            st.warning(f"⚠️ **未找到边界框**: 文本在MD中存在，但没有对应的坐标信息")
			
 
				+            
			
 
				+            # 🎯 增强高亮显示逻辑
			
 
				+            if self.validator.md_content:
			
 
				+                highlighted_content = self.validator.md_content
			
 
				+                
			
 
				+                if st.session_state.selected_text:
			
 
				+                    selected_text = st.session_state.selected_text
			
 
				+                    
			
 
				+                    # 获取匹配信息
			
 
				+                    info_list = self.validator.text_bbox_mapping.get(selected_text, [])
			
 
				+                    has_bbox = False
			
 
				+                    matched_text = None
			
 
				+                    match_type = None
			
 
				+                    
			
 
				+                    if info_list and isinstance(info_list[0], dict):
			
 
				+                        has_bbox = 'bbox' in info_list[0] and info_list[0]['bbox']
			
 
				+                        matched_text = info_list[0].get('matched_text', '')
			
 
				+                        
			
 
				+                        # 🔑 判断匹配类型
			
 
				+                        if matched_text and matched_text != selected_text:
			
 
				+                            match_type = "ocr"
			
 
				+                        elif has_bbox:
			
 
				+                            match_type = "exact"
			
 
				+                        else:
			
 
				+                            match_type = "no_bbox"
			
 
				+                    
			
 
				+                    # 🎯 应用高亮
			
 
				+                    if len(selected_text) > 2:
			
 
				+                        # 1. 高亮原始文本
			
 
				+                        if selected_text in highlighted_content:
			
 
				+                            if match_type == "exact":
			
 
				+                                highlight_class = "highlight-text selected-highlight"
			
 
				+                            elif match_type == "no_bbox":
			
 
				+                                highlight_class = "highlight-text no-bbox"
			
 
				+                            else:
			
 
				+                                highlight_class = "highlight-text default"
			
 
				+                            
			
 
				+                            highlighted_content = highlighted_content.replace(
			
 
				+                                selected_text,
			
 
				+                                f'<span class="{highlight_class}" title="{selected_text}">{selected_text}</span>'
			
 
				+                            )
			
 
				+                        
			
 
				+                        # 2. 如果有 matched_text 且不同，也高亮
			
 
				+                        if matched_text and matched_text != selected_text and matched_text in highlighted_content:
			
 
				+                            highlighted_content = highlighted_content.replace(
			
 
				+                                matched_text,
			
 
				+                                f'<span class="highlight-text ocr-match" title="OCR: {matched_text}">{matched_text}</span>'
			
 
				+                            )
			
 
				+                
			
 
				+                # 🎯 调用渲染方法（样式已内置）
			
 
				+                self.render_content_by_mode(
			
 
				+                    highlighted_content, 
			
 
				+                    "HTML渲染", 
			
 
				+                    font_size, 
			
 
				+                    container_height, 
			
 
				+                    layout_type
			
 
				+                )
			
 
				+    
			
 
				+        with right_col:
			
 
				+            self.create_aligned_image_display(zoom_level, "compact")
			
 
				+
			
 
				+    def create_aligned_image_display(self, zoom_level: float = 1.0, layout_type: str = "aligned"):
			
 
				+        """创建响应式图片显示"""
			
 
				+    
			
 
				+        # st.header("🖼️ 原图标注")
			
 
				+        
			
 
				+        # 图片控制选项
			
 
				+        col1, col2, col3, col4, col5 = st.columns(5, vertical_alignment="center", border= False)
			
 
				+
			
 
				+        with col1:
			
 
				+            # 判断{layout_type}_show_all_boxes是否有值，如果有值直接使用，否则默认False
			
 
				+            # if f"{layout_type}_show_all_boxes" not in st.session_state:
			
 
				+            #     st.session_state[f"{layout_type}_show_all_boxes"] = False
			
 
				+
			
 
				+            show_all_boxes = st.checkbox(
			
 
				+                "显示所有框",
			
 
				+                # value=st.session_state[f"{layout_type}_show_all_boxes"],
			
 
				+                value = self.show_all_boxes,
			
 
				+                key=f"{layout_type}_show_all_boxes"
			
 
				+            )
			
 
				+            if show_all_boxes != self.show_all_boxes:
			
 
				+                self.show_all_boxes = show_all_boxes
			
 
				+
			
 
				+        with col2:
			
 
				+            if st.button("🔄 旋转90度", type="secondary", key=f"{layout_type}_manual_angle"):
			
 
				+                self.rotated_angle = (self.rotated_angle + 90) % 360
			
 
				+                # 需要清除图片缓存，以及text_bbox_mapping中的bbox
			
 
				+                self.clear_image_cache()
			
 
				+                self.validator.process_data()
			
 
				+                st.rerun()
			
 
				+                
			
 
				+        with col3:
			
 
				+            # 显示当前角度状态
			
 
				+            current_angle = self.get_rotation_angle()
			
 
				+            st.metric("当前角度", f"{current_angle}°", label_visibility="collapsed")
			
 
				+
			
 
				+        with col4:
			
 
				+            if st.button("↺ 重置角度", key=f"{layout_type}_reset_angle"):
			
 
				+                self.rotated_angle = 0.0
			
 
				+                st.success("已重置旋转角度")
			
 
				+                # 需要清除图片缓存，以及text_bbox_mapping中的bbox
			
 
				+                self.clear_image_cache()
			
 
				+                self.validator.process_data()
			
 
				+                st.rerun()
			
 
				+ 
			
 
				+        with col5:
			
 
				+            if st.button("🧹 清除选择", key=f"{layout_type}_clear_selection"):
			
 
				+                # 清除选中的文本
			
 
				+                st.session_state.selected_text = None
			
 
				+                # 清除搜索框内容
			
 
				+                st.session_state.compact_search_query = None
			
 
				+                st.rerun()
			
 
				+
			
 
				+        # 使用增强的图像加载方法
			
 
				+        image = self.load_and_rotate_image(self.validator.image_path)
			
 
				+        
			
 
				+        if image:
			
 
				+            try:
			
 
				+                resized_image, all_boxes, selected_boxes = self.zoom_image(image, self.zoom_level)
			
 
				+                
			
 
				+                # 创建交互式图片
			
 
				+                fig = self.create_resized_interactive_plot(resized_image, selected_boxes, self.zoom_level, all_boxes)
			
 
				+
			
 
				+                plot_config = {
			
 
				+                    'displayModeBar': True,
			
 
				+                    'modeBarButtonsToRemove': ['zoom2d', 'select2d', 'lasso2d', 'autoScale2d'],
			
 
				+                    'scrollZoom': True,
			
 
				+                    'doubleClick': 'reset',
			
 
				+                    'responsive': False,  # 关键：禁用响应式，使用固定尺寸
			
 
				+                    'toImageButtonOptions': {
			
 
				+                        'format': 'png',
			
 
				+                        'filename': 'ocr_image',
			
 
				+                        'height': None,  # 使用当前高度
			
 
				+                        'width': None,   # 使用当前宽度
			
 
				+                        'scale': 1
			
 
				+                    }
			
 
				+                }
			
 
				+                
			
 
				+                # 🔧 修复：使用 use_container_width 替代废弃的参数
			
 
				+                st.plotly_chart(
			
 
				+                    fig, 
			
 
				+                    width='stretch',  # 🎯 使用容器宽度
			
 
				+                    config=plot_config,
			
 
				+                    key=f"{layout_type}_plot"
			
 
				+                )
			
 
				+                        
			
 
				+            except Exception as e:
			
 
				+                st.error(f"❌ 图片处理失败: {e}")
			
 
				+                st.exception(e)
			
 
				+        else:
			
 
				+            st.error("未找到对应的图片文件")
			
 
				+            if self.validator.image_path:
			
 
				+                st.write(f"期望路径: {self.validator.image_path}")
			
 
				+
			
 
				+    # st.markdown('</div>', unsafe_allow_html=True)
			
 
				+
			
 
				+    def zoom_image(self, image: Image.Image, current_zoom: float) -> Tuple[Image.Image, List[List[int]], List[List[int]]]:
			
 
				+        """缩放图像"""
			
 
				+        # 根据缩放级别调整图片大小
			
 
				+        new_width = int(image.width * current_zoom)
			
 
				+        new_height = int(image.height * current_zoom)
			
 
				+        resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
			
 
				+
			
 
				+        # 计算选中的bbox
			
 
				+        selected_boxes = []
			
 
				+        if st.session_state.selected_text and st.session_state.selected_text in self.validator.text_bbox_mapping:
			
 
				+            info_list = self.validator.text_bbox_mapping[st.session_state.selected_text]
			
 
				+            for info in info_list:
			
 
				+                if 'bbox' in info:
			
 
				+                    bbox = info['bbox']
			
 
				+                    selected_box = [int(coord * current_zoom) for coord in bbox]
			
 
				+                    selected_boxes.append(selected_box)
			
 
				+
			
 
				+        # 收集所有框
			
 
				+        all_boxes = []
			
 
				+        if self.show_all_boxes:
			
 
				+            for text, info_list in self.validator.text_bbox_mapping.items():
			
 
				+                for info in info_list:
			
 
				+                    bbox = info['bbox']
			
 
				+                    if len(bbox) >= 4:
			
 
				+                        scaled_bbox = [coord * current_zoom for coord in bbox]
			
 
				+                        all_boxes.append(scaled_bbox)
			
 
				+
			
 
				+        return resized_image, all_boxes, selected_boxes
			
 
				+
			
 
				+    def _add_bboxes_to_plot_batch(self, fig: go.Figure, bboxes: List[List[int]], 
			
 
				+                                image_height: int, 
			
 
				+                                line_color: str = "blue", 
			
 
				+                                line_width: int = 2, 
			
 
				+                                fill_color: str = "rgba(0, 100, 200, 0.2)"):
			
 
				+        """
			
 
				+        批量添加边界框（性能优化版）
			
 
				+        """
			
 
				+        if not bboxes or len(bboxes) == 0:
			
 
				+            return
			
 
				+        
			
 
				+        # 🎯 关键优化：构建 shapes 列表，一次性添加
			
 
				+        shapes = []
			
 
				+        for bbox in bboxes:
			
 
				+            if len(bbox) < 4:
			
 
				+                continue
			
 
				+            
			
 
				+            x1, y1, x2, y2 = bbox[:4]
			
 
				+            
			
 
				+            # 转换坐标
			
 
				+            plot_x1 = x1
			
 
				+            plot_x2 = x2
			
 
				+            plot_y1 = image_height - y2
			
 
				+            plot_y2 = image_height - y1
			
 
				+            
			
 
				+            shapes.append(dict(
			
 
				+                type="rect",
			
 
				+                x0=plot_x1, y0=plot_y1,
			
 
				+                x1=plot_x2, y1=plot_y2,
			
 
				+                line=dict(color=line_color, width=line_width),
			
 
				+                fillcolor=fill_color,
			
 
				+            ))
			
 
				+        
			
 
				+        # 🎯 一次性更新所有形状
			
 
				+        fig.update_layout(shapes=fig.layout.shapes + tuple(shapes))
			
 
				+
			
 
				+    def _add_bboxes_as_scatter(self, fig: go.Figure, bboxes: List[List[int]], 
			
 
				+                          image_height: int,
			
 
				+                          line_color: str = "blue", 
			
 
				+                          line_width: int = 2,
			
 
				+                          name: str = "boxes"):
			
 
				+        """
			
 
				+        使用 Scatter 绘制边界框（极致性能优化）
			
 
				+        """
			
 
				+        if not bboxes or len(bboxes) == 0:
			
 
				+            return
			
 
				+        
			
 
				+        # 🎯 收集所有矩形的边框线坐标
			
 
				+        x_coords = []
			
 
				+        y_coords = []
			
 
				+        
			
 
				+        for bbox in bboxes:
			
 
				+            if len(bbox) < 4:
			
 
				+                continue
			
 
				+            
			
 
				+            x1, y1, x2, y2 = bbox[:4]
			
 
				+            
			
 
				+            # 转换坐标
			
 
				+            plot_y1 = image_height - y2
			
 
				+            plot_y2 = image_height - y1
			
 
				+            
			
 
				+            # 绘制矩形：5个点（闭合）
			
 
				+            x_coords.extend([x1, x2, x2, x1, x1, None])  # None用于断开线段
			
 
				+            y_coords.extend([plot_y1, plot_y1, plot_y2, plot_y2, plot_y1, None])
			
 
				+        
			
 
				+        # 🎯 一次性添加所有边框
			
 
				+        fig.add_trace(go.Scatter(
			
 
				+            x=x_coords,
			
 
				+            y=y_coords,
			
 
				+            mode='lines',
			
 
				+            line=dict(color=line_color, width=line_width),
			
 
				+            name=name,
			
 
				+            showlegend=False,
			
 
				+            hoverinfo='skip'
			
 
				+        ))
			
 
				+
			
 
				+    def create_resized_interactive_plot(self, image: Image.Image, selected_boxes: List[List[int]], 
			
 
				+                                       zoom_level: float, all_boxes: List[List[int]]) -> go.Figure:
			
 
				+        """创建可调整大小的交互式图片 - 修复容器溢出问题"""
			
 
				+        fig = go.Figure()
			
 
				+        
			
 
				+        # 添加图片 - Plotly坐标系，原点在左下角
			
 
				+        fig.add_layout_image(
			
 
				+            dict(
			
 
				+                source=image,
			
 
				+                xref="x", yref="y",
			
 
				+                x=0, y=image.height,  # 图片左下角在Plotly坐标系中的位置
			
 
				+                sizex=image.width, 
			
 
				+                sizey=image.height,
			
 
				+                sizing="stretch", 
			
 
				+                opacity=1.0, 
			
 
				+                layer="below",
			
 
				+                yanchor="top"  # 确保图片顶部对齐
			
 
				+            )
			
 
				+        )
			
 
				+        
			
 
				+        # 显示所有bbox（淡蓝色）
			
 
				+        if all_boxes:
			
 
				+            self._add_bboxes_as_scatter(
			
 
				+                fig=fig,
			
 
				+                bboxes=all_boxes,
			
 
				+                image_height=image.height,
			
 
				+                line_color="rgba(0, 100, 200, 0.8)",
			
 
				+                line_width=2,
			
 
				+                name="all_boxes"
			
 
				+            )
			
 
				+
			
 
				+        # 高亮显示选中的bbox（红色）
			
 
				+        if selected_boxes:
			
 
				+            self._add_bboxes_to_plot_batch(
			
 
				+                fig=fig,
			
 
				+                bboxes=selected_boxes,
			
 
				+                image_height=image.height,
			
 
				+                line_color="red",
			
 
				+                line_width=2,
			
 
				+                fill_color="rgba(255, 0, 0, 0.3)"
			
 
				+            )
			
 
				+    
			
 
				+        # 修复：优化显示尺寸计算
			
 
				+        max_display_width = 1500
			
 
				+        max_display_height = 1000
			
 
				+        
			
 
				+        # 计算合适的显示尺寸，保持宽高比
			
 
				+        aspect_ratio = image.width / image.height
			
 
				+        
			
 
				+        if self.fit_to_container:
			
 
				+            # 自适应容器模式
			
 
				+            if aspect_ratio > 1:  # 宽图
			
 
				+                display_width = min(max_display_width, image.width)
			
 
				+                display_height = int(display_width / aspect_ratio)
			
 
				+            else:  # 高图
			
 
				+                display_height = min(max_display_height, image.height)
			
 
				+                display_width = int(display_height * aspect_ratio)
			
 
				+            
			
 
				+            # 确保不会太小
			
 
				+            display_width = max(display_width, 800)
			
 
				+            display_height = max(display_height, 600)
			
 
				+        else:
			
 
				+            # 固定尺寸模式，但仍要考虑容器限制
			
 
				+            display_width = min(image.width, max_display_width)
			
 
				+            display_height = min(image.height, max_display_height)
			
 
				+        
			
 
				+        # 设置布局 - 关键修改
			
 
				+        fig.update_layout(
			
 
				+            width=display_width,
			
 
				+            height=display_height,
			
 
				+            
			
 
				+            margin=dict(l=0, r=0, t=0, b=0),
			
 
				+            showlegend=False,
			
 
				+            plot_bgcolor='white',
			
 
				+            dragmode="pan",
			
 
				+            
			
 
				+            # 关键：让图表自适应容器
			
 
				+            # autosize=True,  # 启用自动调整大小
			
 
				+            
			
 
				+            xaxis=dict(
			
 
				+                visible=False,
			
 
				+                range=[0, image.width],
			
 
				+                constrain="domain",
			
 
				+                fixedrange=False,
			
 
				+                autorange=False,
			
 
				+                showgrid=False,
			
 
				+                zeroline=False,
			
 
				+            ),
			
 
				+            
			
 
				+            # 修复：Y轴设置，确保范围正确
			
 
				+            yaxis=dict(
			
 
				+                visible=False,
			
 
				+                range=[0, image.height],  # 确保Y轴范围从0到图片高度
			
 
				+                constrain="domain",
			
 
				+                scaleanchor="x",
			
 
				+                scaleratio=1,
			
 
				+                fixedrange=False,
			
 
				+                autorange=False,
			
 
				+                showgrid=False,
			
 
				+                zeroline=False
			
 
				+            )
			
 
				+        )
			
 
				+        
			
 
				+        return fig
			
--- a/ocr_validator/ocr_validator_utils.py
+++ b/ocr_validator/ocr_validator_utils.py
@@ -0,0 +1,751 @@
 
				+"""
			
 
				+OCR验证工具的工具函数模块
			
 
				+包含数据处理、图像处理、统计分析等功能
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from pathlib import Path
			
 
				+from PIL import Image, ImageDraw
			
 
				+from typing import Dict, List, Optional, Tuple, Union
			
 
				+import re
			
 
				+import yaml
			
 
				+import sys
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_utils 导入通用工具
			
 
				+from ocr_utils.html_utils import process_all_images_in_content
			
 
				+from ocr_utils.image_utils import rotate_image_and_coordinates
			
 
				+
			
 
				+# rotate_image_and_coordinates 已从 ocr_utils.image_utils 导入，无需重新定义
			
 
				+
			
 
				+def load_config(config_path: str = "config.yaml") -> Dict:
			
 
				+    """加载配置文件"""
			
 
				+    try:
			
 
				+        with open(config_path, 'r', encoding='utf-8') as f:
			
 
				+            return yaml.safe_load(f)
			
 
				+    except Exception as e:
			
 
				+        print(f"加载配置文件失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        # 退出
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+# rotate_image_and_coordinates 已从 ocr_utils.image_utils 导入，无需重新定义
			
 
				+
			
 
				+def parse_dots_ocr_data(data: List, config: Dict, tool_name: str) -> List[Dict]:
			
 
				+    """解析Dots OCR格式的数据"""
			
 
				+    tool_config = config['ocr']['tools'][tool_name]
			
 
				+    parsed_data = []
			
 
				+    
			
 
				+    for item in data:
			
 
				+        if not isinstance(item, dict):
			
 
				+            continue
			
 
				+            
			
 
				+        # 提取字段
			
 
				+        text = item.get(tool_config['text_field'], '')
			
 
				+        bbox = item.get(tool_config['bbox_field'], [])
			
 
				+        category = item.get(tool_config['category_field'], 'Text')
			
 
				+        confidence = item.get(tool_config.get('confidence_field', 'confidence'), 
			
 
				+                            config['ocr']['default_confidence'])
			
 
				+        
			
 
				+        if text and bbox and len(bbox) >= 4:
			
 
				+            parsed_data.append({
			
 
				+                'text': str(text).strip(),
			
 
				+                'bbox': bbox[:4],  # 确保只取前4个坐标
			
 
				+                'category': category,
			
 
				+                'confidence': confidence,
			
 
				+                'source_tool': tool_name
			
 
				+            })
			
 
				+    
			
 
				+    return parsed_data
			
 
				+
			
 
				+
			
 
				+def parse_ppstructv3_data(data: Dict, config: Dict) -> List[Dict]:
			
 
				+    """解析PPStructV3格式的数据"""
			
 
				+    tool_config = config['ocr']['tools']['ppstructv3']
			
 
				+    parsed_data = []
			
 
				+    
			
 
				+    parsing_results = data.get(tool_config['parsing_results_field'], [])
			
 
				+    if not isinstance(parsing_results, list):
			
 
				+        return parsed_data
			
 
				+    
			
 
				+    for item in parsing_results:
			
 
				+        if not isinstance(item, dict):
			
 
				+            continue
			
 
				+            
			
 
				+        text = item.get(tool_config['text_field'], '')
			
 
				+        bbox = item.get(tool_config['bbox_field'], [])
			
 
				+        category = item.get(tool_config['category_field'], 'text')
			
 
				+        confidence = item.get(
			
 
				+            tool_config.get('confidence_field', 'confidence'),
			
 
				+            config['ocr']['default_confidence']
			
 
				+        )
			
 
				+        
			
 
				+        if text and bbox and len(bbox) >= 4:
			
 
				+            parsed_data.append({
			
 
				+                'text': str(text).strip(),
			
 
				+                'bbox': bbox[:4],
			
 
				+                'category': category,
			
 
				+                'confidence': confidence,
			
 
				+                'source_tool': 'ppstructv3'
			
 
				+            })
			
 
				+    
			
 
				+    rec_texts = get_nested_value(data, tool_config.get('rec_texts_field', ''))
			
 
				+    rec_boxes = get_nested_value(data, tool_config.get('rec_boxes_field', ''))
			
 
				+    if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
			
 
				+        for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
			
 
				+            if text and isinstance(box, list) and len(box) >= 4:
			
 
				+                parsed_data.append({
			
 
				+                    'text': str(text).strip(),
			
 
				+                    'bbox': box[:4],
			
 
				+                    'category': 'OCR_Text',
			
 
				+                    'source_tool': 'ppstructv3_ocr'
			
 
				+                })
			
 
				+    
			
 
				+    return parsed_data
			
 
				+
			
 
				+def parse_table_recognition_v2_data(data: Dict, config: Dict) -> List[Dict]:
			
 
				+    tool_config = config['ocr']['tools']['table_recognition_v2']
			
 
				+    parsed_data = []
			
 
				+    tables = data.get(tool_config['parsing_results_field'], [])
			
 
				+    if not isinstance(tables, list):
			
 
				+        return parsed_data
			
 
				+
			
 
				+    for item in tables:
			
 
				+        if not isinstance(item, dict):
			
 
				+            continue
			
 
				+
			
 
				+        html_text = item.get(tool_config['text_field'], '')
			
 
				+
			
 
				+        # 计算表格整体bbox
			
 
				+        cell_boxes_raw = item.get(tool_config['bbox_field'], [])
			
 
				+        if cell_boxes_raw:
			
 
				+            x1_list = [box[0] for box in cell_boxes_raw]
			
 
				+            y1_list = [box[1] for box in cell_boxes_raw]
			
 
				+            x2_list = [box[2] for box in cell_boxes_raw]
			
 
				+            y2_list = [box[3] for box in cell_boxes_raw]
			
 
				+            table_bbox = [
			
 
				+                float(min(x1_list)),
			
 
				+                float(min(y1_list)),
			
 
				+                float(max(x2_list)),
			
 
				+                float(max(y2_list))
			
 
				+            ]
			
 
				+        else:
			
 
				+            table_bbox = [0.0, 0.0, 0.0, 0.0]
			
 
				+
			
 
				+        parsed_data.append({
			
 
				+            'text': str(html_text).strip(),
			
 
				+            'bbox': table_bbox,
			
 
				+            'category': item.get(tool_config.get('category_field', ''), 'table'),
			
 
				+            'confidence': item.get(tool_config.get('confidence_field', ''), config['ocr']['default_confidence']),
			
 
				+            'source_tool': 'table_recognition_v2',
			
 
				+        })
			
 
				+
			
 
				+        rec_texts = get_nested_value(item, tool_config.get('rec_texts_field', ''))
			
 
				+        rec_boxes = get_nested_value(item, tool_config.get('rec_boxes_field', ''))
			
 
				+        if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
			
 
				+            for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
			
 
				+                if text and isinstance(box, list) and len(box) >= 4:
			
 
				+                    parsed_data.append({
			
 
				+                        'text': str(text).strip(),
			
 
				+                        'bbox': box[:4],
			
 
				+                        'category': 'OCR_Text',
			
 
				+                        'source_tool': 'table_recognition_v2'
			
 
				+                    })
			
 
				+    
			
 
				+    return parsed_data
			
 
				+
			
 
				+def parse_mineru_data(data: List, config: Dict, tool_name="mineru") -> List[Dict]:
			
 
				+    """解析MinerU格式的数据"""
			
 
				+    tool_config = config['ocr']['tools'][tool_name]
			
 
				+    parsed_data = []
			
 
				+    
			
 
				+    if not isinstance(data, list):
			
 
				+        return parsed_data
			
 
				+    
			
 
				+    for item in data:
			
 
				+        if not isinstance(item, dict):
			
 
				+            continue
			
 
				+        
			
 
				+        text = item.get(tool_config['text_field'], '')
			
 
				+        bbox = item.get(tool_config['bbox_field'], [])
			
 
				+        category = item.get(tool_config['category_field'], 'Text')
			
 
				+        confidence = item.get(tool_config.get('confidence_field', 'confidence'), 
			
 
				+                            config['ocr']['default_confidence'])        
			
 
				+        # 处理文本类型
			
 
				+        if category == 'text':
			
 
				+            if text and bbox and len(bbox) >= 4:
			
 
				+                parsed_data.append({
			
 
				+                    'text': str(text).strip(),
			
 
				+                    'bbox': bbox[:4],
			
 
				+                    'category': category,
			
 
				+                    'confidence': confidence,
			
 
				+                    'source_tool': tool_name,
			
 
				+                    'text_level': item.get('text_level', 0)  # 保留文本层级信息
			
 
				+                })
			
 
				+        
			
 
				+        # 处理表格类型
			
 
				+        elif category == 'table':
			
 
				+            table_html = item.get(tool_config.get('table_body_field', 'table_body'), '')
			
 
				+            img_path = item.get(tool_config.get('img_path_field', 'img_path'), '')
			
 
				+            
			
 
				+            if bbox and len(bbox) >= 4:
			
 
				+                parsed_data.append({
			
 
				+                    'text': table_html,
			
 
				+                    'bbox': bbox[:4],
			
 
				+                    'category': 'table',
			
 
				+                    'confidence': confidence,
			
 
				+                    'source_tool': tool_name,
			
 
				+                    'img_path': img_path,
			
 
				+                    'table_body': table_html
			
 
				+                })
			
 
				+            table_cells = item.get(tool_config.get('table_cells_field', 'table_cells'), [])
			
 
				+            for cell in table_cells:
			
 
				+                cell_text = cell.get('text', '')
			
 
				+                cell_bbox = cell.get('bbox', [])
			
 
				+                if cell_text and cell_bbox and len(cell_bbox) >= 4:
			
 
				+                    parsed_data.append({
			
 
				+                        'text': str(cell_text).strip(),
			
 
				+                        'matched_text': cell.get('matched_text', ''),
			
 
				+                        'bbox': cell_bbox[:4],
			
 
				+                        'row': cell.get('row', -1),
			
 
				+                        'col': cell.get('col', -1),
			
 
				+                        'category': 'table_cell',
			
 
				+                        'confidence': cell.get('score', 0.0),
			
 
				+                        'source_tool': tool_name,
			
 
				+                    })
			
 
				+        # 处理图片类型
			
 
				+        elif category == 'image':
			
 
				+            img_path = item.get(tool_config.get('img_path_field', 'img_path'), '')
			
 
				+            if bbox and len(bbox) >= 4:
			
 
				+                parsed_data.append({
			
 
				+                    'text': '[Image]',
			
 
				+                    'bbox': bbox[:4],
			
 
				+                    'category': 'image',
			
 
				+                    'confidence': confidence,
			
 
				+                    'source_tool': tool_name,
			
 
				+                    'img_path': img_path
			
 
				+                })
			
 
				+        elif category in ['list']:
			
 
				+            # 处理列表和标题类型
			
 
				+            list_items = item.get('list_items', [])
			
 
				+            sub_type = item.get('sub_type', 'unordered')  # 有序或无序
			
 
				+            
			
 
				+            for list_item in list_items:
			
 
				+                if list_item and bbox and len(bbox) >= 4:
			
 
				+                    parsed_data.append({
			
 
				+                        'text': str(list_item).strip(),
			
 
				+                        'bbox': bbox[:4],
			
 
				+                        'category': category,
			
 
				+                        'sub_type': sub_type,
			
 
				+                        'confidence': confidence,
			
 
				+                        'source_tool': tool_name
			
 
				+                    })
			
 
				+        else:
			
 
				+            # 其他类型，按文本处理,  header, table_cell, ...
			
 
				+            if text and bbox and len(bbox) >= 4:
			
 
				+                parsed_data.append({
			
 
				+                    'text': str(text).strip(),
			
 
				+                    'bbox': bbox[:4],
			
 
				+                    'category': category,
			
 
				+                    'confidence': confidence,
			
 
				+                    'source_tool': tool_name
			
 
				+                })
			
 
				+        
			
 
				+    return parsed_data
			
 
				+
			
 
				+def detect_mineru_structure(data: Union[List, Dict]) -> bool:
			
 
				+    """检测是否为MinerU数据结构"""
			
 
				+    if not isinstance(data, list) or len(data) == 0:
			
 
				+        return False
			
 
				+    
			
 
				+    # 检查第一个元素是否包含MinerU特征字段
			
 
				+    first_item = data[0] if data else {}
			
 
				+    if not isinstance(first_item, dict):
			
 
				+        return False
			
 
				+    
			
 
				+    # MinerU特征：包含type字段，且值为text/table/image之一
			
 
				+    has_type = 'type' in first_item
			
 
				+    has_bbox = 'bbox' in first_item
			
 
				+    has_text = 'text' in first_item
			
 
				+    
			
 
				+    if has_type and has_bbox and has_text:
			
 
				+        item_type = first_item.get('type', '')
			
 
				+        return item_type in ['text', 'table', 'image']
			
 
				+    
			
 
				+    return False
			
 
				+
			
 
				+def detect_ocr_tool_type(data: Union[List, Dict], config: Dict) -> str:
			
 
				+    """
			
 
				+    自动检测OCR工具类型 - 增强版
			
 
				+    
			
 
				+    Args:
			
 
				+        data: OCR数据（可能是列表或字典）
			
 
				+        config: 配置字典
			
 
				+    
			
 
				+    Returns:
			
 
				+        工具类型字符串
			
 
				+    """
			
 
				+    if not config['ocr']['auto_detection']['enabled']:
			
 
				+        return 'mineru'  # 默认类型
			
 
				+    
			
 
				+    rules = config['ocr']['auto_detection']['rules']
			
 
				+    
			
 
				+    # 按优先级排序
			
 
				+    sorted_rules = sorted(rules, key=lambda x: x.get('priority', 999))
			
 
				+    
			
 
				+    for rule in sorted_rules:
			
 
				+        tool_type = rule['tool_type']
			
 
				+        conditions = rule.get('conditions', [])
			
 
				+        
			
 
				+        # 检查所有条件是否满足
			
 
				+        if _check_all_conditions(data, conditions):
			
 
				+            return tool_type
			
 
				+    
			
 
				+    # 如果所有规则都不匹配，返回默认类型
			
 
				+    return 'dots_ocr'
			
 
				+
			
 
				+
			
 
				+def _check_all_conditions(data: Union[List, Dict], conditions: List[Dict]) -> bool:
			
 
				+    """
			
 
				+    检查所有条件是否满足
			
 
				+    
			
 
				+    Args:
			
 
				+        data: 数据
			
 
				+        conditions: 条件列表
			
 
				+    
			
 
				+    Returns:
			
 
				+        是否所有条件都满足
			
 
				+    """
			
 
				+    for condition in conditions:
			
 
				+        condition_type = condition.get('type', '')
			
 
				+        
			
 
				+        if condition_type == 'field_exists':
			
 
				+            # 检查字段存在
			
 
				+            field = condition.get('field', '')
			
 
				+            if not _check_field_exists(data, field):
			
 
				+                return False
			
 
				+        
			
 
				+        elif condition_type == 'field_not_exists':
			
 
				+            # 检查字段不存在
			
 
				+            field = condition.get('field', '')
			
 
				+            if _check_field_exists(data, field):
			
 
				+                return False
			
 
				+        
			
 
				+        elif condition_type == 'json_structure':
			
 
				+            # 检查JSON结构类型
			
 
				+            expected_structure = condition.get('structure', '')
			
 
				+            if expected_structure == 'array' and not isinstance(data, list):
			
 
				+                return False
			
 
				+            elif expected_structure == 'object' and not isinstance(data, dict):
			
 
				+                return False
			
 
				+        
			
 
				+        elif condition_type == 'field_value':
			
 
				+            # 检查字段值
			
 
				+            field = condition.get('field', '')
			
 
				+            expected_value = condition.get('value')
			
 
				+            actual_value = _get_field_value(data, field)
			
 
				+            if actual_value != expected_value:
			
 
				+                return False
			
 
				+        
			
 
				+        elif condition_type == 'field_contains':
			
 
				+            # 检查字段包含某个值
			
 
				+            field = condition.get('field', '')
			
 
				+            expected_values = condition.get('values', [])
			
 
				+            actual_value = _get_field_value(data, field)
			
 
				+            if actual_value not in expected_values:
			
 
				+                return False
			
 
				+    
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def _check_field_exists(data: Union[List, Dict], field_path: str) -> bool:
			
 
				+    """
			
 
				+    检查字段是否存在（支持嵌套路径）
			
 
				+    
			
 
				+    Args:
			
 
				+        data: 数据
			
 
				+        field_path: 字段路径（支持点分隔，如 "doc_preprocessor_res.angle"）
			
 
				+    
			
 
				+    Returns:
			
 
				+        字段是否存在
			
 
				+    """
			
 
				+    if not field_path:
			
 
				+        return False
			
 
				+    
			
 
				+    # 处理数组情况：检查第一个元素
			
 
				+    if isinstance(data, list):
			
 
				+        if not data:
			
 
				+            return False
			
 
				+        data = data[0]
			
 
				+    
			
 
				+    # 处理嵌套字段路径
			
 
				+    fields = field_path.split('.')
			
 
				+    current = data
			
 
				+    
			
 
				+    for field in fields:
			
 
				+        if isinstance(current, dict) and field in current:
			
 
				+            current = current[field]
			
 
				+        else:
			
 
				+            return False
			
 
				+    
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def _get_field_value(data: Union[List, Dict], field_path: str):
			
 
				+    """
			
 
				+    获取字段值（支持嵌套路径）
			
 
				+    
			
 
				+    Args:
			
 
				+        data: 数据
			
 
				+        field_path: 字段路径
			
 
				+    
			
 
				+    Returns:
			
 
				+        字段值，如果不存在返回 None
			
 
				+    """
			
 
				+    if not field_path:
			
 
				+        return None
			
 
				+    
			
 
				+    # 处理数组情况：检查第一个元素
			
 
				+    if isinstance(data, list):
			
 
				+        if not data:
			
 
				+            return None
			
 
				+        data = data[0]
			
 
				+    
			
 
				+    # 处理嵌套字段路径
			
 
				+    fields = field_path.split('.')
			
 
				+    current = data
			
 
				+    
			
 
				+    for field in fields:
			
 
				+        if isinstance(current, dict) and field in current:
			
 
				+            current = current[field]
			
 
				+        else:
			
 
				+            return None
			
 
				+    
			
 
				+    return current
			
 
				+
			
 
				+def normalize_ocr_data(raw_data: Union[List, Dict], config: Dict) -> List[Dict]:
			
 
				+    """标准化OCR数据 - 支持多种工具"""
			
 
				+    tool_type = detect_ocr_tool_type(raw_data, config)
			
 
				+    
			
 
				+    if tool_type == 'dots_ocr':
			
 
				+        return parse_dots_ocr_data(raw_data, config, tool_type)
			
 
				+    elif tool_type == 'ppstructv3':
			
 
				+        return parse_ppstructv3_data(raw_data, config)
			
 
				+    elif tool_type == 'table_recognition_v2':
			
 
				+        return parse_table_recognition_v2_data(raw_data, config)
			
 
				+    elif tool_type == 'mineru':
			
 
				+        return parse_mineru_data(raw_data, config, tool_type)
			
 
				+    else:
			
 
				+        raise ValueError(f"不支持的OCR工具类型: {tool_type}")
			
 
				+
			
 
				+
			
 
				+def get_rotation_angle_from_ppstructv3(data: Dict) -> float:
			
 
				+    """从PPStructV3数据中获取旋转角度"""
			
 
				+    if 'doc_preprocessor_res' in data:
			
 
				+        doc_res = data['doc_preprocessor_res']
			
 
				+        if isinstance(doc_res, dict) and 'angle' in doc_res:
			
 
				+            return float(doc_res['angle'])
			
 
				+    return 0.0
			
 
				+
			
 
				+# 修改 load_ocr_data_file 函数
			
 
				+def load_ocr_data_file(json_path: str, config: Dict) -> Tuple[List, str, str]:
			
 
				+    """加载OCR数据文件 - 支持多数据源配置"""
			
 
				+    json_file = Path(json_path)
			
 
				+    
			
 
				+    if not json_file.exists():
			
 
				+        raise FileNotFoundError(f"找不到JSON文件: {json_path}")
			
 
				+    
			
 
				+    # 加载JSON数据
			
 
				+    try:
			
 
				+        with open(json_file, 'r', encoding='utf-8') as f:
			
 
				+            raw_data = json.load(f)
			
 
				+            
			
 
				+        # 统一数据格式
			
 
				+        ocr_data = normalize_ocr_data(raw_data, config)
			
 
				+        
			
 
				+        # 检查是否需要处理图像旋转
			
 
				+        rotation_angle = 0.0
			
 
				+        if isinstance(raw_data, dict):
			
 
				+            rotation_angle = get_rotation_angle_from_ppstructv3(raw_data)
			
 
				+            
			
 
				+        # 如果有旋转角度，记录下来供后续图像处理使用
			
 
				+        if rotation_angle != 0:
			
 
				+            for item in ocr_data:
			
 
				+                item['rotation_angle'] = rotation_angle
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        raise Exception(f"加载JSON文件失败: {e}")
			
 
				+    
			
 
				+    # 加载MD文件
			
 
				+    md_file = json_file.with_suffix('.md')
			
 
				+    md_content = ""
			
 
				+    if md_file.exists():
			
 
				+        with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+            md_content = f.read()
			
 
				+        
			
 
				+        # ✅ 关键修改：处理MD内容中的所有图片引用
			
 
				+        md_content = process_all_images_in_content(md_content, str(json_file))
			
 
				+    
			
 
				+    # 查找对应的图片文件
			
 
				+    image_path = find_corresponding_image(json_file, config)
			
 
				+    
			
 
				+    return ocr_data, md_content, image_path
			
 
				+
			
 
				+def find_corresponding_image(json_file: Path, config: Dict) -> str:
			
 
				+    """查找对应的图片文件 - 支持多数据源"""
			
 
				+    # 从配置中获取图片目录
			
 
				+    src_img_dir = config.get('paths', {}).get('src_img_dir', '')
			
 
				+    
			
 
				+    if not src_img_dir:
			
 
				+        # 如果没有配置图片目录，尝试在JSON文件同级目录查找
			
 
				+        src_img_dir = json_file.parent
			
 
				+    
			
 
				+    src_img_path = Path(src_img_dir)
			
 
				+    
			
 
				+    # 支持多种图片格式
			
 
				+    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
			
 
				+    
			
 
				+    for ext in image_extensions:
			
 
				+        image_file = src_img_path / f"{json_file.stem}{ext}"
			
 
				+        if image_file.exists():
			
 
				+            return str(image_file)
			
 
				+    
			
 
				+    # 如果找不到，返回空字符串
			
 
				+    return ""
			
 
				+
			
 
				+def process_ocr_data(ocr_data: List, config: Dict) -> Dict[str, List]:
			
 
				+    """处理OCR数据，建立文本到bbox的映射"""
			
 
				+    text_bbox_mapping = {}
			
 
				+    exclude_texts = config['ocr']['exclude_texts']
			
 
				+    min_text_length = config['ocr']['min_text_length']
			
 
				+    
			
 
				+    if not isinstance(ocr_data, list):
			
 
				+        return text_bbox_mapping
			
 
				+    
			
 
				+    for i, item in enumerate(ocr_data):
			
 
				+        if not isinstance(item, dict):
			
 
				+            continue
			
 
				+            
			
 
				+        text = str(item['text']).strip()
			
 
				+        if text and text not in exclude_texts and len(text) >= min_text_length:
			
 
				+            bbox = item['bbox']
			
 
				+            if isinstance(bbox, list) and len(bbox) == 4:
			
 
				+                if text not in text_bbox_mapping:
			
 
				+                    text_bbox_mapping[text] = []
			
 
				+                text_bbox_mapping[text].append({
			
 
				+                    'matched_text': item.get('matched_text', ''),
			
 
				+                    'bbox': bbox,
			
 
				+                    'category': item.get('category', 'Text'),
			
 
				+                    'index': i,
			
 
				+                    'confidence': item.get('confidence', config['ocr']['default_confidence']),
			
 
				+                    'source_tool': item.get('source_tool', 'unknown'),
			
 
				+                    'rotation_angle': item.get('rotation_angle', 0.0)  # 添加旋转角度信息
			
 
				+                })
			
 
				+    
			
 
				+    return text_bbox_mapping
			
 
				+
			
 
				+
			
 
				+def find_available_ocr_files(ocr_out_dir: str) -> List[str]:
			
 
				+    """查找可用的OCR文件"""
			
 
				+    available_files = []
			
 
				+    
			
 
				+    # 搜索多个可能的目录
			
 
				+    search_dirs = [
			
 
				+        Path(ocr_out_dir),
			
 
				+    ]
			
 
				+    
			
 
				+    for search_dir in search_dirs:
			
 
				+        if search_dir.exists():
			
 
				+            # 递归搜索JSON文件
			
 
				+            for json_file in search_dir.rglob("*.json"):
			
 
				+                if re.match(r'.*_page_\d+\.json$', json_file.name, re.IGNORECASE):
			
 
				+                    available_files.append(str(json_file))
			
 
				+    # 去重并排序
			
 
				+    # available_files = sorted(list(set(available_files)))
			
 
				+    # 解析文件名并提取页码信息
			
 
				+    file_info = []
			
 
				+    for file_path in available_files:
			
 
				+        file_name = Path(file_path).stem
			
 
				+        # 提取页码 (例如从 "2023年度报告母公司_page_001" 中提取 "001")
			
 
				+        if 'page_' in file_name:
			
 
				+            try:
			
 
				+                page_part = file_name.split('page_')[-1]
			
 
				+                page_num = int(page_part)
			
 
				+                file_info.append({
			
 
				+                    'path': file_path,
			
 
				+                    'page': page_num,
			
 
				+                    'display_name': f"第{page_num}页"
			
 
				+                })
			
 
				+            except ValueError:
			
 
				+                # 如果无法解析页码，使用文件名
			
 
				+                file_info.append({
			
 
				+                    'path': file_path,
			
 
				+                    'page': len(file_info) + 1,
			
 
				+                    'display_name': Path(file_path).stem
			
 
				+                })
			
 
				+        else:
			
 
				+            # 对于没有page_的文件，按顺序编号
			
 
				+            file_info.append({
			
 
				+                'path': file_path,
			
 
				+                'page': len(file_info) + 1,
			
 
				+                'display_name': Path(file_path).stem
			
 
				+        })
			
 
				+            
			
 
				+    # 按页码排序
			
 
				+    file_info.sort(key=lambda x: x['page'])
			
 
				+
			
 
				+    return file_info
			
 
				+
			
 
				+
			
 
				+def get_ocr_tool_info(ocr_data: List) -> Dict:
			
 
				+    """获取OCR工具信息统计"""
			
 
				+    tool_counts = {}
			
 
				+    for item in ocr_data:
			
 
				+        if isinstance(item, dict):
			
 
				+            source_tool = item.get('source_tool', 'unknown')
			
 
				+            tool_counts[source_tool] = tool_counts.get(source_tool, 0) + 1
			
 
				+    
			
 
				+    return tool_counts
			
 
				+
			
 
				+
			
 
				+def get_ocr_statistics(ocr_data: List, text_bbox_mapping: Dict, marked_errors: set) -> Dict:
			
 
				+    """获取OCR数据统计信息"""
			
 
				+    if not isinstance(ocr_data, list) or not ocr_data:
			
 
				+        return {
			
 
				+            'total_texts': 0, 'clickable_texts': 0, 'marked_errors': 0,
			
 
				+            'categories': {}, 'accuracy_rate': 0, 'tool_info': {}
			
 
				+        }
			
 
				+    
			
 
				+    total_texts = len(ocr_data)
			
 
				+    clickable_texts = len(text_bbox_mapping)
			
 
				+    marked_errors_count = len(marked_errors)
			
 
				+    
			
 
				+    # 按类别统计
			
 
				+    categories = {}
			
 
				+    for item in ocr_data:
			
 
				+        if isinstance(item, dict):
			
 
				+            category = item.get('category', 'Unknown')
			
 
				+            categories[category] = categories.get(category, 0) + 1
			
 
				+    
			
 
				+    # OCR工具信息统计
			
 
				+    tool_info = get_ocr_tool_info(ocr_data)
			
 
				+    
			
 
				+    accuracy_rate = (clickable_texts - marked_errors_count) / clickable_texts * 100 if clickable_texts > 0 else 0
			
 
				+    
			
 
				+    return {
			
 
				+        'total_texts': total_texts,
			
 
				+        'clickable_texts': clickable_texts,
			
 
				+        'marked_errors': marked_errors_count,
			
 
				+        'categories': categories,
			
 
				+        'accuracy_rate': accuracy_rate,
			
 
				+        'tool_info': tool_info
			
 
				+    }
			
 
				+
			
 
				+def group_texts_by_category(text_bbox_mapping: Dict[str, List]) -> Dict[str, List[str]]:
			
 
				+    """按类别对文本进行分组"""
			
 
				+    categories = {}
			
 
				+    for text, info_list in text_bbox_mapping.items():
			
 
				+        category = info_list[0]['category']
			
 
				+        if category not in categories:
			
 
				+            categories[category] = []
			
 
				+        categories[category].append(text)
			
 
				+    return categories
			
 
				+
			
 
				+
			
 
				+def get_ocr_tool_rotation_config(ocr_data: List, config: Dict) -> Dict:
			
 
				+    """获取OCR工具的旋转配置"""
			
 
				+    if not ocr_data or not isinstance(ocr_data, list):
			
 
				+        # 默认配置
			
 
				+        return {
			
 
				+            'coordinates_are_pre_rotated': False
			
 
				+        }
			
 
				+    
			
 
				+    # 从第一个OCR数据项获取工具类型
			
 
				+    first_item = ocr_data[0] if ocr_data else {}
			
 
				+    source_tool = first_item.get('source_tool', 'dots_ocr')
			
 
				+    
			
 
				+    # 获取工具配置
			
 
				+    tools_config = config.get('ocr', {}).get('tools', {})
			
 
				+    
			
 
				+    if source_tool in tools_config:
			
 
				+        tool_config = tools_config[source_tool]
			
 
				+        return tool_config.get('rotation', {
			
 
				+            'coordinates_are_pre_rotated': False
			
 
				+        })
			
 
				+    else:
			
 
				+        # 默认配置
			
 
				+        return {
			
 
				+            'coordinates_are_pre_rotated': False
			
 
				+        }
			
 
				+
			
 
				+# ocr_validator_utils.py
			
 
				+def find_available_ocr_files_multi_source(config: Dict) -> Dict[str, List[Dict]]:
			
 
				+    """查找多个数据源的OCR文件"""
			
 
				+    all_sources = {}
			
 
				+    
			
 
				+    for source in config.get('data_sources', []):
			
 
				+        source_name = source['name']
			
 
				+        ocr_tool = source['ocr_tool']
			
 
				+        source_key = f"{source_name}"
			
 
				+        
			
 
				+        ocr_out_dir = source['ocr_out_dir']
			
 
				+        
			
 
				+        if Path(ocr_out_dir).exists():
			
 
				+            files = find_available_ocr_files(ocr_out_dir)
			
 
				+            
			
 
				+            # 为每个文件添加数据源信息
			
 
				+            for file_info in files:
			
 
				+                file_info.update({
			
 
				+                    'source_name': source_name,
			
 
				+                    'ocr_tool': ocr_tool,
			
 
				+                    'description': source.get('description', ''),
			
 
				+                    'src_img_dir': source.get('src_img_dir', ''),
			
 
				+                    'ocr_out_dir': ocr_out_dir
			
 
				+                })
			
 
				+            
			
 
				+            all_sources[source_key] = {
			
 
				+                'files': files,
			
 
				+                'config': source
			
 
				+            }
			
 
				+            
			
 
				+            print(f"📁 找到数据源: {source_key} - {len(files)} 个文件")
			
 
				+    
			
 
				+    return all_sources
			
 
				+
			
 
				+def get_data_source_display_name(source_config: Dict) -> str:
			
 
				+    """生成数据源的显示名称"""
			
 
				+    name = source_config['name']
			
 
				+    tool = source_config['ocr_tool']
			
 
				+    description = source_config.get('description', '')
			
 
				+    
			
 
				+    # 获取工具的友好名称
			
 
				+    tool_name_map = {
			
 
				+        'dots_ocr': 'Dots OCR',
			
 
				+        'ppstructv3': 'PPStructV3',
			
 
				+        'table_recognition_v2': 'Table Recognition V2',
			
 
				+        'mineru': 'MinerU VLM-2.5.3'
			
 
				+    }
			
 
				+    
			
 
				+    tool_display = tool_name_map.get(tool, tool)
			
 
				+    return f"{name} ({tool_display})"
			
 
				+
			
 
				+def get_nested_value(data: Dict, path: str, default=None):
			
 
				+    if not path:
			
 
				+        return default
			
 
				+    keys = path.split('.')
			
 
				+    value = data
			
 
				+    for key in keys:
			
 
				+        if isinstance(value, dict) and key in value:
			
 
				+            value = value[key]
			
 
				+        else:
			
 
				+            return default
			
 
				+    return value
			
--- a/ocr_validator/run_streamlit_validator.py
+++ b/ocr_validator/run_streamlit_validator.py
@@ -0,0 +1,93 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Streamlit OCR校验工具启动脚本
			
 
				+"""
			
 
				+
			
 
				+import subprocess
			
 
				+import sys
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def check_streamlit():
			
 
				+    """检查Streamlit是否已安装"""
			
 
				+    try:
			
 
				+        import streamlit
			
 
				+        return True
			
 
				+    except ImportError:
			
 
				+        return False
			
 
				+
			
 
				+def install_dependencies():
			
 
				+    """安装必要的依赖"""
			
 
				+    print("📦 安装Streamlit依赖...")
			
 
				+    
			
 
				+    dependencies = [
			
 
				+        "streamlit",
			
 
				+        "plotly", 
			
 
				+        "pandas",
			
 
				+        "pillow",
			
 
				+        "opencv-python"
			
 
				+    ]
			
 
				+    
			
 
				+    for dep in dependencies:
			
 
				+        try:
			
 
				+            print(f"   安装 {dep}...")
			
 
				+            subprocess.check_call([sys.executable, "-m", "pip", "install", dep], 
			
 
				+                                stdout=subprocess.DEVNULL, 
			
 
				+                                stderr=subprocess.DEVNULL)
			
 
				+            print(f"   ✅ {dep} 安装成功")
			
 
				+        except subprocess.CalledProcessError:
			
 
				+            print(f"   ❌ {dep} 安装失败")
			
 
				+            return False
			
 
				+    
			
 
				+    return True
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    print("🚀 启动Streamlit OCR校验工具")
			
 
				+    print("=" * 50)
			
 
				+    
			
 
				+    # 检查Streamlit
			
 
				+    if not check_streamlit():
			
 
				+        print("❌ Streamlit未安装")
			
 
				+        choice = input("是否自动安装依赖？(y/n): ")
			
 
				+        if choice.lower() == 'y':
			
 
				+            if not install_dependencies():
			
 
				+                print("❌ 依赖安装失败，请手动安装:")
			
 
				+                print("pip install streamlit plotly pandas pillow opencv-python")
			
 
				+                return
			
 
				+        else:
			
 
				+            print("请手动安装依赖:")
			
 
				+            print("pip install streamlit plotly pandas pillow opencv-python")
			
 
				+            return
			
 
				+    
			
 
				+    # 检查OCR数据
			
 
				+    ocr_out_dir = Path("ocr_out_dir")
			
 
				+    if not ocr_out_dir.exists() or not any(ocr_out_dir.rglob("*.json")):
			
 
				+        print("⚠️  未找到OCR数据文件")
			
 
				+        print("   请先运行OCR处理生成数据文件")
			
 
				+        print("   python3 ocr_by_vlm.py sample_data/your_image.png")
			
 
				+        print("")
			
 
				+    
			
 
				+    # 启动Streamlit
			
 
				+    print("🌐 启动Streamlit应用...")
			
 
				+    print("   浏览器将自动打开 http://localhost:8501")
			
 
				+    print("   按 Ctrl+C 停止应用")
			
 
				+    print("")
			
 
				+    
			
 
				+    try:
			
 
				+        # 启动Streamlit应用
			
 
				+        subprocess.run([
			
 
				+            sys.executable, "-m", "streamlit", "run", 
			
 
				+            "streamlit_ocr_validator.py",
			
 
				+            "--server.headless", "false",
			
 
				+            "--server.port", "8501"
			
 
				+        ])
			
 
				+    except KeyboardInterrupt:
			
 
				+        print("\n👋 应用已停止")
			
 
				+    except FileNotFoundError:
			
 
				+        print("❌ streamlit_ocr_validator.py 文件不存在")
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ 启动失败: {e}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/ocr_validator/streamlit_ocr_validator.py
+++ b/ocr_validator/streamlit_ocr_validator.py
@@ -0,0 +1,356 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+基于Streamlit的OCR可视化校验工具（主入口）
			
 
				+"""
			
 
				+
			
 
				+import streamlit as st
			
 
				+from pathlib import Path
			
 
				+import json
			
 
				+import sys
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_ocr_validator.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+from streamlit_validator_core import StreamlitOCRValidator
			
 
				+from streamlit_validator_ui import (
			
 
				+    setup_page_config, create_data_source_selector, message_box
			
 
				+)
			
 
				+from streamlit_validator_table import display_html_table_as_dataframe
			
 
				+from streamlit_validator_cross import (
			
 
				+    cross_validation_dialog, show_batch_cross_validation_results_dialog
			
 
				+)
			
 
				+from streamlit_validator_result import display_single_page_cross_validation
			
 
				+from ocr_validator_utils import get_data_source_display_name
			
 
				+from config_manager import load_config  # 🎯 使用新配置管理器
			
 
				+
			
 
				+
			
 
				+def reset_cross_validation_results():
			
 
				+    """重置交叉验证结果"""
			
 
				+    if 'cross_validation_batch_result' in st.session_state:
			
 
				+        st.session_state.cross_validation_batch_result = None
			
 
				+        print("🔄 数据源已变更，交叉验证结果已清空")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主应用"""
			
 
				+    # 🎯 初始化配置管理器
			
 
				+    if 'config_manager' not in st.session_state:
			
 
				+        try:
			
 
				+            st.session_state.config_manager = load_config(config_dir="config")
			
 
				+            # 🎯 生成 OCRValidator 所需的配置
			
 
				+            st.session_state.validator_config = st.session_state.config_manager.to_validator_config()
			
 
				+            print("✅ 配置管理器初始化成功")
			
 
				+            print(f"📄 发现 {len(st.session_state.config_manager.list_documents())} 个文档配置")
			
 
				+            print(f"🔧 发现 {len(st.session_state.config_manager.list_ocr_tools())} 个 OCR 工具")
			
 
				+        except Exception as e:
			
 
				+            st.error(f"❌ 配置加载失败: {e}")
			
 
				+            st.stop()
			
 
				+    
			
 
				+    config_manager = st.session_state.config_manager
			
 
				+    validator_config = st.session_state.validator_config
			
 
				+    
			
 
				+    # 初始化应用
			
 
				+    if 'validator' not in st.session_state:
			
 
				+        # 🎯 直接传递配置字典给 OCRValidator
			
 
				+        validator = StreamlitOCRValidator(config_dict=validator_config)
			
 
				+        st.session_state.validator = validator
			
 
				+        setup_page_config(validator_config)
			
 
				+        
			
 
				+        # 页面标题
			
 
				+        st.title(validator_config['ui']['page_title'])
			
 
				+        
			
 
				+        # 初始化数据源追踪
			
 
				+        st.session_state.current_ocr_source = validator.current_source_key
			
 
				+        st.session_state.current_verify_source = validator.verify_source_key
			
 
				+    else:
			
 
				+        validator = st.session_state.validator
			
 
				+    
			
 
				+    if 'selected_text' not in st.session_state:
			
 
				+        st.session_state.selected_text = None
			
 
				+        st.session_state.compact_search_query = None
			
 
				+    if 'marked_errors' not in st.session_state:
			
 
				+        st.session_state.marked_errors = set()
			
 
				+    
			
 
				+    # 数据源选择器
			
 
				+    create_data_source_selector(validator)
			
 
				+    
			
 
				+    # ✅ 检测数据源是否变更
			
 
				+    ocr_source_changed = False
			
 
				+    verify_source_changed = False
			
 
				+    
			
 
				+    if 'current_ocr_source' in st.session_state:
			
 
				+        if st.session_state.current_ocr_source != validator.current_source_key:
			
 
				+            ocr_source_changed = True
			
 
				+            st.session_state.current_ocr_source = validator.current_source_key
			
 
				+            print(f"🔄 OCR数据源已切换到: {validator.current_source_key}")
			
 
				+    
			
 
				+    if 'current_verify_source' in st.session_state:
			
 
				+        if st.session_state.current_verify_source != validator.verify_source_key:
			
 
				+            verify_source_changed = True
			
 
				+            st.session_state.current_verify_source = validator.verify_source_key
			
 
				+            print(f"🔄 验证数据源已切换到: {validator.verify_source_key}")
			
 
				+    
			
 
				+    # ✅ 如果任一数据源变更，清空交叉验证结果
			
 
				+    if ocr_source_changed or verify_source_changed:
			
 
				+        reset_cross_validation_results()
			
 
				+        
			
 
				+        # 显示提示信息
			
 
				+        if ocr_source_changed and verify_source_changed:
			
 
				+            st.info("ℹ️ OCR数据源和验证数据源已变更，请重新运行交叉验证")
			
 
				+        elif ocr_source_changed:
			
 
				+            st.info("ℹ️ OCR数据源已变更，请重新运行交叉验证")
			
 
				+        elif verify_source_changed:
			
 
				+            st.info("ℹ️ 验证数据源已变更，请重新运行交叉验证")
			
 
				+    
			
 
				+    # 如果没有可用的数据源，提前返回
			
 
				+    if not validator.all_sources:
			
 
				+        st.warning("⚠️ 未找到任何数据源，请检查配置文件")
			
 
				+        
			
 
				+        # 🎯 显示配置信息帮助调试
			
 
				+        with st.expander("🔍 配置信息", expanded=True):
			
 
				+            st.write("**已加载的文档:**")
			
 
				+            docs = config_manager.list_documents()
			
 
				+            if docs:
			
 
				+                for doc in docs:
			
 
				+                    doc_config = config_manager.get_document(doc)
			
 
				+                    st.write(f"- **{doc}**")
			
 
				+                    st.write(f"  - 基础目录: `{doc_config.base_dir}`")
			
 
				+                    st.write(f"  - OCR 结果: {len([r for r in doc_config.ocr_results if r.enabled])} 个已启用")
			
 
				+            else:
			
 
				+                st.write("无")
			
 
				+            
			
 
				+            st.write("**已加载的 OCR 工具:**")
			
 
				+            tools = config_manager.list_ocr_tools()
			
 
				+            if tools:
			
 
				+                for tool in tools:
			
 
				+                    tool_config = config_manager.get_ocr_tool(tool)
			
 
				+                    st.write(f"- **{tool_config.name}** (`{tool}`)")
			
 
				+            else:
			
 
				+                st.write("无")
			
 
				+            
			
 
				+            st.write("**配置文件路径:**")
			
 
				+            st.code(str(config_manager.config_dir / "global.yaml"))
			
 
				+            
			
 
				+            st.write("**生成的数据源:**")
			
 
				+            data_sources = config_manager.get_data_sources()
			
 
				+            if data_sources:
			
 
				+                for ds in data_sources:
			
 
				+                    st.write(f"- `{ds.name}`")
			
 
				+                    st.write(f"  - 工具: {ds.ocr_tool}")
			
 
				+                    st.write(f"  - 结果目录: {ds.ocr_out_dir}")
			
 
				+                    st.write(f"  - 图片目录: {ds.src_img_dir}")
			
 
				+            else:
			
 
				+                st.write("无")
			
 
				+        
			
 
				+        st.stop()
			
 
				+    
			
 
				+    # 文件选择区域
			
 
				+    with st.container(height=75, horizontal=True, horizontal_alignment='left', gap="medium"):
			
 
				+        if 'selected_file_index' not in st.session_state:
			
 
				+            st.session_state.selected_file_index = 0
			
 
				+            st.session_state.file_selectbox = 0
			
 
				+            
			
 
				+        if validator.display_options:
			
 
				+            # 确保 selected_file_index 在有效范围内
			
 
				+            if st.session_state.selected_file_index >= len(validator.display_options):
			
 
				+                st.session_state.selected_file_index = 0
			
 
				+                st.session_state.file_selectbox = 0
			
 
				+            
			
 
				+            # 使用独立的 key 给 selectbox，避免 Streamlit 锁定 selected_file_index
			
 
				+            # 在创建 selectbox 之前，同步 file_selectbox 的值到 selected_file_index
			
 
				+            if 'file_selectbox' not in st.session_state:
			
 
				+                st.session_state.file_selectbox = st.session_state.selected_file_index
			
 
				+            elif st.session_state.file_selectbox != st.session_state.selected_file_index:
			
 
				+                # 如果 selected_file_index 被外部更新（如通过页码输入），同步到 file_selectbox
			
 
				+                st.session_state.file_selectbox = st.session_state.selected_file_index
			
 
				+            
			
 
				+            selected_index = st.selectbox(
			
 
				+                "选择OCR结果文件", 
			
 
				+                range(len(validator.display_options)),
			
 
				+                format_func=lambda i: validator.display_options[i],
			
 
				+                index=st.session_state.selected_file_index,
			
 
				+                key="file_selectbox",
			
 
				+                label_visibility="collapsed"
			
 
				+            )
			
 
				+            
			
 
				+            # 手动同步 selectbox 的值到 selected_file_index
			
 
				+            if selected_index != st.session_state.selected_file_index:
			
 
				+                st.session_state.selected_file_index = selected_index
			
 
				+            
			
 
				+            selected_file = validator.file_paths[selected_index]
			
 
				+            current_page = validator.file_info[selected_index]['page']
			
 
				+            
			
 
				+            # 初始化或同步页码值
			
 
				+            if 'page_input_value' not in st.session_state:
			
 
				+                st.session_state.page_input_value = current_page
			
 
				+            
			
 
				+            # 如果当前页码与 session_state 中的值不一致，更新 session_state
			
 
				+            # 这会在下拉框改变或通过其他方式改变文件时同步页码
			
 
				+            if current_page != st.session_state.page_input_value:
			
 
				+                st.session_state.page_input_value = current_page
			
 
				+            
			
 
				+            page_input = st.number_input(
			
 
				+                "输入页码", 
			
 
				+                placeholder="输入页码", 
			
 
				+                label_visibility="collapsed",
			
 
				+                min_value=1, 
			
 
				+                max_value=len(validator.display_options), 
			
 
				+                value=st.session_state.page_input_value, 
			
 
				+                step=1,
			
 
				+                key="page_input"
			
 
				+            )
			
 
				+            
			
 
				+            # 更新 session_state 中的页码值
			
 
				+            if page_input != st.session_state.page_input_value:
			
 
				+                st.session_state.page_input_value = page_input
			
 
				+            
			
 
				+            if page_input != current_page:
			
 
				+                for i, info in enumerate(validator.file_info):
			
 
				+                    if info['page'] == page_input:
			
 
				+                        # 更新 selected_file_index，selectbox 会在下一个运行周期自动同步
			
 
				+                        st.session_state.selected_file_index = i
			
 
				+                        selected_file = validator.file_paths[i]
			
 
				+                        # 同步页码值
			
 
				+                        st.session_state.page_input_value = page_input
			
 
				+                        st.rerun()
			
 
				+                        break
			
 
				+
			
 
				+            if (st.session_state.selected_file_index >= 0
			
 
				+                and validator.selected_file_index != st.session_state.selected_file_index
			
 
				+                and selected_file):
			
 
				+                validator.selected_file_index = st.session_state.selected_file_index
			
 
				+                st.session_state.validator.load_ocr_data(selected_file)
			
 
				+                
			
 
				+                current_source_name = get_data_source_display_name(validator.current_source_config)
			
 
				+                st.success(f"✅ 已加载 {current_source_name} - 第{validator.file_info[st.session_state.selected_file_index]['page']}页")
			
 
				+                st.rerun()
			
 
				+        else:
			
 
				+            st.warning("当前数据源中未找到OCR结果文件")
			
 
				+
			
 
				+        # ✅ 交叉验证按钮 - 添加数据源检查
			
 
				+        cross_validation_enabled = (
			
 
				+            validator.current_source_key != validator.verify_source_key 
			
 
				+            and validator.image_path 
			
 
				+            and validator.md_content
			
 
				+        )
			
 
				+        
			
 
				+        if st.button(
			
 
				+            "交叉验证", 
			
 
				+            type="primary", 
			
 
				+            icon=":material/compare_arrows:",
			
 
				+            disabled=not cross_validation_enabled,
			
 
				+            help="需要选择不同的OCR数据源和验证数据源" if not cross_validation_enabled else "开始批量交叉验证"
			
 
				+        ):
			
 
				+            cross_validation_dialog(validator)
			
 
				+
			
 
				+        # ✅ 查看验证结果按钮 - 检查是否有验证结果
			
 
				+        has_validation_results = (
			
 
				+            'cross_validation_batch_result' in st.session_state 
			
 
				+            and st.session_state.cross_validation_batch_result is not None
			
 
				+        )
			
 
				+        
			
 
				+        if st.button(
			
 
				+            "查看验证结果", 
			
 
				+            type="secondary", 
			
 
				+            icon=":material/quick_reference_all:",
			
 
				+            disabled=not has_validation_results,
			
 
				+            help="暂无验证结果，请先运行交叉验证" if not has_validation_results else "查看批量验证结果"
			
 
				+        ):
			
 
				+            show_batch_cross_validation_results_dialog()
			
 
				+
			
 
				+    # 显示当前数据源统计信息
			
 
				+    with st.expander("OCR工具统计信息", expanded=False):
			
 
				+        stats = validator.get_statistics()
			
 
				+        col1, col2, col3, col4, col5 = st.columns(5)
			
 
				+        
			
 
				+        with col1:
			
 
				+            st.metric("📊 总文本块", stats['total_texts'])
			
 
				+        with col2:
			
 
				+            st.metric("🔗 可点击文本", stats['clickable_texts'])
			
 
				+        with col3:
			
 
				+            st.metric("❌ 标记错误", stats['marked_errors'])
			
 
				+        with col4:
			
 
				+            st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
			
 
				+        with col5:
			
 
				+            if validator.current_source_config:
			
 
				+                tool_id = validator.current_source_config['ocr_tool']
			
 
				+                # 🎯 从配置管理器获取工具名称
			
 
				+                tool_config = config_manager.get_ocr_tool(tool_id)
			
 
				+                tool_display = tool_config.name if tool_config else tool_id.upper()
			
 
				+                st.metric("🔧 OCR工具", tool_display)
			
 
				+        
			
 
				+        if stats['tool_info']:
			
 
				+            st.write("**详细信息:**", stats['tool_info'])
			
 
				+        
			
 
				+        # 🎯 显示当前文档和 OCR 结果信息
			
 
				+        if validator.current_source_config:
			
 
				+            source_name = validator.current_source_config['name']
			
 
				+            # 解析数据源名称，提取文档名（更精确的解析）
			
 
				+            parts = source_name.split('_', 1)
			
 
				+            doc_name = parts[0] if parts else source_name
			
 
				+            
			
 
				+            doc_config = config_manager.get_document(doc_name)
			
 
				+            if doc_config:
			
 
				+                st.write("**文档信息:**")
			
 
				+                st.write(f"- 文档名称: {doc_config.name}")
			
 
				+                st.write(f"- 基础目录: {doc_config.base_dir}")
			
 
				+                st.write(f"- 可用 OCR 工具: {len([r for r in doc_config.ocr_results if r.enabled])} 个")
			
 
				+    
			
 
				+    # 🎯 添加配置管理面板
			
 
				+    with st.expander("⚙️ 配置管理", expanded=False):
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        
			
 
				+        with col1:
			
 
				+            st.subheader("📄 已加载文档")
			
 
				+            docs = config_manager.list_documents()
			
 
				+            for doc_name in docs:
			
 
				+                doc_config = config_manager.get_document(doc_name)
			
 
				+                enabled_count = len([r for r in doc_config.ocr_results if r.enabled])
			
 
				+                total_count = len(doc_config.ocr_results)
			
 
				+                
			
 
				+                with st.container():
			
 
				+                    st.write(f"✅ **{doc_name}**")
			
 
				+                    st.caption(f"📊 {enabled_count}/{total_count} 工具已启用")
			
 
				+                    
			
 
				+                    # 显示每个 OCR 工具的状态
			
 
				+                    for ocr_result in doc_config.ocr_results:
			
 
				+                        status_icon = "🟢" if ocr_result.enabled else "⚪"
			
 
				+                        tool_config = config_manager.get_ocr_tool(ocr_result.tool)
			
 
				+                        tool_name = tool_config.name if tool_config else ocr_result.tool
			
 
				+                        st.caption(f"  {status_icon} {tool_name} - {ocr_result.description or ocr_result.result_dir}")
			
 
				+        
			
 
				+        with col2:
			
 
				+            st.subheader("🔧 已加载 OCR 工具")
			
 
				+            tools = config_manager.list_ocr_tools()
			
 
				+            for tool_id in tools:
			
 
				+                tool_config = config_manager.get_ocr_tool(tool_id)
			
 
				+                with st.container():
			
 
				+                    st.write(f"🔧 **{tool_config.name}**")
			
 
				+                    st.caption(f"ID: `{tool_id}`")
			
 
				+                    st.caption(f"描述: {tool_config.description}")
			
 
				+    
			
 
				+    tab1, tab2, tab3 = st.tabs(["📄 内容人工检查", "🔍 交叉验证结果", "📊 表格分析"])
			
 
				+    
			
 
				+    with tab1:
			
 
				+        validator.create_compact_layout(validator_config)
			
 
				+
			
 
				+    with tab2:
			
 
				+        # ✅ 使用封装的函数显示单页交叉验证结果
			
 
				+        display_single_page_cross_validation(validator, validator_config)
			
 
				+
			
 
				+    with tab3:
			
 
				+        st.header("📊 表格数据分析")
			
 
				+        
			
 
				+        if validator.md_content and '<table' in validator.md_content.lower():
			
 
				+            st.subheader("🔍 表格数据预览")
			
 
				+            display_html_table_as_dataframe(validator.md_content)
			
 
				+        else:
			
 
				+            st.info("当前OCR结果中没有检测到表格数据")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/ocr_validator/streamlit_validator_core.py
+++ b/ocr_validator/streamlit_validator_core.py
@@ -0,0 +1,155 @@
 
				+"""
			
 
				+核心验证器类
			
 
				+"""
			
 
				+import streamlit as st
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+import json
			
 
				+import sys
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_validator_core.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+from ocr_validator_utils import (
			
 
				+    load_ocr_data_file, process_ocr_data,
			
 
				+    get_ocr_statistics, find_available_ocr_files_multi_source, 
			
 
				+    get_data_source_display_name
			
 
				+)
			
 
				+from ocr_validator_layout import OCRLayoutManager
			
 
				+
			
 
				+
			
 
				+class StreamlitOCRValidator:
			
 
				+    """核心验证器类"""
			
 
				+    
			
 
				+    def __init__(self, config_dict: Dict = None):  # 🎯 参数名改为 config_dict
			
 
				+        """
			
 
				+        初始化验证器
			
 
				+        
			
 
				+        Args:
			
 
				+            config_dict: 配置字典（从 ConfigManager.to_validator_config() 生成）
			
 
				+        """
			
 
				+        self.config = config_dict  # 🎯 直接赋值
			
 
				+        self.ocr_data = []
			
 
				+        self.md_content = ""
			
 
				+        self.image_path = ""
			
 
				+        self.text_bbox_mapping = {}
			
 
				+        self.selected_text = None
			
 
				+        self.marked_errors = set()
			
 
				+        
			
 
				+        # 多数据源相关
			
 
				+        self.all_sources = {}
			
 
				+        self.current_source_key = None
			
 
				+        self.current_source_config = None
			
 
				+        self.file_info = []
			
 
				+        self.selected_file_index = -1
			
 
				+        self.display_options = []
			
 
				+        self.file_paths = []
			
 
				+        
			
 
				+        # 交叉验证数据源
			
 
				+        self.verify_source_key = None
			
 
				+        self.verify_source_config = None
			
 
				+        self.verify_file_info = []
			
 
				+        self.verify_display_options = []
			
 
				+        self.verify_file_paths = []
			
 
				+
			
 
				+        # 初始化布局管理器
			
 
				+        self.layout_manager = OCRLayoutManager(self)
			
 
				+
			
 
				+        # 加载多数据源文件信息
			
 
				+        self.load_multi_source_info()
			
 
				+        
			
 
				+    def load_multi_source_info(self):
			
 
				+        """加载多数据源文件信息"""
			
 
				+        self.all_sources = find_available_ocr_files_multi_source(self.config)
			
 
				+        
			
 
				+        if self.all_sources:
			
 
				+            source_keys = list(self.all_sources.keys())
			
 
				+            first_source_key = source_keys[0]
			
 
				+            self.switch_to_source(first_source_key)
			
 
				+            
			
 
				+            if len(source_keys) > 1:
			
 
				+                self.switch_to_verify_source(source_keys[1])
			
 
				+    
			
 
				+    def switch_to_source(self, source_key: str):
			
 
				+        """切换到指定OCR数据源"""
			
 
				+        if source_key in self.all_sources:
			
 
				+            self.current_source_key = source_key
			
 
				+            source_data = self.all_sources[source_key]
			
 
				+            self.current_source_config = source_data['config']
			
 
				+            self.file_info = source_data['files']
			
 
				+            
			
 
				+            if self.file_info:
			
 
				+                self.display_options = [f"{info['display_name']}" for info in self.file_info]
			
 
				+                self.file_paths = [info['path'] for info in self.file_info]
			
 
				+                self.selected_file_index = -1
			
 
				+                print(f"✅ 切换到OCR数据源: {source_key}")
			
 
				+            else:
			
 
				+                print(f"⚠️ 数据源 {source_key} 没有可用文件")
			
 
				+        else:
			
 
				+            raise FileNotFoundError(f"找不到文件路径: {source_key}")
			
 
				+    
			
 
				+    def switch_to_verify_source(self, source_key: str):
			
 
				+        """切换到指定验证数据源"""
			
 
				+        if source_key in self.all_sources:
			
 
				+            self.verify_source_key = source_key
			
 
				+            source_data = self.all_sources[source_key]
			
 
				+            self.verify_source_config = source_data['config']
			
 
				+            self.verify_file_info = source_data['files']
			
 
				+            
			
 
				+            if self.verify_file_info:
			
 
				+                self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
			
 
				+                self.verify_file_paths = [info['path'] for info in self.verify_file_info]
			
 
				+                print(f"✅ 切换到验证数据源: {source_key}")
			
 
				+            else:
			
 
				+                print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
			
 
				+        else:
			
 
				+            raise FileNotFoundError(f"找不到文件路径: {source_key}")
			
 
				+
			
 
				+    def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
			
 
				+        """加载OCR相关数据"""
			
 
				+        try:
			
 
				+            if self.current_source_config:
			
 
				+                temp_config = self.config.copy()
			
 
				+                temp_config['paths'] = {
			
 
				+                    'ocr_out_dir': self.current_source_config['ocr_out_dir'],
			
 
				+                    'src_img_dir': self.current_source_config.get('src_img_dir', ''),
			
 
				+                    'pre_validation_dir': self.config['pre_validation']['out_dir']
			
 
				+                }
			
 
				+                temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
			
 
				+                
			
 
				+                self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
			
 
				+            else:
			
 
				+                self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
			
 
				+                
			
 
				+            self.process_data()
			
 
				+        except Exception as e:
			
 
				+            st.error(f"❌ 加载失败: {e}")
			
 
				+            st.exception(e)
			
 
				+    
			
 
				+    def process_data(self):
			
 
				+        """处理OCR数据"""
			
 
				+        self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
			
 
				+    
			
 
				+    def get_statistics(self) -> Dict:
			
 
				+        """获取统计信息"""
			
 
				+        return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
			
 
				+    
			
 
				+    def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
			
 
				+        """查找当前OCR文件对应的验证文件路径"""
			
 
				+        current_page = self.file_info[selected_file_index]['page']
			
 
				+        verify_md_path = None
			
 
				+
			
 
				+        for i, info in enumerate(self.verify_file_info):
			
 
				+            if info['page'] == current_page:
			
 
				+                verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
			
 
				+                break
			
 
				+
			
 
				+        return verify_md_path
			
 
				+
			
 
				+    def create_compact_layout(self, config):
			
 
				+        """创建紧凑布局"""
			
 
				+        return self.layout_manager.create_compact_layout(config)
			
--- a/ocr_validator/streamlit_validator_cross.py
+++ b/ocr_validator/streamlit_validator_cross.py
@@ -0,0 +1,463 @@
 
				+"""
			
 
				+交叉验证功能模块
			
 
				+"""
			
 
				+import streamlit as st
			
 
				+import pandas as pd
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from io import BytesIO
			
 
				+import plotly.express as px
			
 
				+import sys
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_validator_cross.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_comparator 导入（已迁移到 ocr_platform）
			
 
				+from ocr_comparator import compare_ocr_results
			
 
				+
			
 
				+from ocr_validator_utils import get_data_source_display_name
			
 
				+
			
 
				+
			
 
				+@st.dialog("交叉验证", width="large", dismissible=True, on_dismiss="rerun")
			
 
				+def cross_validation_dialog(validator):
			
 
				+    """交叉验证对话框"""
			
 
				+    if validator.current_source_key == validator.verify_source_key:
			
 
				+        st.error("❌ OCR数据源和验证数据源不能相同")
			
 
				+        return
			
 
				+    
			
 
				+    if 'cross_validation_batch_result' not in st.session_state:
			
 
				+        st.session_state.cross_validation_batch_result = None
			
 
				+    
			
 
				+    st.header("🔄 批量交叉验证")
			
 
				+    
			
 
				+    col1, col2 = st.columns(2)
			
 
				+    with col1:
			
 
				+        st.info(f"**OCR数据源:** {get_data_source_display_name(validator.current_source_config)}")
			
 
				+        st.write(f"📁 文件数量: {len(validator.file_info)}")
			
 
				+    with col2:
			
 
				+        st.info(f"**验证数据源:** {get_data_source_display_name(validator.verify_source_config)}")
			
 
				+        st.write(f"📁 文件数量: {len(validator.verify_file_info)}")
			
 
				+    
			
 
				+    with st.expander("⚙️ 验证选项", expanded=True):
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        with col1:
			
 
				+            table_mode = st.selectbox(
			
 
				+                "表格比对模式",
			
 
				+                options=['standard', 'flow_list'],
			
 
				+                index=1,
			
 
				+                format_func=lambda x: '流水表格模式' if x == 'flow_list' else '标准模式',
			
 
				+                help="选择表格比对算法"
			
 
				+            )
			
 
				+        with col2:
			
 
				+            similarity_algorithm = st.selectbox(
			
 
				+                "相似度算法",
			
 
				+                options=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
			
 
				+                index=0,
			
 
				+                help="选择文本相似度计算算法"
			
 
				+            )
			
 
				+    
			
 
				+    if st.button("🚀 开始批量验证", type="primary", width='stretch'):
			
 
				+        run_batch_cross_validation(validator, table_mode, similarity_algorithm)
			
 
				+    
			
 
				+    if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
			
 
				+        st.markdown("---")
			
 
				+        display_batch_validation_results(st.session_state.cross_validation_batch_result)
			
 
				+
			
 
				+
			
 
				+def run_batch_cross_validation(validator, table_mode: str, similarity_algorithm: str):
			
 
				+    """执行批量交叉验证"""
			
 
				+    pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				+    pre_validation_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    
			
 
				+    batch_results = _initialize_batch_results(validator, table_mode, similarity_algorithm)
			
 
				+    
			
 
				+    progress_bar = st.progress(0)
			
 
				+    status_text = st.empty()
			
 
				+    
			
 
				+    ocr_page_map = {info['page']: i for i, info in enumerate(validator.file_info)}
			
 
				+    verify_page_map = {info['page']: i for i, info in enumerate(validator.verify_file_info)}
			
 
				+    
			
 
				+    common_pages = sorted(set(ocr_page_map.keys()) & set(verify_page_map.keys()))
			
 
				+    
			
 
				+    if not common_pages:
			
 
				+        st.error("❌ 两个数据源没有共同的页码，无法进行对比")
			
 
				+        return
			
 
				+    
			
 
				+    batch_results['summary']['total_pages'] = len(common_pages)
			
 
				+    
			
 
				+    with st.expander("📋 详细对比日志", expanded=True):
			
 
				+        log_container = st.container()
			
 
				+    
			
 
				+    for idx, page_num in enumerate(common_pages):
			
 
				+        try:
			
 
				+            progress = (idx + 1) / len(common_pages)
			
 
				+            progress_bar.progress(progress)
			
 
				+            status_text.text(f"正在对比第 {page_num} 页... ({idx + 1}/{len(common_pages)})")
			
 
				+            
			
 
				+            ocr_file_index = ocr_page_map[page_num]
			
 
				+            verify_file_index = verify_page_map[page_num]
			
 
				+            
			
 
				+            ocr_md_path = Path(validator.file_paths[ocr_file_index]).with_suffix('.md')
			
 
				+            verify_md_path = Path(validator.verify_file_paths[verify_file_index]).with_suffix('.md')
			
 
				+            
			
 
				+            if not ocr_md_path.exists() or not verify_md_path.exists():
			
 
				+                with log_container:
			
 
				+                    st.warning(f"⚠️ 第 {page_num} 页：文件不存在，跳过")
			
 
				+                batch_results['summary']['failed_pages'] += 1
			
 
				+                continue
			
 
				+            
			
 
				+            comparison_result_path = pre_validation_dir / f"{ocr_md_path.stem}_cross_validation"
			
 
				+            
			
 
				+            import io
			
 
				+            import contextlib
			
 
				+            
			
 
				+            output_buffer = io.StringIO()
			
 
				+            
			
 
				+            with contextlib.redirect_stdout(output_buffer):
			
 
				+                comparison_result = compare_ocr_results(
			
 
				+                    file1_path=str(ocr_md_path),
			
 
				+                    file2_path=str(verify_md_path),
			
 
				+                    output_file=str(comparison_result_path),
			
 
				+                    output_format='both',
			
 
				+                    ignore_images=True,
			
 
				+                    table_mode=table_mode,
			
 
				+                    similarity_algorithm=similarity_algorithm
			
 
				+                )
			
 
				+            
			
 
				+            _process_comparison_result(batch_results, comparison_result, page_num, 
			
 
				+                                      ocr_md_path, verify_md_path, comparison_result_path)
			
 
				+            
			
 
				+            with log_container:
			
 
				+                if comparison_result['statistics']['total_differences'] == 0:
			
 
				+                    st.success(f"✅ 第 {page_num} 页：完全匹配")
			
 
				+                else:
			
 
				+                    st.warning(f"⚠️ 第 {page_num} 页：发现 {comparison_result['statistics']['total_differences']} 个差异")
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            with log_container:
			
 
				+                st.error(f"❌ 第 {page_num} 页：对比失败 - {str(e)}")
			
 
				+            
			
 
				+            batch_results['pages'].append({
			
 
				+                'page_num': page_num,
			
 
				+                'status': 'failed',
			
 
				+                'error': str(e)
			
 
				+            })
			
 
				+            batch_results['summary']['failed_pages'] += 1
			
 
				+    
			
 
				+    _save_batch_results(validator, batch_results, pre_validation_dir)
			
 
				+    
			
 
				+    progress_bar.progress(1.0)
			
 
				+    status_text.text("✅ 批量验证完成！")
			
 
				+    
			
 
				+    st.success(f"🎉 批量验证完成！成功: {batch_results['summary']['successful_pages']}, 失败: {batch_results['summary']['failed_pages']}")
			
 
				+
			
 
				+
			
 
				+def _initialize_batch_results(validator, table_mode: str, similarity_algorithm: str) -> dict:
			
 
				+    """初始化批量结果存储"""
			
 
				+    return {
			
 
				+        'ocr_source': get_data_source_display_name(validator.current_source_config),
			
 
				+        'verify_source': get_data_source_display_name(validator.verify_source_config),
			
 
				+        'table_mode': table_mode,
			
 
				+        'similarity_algorithm': similarity_algorithm,
			
 
				+        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        'pages': [],
			
 
				+        'summary': {
			
 
				+            'total_pages': 0,
			
 
				+            'successful_pages': 0,
			
 
				+            'failed_pages': 0,
			
 
				+            'total_differences': 0,
			
 
				+            'total_table_differences': 0,
			
 
				+            'total_amount_differences': 0,
			
 
				+            'total_datetime_differences': 0,
			
 
				+            'total_text_differences': 0,
			
 
				+            'total_paragraph_differences': 0,
			
 
				+            'total_table_pre_header': 0,
			
 
				+            'total_table_header_position': 0,
			
 
				+            'total_table_header_critical': 0,
			
 
				+            'total_table_row_missing': 0,
			
 
				+            'total_high_severity': 0,
			
 
				+            'total_medium_severity': 0,
			
 
				+            'total_low_severity': 0
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def _process_comparison_result(batch_results: dict, comparison_result: dict, page_num: int,
			
 
				+                               ocr_md_path: Path, verify_md_path: Path, comparison_result_path: Path):
			
 
				+    """处理对比结果"""
			
 
				+    stats = comparison_result['statistics']
			
 
				+    
			
 
				+    page_result = {
			
 
				+        'page_num': page_num,
			
 
				+        'ocr_file': str(ocr_md_path.name),
			
 
				+        'verify_file': str(verify_md_path.name),
			
 
				+        'total_differences': stats['total_differences'],
			
 
				+        'table_differences': stats['table_differences'],
			
 
				+        'amount_differences': stats.get('amount_differences', 0),
			
 
				+        'datetime_differences': stats.get('datetime_differences', 0),
			
 
				+        'text_differences': stats.get('text_differences', 0),
			
 
				+        'paragraph_differences': stats['paragraph_differences'],
			
 
				+        'table_pre_header': stats.get('table_pre_header', 0),
			
 
				+        'table_header_position': stats.get('table_header_position', 0),
			
 
				+        'table_header_critical': stats.get('table_header_critical', 0),
			
 
				+        'table_row_missing': stats.get('table_row_missing', 0),
			
 
				+        'high_severity': stats.get('high_severity', 0),
			
 
				+        'medium_severity': stats.get('medium_severity', 0),
			
 
				+        'low_severity': stats.get('low_severity', 0),
			
 
				+        'status': 'success',
			
 
				+        'comparison_json': f"{comparison_result_path}.json",
			
 
				+        'comparison_md': f"{comparison_result_path}.md"
			
 
				+    }
			
 
				+    
			
 
				+    batch_results['pages'].append(page_result)
			
 
				+    batch_results['summary']['successful_pages'] += 1
			
 
				+    
			
 
				+    # 更新汇总统计
			
 
				+    for key in stats:
			
 
				+        total_key = f'total_{key}'
			
 
				+        if total_key in batch_results['summary']:
			
 
				+            batch_results['summary'][total_key] += stats.get(key, 0)
			
 
				+
			
 
				+
			
 
				+def _save_batch_results(validator, batch_results: dict, pre_validation_dir: Path):
			
 
				+    """保存批量结果"""
			
 
				+    batch_result_path = pre_validation_dir / f"{validator.current_source_config['name']}_{validator.current_source_config['ocr_tool']}_vs_{validator.verify_source_config['ocr_tool']}_batch_cross_validation"
			
 
				+    
			
 
				+    with open(f"{batch_result_path}.json", "w", encoding="utf-8") as f:
			
 
				+        json.dump(batch_results, f, ensure_ascii=False, indent=2)
			
 
				+    
			
 
				+    generate_batch_validation_markdown(batch_results, f"{batch_result_path}.md")
			
 
				+    
			
 
				+    st.session_state.cross_validation_batch_result = batch_results
			
 
				+
			
 
				+
			
 
				+def generate_batch_validation_markdown(batch_results: dict, output_path: str):
			
 
				+    """生成批量验证的Markdown报告"""
			
 
				+    with open(output_path, "w", encoding="utf-8") as f:
			
 
				+        f.write("# 批量交叉验证报告\n\n")
			
 
				+        
			
 
				+        # 基本信息
			
 
				+        f.write("## 📋 基本信息\n\n")
			
 
				+        f.write(f"- **OCR数据源:** {batch_results['ocr_source']}\n")
			
 
				+        f.write(f"- **验证数据源:** {batch_results['verify_source']}\n")
			
 
				+        f.write(f"- **表格模式:** {batch_results['table_mode']}\n")
			
 
				+        f.write(f"- **相似度算法:** {batch_results['similarity_algorithm']}\n")
			
 
				+        f.write(f"- **验证时间:** {batch_results['timestamp']}\n\n")
			
 
				+        
			
 
				+        # 汇总统计
			
 
				+        summary = batch_results['summary']
			
 
				+        f.write("## 📊 汇总统计\n\n")
			
 
				+        f.write(f"- **总页数:** {summary['total_pages']}\n")
			
 
				+        f.write(f"- **成功页数:** {summary['successful_pages']}\n")
			
 
				+        f.write(f"- **失败页数:** {summary['failed_pages']}\n")
			
 
				+        f.write(f"- **总差异数:** {summary['total_differences']}\n")
			
 
				+        f.write(f"- **表格差异:** {summary['total_table_differences']}\n")
			
 
				+        f.write(f"  - 金额差异: {summary.get('total_amount_differences', 0)}\n")
			
 
				+        f.write(f"  - 日期差异: {summary.get('total_datetime_differences', 0)}\n")
			
 
				+        f.write(f"  - 文本差异: {summary.get('total_text_differences', 0)}\n")
			
 
				+        f.write(f"  - 表头前差异: {summary.get('total_table_pre_header', 0)}\n")
			
 
				+        f.write(f"  - 表头位置差异: {summary.get('total_table_header_position', 0)}\n")
			
 
				+        f.write(f"  - 表头严重错误: {summary.get('total_table_header_critical', 0)}\n")
			
 
				+        f.write(f"  - 行缺失: {summary.get('total_table_row_missing', 0)}\n")
			
 
				+        f.write(f"- **段落差异:** {summary['total_paragraph_differences']}\n")
			
 
				+        f.write(f"- **严重程度统计:**\n")
			
 
				+        f.write(f"  - 高严重度: {summary.get('total_high_severity', 0)}\n")
			
 
				+        f.write(f"  - 中严重度: {summary.get('total_medium_severity', 0)}\n")
			
 
				+        f.write(f"  - 低严重度: {summary.get('total_low_severity', 0)}\n\n")
			
 
				+        
			
 
				+        # 详细结果表格
			
 
				+        f.write("## 📄 各页差异统计\n\n")
			
 
				+        f.write("| 页码 | 状态 | 总差异 | 表格差异 | 金额 | 日期 | 文本 | 段落 | 表头前 | 表头位置 | 表头错误 | 行缺失 | 高 | 中 | 低 |\n")
			
 
				+        f.write("|------|------|--------|----------|------|------|------|------|--------|----------|----------|--------|----|----|----|\n")
			
 
				+        
			
 
				+        for page in batch_results['pages']:
			
 
				+            if page['status'] == 'success':
			
 
				+                status_icon = "✅" if page['total_differences'] == 0 else "⚠️"
			
 
				+                f.write(f"| {page['page_num']} | {status_icon} | ")
			
 
				+                f.write(f"{page['total_differences']} | ")
			
 
				+                f.write(f"{page['table_differences']} | ")
			
 
				+                f.write(f"{page.get('amount_differences', 0)} | ")
			
 
				+                f.write(f"{page.get('datetime_differences', 0)} | ")
			
 
				+                f.write(f"{page.get('text_differences', 0)} | ")
			
 
				+                f.write(f"{page['paragraph_differences']} | ")
			
 
				+                f.write(f"{page.get('table_pre_header', 0)} | ")
			
 
				+                f.write(f"{page.get('table_header_position', 0)} | ")
			
 
				+                f.write(f"{page.get('table_header_critical', 0)} | ")
			
 
				+                f.write(f"{page.get('table_row_missing', 0)} | ")
			
 
				+                f.write(f"{page.get('high_severity', 0)} | ")
			
 
				+                f.write(f"{page.get('medium_severity', 0)} | ")
			
 
				+                f.write(f"{page.get('low_severity', 0)} |\n")
			
 
				+            else:
			
 
				+                f.write(f"| {page['page_num']} | ❌ | - | - | - | - | - | - | - | - | - | - | - | - | - |\n")
			
 
				+        
			
 
				+        f.write("\n")
			
 
				+        
			
 
				+        # 问题汇总
			
 
				+        f.write("## 🔍 问题汇总\n\n")
			
 
				+        
			
 
				+        high_diff_pages = [p for p in batch_results['pages'] 
			
 
				+                            if p['status'] == 'success' and p['total_differences'] > 10]
			
 
				+        if high_diff_pages:
			
 
				+            f.write("### ⚠️ 高差异页面（差异>10）\n\n")
			
 
				+            for page in high_diff_pages:
			
 
				+                f.write(f"- 第 {page['page_num']} 页：{page['total_differences']} 个差异\n")
			
 
				+            f.write("\n")
			
 
				+        
			
 
				+        amount_error_pages = [p for p in batch_results['pages'] 
			
 
				+                            if p['status'] == 'success' and p.get('amount_differences', 0) > 0]
			
 
				+        if amount_error_pages:
			
 
				+            f.write("### 💰 金额差异页面\n\n")
			
 
				+            for page in amount_error_pages:
			
 
				+                f.write(f"- 第 {page['page_num']} 页：{page.get('amount_differences', 0)} 个金额差异\n")
			
 
				+            f.write("\n")
			
 
				+        
			
 
				+        header_error_pages = [p for p in batch_results['pages'] 
			
 
				+                            if p['status'] == 'success' and p.get('table_header_critical', 0) > 0]
			
 
				+        if header_error_pages:
			
 
				+            f.write("### ❌ 表头严重错误页面\n\n")
			
 
				+            for page in header_error_pages:
			
 
				+                f.write(f"- 第 {page['page_num']} 页：{page['table_header_critical']} 个表头错误\n")
			
 
				+            f.write("\n")
			
 
				+        
			
 
				+        failed_pages = [p for p in batch_results['pages'] if p['status'] == 'failed']
			
 
				+        if failed_pages:
			
 
				+            f.write("### 💥 验证失败页面\n\n")
			
 
				+            for page in failed_pages:
			
 
				+                f.write(f"- 第 {page['page_num']} 页：{page.get('error', '未知错误')}\n")
			
 
				+            f.write("\n")
			
 
				+
			
 
				+
			
 
				+
			
 
				+def display_batch_validation_results(batch_results: dict):
			
 
				+    """显示批量验证结果"""
			
 
				+    st.header("📊 批量验证结果")
			
 
				+    
			
 
				+    summary = batch_results['summary']
			
 
				+    
			
 
				+    col1, col2, col3, col4 = st.columns(4)
			
 
				+    with col1:
			
 
				+        st.metric("总页数", summary['total_pages'])
			
 
				+    with col2:
			
 
				+        st.metric("成功页数", summary['successful_pages'], 
			
 
				+                 delta=f"{summary['successful_pages']/summary['total_pages']*100:.1f}%")
			
 
				+    with col3:
			
 
				+        st.metric("失败页数", summary['failed_pages'],
			
 
				+                 delta=f"-{summary['failed_pages']}" if summary['failed_pages'] > 0 else "0")
			
 
				+    with col4:
			
 
				+        st.metric("总差异数", summary['total_differences'])
			
 
				+    
			
 
				+    # ✅ 详细差异类型统计 - 更新展示
			
 
				+    st.subheader("📈 差异类型统计")
			
 
				+    
			
 
				+    col1, col2, col3 = st.columns(3)
			
 
				+    with col1:
			
 
				+        st.metric("表格差异", summary['total_table_differences'])
			
 
				+        st.caption(f"金额: {summary.get('total_amount_differences', 0)} | 日期: {summary.get('total_datetime_differences', 0)} | 文本: {summary.get('total_text_differences', 0)}")
			
 
				+    with col2:
			
 
				+        st.metric("段落差异", summary['total_paragraph_differences'])
			
 
				+    with col3:
			
 
				+        st.metric("严重度", f"高:{summary.get('total_high_severity', 0)} 中:{summary.get('total_medium_severity', 0)} 低:{summary.get('total_low_severity', 0)}")
			
 
				+    
			
 
				+    # 表格结构差异统计
			
 
				+    with st.expander("📋 表格结构差异详情", expanded=False):
			
 
				+        col1, col2, col3, col4 = st.columns(4)
			
 
				+        with col1:
			
 
				+            st.metric("表头前", summary.get('total_table_pre_header', 0))
			
 
				+        with col2:
			
 
				+            st.metric("表头位置", summary.get('total_table_header_position', 0))
			
 
				+        with col3:
			
 
				+            st.metric("表头错误", summary.get('total_table_header_critical', 0))
			
 
				+        with col4:
			
 
				+            st.metric("行缺失", summary.get('total_table_row_missing', 0))
			
 
				+    
			
 
				+    # ✅ 各页详细结果表格 - 更新列
			
 
				+    st.subheader("📄 各页详细结果")
			
 
				+    
			
 
				+    # 准备DataFrame
			
 
				+    page_data = []
			
 
				+    for page in batch_results['pages']:
			
 
				+        if page['status'] == 'success':
			
 
				+            page_data.append({
			
 
				+                '页码': page['page_num'],
			
 
				+                '状态': '✅ 成功' if page['total_differences'] == 0 else '⚠️ 有差异',
			
 
				+                '总差异': page['total_differences'],
			
 
				+                '表格差异': page['table_differences'],
			
 
				+                '金额': page.get('amount_differences', 0),
			
 
				+                '日期': page.get('datetime_differences', 0),
			
 
				+                '文本': page.get('text_differences', 0),
			
 
				+                '段落': page['paragraph_differences'],
			
 
				+                '表头前': page.get('table_pre_header', 0),
			
 
				+                '表头位置': page.get('table_header_position', 0),
			
 
				+                '表头错误': page.get('table_header_critical', 0),
			
 
				+                '行缺失': page.get('table_row_missing', 0),
			
 
				+                '高': page.get('high_severity', 0),
			
 
				+                '中': page.get('medium_severity', 0),
			
 
				+                '低': page.get('low_severity', 0)
			
 
				+            })
			
 
				+        else:
			
 
				+            page_data.append({
			
 
				+                '页码': page['page_num'],
			
 
				+                '状态': '❌ 失败',
			
 
				+                '总差异': '-', '表格差异': '-', '金额': '-', '日期': '-', 
			
 
				+                '文本': '-', '段落': '-', '表头前': '-', '表头位置': '-',
			
 
				+                '表头错误': '-', '行缺失': '-', '高': '-', '中': '-', '低': '-'
			
 
				+            })
			
 
				+    
			
 
				+    df_pages = pd.DataFrame(page_data)
			
 
				+    
			
 
				+    # 显示表格
			
 
				+    st.dataframe(
			
 
				+        df_pages,
			
 
				+        width='stretch',
			
 
				+        hide_index=True,
			
 
				+        column_config={
			
 
				+            "页码": st.column_config.NumberColumn("页码", width="small"),
			
 
				+            "状态": st.column_config.TextColumn("状态", width="small"),
			
 
				+            "总差异": st.column_config.NumberColumn("总差异", width="small"),
			
 
				+            "表格差异": st.column_config.NumberColumn("表格", width="small"),
			
 
				+            "金额": st.column_config.NumberColumn("金额", width="small"),
			
 
				+            "日期": st.column_config.NumberColumn("日期", width="small"),
			
 
				+            "文本": st.column_config.NumberColumn("文本", width="small"),
			
 
				+            "段落": st.column_config.NumberColumn("段落", width="small"),
			
 
				+        }
			
 
				+    )
			
 
				+    
			
 
				+    # 下载选项
			
 
				+    st.subheader("📥 导出报告")
			
 
				+    
			
 
				+    col1, col2 = st.columns(2)
			
 
				+    
			
 
				+    with col1:
			
 
				+        # 导出Excel
			
 
				+        excel_buffer = BytesIO()
			
 
				+        df_pages.to_excel(excel_buffer, index=False, sheet_name='验证结果')
			
 
				+        
			
 
				+        st.download_button(
			
 
				+            label="📊 下载Excel报告",
			
 
				+            data=excel_buffer.getvalue(),
			
 
				+            file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
			
 
				+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
			
 
				+        )
			
 
				+    
			
 
				+    with col2:
			
 
				+        # 导出JSON
			
 
				+        json_data = json.dumps(batch_results, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        st.download_button(
			
 
				+            label="📄 下载JSON报告",
			
 
				+            data=json_data,
			
 
				+            file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
			
 
				+            mime="application/json"
			
 
				+    )
			
 
				+
			
 
				+@st.dialog("查看交叉验证结果", width="large", dismissible=True, on_dismiss="rerun")
			
 
				+def show_batch_cross_validation_results_dialog():
			
 
				+    """显示批量验证结果对话框"""
			
 
				+    if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
			
 
				+        display_batch_validation_results(st.session_state.cross_validation_batch_result)
			
 
				+    else:
			
 
				+        st.info("暂无交叉验证结果，请先运行交叉验证")
			
--- a/ocr_validator/streamlit_validator_result.py
+++ b/ocr_validator/streamlit_validator_result.py
@@ -0,0 +1,399 @@
 
				+"""
			
 
				+验证结果展示模块
			
 
				+"""
			
 
				+import streamlit as st
			
 
				+import pandas as pd
			
 
				+import plotly.express as px
			
 
				+from io import BytesIO
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_validator_result.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_utils 导入通用工具
			
 
				+from ocr_utils.html_utils import process_all_images_in_content
			
 
				+
			
 
				+def display_single_page_cross_validation(validator, config):
			
 
				+    """显示单页交叉验证结果
			
 
				+    
			
 
				+    Args:
			
 
				+        validator: OCR验证器实例
			
 
				+        config: 配置字典
			
 
				+    """
			
 
				+    current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md')
			
 
				+    pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				+    comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
			
 
				+    verify_md_path = validator.find_verify_md_path(validator.selected_file_index)
			
 
				+    
			
 
				+    # 检查验证结果是否与当前数据源匹配
			
 
				+    result_is_valid = False
			
 
				+    comparison_result = None
			
 
				+    
			
 
				+    if comparison_result_path.exists():
			
 
				+        try:
			
 
				+            with open(comparison_result_path, "r", encoding="utf-8") as f:
			
 
				+                comparison_result = json.load(f)
			
 
				+            
			
 
				+            # 检查文件路径是否匹配（验证结果是否为当前页面生成）
			
 
				+            if (comparison_result.get('file1_path') == str(current_md_path) and 
			
 
				+                comparison_result.get('file2_path') == str(verify_md_path)):
			
 
				+                result_is_valid = True
			
 
				+        except Exception as e:
			
 
				+            st.error(f"读取验证结果失败: {e}")
			
 
				+    
			
 
				+    if result_is_valid:
			
 
				+        _display_valid_cross_validation_result(
			
 
				+            validator, config, current_md_path, verify_md_path, comparison_result
			
 
				+        )
			
 
				+    else:
			
 
				+        _display_no_validation_result_prompt(validator)
			
 
				+
			
 
				+
			
 
				+def _display_valid_cross_validation_result(validator, config, current_md_path, verify_md_path, comparison_result):
			
 
				+    """显示有效的交叉验证结果
			
 
				+    
			
 
				+    Args:
			
 
				+        validator: OCR验证器实例
			
 
				+        config: 配置字典
			
 
				+        current_md_path: 当前OCR文件路径
			
 
				+        verify_md_path: 验证文件路径
			
 
				+        comparison_result: 对比结果字典
			
 
				+    """
			
 
				+    col1, col2 = st.columns([1, 1])
			
 
				+    
			
 
				+    # 左侧：原OCR识别结果
			
 
				+    with col1:
			
 
				+        st.subheader("🤖 原OCR识别结果")
			
 
				+        if current_md_path.exists():
			
 
				+            with open(current_md_path, "r", encoding="utf-8") as f:
			
 
				+                original_md_content = f.read()
			
 
				+                original_md_content = process_all_images_in_content(original_md_content, current_md_path)            
			
 
				+
			
 
				+            font_size = config['styles'].get('font_size', 10)
			
 
				+            height = config['styles']['layout'].get('default_height', 800)
			
 
				+            validator.layout_manager.render_content_by_mode(
			
 
				+                original_md_content, "HTML渲染", font_size, height, "compact"
			
 
				+            )
			
 
				+        else:
			
 
				+            st.error("原OCR文件不存在")
			
 
				+    
			
 
				+    # 右侧：验证识别结果
			
 
				+    with col2:
			
 
				+        st.subheader("🤖 验证识别结果")
			
 
				+        if verify_md_path and verify_md_path.exists():
			
 
				+            with open(str(verify_md_path), "r", encoding="utf-8") as f:
			
 
				+                verify_md_content = f.read()
			
 
				+                verify_md_content = process_all_images_in_content(verify_md_content, verify_md_path)
			
 
				+            
			
 
				+            font_size = config['styles'].get('font_size', 10)
			
 
				+            height = config['styles']['layout'].get('default_height', 800)
			
 
				+            validator.layout_manager.render_content_by_mode(
			
 
				+                verify_md_content, "HTML渲染", font_size, height, "compact"
			
 
				+            )
			
 
				+        else:
			
 
				+            st.warning("验证文件不存在")
			
 
				+    
			
 
				+    st.markdown("---")
			
 
				+    
			
 
				+    # 显示详细的对比结果
			
 
				+    display_comparison_results(comparison_result, detailed=True)
			
 
				+
			
 
				+
			
 
				+def _display_no_validation_result_prompt(validator):
			
 
				+    """显示无验证结果的提示信息
			
 
				+    
			
 
				+    Args:
			
 
				+        validator: OCR验证器实例
			
 
				+    """
			
 
				+    st.info("💡 暂无当前页面的交叉验证结果，请点击上方「交叉验证」按钮运行验证")
			
 
				+    
			
 
				+    # 显示当前数据源信息
			
 
				+    col1, col2 = st.columns(2)
			
 
				+    
			
 
				+    with col1:
			
 
				+        st.write("**当前OCR数据源:**")
			
 
				+        from ocr_validator_utils import get_data_source_display_name
			
 
				+        
			
 
				+        if validator.current_source_config and validator.file_info:
			
 
				+            current_source_name = get_data_source_display_name(validator.current_source_config)
			
 
				+            current_page = validator.file_info[validator.selected_file_index]['page']
			
 
				+            st.code(f"{current_source_name}\n第 {current_page} 页")
			
 
				+        else:
			
 
				+            st.warning("未选择OCR数据源")
			
 
				+    
			
 
				+    with col2:
			
 
				+        st.write("**当前验证数据源:**")
			
 
				+        if validator.verify_source_config:
			
 
				+            from ocr_validator_utils import get_data_source_display_name
			
 
				+            verify_source_name = get_data_source_display_name(validator.verify_source_config)
			
 
				+            st.code(verify_source_name)
			
 
				+        else:
			
 
				+            st.warning("未选择验证数据源")
			
 
				+    
			
 
				+    # 添加操作提示
			
 
				+    st.markdown("---")
			
 
				+    st.markdown("""
			
 
				+    ### 📝 操作步骤：
			
 
				+    
			
 
				+    1. **选择数据源**: 在页面顶部选择不同的OCR数据源和验证数据源
			
 
				+    2. **运行验证**: 点击「交叉验证」按钮开始批量验证
			
 
				+    3. **查看结果**: 验证完成后，在此处查看详细对比结果
			
 
				+    
			
 
				+    💡 **提示**: 
			
 
				+    - 确保两个数据源包含相同页码的文件
			
 
				+    - 建议选择不同OCR工具的结果进行交叉验证
			
 
				+    - 验证结果会自动保存，可随时查看
			
 
				+    """)
			
 
				+
			
 
				+
			
 
				+def display_comparison_results(comparison_result: dict, detailed: bool = True):
			
 
				+    """显示对比结果
			
 
				+    
			
 
				+    Args:
			
 
				+        comparison_result: 对比结果字典
			
 
				+        detailed: 是否显示详细信息
			
 
				+    """
			
 
				+    st.header("📊 交叉验证结果")
			
 
				+    
			
 
				+    stats = comparison_result['statistics']
			
 
				+    
			
 
				+    # 显示主要指标
			
 
				+    col1, col2, col3, col4 = st.columns(4)
			
 
				+    with col1:
			
 
				+        st.metric("总差异数", stats['total_differences'])
			
 
				+    with col2:
			
 
				+        st.metric("表格差异", stats['table_differences'])
			
 
				+    with col3:
			
 
				+        st.metric("金额差异", stats.get('amount_differences', 0))
			
 
				+    with col4:
			
 
				+        st.metric("段落差异", stats['paragraph_differences'])
			
 
				+    
			
 
				+    # 根据差异数量显示不同的提示
			
 
				+    if stats['total_differences'] == 0:
			
 
				+        st.success("🎉 完美匹配！两个数据源结果完全一致")
			
 
				+    else:
			
 
				+        st.warning(f"⚠️ 发现 {stats['total_differences']} 个差异，建议人工检查")
			
 
				+        
			
 
				+        if comparison_result['differences'] and detailed:
			
 
				+            _display_differences_dataframe(comparison_result)
			
 
				+            _display_difference_details(comparison_result)
			
 
				+            _display_difference_charts(comparison_result)
			
 
				+            _provide_download_options(comparison_result)
			
 
				+
			
 
				+
			
 
				+def _display_differences_dataframe(comparison_result: dict):
			
 
				+    """显示差异DataFrame"""
			
 
				+    st.subheader("🔍 差异详情对比")
			
 
				+    
			
 
				+    diff_data = []
			
 
				+    for i, diff in enumerate(comparison_result['differences'], 1):
			
 
				+        diff_data.append({
			
 
				+            '序号': i,
			
 
				+            '位置': diff['position'],
			
 
				+            '类型': diff['type'],
			
 
				+            '原OCR结果': diff['file1_value'][:100] + ('...' if len(diff['file1_value']) > 100 else ''),
			
 
				+            '验证结果': diff['file2_value'][:100] + ('...' if len(diff['file2_value']) > 100 else ''),
			
 
				+            '描述': diff['description'][:80] + ('...' if len(diff['description']) > 80 else ''),
			
 
				+            '严重程度': _get_severity_level(diff)
			
 
				+        })
			
 
				+    
			
 
				+    df_differences = pd.DataFrame(diff_data)
			
 
				+    
			
 
				+    def highlight_severity(val):
			
 
				+        if val == '高':
			
 
				+            return 'background-color: #ffebee; color: #c62828'
			
 
				+        elif val == '中':
			
 
				+            return 'background-color: #fff3e0; color: #ef6c00'
			
 
				+        elif val == '低':
			
 
				+            return 'background-color: #e8f5e8; color: #2e7d32'
			
 
				+        return ''
			
 
				+    
			
 
				+    styled_df = df_differences.style.map(
			
 
				+        highlight_severity, 
			
 
				+        subset=['严重程度']
			
 
				+    ).format({'序号': '{:d}'})
			
 
				+    
			
 
				+    st.dataframe(styled_df, width='stretch', height=400, hide_index=True)
			
 
				+
			
 
				+
			
 
				+def _display_difference_details(comparison_result: dict):
			
 
				+    """显示详细差异"""
			
 
				+    st.subheader("🔍 详细差异查看")
			
 
				+    
			
 
				+    selected_diff_index = st.selectbox(
			
 
				+        "选择要查看的差异:",
			
 
				+        options=range(len(comparison_result['differences'])),
			
 
				+        format_func=lambda x: f"差异 {x+1}: {comparison_result['differences'][x]['position']} - {comparison_result['differences'][x]['type']}",
			
 
				+        key="selected_diff"
			
 
				+    )
			
 
				+    
			
 
				+    if selected_diff_index is not None:
			
 
				+        diff = comparison_result['differences'][selected_diff_index]
			
 
				+        
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        
			
 
				+        with col1:
			
 
				+            st.write("**原OCR结果:**")
			
 
				+            st.text_area("原OCR结果详情", value=diff['file1_value'], height=200, 
			
 
				+                        key=f"original_{selected_diff_index}", label_visibility="collapsed")
			
 
				+        
			
 
				+        with col2:
			
 
				+            st.write("**验证结果:**")
			
 
				+            st.text_area("验证结果详情", value=diff['file2_value'], height=200, 
			
 
				+                        key=f"verify_{selected_diff_index}", label_visibility="collapsed")
			
 
				+        
			
 
				+        st.info(f"**位置:** {diff['position']}")
			
 
				+        st.info(f"**类型:** {diff['type']}")
			
 
				+        st.info(f"**描述:** {diff['description']}")
			
 
				+        st.info(f"**严重程度:** {_get_severity_level(diff)}")
			
 
				+
			
 
				+
			
 
				+def _display_difference_charts(comparison_result: dict):
			
 
				+    """显示差异统计图表"""
			
 
				+    st.subheader("📈 差异类型分布")
			
 
				+    
			
 
				+    type_counts = {}
			
 
				+    severity_counts = {'高': 0, '中': 0, '低': 0}
			
 
				+    
			
 
				+    for diff in comparison_result['differences']:
			
 
				+        diff_type = diff['type']
			
 
				+        type_counts[diff_type] = type_counts.get(diff_type, 0) + 1
			
 
				+        
			
 
				+        severity = _get_severity_level(diff)
			
 
				+        severity_counts[severity] += 1
			
 
				+    
			
 
				+    col1, col2 = st.columns(2)
			
 
				+    
			
 
				+    with col1:
			
 
				+        if type_counts:
			
 
				+            fig_type = px.pie(
			
 
				+                values=list(type_counts.values()),
			
 
				+                names=list(type_counts.keys()),
			
 
				+                title="差异类型分布"
			
 
				+            )
			
 
				+            st.plotly_chart(fig_type, width='stretch')
			
 
				+    
			
 
				+    with col2:
			
 
				+        fig_severity = px.bar(
			
 
				+            x=list(severity_counts.keys()),
			
 
				+            y=list(severity_counts.values()),
			
 
				+            title="差异严重程度分布",
			
 
				+            color=list(severity_counts.keys()),
			
 
				+            color_discrete_map={'高': '#f44336', '中': '#ff9800', '低': '#4caf50'}
			
 
				+        )
			
 
				+        st.plotly_chart(fig_severity, width='stretch')
			
 
				+
			
 
				+
			
 
				+def _provide_download_options(comparison_result: dict):
			
 
				+    """提供下载选项"""
			
 
				+    st.subheader("📥 导出验证结果")
			
 
				+    
			
 
				+    col1, col2, col3 = st.columns(3)
			
 
				+    
			
 
				+    with col1:
			
 
				+        if comparison_result['differences']:
			
 
				+            diff_data = []
			
 
				+            for i, diff in enumerate(comparison_result['differences'], 1):
			
 
				+                diff_data.append({
			
 
				+                    '序号': i,
			
 
				+                    '位置': diff['position'],
			
 
				+                    '类型': diff['type'],
			
 
				+                    '原OCR结果': diff['file1_value'],
			
 
				+                    '验证结果': diff['file2_value'],
			
 
				+                    '描述': diff['description'],
			
 
				+                    '严重程度': _get_severity_level(diff)
			
 
				+                })
			
 
				+            
			
 
				+            df_export = pd.DataFrame(diff_data)
			
 
				+            excel_buffer = BytesIO()
			
 
				+            df_export.to_excel(excel_buffer, index=False, sheet_name='差异详情')
			
 
				+            
			
 
				+            st.download_button(
			
 
				+                label="📊 下载差异详情(Excel)",
			
 
				+                data=excel_buffer.getvalue(),
			
 
				+                file_name=f"comparison_differences_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
			
 
				+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
			
 
				+                key="download_differences_excel"
			
 
				+            )
			
 
				+    
			
 
				+    with col2:
			
 
				+        stats_data = {
			
 
				+            '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
			
 
				+            '数量': [
			
 
				+                comparison_result['statistics']['total_differences'],
			
 
				+                comparison_result['statistics']['table_differences'],
			
 
				+                comparison_result['statistics'].get('amount_differences', 0),
			
 
				+                comparison_result['statistics']['paragraph_differences']
			
 
				+            ]
			
 
				+        }
			
 
				+        
			
 
				+        df_stats = pd.DataFrame(stats_data)
			
 
				+        csv_stats = df_stats.to_csv(index=False)
			
 
				+        
			
 
				+        st.download_button(
			
 
				+            label="📈 下载统计报告(CSV)",
			
 
				+            data=csv_stats,
			
 
				+            file_name=f"comparison_stats_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
			
 
				+            mime="text/csv",
			
 
				+            key="download_stats_csv"
			
 
				+        )
			
 
				+    
			
 
				+    with col3:
			
 
				+        report_json = json.dumps(comparison_result, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        st.download_button(
			
 
				+            label="📄 下载完整报告(JSON)",
			
 
				+            data=report_json,
			
 
				+            file_name=f"comparison_full_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
			
 
				+            mime="application/json",
			
 
				+            key="download_full_json"
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def _get_severity_level(diff: dict) -> str:
			
 
				+    """判断严重程度
			
 
				+    
			
 
				+    Args:
			
 
				+        diff: 差异字典
			
 
				+    
			
 
				+    Returns:
			
 
				+        严重程度: '高', '中', '低'
			
 
				+    """
			
 
				+    if 'severity' in diff:
			
 
				+        severity_map = {'critical': '高', 'high': '高', 'medium': '中', 'low': '低'}
			
 
				+        return severity_map.get(diff['severity'], '中')
			
 
				+    
			
 
				+    diff_type = diff['type'].lower()
			
 
				+    
			
 
				+    # 金额和数字类差异为高严重度
			
 
				+    if 'amount' in diff_type or 'number' in diff_type:
			
 
				+        return '高'
			
 
				+    
			
 
				+    # 表格和结构类差异为中严重度
			
 
				+    if 'table' in diff_type or 'structure' in diff_type:
			
 
				+        return '中'
			
 
				+    
			
 
				+    # 根据相似度判断
			
 
				+    if 'similarity' in diff:
			
 
				+        similarity = diff['similarity']
			
 
				+        if similarity < 50:
			
 
				+            return '高'
			
 
				+        elif similarity < 85:
			
 
				+            return '中'
			
 
				+        else:
			
 
				+            return '低'
			
 
				+    
			
 
				+    # 根据长度差异判断
			
 
				+    len_diff = abs(len(diff['file1_value']) - len(diff['file2_value']))
			
 
				+    if len_diff > 50:
			
 
				+        return '高'
			
 
				+    elif len_diff > 10:
			
 
				+        return '中'
			
 
				+    else:
			
 
				+        return '低'
			
--- a/ocr_validator/streamlit_validator_table.py
+++ b/ocr_validator/streamlit_validator_table.py
@@ -0,0 +1,277 @@
 
				+"""
			
 
				+表格处理和分析功能
			
 
				+"""
			
 
				+import streamlit as st
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from io import BytesIO
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_validator_table.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从 ocr_utils 导入通用工具
			
 
				+from ocr_utils.html_utils import parse_html_tables
			
 
				+
			
 
				+
			
 
				+def display_html_table_as_dataframe(html_content: str, enable_editing: bool = False):
			
 
				+    """将HTML表格解析为DataFrame显示"""
			
 
				+    tables = parse_html_tables(html_content)
			
 
				+    wide_table_threshold = 15
			
 
				+    
			
 
				+    if not tables:
			
 
				+        st.warning("未找到可解析的表格")
			
 
				+        st.markdown("""
			
 
				+        <style>
			
 
				+        .scrollable-table {
			
 
				+            overflow-x: auto;
			
 
				+            white-space: nowrap;
			
 
				+            border: 1px solid #ddd;
			
 
				+            border-radius: 5px;
			
 
				+            margin: 10px 0;
			
 
				+        }
			
 
				+        .scrollable-table table {
			
 
				+            width: 100%;
			
 
				+            border-collapse: collapse;
			
 
				+        }
			
 
				+        .scrollable-table th, .scrollable-table td {
			
 
				+            border: 1px solid #ddd;
			
 
				+            padding: 8px;
			
 
				+            text-align: left;
			
 
				+            min-width: 100px;
			
 
				+        }
			
 
				+        .scrollable-table th {
			
 
				+            background-color: #f5f5f5;
			
 
				+            font-weight: bold;
			
 
				+        }
			
 
				+        </style>
			
 
				+        """, unsafe_allow_html=True)
			
 
				+        
			
 
				+        st.markdown(f'<div class="scrollable-table">{html_content}</div>', unsafe_allow_html=True)
			
 
				+        return
			
 
				+        
			
 
				+    for i, table in enumerate(tables):
			
 
				+        st.subheader(f"📊 表格 {i+1}")
			
 
				+        
			
 
				+        col_info1, col_info2, col_info3, col_info4 = st.columns(4)
			
 
				+        with col_info1:
			
 
				+            st.metric("行数", len(table))
			
 
				+        with col_info2:
			
 
				+            st.metric("列数", len(table.columns))
			
 
				+        with col_info3:
			
 
				+            is_wide_table = len(table.columns) > wide_table_threshold
			
 
				+            st.metric("表格类型", "超宽表格" if is_wide_table else "普通表格")
			
 
				+        with col_info4:
			
 
				+            display_mode = st.selectbox(
			
 
				+                f"显示模式 (表格{i+1})",
			
 
				+                ["完整显示", "分页显示", "筛选列显示"],
			
 
				+                key=f"display_mode_{i}"
			
 
				+            )
			
 
				+        
			
 
				+        col1, col2, col3, col4 = st.columns(4)
			
 
				+        with col1:
			
 
				+            show_info = st.checkbox(f"显示详细信息", key=f"info_{i}")
			
 
				+        with col2:
			
 
				+            show_stats = st.checkbox(f"显示统计信息", key=f"stats_{i}")
			
 
				+        with col3:
			
 
				+            enable_filter = st.checkbox(f"启用过滤", key=f"filter_{i}")
			
 
				+        with col4:
			
 
				+            enable_sort = st.checkbox(f"启用排序", key=f"sort_{i}")
			
 
				+        
			
 
				+        display_table = _process_table_display_mode(table, i, display_mode)
			
 
				+        filtered_table = _apply_table_filters_and_sorts(display_table, i, enable_filter, enable_sort)
			
 
				+        
			
 
				+        _render_table_with_style(filtered_table, table, i, enable_editing, wide_table_threshold)
			
 
				+        _display_table_info_and_stats(table, filtered_table, show_info, show_stats, i)
			
 
				+        
			
 
				+        st.markdown("---")
			
 
				+
			
 
				+
			
 
				+def _process_table_display_mode(table: pd.DataFrame, table_index: int, display_mode: str) -> pd.DataFrame:
			
 
				+    """根据显示模式处理表格"""
			
 
				+    if display_mode == "分页显示":
			
 
				+        page_size = st.selectbox(
			
 
				+            f"每页显示行数 (表格 {table_index+1})",
			
 
				+            [10, 20, 50, 100],
			
 
				+            key=f"page_size_{table_index}"
			
 
				+        )
			
 
				+        
			
 
				+        total_pages = (len(table) - 1) // page_size + 1
			
 
				+        
			
 
				+        if total_pages > 1:
			
 
				+            page_number = st.selectbox(
			
 
				+                f"页码 (表格 {table_index+1})",
			
 
				+                range(1, total_pages + 1),
			
 
				+                key=f"page_number_{table_index}"
			
 
				+            )
			
 
				+            
			
 
				+            start_idx = (page_number - 1) * page_size
			
 
				+            end_idx = start_idx + page_size
			
 
				+            return table.iloc[start_idx:end_idx]
			
 
				+        
			
 
				+        return table
			
 
				+        
			
 
				+    elif display_mode == "筛选列显示":
			
 
				+        if len(table.columns) > 5:
			
 
				+            selected_columns = st.multiselect(
			
 
				+                f"选择要显示的列 (表格 {table_index+1})",
			
 
				+                table.columns.tolist(),
			
 
				+                default=table.columns.tolist()[:5],
			
 
				+                key=f"selected_columns_{table_index}"
			
 
				+            )
			
 
				+            
			
 
				+            if selected_columns:
			
 
				+                return table[selected_columns]
			
 
				+        
			
 
				+        return table
			
 
				+        
			
 
				+    else:
			
 
				+        return table
			
 
				+
			
 
				+
			
 
				+def _apply_table_filters_and_sorts(table: pd.DataFrame, table_index: int, 
			
 
				+                                   enable_filter: bool, enable_sort: bool) -> pd.DataFrame:
			
 
				+    """应用表格过滤和排序"""
			
 
				+    filtered_table = table.copy()
			
 
				+    
			
 
				+    if enable_filter and not table.empty:
			
 
				+        filter_col = st.selectbox(
			
 
				+            f"选择过滤列 (表格 {table_index+1})", 
			
 
				+            options=['无'] + list(table.columns),
			
 
				+            key=f"filter_col_{table_index}"
			
 
				+        )
			
 
				+        
			
 
				+        if filter_col != '无':
			
 
				+            filter_value = st.text_input(f"过滤值 (表格 {table_index+1})", key=f"filter_value_{table_index}")
			
 
				+            if filter_value:
			
 
				+                filtered_table = table[table[filter_col].astype(str).str.contains(filter_value, na=False)]
			
 
				+    
			
 
				+    if enable_sort and not filtered_table.empty:
			
 
				+        sort_col = st.selectbox(
			
 
				+            f"选择排序列 (表格 {table_index+1})", 
			
 
				+            options=['无'] + list(filtered_table.columns),
			
 
				+            key=f"sort_col_{table_index}"
			
 
				+        )
			
 
				+        
			
 
				+        if sort_col != '无':
			
 
				+            sort_order = st.radio(
			
 
				+                f"排序方式 (表格 {table_index+1})",
			
 
				+                options=['升序', '降序'],
			
 
				+                horizontal=True,
			
 
				+                key=f"sort_order_{table_index}"
			
 
				+            )
			
 
				+            ascending = (sort_order == '升序')
			
 
				+            filtered_table = filtered_table.sort_values(sort_col, ascending=ascending)
			
 
				+    
			
 
				+    return filtered_table
			
 
				+
			
 
				+
			
 
				+def _render_table_with_style(filtered_table: pd.DataFrame, original_table: pd.DataFrame,
			
 
				+                             table_index: int, enable_editing: bool, wide_table_threshold: int):
			
 
				+    """渲染表格并应用样式"""
			
 
				+    st.markdown("""
			
 
				+    <style>
			
 
				+    .dataframe-container {
			
 
				+        overflow-x: auto;
			
 
				+        border: 1px solid #ddd;
			
 
				+        border-radius: 5px;
			
 
				+        margin: 10px 0;
			
 
				+    }
			
 
				+    
			
 
				+    .wide-table-container {
			
 
				+        overflow-x: auto;
			
 
				+        max-height: 500px;
			
 
				+        overflow-y: auto;
			
 
				+        border: 2px solid #0288d1;
			
 
				+        border-radius: 8px;
			
 
				+        background: linear-gradient(90deg, #f8f9fa 0%, #ffffff 100%);
			
 
				+    }
			
 
				+    
			
 
				+    .dataframe thead th {
			
 
				+        position: sticky;
			
 
				+        top: 0;
			
 
				+        background-color: #f5f5f5 !important;
			
 
				+        z-index: 10;
			
 
				+        border-bottom: 2px solid #0288d1;
			
 
				+    }
			
 
				+    
			
 
				+    .dataframe tbody td {
			
 
				+        white-space: nowrap;
			
 
				+        min-width: 100px;
			
 
				+        max-width: 300px;
			
 
				+        overflow: hidden;
			
 
				+        text-overflow: ellipsis;
			
 
				+    }
			
 
				+    </style>
			
 
				+    """, unsafe_allow_html=True)
			
 
				+    
			
 
				+    container_class = "wide-table-container" if len(original_table.columns) > wide_table_threshold else "dataframe-container"
			
 
				+    
			
 
				+    if enable_editing:
			
 
				+        st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
			
 
				+        edited_table = st.data_editor(
			
 
				+            filtered_table, 
			
 
				+            width='stretch', 
			
 
				+            key=f"editor_{table_index}",
			
 
				+            height=400 if len(original_table.columns) > 8 else None
			
 
				+        )
			
 
				+        st.markdown('</div>', unsafe_allow_html=True)
			
 
				+        
			
 
				+        if not edited_table.equals(filtered_table):
			
 
				+            st.success("✏️ 表格已编辑，可以导出修改后的数据")
			
 
				+    else:
			
 
				+        st.markdown(f'<div class="{container_class}">', unsafe_allow_html=True)
			
 
				+        st.dataframe(
			
 
				+            filtered_table, 
			
 
				+            width=400 if len(original_table.columns) > wide_table_threshold else "stretch"
			
 
				+        )
			
 
				+        st.markdown('</div>', unsafe_allow_html=True)
			
 
				+
			
 
				+
			
 
				+def _display_table_info_and_stats(original_table: pd.DataFrame, filtered_table: pd.DataFrame, 
			
 
				+                                  show_info: bool, show_stats: bool, table_index: int):
			
 
				+    """显示表格信息和统计数据"""
			
 
				+    if show_info:
			
 
				+        st.write("**表格信息:**")
			
 
				+        st.write(f"- 原始行数: {len(original_table)}")
			
 
				+        st.write(f"- 过滤后行数: {len(filtered_table)}")
			
 
				+        st.write(f"- 列数: {len(original_table.columns)}")
			
 
				+        st.write(f"- 列名: {', '.join(original_table.columns)}")
			
 
				+    
			
 
				+    if show_stats:
			
 
				+        st.write("**统计信息:**")
			
 
				+        numeric_cols = filtered_table.select_dtypes(include=[np.number]).columns
			
 
				+        if len(numeric_cols) > 0:
			
 
				+            st.dataframe(filtered_table[numeric_cols].describe())
			
 
				+        else:
			
 
				+            st.info("表格中没有数值列")
			
 
				+    
			
 
				+    if st.button(f"📥 导出表格 {table_index+1}", key=f"export_{table_index}"):
			
 
				+        _create_export_buttons(filtered_table, table_index)
			
 
				+
			
 
				+
			
 
				+def _create_export_buttons(table: pd.DataFrame, table_index: int):
			
 
				+    """创建导出按钮"""
			
 
				+    csv_data = table.to_csv(index=False)
			
 
				+    st.download_button(
			
 
				+        label=f"下载CSV (表格 {table_index+1})",
			
 
				+        data=csv_data,
			
 
				+        file_name=f"table_{table_index+1}.csv",
			
 
				+        mime="text/csv",
			
 
				+        key=f"download_csv_{table_index}"
			
 
				+    )
			
 
				+    
			
 
				+    excel_buffer = BytesIO()
			
 
				+    table.to_excel(excel_buffer, index=False)
			
 
				+    st.download_button(
			
 
				+        label=f"下载Excel (表格 {table_index+1})",
			
 
				+        data=excel_buffer.getvalue(),
			
 
				+        file_name=f"table_{table_index+1}.xlsx",
			
 
				+        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
			
 
				+        key=f"download_excel_{table_index}"
			
 
				+    )
			
--- a/ocr_validator/streamlit_validator_ui.py
+++ b/ocr_validator/streamlit_validator_ui.py
@@ -0,0 +1,297 @@
 
				+"""
			
 
				+UI组件和页面配置
			
 
				+"""
			
 
				+import streamlit as st
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加 ocr_platform 根目录到 Python 路径（用于导入 ocr_utils）
			
 
				+# 使用 resolve() 确保路径是绝对路径，避免相对路径导致的 IndexError
			
 
				+_file_path = Path(__file__).resolve()
			
 
				+ocr_platform_root = _file_path.parents[1]  # streamlit_validator_ui.py -> ocr_validator -> ocr_platform
			
 
				+if str(ocr_platform_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(ocr_platform_root))
			
 
				+
			
 
				+# 从本地文件导入 Streamlit 特定函数
			
 
				+from ocr_validator_file_utils import load_css_styles
			
 
				+from ocr_validator_utils import get_data_source_display_name
			
 
				+
			
 
				+
			
 
				+def setup_page_config(config):
			
 
				+    """设置页面配置"""
			
 
				+    ui_config = config['ui']
			
 
				+    st.set_page_config(
			
 
				+        page_title=ui_config['page_title'],
			
 
				+        page_icon=ui_config['page_icon'],
			
 
				+        layout=ui_config['layout'],
			
 
				+        initial_sidebar_state=ui_config['sidebar_state']
			
 
				+    )
			
 
				+    
			
 
				+    css_content = load_css_styles()
			
 
				+    st.markdown(f"<style>{css_content}</style>", unsafe_allow_html=True)
			
 
				+
			
 
				+
			
 
				+def _parse_document_from_source_key(source_key: str, documents: list) -> str:
			
 
				+    """
			
 
				+    🎯 从数据源 key 解析文档名
			
 
				+    
			
 
				+    数据源 key 格式: {文档名}_{result_dir}
			
 
				+    例如: "德_内蒙古银行照_mineru_vllm_results"
			
 
				+    
			
 
				+    Args:
			
 
				+        source_key: 数据源 key
			
 
				+        documents: 可用文档列表
			
 
				+    
			
 
				+    Returns:
			
 
				+        文档名，如果无法解析则返回 None
			
 
				+    """
			
 
				+    # 🎯 按文档名长度降序排序，优先匹配长文档名
			
 
				+    sorted_docs = sorted(documents, key=len, reverse=True)
			
 
				+    
			
 
				+    for doc in sorted_docs:
			
 
				+        # 检查数据源 key 是否以 "文档名_" 开头
			
 
				+        if source_key.startswith(f"{doc}_"):
			
 
				+            return doc
			
 
				+    
			
 
				+    # 如果没有匹配，尝试直接匹配
			
 
				+    if source_key in documents:
			
 
				+        return source_key
			
 
				+    
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def create_data_source_selector(validator):
			
 
				+    """
			
 
				+    🎯 新版数据源选择器 - 3 列布局
			
 
				+    1. 选择文档
			
 
				+    2. 选择 OCR 工具
			
 
				+    3. 选择验证工具
			
 
				+    """
			
 
				+    from config_manager import ConfigManager
			
 
				+    
			
 
				+    # 获取配置管理器
			
 
				+    if 'config_manager' not in st.session_state:
			
 
				+        st.error("配置管理器未初始化")
			
 
				+        return
			
 
				+    
			
 
				+    config_manager: ConfigManager = st.session_state.config_manager
			
 
				+    
			
 
				+    # ============================================================
			
 
				+    # 3 列布局
			
 
				+    # ============================================================
			
 
				+    col1, col2, col3 = st.columns(3)
			
 
				+    
			
 
				+    # ============================================================
			
 
				+    # 第 1 列：选择文档
			
 
				+    # ============================================================
			
 
				+    with col1:
			
 
				+        st.markdown("#### 📄 选择文档")
			
 
				+        
			
 
				+        documents = config_manager.list_documents()
			
 
				+        if not documents:
			
 
				+            st.error("未找到任何文档配置")
			
 
				+            return
			
 
				+        
			
 
				+        # 🎯 从当前数据源 key 解析文档名
			
 
				+        current_doc = None
			
 
				+        if validator.current_source_key:
			
 
				+            current_doc = _parse_document_from_source_key(validator.current_source_key, documents)
			
 
				+        
			
 
				+        # 文档下拉框
			
 
				+        selected_doc_index = 0
			
 
				+        if current_doc and current_doc in documents:
			
 
				+            selected_doc_index = documents.index(current_doc)
			
 
				+        
			
 
				+        selected_doc = st.selectbox(
			
 
				+            "文档",
			
 
				+            options=documents,
			
 
				+            index=selected_doc_index,
			
 
				+            key="selected_document",
			
 
				+            label_visibility="collapsed",
			
 
				+            help="选择要处理的文档"
			
 
				+        )
			
 
				+        
			
 
				+        # 显示文档详情
			
 
				+        doc_config = config_manager.get_document(selected_doc)
			
 
				+        if doc_config:
			
 
				+            enabled_count = len([r for r in doc_config.ocr_results if r.enabled])
			
 
				+            with st.expander("📋 文档详情", expanded=False):
			
 
				+                st.caption(f"**基础目录:** `{doc_config.base_dir}`")
			
 
				+                st.caption(f"**可用工具:** {enabled_count} 个")
			
 
				+    
			
 
				+    # ============================================================
			
 
				+    # 第 2 列：选择 OCR 工具
			
 
				+    # ============================================================
			
 
				+    with col2:
			
 
				+        st.markdown("#### 🔧 OCR 数据源")
			
 
				+        
			
 
				+        if not doc_config:
			
 
				+            st.error(f"文档配置不存在: {selected_doc}")
			
 
				+            return
			
 
				+        
			
 
				+        # 获取该文档的所有启用的 OCR 结果
			
 
				+        enabled_ocr_results = [r for r in doc_config.ocr_results if r.enabled]
			
 
				+        
			
 
				+        if not enabled_ocr_results:
			
 
				+            st.warning(f"文档 {selected_doc} 没有可用的 OCR 工具")
			
 
				+            return
			
 
				+        
			
 
				+        # 🎯 构建 OCR 工具选项（使用 result_dir）
			
 
				+        ocr_tool_options = []
			
 
				+        ocr_source_keys = []  # 对应的数据源 key
			
 
				+        
			
 
				+        for ocr_result in enabled_ocr_results:
			
 
				+            # 🎯 显示名称：优先使用 description
			
 
				+            if ocr_result.description:
			
 
				+                display_name = ocr_result.description
			
 
				+            else:
			
 
				+                tool_config = config_manager.get_ocr_tool(ocr_result.tool)
			
 
				+                display_name = tool_config.name if tool_config else ocr_result.tool
			
 
				+            
			
 
				+            # 🎯 构建数据源 key（使用 result_dir）
			
 
				+            source_key = f"{selected_doc}_{ocr_result.result_dir}"
			
 
				+            
			
 
				+            ocr_tool_options.append(display_name)
			
 
				+            ocr_source_keys.append(source_key)
			
 
				+        
			
 
				+        # 获取当前选中的 OCR 工具索引
			
 
				+        current_ocr_index = 0
			
 
				+        if validator.current_source_key:
			
 
				+            # 🎯 检查当前数据源是否属于当前文档
			
 
				+            if validator.current_source_key.startswith(f"{selected_doc}_"):
			
 
				+                if validator.current_source_key in ocr_source_keys:
			
 
				+                    current_ocr_index = ocr_source_keys.index(validator.current_source_key)
			
 
				+        
			
 
				+        # OCR 工具下拉框
			
 
				+        selected_ocr_index = st.selectbox(
			
 
				+            "OCR 工具",
			
 
				+            options=range(len(ocr_tool_options)),
			
 
				+            format_func=lambda i: ocr_tool_options[i],
			
 
				+            index=current_ocr_index,
			
 
				+            key="selected_ocr_tool",
			
 
				+            label_visibility="collapsed",
			
 
				+            help="选择 OCR 识别工具"
			
 
				+        )
			
 
				+        
			
 
				+        selected_ocr_source_key = ocr_source_keys[selected_ocr_index]
			
 
				+        
			
 
				+        # 🎯 切换 OCR 数据源
			
 
				+        if validator.current_source_key != selected_ocr_source_key:
			
 
				+            validator.switch_to_source(selected_ocr_source_key)
			
 
				+            st.success(f"✅ 已切换 OCR 工具")
			
 
				+            # 重置文件选择
			
 
				+            if 'selected_file_index' in st.session_state:
			
 
				+                st.session_state.selected_file_index = 0
			
 
				+            st.rerun()
			
 
				+        
			
 
				+        # 显示 OCR 数据源详情
			
 
				+        if validator.current_source_config:
			
 
				+            with st.expander("📋 OCR 详情", expanded=False):
			
 
				+                st.caption(f"**工具:** {validator.current_source_config['ocr_tool']}")
			
 
				+                st.caption(f"**结果目录:** `{enabled_ocr_results[selected_ocr_index].result_dir}`")
			
 
				+                st.caption(f"**文件数:** {len(validator.file_info)}")
			
 
				+    
			
 
				+    # ============================================================
			
 
				+    # 第 3 列：选择验证工具
			
 
				+    # ============================================================
			
 
				+    with col3:
			
 
				+        st.markdown("#### 🔍 验证数据源")
			
 
				+        
			
 
				+        # 🎯 验证工具选项（排除当前 OCR 工具）
			
 
				+        verify_tool_options = []
			
 
				+        verify_source_keys = []
			
 
				+        verify_results = []  # 保存对应的 ocr_result，用于显示详情
			
 
				+        
			
 
				+        for i, ocr_result in enumerate(enabled_ocr_results):
			
 
				+            # 跳过当前 OCR 工具
			
 
				+            if ocr_source_keys[i] == selected_ocr_source_key:
			
 
				+                continue
			
 
				+            
			
 
				+            # 🎯 显示名称
			
 
				+            if ocr_result.description:
			
 
				+                display_name = ocr_result.description
			
 
				+            else:
			
 
				+                tool_config = config_manager.get_ocr_tool(ocr_result.tool)
			
 
				+                display_name = tool_config.name if tool_config else ocr_result.tool
			
 
				+            
			
 
				+            verify_tool_options.append(display_name)
			
 
				+            verify_source_keys.append(ocr_source_keys[i])
			
 
				+            verify_results.append(ocr_result)
			
 
				+        
			
 
				+        if not verify_tool_options:
			
 
				+            st.warning("⚠️ 没有其他可用的验证工具")
			
 
				+            st.info("💡 可以添加更多 OCR 工具到配置文件")
			
 
				+            return
			
 
				+        
			
 
				+        # 获取当前选中的验证工具索引
			
 
				+        current_verify_index = 0
			
 
				+        if validator.verify_source_key:
			
 
				+            # 🎯 检查验证数据源是否属于当前文档
			
 
				+            if validator.verify_source_key.startswith(f"{selected_doc}_"):
			
 
				+                if validator.verify_source_key in verify_source_keys:
			
 
				+                    current_verify_index = verify_source_keys.index(validator.verify_source_key)
			
 
				+        
			
 
				+        # 验证工具下拉框
			
 
				+        selected_verify_index = st.selectbox(
			
 
				+            "验证工具",
			
 
				+            options=range(len(verify_tool_options)),
			
 
				+            format_func=lambda i: verify_tool_options[i],
			
 
				+            index=current_verify_index,
			
 
				+            key="selected_verify_tool",
			
 
				+            label_visibility="collapsed",
			
 
				+            help="选择用于交叉验证的工具"
			
 
				+        )
			
 
				+        
			
 
				+        selected_verify_source_key = verify_source_keys[selected_verify_index]
			
 
				+        
			
 
				+        # 🎯 切换验证数据源
			
 
				+        if validator.verify_source_key != selected_verify_source_key:
			
 
				+            validator.switch_to_verify_source(selected_verify_source_key)
			
 
				+            st.success(f"✅ 已切换验证工具")
			
 
				+            st.rerun()
			
 
				+        
			
 
				+        # 显示验证数据源详情
			
 
				+        if validator.verify_source_config:
			
 
				+            verify_result = verify_results[selected_verify_index]
			
 
				+            with st.expander("📋 验证详情", expanded=False):
			
 
				+                st.caption(f"**工具:** {validator.verify_source_config['ocr_tool']}")
			
 
				+                st.caption(f"**结果目录:** `{verify_result.result_dir}`")
			
 
				+                st.caption(f"**文件数:** {len(validator.verify_file_info)}")
			
 
				+    
			
 
				+    # ============================================================
			
 
				+    # 状态提示（全宽）
			
 
				+    # ============================================================
			
 
				+    if validator.current_source_key == validator.verify_source_key:
			
 
				+        st.warning("⚠️ OCR数据源和验证数据源相同，建议选择不同的工具进行交叉验证")
			
 
				+    else:
			
 
				+        # 🎯 检查是否有交叉验证结果
			
 
				+        has_results = (
			
 
				+            'cross_validation_batch_result' in st.session_state 
			
 
				+            and st.session_state.cross_validation_batch_result is not None
			
 
				+        )
			
 
				+        
			
 
				+        if has_results:
			
 
				+            # 检查验证结果是否与当前数据源匹配
			
 
				+            result = st.session_state.cross_validation_batch_result
			
 
				+            result_ocr_source = result.get('ocr_source', '')
			
 
				+            result_verify_source = result.get('verify_source', '')
			
 
				+            current_ocr_source = get_data_source_display_name(validator.current_source_config)
			
 
				+            current_verify_source = get_data_source_display_name(validator.verify_source_config)
			
 
				+            
			
 
				+            if result_ocr_source == current_ocr_source and result_verify_source == current_verify_source:
			
 
				+                st.success(f"✅ 已选择 **{selected_doc}** 文档，使用 **{ocr_tool_options[selected_ocr_index]}** 与 **{verify_tool_options[selected_verify_index]}** 进行交叉验证 **（已有验证结果）**")
			
 
				+            else:
			
 
				+                st.info(f"ℹ️ 已选择 **{selected_doc}** 文档，使用 **{ocr_tool_options[selected_ocr_index]}** 与 **{verify_tool_options[selected_verify_index]}** 进行交叉验证 **（验证结果已过期，请重新验证）**")
			
 
				+        else:
			
 
				+            st.success(f"✅ 已选择 **{selected_doc}** 文档，使用 **{ocr_tool_options[selected_ocr_index]}** 与 **{verify_tool_options[selected_verify_index]}** 进行交叉验证")
			
 
				+
			
 
				+
			
 
				+@st.dialog("message", width="small", dismissible=True, on_dismiss="rerun")
			
 
				+def message_box(msg: str, msg_type: str = "info"):
			
 
				+    """消息对话框"""
			
 
				+    if msg_type == "info":
			
 
				+        st.info(msg)
			
 
				+    elif msg_type == "warning":
			
 
				+        st.warning(msg)
			
 
				+    elif msg_type == "error":
			
 
				+        st.error(msg)
			
--- a/ocr_validator/styles.css
+++ b/ocr_validator/styles.css
@@ -0,0 +1,126 @@
 
				+/* OCR验证工具样式配置 */
			
 
				+
			
 
				+/* 主体样式 */
			
 
				+.main > div {
			
 
				+    padding-top: 2rem;
			
 
				+    background-color: white !important;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+.stApp {
			
 
				+    background-color: white !important;
			
 
				+}
			
 
				+
			
 
				+.block-container {
			
 
				+    background-color: white !important;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+/* 侧边栏样式 */
			
 
				+.css-1d391kg {
			
 
				+    background-color: #f8f9fa !important;
			
 
				+}
			
 
				+
			
 
				+/* 选择框样式 */
			
 
				+.stSelectbox > div > div > div {
			
 
				+    background-color: #f0f2f6 !important;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+/* 文本样式 */
			
 
				+h1, h2, h3, h4, h5, h6, p, div, span, label {
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+/* 可点击文本样式 */
			
 
				+.clickable-text {
			
 
				+    background-color: #e1f5fe;
			
 
				+    padding: 2px 6px;
			
 
				+    border-radius: 4px;
			
 
				+    border: 1px solid #0288d1;
			
 
				+    cursor: pointer;
			
 
				+    margin: 2px;
			
 
				+    display: inline-block;
			
 
				+    color: #0288d1 !important;
			
 
				+}
			
 
				+
			
 
				+.selected-text {
			
 
				+    background-color: #fff3e0;
			
 
				+    border-color: #ff9800;
			
 
				+    font-weight: bold;
			
 
				+    color: #ff9800 !important;
			
 
				+}
			
 
				+
			
 
				+.error-text {
			
 
				+    background-color: #ffebee;
			
 
				+    border-color: #f44336;
			
 
				+    color: #d32f2f !important;
			
 
				+}
			
 
				+
			
 
				+.stats-container {
			
 
				+    background-color: #f8f9fa;
			
 
				+    padding: 1rem;
			
 
				+    border-radius: 8px;
			
 
				+    border-left: 4px solid #28a745;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+/* 滚动内容样式 */
			
 
				+.scrollable-content {
			
 
				+    overflow-y: auto;
			
 
				+    overflow-x: hidden;
			
 
				+    padding: 10px;
			
 
				+    border: 1px solid #ddd;
			
 
				+    border-radius: 5px;
			
 
				+    background-color: #fafafa !important;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+.scrollable-content::-webkit-scrollbar {
			
 
				+    width: 8px;
			
 
				+}
			
 
				+
			
 
				+.scrollable-content::-webkit-scrollbar-track {
			
 
				+    background: #f1f1f1;
			
 
				+    border-radius: 4px;
			
 
				+}
			
 
				+
			
 
				+.scrollable-content::-webkit-scrollbar-thumb {
			
 
				+    background: #888;
			
 
				+    border-radius: 4px;
			
 
				+}
			
 
				+
			
 
				+.scrollable-content::-webkit-scrollbar-thumb:hover {
			
 
				+    background: #555;
			
 
				+}
			
 
				+
			
 
				+/* 紧凑内容样式 */
			
 
				+.compact-content {
			
 
				+    overflow-y: auto;
			
 
				+    border: 1px solid #ddd;
			
 
				+    padding: 10px;
			
 
				+    background-color: #fafafa !important;
			
 
				+    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
			
 
				+    color: #333333 !important;
			
 
				+}
			
 
				+
			
 
				+/* 高亮文本样式 */
			
 
				+.highlight-text {
			
 
				+    background-color: #ffeb3b !important;
			
 
				+    color: #333333 !important;
			
 
				+    padding: 2px 4px;
			
 
				+    border-radius: 3px;
			
 
				+    cursor: pointer;
			
 
				+}
			
 
				+
			
 
				+.selected-highlight {
			
 
				+    background-color: #4caf50 !important;
			
 
				+    color: white !important;
			
 
				+}
			
 
				+
			
 
				+/* 标准内容样式 */
			
 
				+.standard-content {
			
 
				+    background-color: #fafafa !important;
			
 
				+    color: #333333 !important;
			
 
				+    border: 1px solid #ddd !important;
			
 
				+}