1 ماه پیش · 0c68266c77
--- a/streamlit_ocr_validator_v1.py
+++ b/streamlit_ocr_validator_v1.py
@@ -55,6 +55,13 @@ class StreamlitOCRValidator:
 
				         self.selected_file_index = -1
			
 
				         self.display_options = []
			
 
				         self.file_paths = []
			
 
				+        
			
 
				+        # ✅ 新增：交叉验证数据源
			
 
				+        self.verify_source_key = None
			
 
				+        self.verify_source_config = None
			
 
				+        self.verify_file_info = []
			
 
				+        self.verify_display_options = []
			
 
				+        self.verify_file_paths = []
			
 
				 
			
 
				         # 初始化布局管理器
			
 
				         self.layout_manager = OCRLayoutManager(self)
			
@@ -66,13 +73,18 @@ class StreamlitOCRValidator:
 
				         """加载多数据源文件信息"""
			
 
				         self.all_sources = find_available_ocr_files_multi_source(self.config)
			
 
				         
			
 
				-        # 如果有数据源，默认选择第一个
			
 
				+        # 如果有数据源，默认选择第一个作为OCR源
			
 
				         if self.all_sources:
			
 
				-            first_source_key = list(self.all_sources.keys())[0]
			
 
				+            source_keys = list(self.all_sources.keys())
			
 
				+            first_source_key = source_keys[0]
			
 
				             self.switch_to_source(first_source_key)
			
 
				+            
			
 
				+            # 如果有第二个数据源，默认作为验证源
			
 
				+            if len(source_keys) > 1:
			
 
				+                self.switch_to_verify_source(source_keys[1])
			
 
				     
			
 
				     def switch_to_source(self, source_key: str):
			
 
				-        """切换到指定数据源"""
			
 
				+        """切换到指定OCR数据源"""
			
 
				         if source_key in self.all_sources:
			
 
				             self.current_source_key = source_key
			
 
				             source_data = self.all_sources[source_key]
			
@@ -86,11 +98,25 @@ class StreamlitOCRValidator:
 
				                 
			
 
				                 # 重置文件选择
			
 
				                 self.selected_file_index = -1
			
 
				-                
			
 
				-                print(f"✅ 切换到数据源: {source_key}")
			
 
				+                print(f"✅ 切换到OCR数据源: {source_key}")
			
 
				             else:
			
 
				                 print(f"⚠️ 数据源 {source_key} 没有可用文件")
			
 
				     
			
 
				+    def switch_to_verify_source(self, source_key: str):
			
 
				+        """切换到指定验证数据源"""
			
 
				+        if source_key in self.all_sources:
			
 
				+            self.verify_source_key = source_key
			
 
				+            source_data = self.all_sources[source_key]
			
 
				+            self.verify_source_config = source_data['config']
			
 
				+            self.verify_file_info = source_data['files']
			
 
				+            
			
 
				+            if self.verify_file_info:
			
 
				+                self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
			
 
				+                self.verify_file_paths = [info['path'] for info in self.verify_file_info]
			
 
				+                print(f"✅ 切换到验证数据源: {source_key}")
			
 
				+            else:
			
 
				+                print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
			
 
				+
			
 
				     def setup_page_config(self):
			
 
				         """设置页面配置"""
			
 
				         ui_config = self.config['ui']
			
@@ -106,56 +132,91 @@ class StreamlitOCRValidator:
 
				         st.markdown(f"<style>{css_content}</style>", unsafe_allow_html=True)
			
 
				 
			
 
				     def create_data_source_selector(self):
			
 
				-        """创建数据源选择器"""
			
 
				+        """创建双数据源选择器 - 支持交叉验证"""
			
 
				         if not self.all_sources:
			
 
				             st.warning("❌ 未找到任何数据源，请检查配置文件")
			
 
				             return
			
 
				         
			
 
				-        # 数据源选择
			
 
				+        # 准备数据源选项
			
 
				         source_options = {}
			
 
				         for source_key, source_data in self.all_sources.items():
			
 
				             display_name = get_data_source_display_name(source_data['config'])
			
 
				             source_options[display_name] = source_key
			
 
				         
			
 
				-        # 获取当前选择的显示名称
			
 
				-        current_display_name = None
			
 
				-        if self.current_source_key:
			
 
				-            for display_name, key in source_options.items():
			
 
				-                if key == self.current_source_key:
			
 
				-                    current_display_name = display_name
			
 
				-                    break
			
 
				-        
			
 
				-        selected_display_name = st.selectbox(
			
 
				-            "📁 选择数据源",
			
 
				-            options=list(source_options.keys()),
			
 
				-            index=list(source_options.keys()).index(current_display_name) if current_display_name else 0,
			
 
				-            key="data_source_selector",
			
 
				-            help="选择要分析的OCR数据源"
			
 
				-        )
			
 
				+        # 创建两列布局
			
 
				+        col1, col2 = st.columns(2)
			
 
				         
			
 
				-        selected_source_key = source_options[selected_display_name]
			
 
				+        with col1:
			
 
				+            st.markdown("#### 📊 OCR数据源")
			
 
				+            # OCR数据源选择
			
 
				+            current_display_name = None
			
 
				+            if self.current_source_key:
			
 
				+                for display_name, key in source_options.items():
			
 
				+                    if key == self.current_source_key:
			
 
				+                        current_display_name = display_name
			
 
				+                        break
			
 
				+            
			
 
				+            selected_ocr_display = st.selectbox(
			
 
				+                "选择OCR数据源",
			
 
				+                options=list(source_options.keys()),
			
 
				+                index=list(source_options.keys()).index(current_display_name) if current_display_name else 0,
			
 
				+                key="ocr_source_selector",
			
 
				+                label_visibility="collapsed",
			
 
				+                help="选择要分析的OCR数据源"
			
 
				+            )
			
 
				+            
			
 
				+            selected_ocr_key = source_options[selected_ocr_display]
			
 
				+            
			
 
				+            # 如果OCR数据源发生变化，切换数据源
			
 
				+            if selected_ocr_key != self.current_source_key:
			
 
				+                self.switch_to_source(selected_ocr_key)
			
 
				+                if 'selected_file_index' in st.session_state:
			
 
				+                    st.session_state.selected_file_index = 0
			
 
				+                st.rerun()
			
 
				+            
			
 
				+            # 显示OCR数据源信息
			
 
				+            if self.current_source_config:
			
 
				+                with st.expander("📋 OCR数据源详情", expanded=False):
			
 
				+                    st.write(f"**工具:** {self.current_source_config['ocr_tool']}")
			
 
				+                    st.write(f"**文件数:** {len(self.file_info)}")
			
 
				         
			
 
				-        # 如果数据源发生变化，切换数据源
			
 
				-        if selected_source_key != self.current_source_key:
			
 
				-            self.switch_to_source(selected_source_key)
			
 
				-            # 重置session state
			
 
				-            if 'selected_file_index' in st.session_state:
			
 
				-                st.session_state.selected_file_index = 0
			
 
				-            st.rerun()
			
 
				+        with col2:
			
 
				+            st.markdown("#### 🔍 验证数据源")
			
 
				+            # 验证数据源选择
			
 
				+            verify_display_name = None
			
 
				+            if self.verify_source_key:
			
 
				+                for display_name, key in source_options.items():
			
 
				+                    if key == self.verify_source_key:
			
 
				+                        verify_display_name = display_name
			
 
				+                        break
			
 
				+            
			
 
				+            selected_verify_display = st.selectbox(
			
 
				+                "选择验证数据源",
			
 
				+                options=list(source_options.keys()),
			
 
				+                index=list(source_options.keys()).index(verify_display_name) if verify_display_name else (1 if len(source_options) > 1 else 0),
			
 
				+                key="verify_source_selector",
			
 
				+                label_visibility="collapsed",
			
 
				+                help="选择用于交叉验证的数据源"
			
 
				+            )
			
 
				+            
			
 
				+            selected_verify_key = source_options[selected_verify_display]
			
 
				+            
			
 
				+            # 如果验证数据源发生变化，切换数据源
			
 
				+            if selected_verify_key != self.verify_source_key:
			
 
				+                self.switch_to_verify_source(selected_verify_key)
			
 
				+                st.rerun()
			
 
				+            
			
 
				+            # 显示验证数据源信息
			
 
				+            if self.verify_source_config:
			
 
				+                with st.expander("📋 验证数据源详情", expanded=False):
			
 
				+                    st.write(f"**工具:** {self.verify_source_config['ocr_tool']}")
			
 
				+                    st.write(f"**文件数:** {len(self.verify_file_info)}")
			
 
				         
			
 
				-        # 显示数据源信息
			
 
				-        if self.current_source_config:
			
 
				-            with st.expander("📋 数据源详情", expanded=False):
			
 
				-                col1, col2, col3 = st.columns(3)
			
 
				-                with col1:
			
 
				-                    st.write(f"**名称:** {self.current_source_config['name']}")
			
 
				-                    st.write(f"**OCR工具:** {self.current_source_config['ocr_tool']}")
			
 
				-                with col2:
			
 
				-                    st.write(f"**输出目录:** {self.current_source_config['ocr_out_dir']}")
			
 
				-                    st.write(f"**图片目录:** {self.current_source_config.get('src_img_dir', 'N/A')}")
			
 
				-                with col3:
			
 
				-                    st.write(f"**描述:** {self.current_source_config.get('description', 'N/A')}")
			
 
				-                    st.write(f"**文件数量:** {len(self.file_info)}")
			
 
				+        # 数据源对比提示
			
 
				+        if self.current_source_key == self.verify_source_key:
			
 
				+            st.warning("⚠️ OCR数据源和验证数据源相同，建议选择不同的数据源进行交叉验证")
			
 
				+        else:
			
 
				+            st.success(f"✅ 已选择 {selected_ocr_display} 与 {selected_verify_display} 进行交叉验证")    
			
 
				     
			
 
				     def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
			
 
				         """加载OCR相关数据 - 支持多数据源配置"""
			
@@ -456,107 +517,474 @@ class StreamlitOCRValidator:
 
				             
			
 
				         else:  # 完整显示
			
 
				             return table
			
 
				-    
			
 
				-    @st.dialog("VLM预校验", width="large", dismissible=True, on_dismiss="rerun")
			
 
				-    def vlm_pre_validation(self):
			
 
				-        """VLM预校验功能 - 封装OCR识别和结果对比"""
			
 
				+
			
 
				+    def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
			
 
				+        """查找当前OCR文件对应的验证文件路径"""
			
 
				+        current_page = self.file_info[selected_file_index]['page']
			
 
				+        verify_md_path = None
			
 
				+
			
 
				+        for i, info in enumerate(self.verify_file_info):
			
 
				+            if info['page'] == current_page:
			
 
				+                verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
			
 
				+                break
			
 
				+
			
 
				+        return verify_md_path
			
 
				+
			
 
				+    @st.dialog("交叉验证", width="large", dismissible=True, on_dismiss="rerun")
			
 
				+    def cross_validation(self):
			
 
				+        """交叉验证功能 - 批量比对两个数据源的所有OCR结果"""
			
 
				         
			
 
				-        if not self.image_path or not self.md_content:
			
 
				-            st.error("❌ 请先加载OCR数据文件")
			
 
				+        if self.current_source_key == self.verify_source_key:
			
 
				+            st.error("❌ OCR数据源和验证数据源不能相同")
			
 
				             return
			
 
				+        
			
 
				         # 初始化对比结果存储
			
 
				-        if 'comparison_result' not in st.session_state:
			
 
				-            st.session_state.comparison_result = None
			
 
				-
			
 
				-        # 创建进度条和状态显示
			
 
				-        with st.spinner("正在进行VLM预校验...", show_time=True):
			
 
				-            status_text = st.empty()
			
 
				+        if 'cross_validation_batch_result' not in st.session_state:
			
 
				+            st.session_state.cross_validation_batch_result = None
			
 
				+        
			
 
				+        st.header("🔄 批量交叉验证")
			
 
				+        
			
 
				+        # 显示数据源信息
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        with col1:
			
 
				+            st.info(f"**OCR数据源:** {get_data_source_display_name(self.current_source_config)}")
			
 
				+            st.write(f"📁 文件数量: {len(self.file_info)}")
			
 
				+        with col2:
			
 
				+            st.info(f"**验证数据源:** {get_data_source_display_name(self.verify_source_config)}")
			
 
				+            st.write(f"📁 文件数量: {len(self.verify_file_info)}")
			
 
				+        
			
 
				+        # 批量验证选项
			
 
				+        with st.expander("⚙️ 验证选项", expanded=True):
			
 
				+            col1, col2 = st.columns(2)
			
 
				+            with col1:
			
 
				+                table_mode = st.selectbox(
			
 
				+                    "表格比对模式",
			
 
				+                    options=['standard', 'flow_list'],
			
 
				+                    index=1,  # 默认使用flow_list
			
 
				+                    format_func=lambda x: '流水表格模式' if x == 'flow_list' else '标准模式',
			
 
				+                    help="选择表格比对算法"
			
 
				+                )
			
 
				+            with col2:
			
 
				+                similarity_algorithm = st.selectbox(
			
 
				+                    "相似度算法",
			
 
				+                    options=['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio'],
			
 
				+                    index=0,
			
 
				+                    help="选择文本相似度计算算法"
			
 
				+                )
			
 
				+        
			
 
				+        # 开始批量验证按钮
			
 
				+        if st.button("🚀 开始批量验证", type="primary", use_container_width=True):
			
 
				+            self._run_batch_cross_validation(table_mode, similarity_algorithm)
			
 
				+        
			
 
				+        # 显示历史批量验证结果
			
 
				+        if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
			
 
				+            st.markdown("---")
			
 
				+            self._display_batch_validation_results(st.session_state.cross_validation_batch_result)
			
 
				+    
			
 
				+    def _generate_batch_validation_markdown(self, batch_results: dict, output_path: str):
			
 
				+        """生成批量验证的Markdown报告"""
			
 
				+        
			
 
				+        with open(output_path, "w", encoding="utf-8") as f:
			
 
				+            f.write("# 批量交叉验证报告\n\n")
			
 
				             
			
 
				-            try:
			
 
				-                current_md_path = Path(self.file_paths[self.selected_file_index]).with_suffix('.md')
			
 
				-                if not current_md_path.exists():
			
 
				-                    st.error("❌ 当前OCR结果的Markdown文件不存在，无法进行对比")
			
 
				-                    return
			
 
				-                # 第一步：准备目录
			
 
				-                pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				-                pre_validation_dir.mkdir(parents=True, exist_ok=True)
			
 
				-                status_text.write(f"工作目录: {pre_validation_dir}")
			
 
				+            # 基本信息
			
 
				+            f.write("## 📋 基本信息\n\n")
			
 
				+            f.write(f"- **OCR数据源:** {batch_results['ocr_source']}\n")
			
 
				+            f.write(f"- **验证数据源:** {batch_results['verify_source']}\n")
			
 
				+            f.write(f"- **表格模式:** {batch_results['table_mode']}\n")
			
 
				+            f.write(f"- **相似度算法:** {batch_results['similarity_algorithm']}\n")
			
 
				+            f.write(f"- **验证时间:** {batch_results['timestamp']}\n\n")
			
 
				+            
			
 
				+            # 汇总统计
			
 
				+            summary = batch_results['summary']
			
 
				+            f.write("## 📊 汇总统计\n\n")
			
 
				+            f.write(f"- **总页数:** {summary['total_pages']}\n")
			
 
				+            f.write(f"- **成功页数:** {summary['successful_pages']}\n")
			
 
				+            f.write(f"- **失败页数:** {summary['failed_pages']}\n")
			
 
				+            f.write(f"- **总差异数:** {summary['total_differences']}\n")
			
 
				+            f.write(f"- **表格差异:** {summary['total_table_differences']}\n")
			
 
				+            f.write(f"  - 金额差异: {summary.get('total_amount_differences', 0)}\n")
			
 
				+            f.write(f"  - 日期差异: {summary.get('total_datetime_differences', 0)}\n")
			
 
				+            f.write(f"  - 文本差异: {summary.get('total_text_differences', 0)}\n")
			
 
				+            f.write(f"  - 表头前差异: {summary.get('total_table_pre_header', 0)}\n")
			
 
				+            f.write(f"  - 表头位置差异: {summary.get('total_table_header_position', 0)}\n")
			
 
				+            f.write(f"  - 表头严重错误: {summary.get('total_table_header_critical', 0)}\n")
			
 
				+            f.write(f"  - 行缺失: {summary.get('total_table_row_missing', 0)}\n")
			
 
				+            f.write(f"- **段落差异:** {summary['total_paragraph_differences']}\n")
			
 
				+            f.write(f"- **严重程度统计:**\n")
			
 
				+            f.write(f"  - 高严重度: {summary.get('total_high_severity', 0)}\n")
			
 
				+            f.write(f"  - 中严重度: {summary.get('total_medium_severity', 0)}\n")
			
 
				+            f.write(f"  - 低严重度: {summary.get('total_low_severity', 0)}\n\n")
			
 
				+            
			
 
				+            # 详细结果表格
			
 
				+            f.write("## 📄 各页差异统计\n\n")
			
 
				+            f.write("| 页码 | 状态 | 总差异 | 表格差异 | 金额 | 日期 | 文本 | 段落 | 表头前 | 表头位置 | 表头错误 | 行缺失 | 高 | 中 | 低 |\n")
			
 
				+            f.write("|------|------|--------|----------|------|------|------|------|--------|----------|----------|--------|----|----|----|\n")
			
 
				+            
			
 
				+            for page in batch_results['pages']:
			
 
				+                if page['status'] == 'success':
			
 
				+                    status_icon = "✅" if page['total_differences'] == 0 else "⚠️"
			
 
				+                    f.write(f"| {page['page_num']} | {status_icon} | ")
			
 
				+                    f.write(f"{page['total_differences']} | ")
			
 
				+                    f.write(f"{page['table_differences']} | ")
			
 
				+                    f.write(f"{page.get('amount_differences', 0)} | ")
			
 
				+                    f.write(f"{page.get('datetime_differences', 0)} | ")
			
 
				+                    f.write(f"{page.get('text_differences', 0)} | ")
			
 
				+                    f.write(f"{page['paragraph_differences']} | ")
			
 
				+                    f.write(f"{page.get('table_pre_header', 0)} | ")
			
 
				+                    f.write(f"{page.get('table_header_position', 0)} | ")
			
 
				+                    f.write(f"{page.get('table_header_critical', 0)} | ")
			
 
				+                    f.write(f"{page.get('table_row_missing', 0)} | ")
			
 
				+                    f.write(f"{page.get('high_severity', 0)} | ")
			
 
				+                    f.write(f"{page.get('medium_severity', 0)} | ")
			
 
				+                    f.write(f"{page.get('low_severity', 0)} |\n")
			
 
				+                else:
			
 
				+                    f.write(f"| {page['page_num']} | ❌ | - | - | - | - | - | - | - | - | - | - | - | - | - |\n")
			
 
				+            
			
 
				+            f.write("\n")
			
 
				+            
			
 
				+            # 问题汇总
			
 
				+            f.write("## 🔍 问题汇总\n\n")
			
 
				+            
			
 
				+            high_diff_pages = [p for p in batch_results['pages'] 
			
 
				+                             if p['status'] == 'success' and p['total_differences'] > 10]
			
 
				+            if high_diff_pages:
			
 
				+                f.write("### ⚠️ 高差异页面（差异>10）\n\n")
			
 
				+                for page in high_diff_pages:
			
 
				+                    f.write(f"- 第 {page['page_num']} 页：{page['total_differences']} 个差异\n")
			
 
				+                f.write("\n")
			
 
				+            
			
 
				+            amount_error_pages = [p for p in batch_results['pages'] 
			
 
				+                                if p['status'] == 'success' and p.get('amount_differences', 0) > 0]
			
 
				+            if amount_error_pages:
			
 
				+                f.write("### 💰 金额差异页面\n\n")
			
 
				+                for page in amount_error_pages:
			
 
				+                    f.write(f"- 第 {page['page_num']} 页：{page.get('amount_differences', 0)} 个金额差异\n")
			
 
				+                f.write("\n")
			
 
				+            
			
 
				+            header_error_pages = [p for p in batch_results['pages'] 
			
 
				+                                if p['status'] == 'success' and p.get('table_header_critical', 0) > 0]
			
 
				+            if header_error_pages:
			
 
				+                f.write("### ❌ 表头严重错误页面\n\n")
			
 
				+                for page in header_error_pages:
			
 
				+                    f.write(f"- 第 {page['page_num']} 页：{page['table_header_critical']} 个表头错误\n")
			
 
				+                f.write("\n")
			
 
				+            
			
 
				+            failed_pages = [p for p in batch_results['pages'] if p['status'] == 'failed']
			
 
				+            if failed_pages:
			
 
				+                f.write("### 💥 验证失败页面\n\n")
			
 
				+                for page in failed_pages:
			
 
				+                    f.write(f"- 第 {page['page_num']} 页：{page.get('error', '未知错误')}\n")
			
 
				+                f.write("\n")
			
 
				 
			
 
				-                # 第二步：调用VLM进行OCR识别
			
 
				-                status_text.text("🤖 正在调用VLM进行OCR识别...")
			
 
				+    def _run_batch_cross_validation(self, table_mode: str, similarity_algorithm: str):
			
 
				+        """执行批量交叉验证"""
			
 
				+        
			
 
				+        # 准备输出目录
			
 
				+        pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				+        pre_validation_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # ✅ 批量结果存储 - 更新统计字段
			
 
				+        batch_results = {
			
 
				+            'ocr_source': get_data_source_display_name(self.current_source_config),
			
 
				+            'verify_source': get_data_source_display_name(self.verify_source_config),
			
 
				+            'table_mode': table_mode,
			
 
				+            'similarity_algorithm': similarity_algorithm,
			
 
				+            'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+            'pages': [],
			
 
				+            'summary': {
			
 
				+                'total_pages': 0,
			
 
				+                'successful_pages': 0,
			
 
				+                'failed_pages': 0,
			
 
				+                'total_differences': 0,
			
 
				+                'total_table_differences': 0,
			
 
				+                'total_amount_differences': 0,
			
 
				+                'total_datetime_differences': 0,
			
 
				+                'total_text_differences': 0,
			
 
				+                'total_paragraph_differences': 0,
			
 
				+                'total_table_pre_header': 0,
			
 
				+                'total_table_header_position': 0,
			
 
				+                'total_table_header_critical': 0,
			
 
				+                'total_table_row_missing': 0,
			
 
				+                'total_high_severity': 0,
			
 
				+                'total_medium_severity': 0,
			
 
				+                'total_low_severity': 0
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        # 创建进度条
			
 
				+        progress_bar = st.progress(0)
			
 
				+        status_text = st.empty()
			
 
				+        
			
 
				+        # 建立页码映射
			
 
				+        ocr_page_map = {info['page']: i for i, info in enumerate(self.file_info)}
			
 
				+        verify_page_map = {info['page']: i for i, info in enumerate(self.verify_file_info)}
			
 
				+        
			
 
				+        # 找出两个数据源共同的页码
			
 
				+        common_pages = sorted(set(ocr_page_map.keys()) & set(verify_page_map.keys()))
			
 
				+        
			
 
				+        if not common_pages:
			
 
				+            st.error("❌ 两个数据源没有共同的页码，无法进行对比")
			
 
				+            return
			
 
				+        
			
 
				+        batch_results['summary']['total_pages'] = len(common_pages)
			
 
				+        
			
 
				+        # 创建详细日志区域
			
 
				+        with st.expander("📋 详细对比日志", expanded=True):
			
 
				+            log_container = st.container()
			
 
				+        
			
 
				+        # 逐页对比
			
 
				+        for idx, page_num in enumerate(common_pages):
			
 
				+            try:
			
 
				+                # 更新进度
			
 
				+                progress = (idx + 1) / len(common_pages)
			
 
				+                progress_bar.progress(progress)
			
 
				+                status_text.text(f"正在对比第 {page_num} 页... ({idx + 1}/{len(common_pages)})")
			
 
				                 
			
 
				-                # 在expander中显示OCR过程
			
 
				-                with st.expander("🔍 VLM OCR识别过程", expanded=True):
			
 
				-                    ocr_output = st.empty()
			
 
				-                    
			
 
				-                    # 捕获OCR输出
			
 
				-                    import io
			
 
				-                    import contextlib
			
 
				-                    
			
 
				-                    # 创建字符串缓冲区来捕获print输出
			
 
				-                    output_buffer = io.StringIO()
			
 
				-                    
			
 
				-                    with contextlib.redirect_stdout(output_buffer):
			
 
				-                        ocr_result = ocr_with_vlm(
			
 
				-                            image_path=str(self.image_path),
			
 
				-                            output_dir=str(pre_validation_dir),
			
 
				-                            normalize_numbers=True
			
 
				-                        )
			
 
				-                    
			
 
				-                    # 显示OCR过程输出
			
 
				-                    ocr_output.code(output_buffer.getvalue(), language='text')
			
 
				-                    
			
 
				-                    status_text.text("✅ VLM OCR识别完成")
			
 
				-                    
			
 
				-                    # 第三步：获取VLM生成的文件路径
			
 
				-                    vlm_md_path = pre_validation_dir / f"{Path(self.image_path).stem}.md"
			
 
				-                    
			
 
				-                    if not vlm_md_path.exists():
			
 
				-                        st.error("❌ VLM OCR结果文件未生成")
			
 
				-                        return
			
 
				-                    
			
 
				-                    # 第四步：调用对比功能
			
 
				-                    status_text.text("📊 正在对比OCR结果...")
			
 
				-                    
			
 
				-                    # 在expander中显示对比过程
			
 
				-                    comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_comparison_result"
			
 
				-                    with st.expander("🔍 OCR结果对比过程", expanded=True):
			
 
				-                        compare_output = st.empty()
			
 
				-                        
			
 
				-                        # 捕获对比输出
			
 
				-                        output_buffer = io.StringIO()
			
 
				-                        
			
 
				-                        with contextlib.redirect_stdout(output_buffer):
			
 
				-                            comparison_result = compare_ocr_results(
			
 
				-                                file1_path=str(current_md_path),
			
 
				-                                file2_path=str(vlm_md_path),
			
 
				-                                output_file=str(comparison_result_path),
			
 
				-                                output_format='both',
			
 
				-                                ignore_images=True
			
 
				-                            )
			
 
				-                        
			
 
				-                        # 显示对比过程输出
			
 
				-                        compare_output.code(output_buffer.getvalue(), language='text')
			
 
				-                    
			
 
				-                    status_text.text("✅ VLM预校验完成")
			
 
				-
			
 
				-                    st.session_state.comparison_result = {
			
 
				-                        "image_path": self.image_path,
			
 
				-                        "comparison_result_json": f"{comparison_result_path}.json",
			
 
				-                        "comparison_result_md": f"{comparison_result_path}.md",
			
 
				-                        "comparison_result": comparison_result
			
 
				-                    }
			
 
				+                # 获取文件路径
			
 
				+                ocr_file_index = ocr_page_map[page_num]
			
 
				+                verify_file_index = verify_page_map[page_num]
			
 
				+                
			
 
				+                ocr_md_path = Path(self.file_paths[ocr_file_index]).with_suffix('.md')
			
 
				+                verify_md_path = Path(self.verify_file_paths[verify_file_index]).with_suffix('.md')
			
 
				+                
			
 
				+                if not ocr_md_path.exists() or not verify_md_path.exists():
			
 
				+                    with log_container:
			
 
				+                        st.warning(f"⚠️ 第 {page_num} 页：文件不存在，跳过")
			
 
				+                    batch_results['summary']['failed_pages'] += 1
			
 
				+                    continue
			
 
				+                
			
 
				+                # 执行对比
			
 
				+                comparison_result_path = pre_validation_dir / f"{ocr_md_path.stem}_cross_validation"
			
 
				+                
			
 
				+                # 捕获对比输出
			
 
				+                import io
			
 
				+                import contextlib
			
 
				                 
			
 
				-                # 第五步：显示对比结果
			
 
				-                self.display_comparison_results(comparison_result, detailed=False)
			
 
				+                output_buffer = io.StringIO()
			
 
				                 
			
 
				-                # 第六步：提供文件下载
			
 
				-                # self.provide_download_options(pre_validation_dir, vlm_md_path, comparison_result)
			
 
				+                with contextlib.redirect_stdout(output_buffer):
			
 
				+                    comparison_result = compare_ocr_results(
			
 
				+                        file1_path=str(ocr_md_path),
			
 
				+                        file2_path=str(verify_md_path),
			
 
				+                        output_file=str(comparison_result_path),
			
 
				+                        output_format='both',
			
 
				+                        ignore_images=True,
			
 
				+                        table_mode=table_mode,
			
 
				+                        similarity_algorithm=similarity_algorithm
			
 
				+                    )
			
 
				+                
			
 
				+                # ✅ 提取统计信息 - 更新字段
			
 
				+                stats = comparison_result['statistics']
			
 
				+                
			
 
				+                page_result = {
			
 
				+                    'page_num': page_num,
			
 
				+                    'ocr_file': str(ocr_md_path.name),
			
 
				+                    'verify_file': str(verify_md_path.name),
			
 
				+                    'total_differences': stats['total_differences'],
			
 
				+                    'table_differences': stats['table_differences'],
			
 
				+                    'amount_differences': stats.get('amount_differences', 0),
			
 
				+                    'datetime_differences': stats.get('datetime_differences', 0),
			
 
				+                    'text_differences': stats.get('text_differences', 0),
			
 
				+                    'paragraph_differences': stats['paragraph_differences'],
			
 
				+                    'table_pre_header': stats.get('table_pre_header', 0),
			
 
				+                    'table_header_position': stats.get('table_header_position', 0),
			
 
				+                    'table_header_critical': stats.get('table_header_critical', 0),
			
 
				+                    'table_row_missing': stats.get('table_row_missing', 0),
			
 
				+                    'high_severity': stats.get('high_severity', 0),
			
 
				+                    'medium_severity': stats.get('medium_severity', 0),
			
 
				+                    'low_severity': stats.get('low_severity', 0),
			
 
				+                    'status': 'success',
			
 
				+                    'comparison_json': f"{comparison_result_path}.json",
			
 
				+                    'comparison_md': f"{comparison_result_path}.md"
			
 
				+                }
			
 
				+                
			
 
				+                batch_results['pages'].append(page_result)
			
 
				+                batch_results['summary']['successful_pages'] += 1
			
 
				+                batch_results['summary']['total_differences'] += stats['total_differences']
			
 
				+                batch_results['summary']['total_table_differences'] += stats['table_differences']
			
 
				+                batch_results['summary']['total_amount_differences'] += stats.get('amount_differences', 0)
			
 
				+                batch_results['summary']['total_datetime_differences'] += stats.get('datetime_differences', 0)
			
 
				+                batch_results['summary']['total_text_differences'] += stats.get('text_differences', 0)
			
 
				+                batch_results['summary']['total_paragraph_differences'] += stats['paragraph_differences']
			
 
				+                batch_results['summary']['total_table_pre_header'] += stats.get('table_pre_header', 0)
			
 
				+                batch_results['summary']['total_table_header_position'] += stats.get('table_header_position', 0)
			
 
				+                batch_results['summary']['total_table_header_critical'] += stats.get('table_header_critical', 0)
			
 
				+                batch_results['summary']['total_table_row_missing'] += stats.get('table_row_missing', 0)
			
 
				+                batch_results['summary']['total_high_severity'] += stats.get('high_severity', 0)
			
 
				+                batch_results['summary']['total_medium_severity'] += stats.get('medium_severity', 0)
			
 
				+                batch_results['summary']['total_low_severity'] += stats.get('low_severity', 0)
			
 
				+                
			
 
				+                # 显示当前页对比结果
			
 
				+                with log_container:
			
 
				+                    if stats['total_differences'] == 0:
			
 
				+                        st.success(f"✅ 第 {page_num} 页：完全匹配")
			
 
				+                    else:
			
 
				+                        st.warning(f"⚠️ 第 {page_num} 页：发现 {stats['total_differences']} 个差异")
			
 
				                 
			
 
				             except Exception as e:
			
 
				-                st.error(f"❌ VLM预校验失败: {e}")
			
 
				-                st.exception(e)
			
 
				-    
			
 
				+                with log_container:
			
 
				+                    st.error(f"❌ 第 {page_num} 页：对比失败 - {str(e)}")
			
 
				+                
			
 
				+                page_result = {
			
 
				+                    'page_num': page_num,
			
 
				+                    'status': 'failed',
			
 
				+                    'error': str(e)
			
 
				+                }
			
 
				+                batch_results['pages'].append(page_result)
			
 
				+                batch_results['summary']['failed_pages'] += 1
			
 
				+        
			
 
				+        # 保存批量结果
			
 
				+        batch_result_path = pre_validation_dir / f"{self.current_source_config['name']}_{self.current_source_config['ocr_tool']}_vs_{self.verify_source_config['ocr_tool']}_batch_cross_validation"
			
 
				+        
			
 
				+        # 保存JSON
			
 
				+        with open(f"{batch_result_path}.json", "w", encoding="utf-8") as f:
			
 
				+            json.dump(batch_results, f, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        # 生成Markdown报告
			
 
				+        self._generate_batch_validation_markdown(batch_results, f"{batch_result_path}.md")
			
 
				+        
			
 
				+        # 保存到session state
			
 
				+        st.session_state.cross_validation_batch_result = batch_results
			
 
				+        
			
 
				+        # 完成提示
			
 
				+        progress_bar.progress(1.0)
			
 
				+        status_text.text("✅ 批量验证完成！")
			
 
				+        
			
 
				+        st.success(f"🎉 批量验证完成！成功: {batch_results['summary']['successful_pages']}, 失败: {batch_results['summary']['failed_pages']}")
			
 
				+
			
 
				+    def _display_batch_validation_results(self, batch_results: dict):
			
 
				+        """显示批量验证结果"""
			
 
				+        
			
 
				+        st.header("📊 批量验证结果")
			
 
				+        
			
 
				+        # 汇总统计
			
 
				+        summary = batch_results['summary']
			
 
				+        
			
 
				+        col1, col2, col3, col4 = st.columns(4)
			
 
				+        with col1:
			
 
				+            st.metric("总页数", summary['total_pages'])
			
 
				+        with col2:
			
 
				+            st.metric("成功页数", summary['successful_pages'], 
			
 
				+                     delta=f"{summary['successful_pages']/summary['total_pages']*100:.1f}%")
			
 
				+        with col3:
			
 
				+            st.metric("失败页数", summary['failed_pages'],
			
 
				+                     delta=f"-{summary['failed_pages']}" if summary['failed_pages'] > 0 else "0")
			
 
				+        with col4:
			
 
				+            st.metric("总差异数", summary['total_differences'])
			
 
				+        
			
 
				+        # ✅ 详细差异类型统计 - 更新展示
			
 
				+        st.subheader("📈 差异类型统计")
			
 
				+        
			
 
				+        col1, col2, col3 = st.columns(3)
			
 
				+        with col1:
			
 
				+            st.metric("表格差异", summary['total_table_differences'])
			
 
				+            st.caption(f"金额: {summary.get('total_amount_differences', 0)} | 日期: {summary.get('total_datetime_differences', 0)} | 文本: {summary.get('total_text_differences', 0)}")
			
 
				+        with col2:
			
 
				+            st.metric("段落差异", summary['total_paragraph_differences'])
			
 
				+        with col3:
			
 
				+            st.metric("严重度", f"高:{summary.get('total_high_severity', 0)} 中:{summary.get('total_medium_severity', 0)} 低:{summary.get('total_low_severity', 0)}")
			
 
				+        
			
 
				+        # 表格结构差异统计
			
 
				+        with st.expander("📋 表格结构差异详情", expanded=False):
			
 
				+            col1, col2, col3, col4 = st.columns(4)
			
 
				+            with col1:
			
 
				+                st.metric("表头前", summary.get('total_table_pre_header', 0))
			
 
				+            with col2:
			
 
				+                st.metric("表头位置", summary.get('total_table_header_position', 0))
			
 
				+            with col3:
			
 
				+                st.metric("表头错误", summary.get('total_table_header_critical', 0))
			
 
				+            with col4:
			
 
				+                st.metric("行缺失", summary.get('total_table_row_missing', 0))
			
 
				+        
			
 
				+        # ✅ 各页详细结果表格 - 更新列
			
 
				+        st.subheader("📄 各页详细结果")
			
 
				+        
			
 
				+        # 准备DataFrame
			
 
				+        page_data = []
			
 
				+        for page in batch_results['pages']:
			
 
				+            if page['status'] == 'success':
			
 
				+                page_data.append({
			
 
				+                    '页码': page['page_num'],
			
 
				+                    '状态': '✅ 成功' if page['total_differences'] == 0 else '⚠️ 有差异',
			
 
				+                    '总差异': page['total_differences'],
			
 
				+                    '表格差异': page['table_differences'],
			
 
				+                    '金额': page.get('amount_differences', 0),
			
 
				+                    '日期': page.get('datetime_differences', 0),
			
 
				+                    '文本': page.get('text_differences', 0),
			
 
				+                    '段落': page['paragraph_differences'],
			
 
				+                    '表头前': page.get('table_pre_header', 0),
			
 
				+                    '表头位置': page.get('table_header_position', 0),
			
 
				+                    '表头错误': page.get('table_header_critical', 0),
			
 
				+                    '行缺失': page.get('table_row_missing', 0),
			
 
				+                    '高': page.get('high_severity', 0),
			
 
				+                    '中': page.get('medium_severity', 0),
			
 
				+                    '低': page.get('low_severity', 0)
			
 
				+                })
			
 
				+            else:
			
 
				+                page_data.append({
			
 
				+                    '页码': page['page_num'],
			
 
				+                    '状态': '❌ 失败',
			
 
				+                    '总差异': '-', '表格差异': '-', '金额': '-', '日期': '-', 
			
 
				+                    '文本': '-', '段落': '-', '表头前': '-', '表头位置': '-',
			
 
				+                    '表头错误': '-', '行缺失': '-', '高': '-', '中': '-', '低': '-'
			
 
				+                })
			
 
				+        
			
 
				+        df_pages = pd.DataFrame(page_data)
			
 
				+        
			
 
				+        # 显示表格
			
 
				+        st.dataframe(
			
 
				+            df_pages,
			
 
				+            use_container_width=True,
			
 
				+            hide_index=True,
			
 
				+            column_config={
			
 
				+                "页码": st.column_config.NumberColumn("页码", width="small"),
			
 
				+                "状态": st.column_config.TextColumn("状态", width="small"),
			
 
				+                "总差异": st.column_config.NumberColumn("总差异", width="small"),
			
 
				+                "表格差异": st.column_config.NumberColumn("表格", width="small"),
			
 
				+                "金额": st.column_config.NumberColumn("金额", width="small"),
			
 
				+                "日期": st.column_config.NumberColumn("日期", width="small"),
			
 
				+                "文本": st.column_config.NumberColumn("文本", width="small"),
			
 
				+                "段落": st.column_config.NumberColumn("段落", width="small"),
			
 
				+            }
			
 
				+        )
			
 
				+        
			
 
				+        # 下载选项
			
 
				+        st.subheader("📥 导出报告")
			
 
				+        
			
 
				+        col1, col2 = st.columns(2)
			
 
				+        
			
 
				+        with col1:
			
 
				+            # 导出Excel
			
 
				+            excel_buffer = BytesIO()
			
 
				+            df_pages.to_excel(excel_buffer, index=False, sheet_name='验证结果')
			
 
				+            
			
 
				+            st.download_button(
			
 
				+                label="📊 下载Excel报告",
			
 
				+                data=excel_buffer.getvalue(),
			
 
				+                file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
			
 
				+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
			
 
				+            )
			
 
				+        
			
 
				+        with col2:
			
 
				+            # 导出JSON
			
 
				+            json_data = json.dumps(batch_results, ensure_ascii=False, indent=2)
			
 
				+            
			
 
				+            st.download_button(
			
 
				+                label="📄 下载JSON报告",
			
 
				+                data=json_data,
			
 
				+                file_name=f"batch_validation_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
			
 
				+                mime="application/json"
			
 
				+            )
			
 
				+
			
 
				+    @st.dialog("查看交叉验证结果", width="large", dismissible=True, on_dismiss="rerun")
			
 
				+    def show_batch_cross_validation_results_dialog(self):
			
 
				+        if 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result:
			
 
				+            self._display_batch_validation_results(st.session_state.cross_validation_batch_result)
			
 
				+            
			
 
				+        else:
			
 
				+            st.info("暂无交叉验证结果，请先运行交叉验证")
			
 
				+
			
 
				     def display_comparison_results(self, comparison_result: dict, detailed: bool = True):
			
 
				         """显示对比结果摘要 - 使用DataFrame展示"""
			
 
				         
			
@@ -572,7 +1000,7 @@ class StreamlitOCRValidator:
 
				         with col2:
			
 
				             st.metric("表格差异", stats['table_differences'])
			
 
				         with col3:
			
 
				-            st.metric("金额差异", stats['amount_differences'])
			
 
				+            st.metric("其中表格金额差异", stats['amount_differences'])
			
 
				         with col4:
			
 
				             st.metric("段落差异", stats['paragraph_differences'])
			
 
				         
			
@@ -698,9 +1126,9 @@ class StreamlitOCRValidator:
 
				                             )
			
 
				                         
			
 
				                         with col2:
			
 
				-                            st.write("**VLM识别结果:**")
			
 
				+                            st.write("**验证数据源识别结果:**")
			
 
				                             st.text_area(
			
 
				-                                "VLM识别结果详情",
			
 
				+                                "验证数据源识别结果详情",
			
 
				                                 value=diff['file2_value'],
			
 
				                                 height=200,
			
 
				                                 key=f"vlm_{selected_diff_index}",
			
@@ -828,7 +1256,7 @@ class StreamlitOCRValidator:
 
				         with col2:
			
 
				             # 导出统计报告
			
 
				             stats_data = {
			
 
				-                '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
			
 
				+                '统计项目': ['总差异数', '表格差异', '其中表格金额差异', '段落差异'],
			
 
				                 '数量': [
			
 
				                     comparison_result['statistics']['total_differences'],
			
 
				                     comparison_result['statistics']['table_differences'],
			
@@ -875,31 +1303,6 @@ class StreamlitOCRValidator:
 
				         else:
			
 
				             st.error("❌ 发现大量差异，建议重新进行OCR识别或检查原始图片质量")
			
 
				     
			
 
				-    @st.dialog("查看预校验结果", width="large", dismissible=True, on_dismiss="rerun")
			
 
				-    def show_comparison_results_dialog(self):
			
 
				-        """显示VLM预校验结果的对话框"""
			
 
				-        current_md_path = Path(self.file_paths[self.selected_file_index]).with_suffix('.md')
			
 
				-        pre_validation_dir = Path(self.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				-        comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_comparison_result.json"
			
 
				-        if 'comparison_result' in st.session_state and st.session_state.comparison_result:
			
 
				-            self.display_comparison_results(st.session_state.comparison_result['comparison_result'])
			
 
				-        elif comparison_result_path.exists():
			
 
				-            # 如果pre_validation_dir下有结果文件，提示用户加载
			
 
				-            if st.button("加载预校验结果"):
			
 
				-                with open(comparison_result_path, "r", encoding="utf-8") as f:
			
 
				-                    comparison_json_result = json.load(f)
			
 
				-                comparison_result = {
			
 
				-                    "image_path": self.image_path,
			
 
				-                    "comparison_result_json": str(comparison_result_path),
			
 
				-                    "comparison_result_md": str(comparison_result_path.with_suffix('.md')),
			
 
				-                    "comparison_result": comparison_json_result
			
 
				-                }
			
 
				-                    
			
 
				-                st.session_state.comparison_result = comparison_result
			
 
				-                self.display_comparison_results(comparison_json_result)
			
 
				-        else:
			
 
				-            st.info("暂无预校验结果，请先运行VLM预校验")
			
 
				-
			
 
				     def create_compact_layout(self, config):
			
 
				         """创建滚动凑布局"""
			
 
				         return self.layout_manager.create_compact_layout(config)
			
@@ -999,17 +1402,17 @@ def main():
 
				                 st.rerun()
			
 
				         else:
			
 
				             st.warning("当前数据源中未找到OCR结果文件")
			
 
				-        
			
 
				-        # VLM预校验按钮
			
 
				-        if st.button("VLM预校验", type="primary", icon=":material/compare_arrows:"):
			
 
				+
			
 
				+        # 交叉验证按钮
			
 
				+        if st.button("交叉验证", type="primary", icon=":material/compare_arrows:"):
			
 
				             if validator.image_path and validator.md_content:
			
 
				-                validator.vlm_pre_validation()
			
 
				+                validator.cross_validation()
			
 
				             else:
			
 
				                 message_box("❌ 请先选择OCR数据文件", "error")
			
 
				 
			
 
				         # 查看预校验结果按钮
			
 
				-        if st.button("查看预校验结果", type="secondary", icon=":material/quick_reference_all:"):
			
 
				-            validator.show_comparison_results_dialog()
			
 
				+        if st.button("查看验证结果", type="secondary", icon=":material/quick_reference_all:"):
			
 
				+            validator.show_batch_cross_validation_results_dialog()
			
 
				 
			
 
				     # 显示当前数据源统计信息
			
 
				     with st.expander("🔧 OCR工具统计信息", expanded=False):
			
@@ -1035,7 +1438,7 @@ def main():
 
				             st.write("**详细信息:**", stats['tool_info'])
			
 
				     
			
 
				     # 其余标签页保持不变...
			
 
				-    tab1, tab2, tab3 = st.tabs(["📄 内容人工检查", "📄 VLM预校验识别结果", "📊 表格分析"])
			
 
				+    tab1, tab2, tab3 = st.tabs(["📄 内容人工检查", "🔍 交叉验证结果", "📊 表格分析"])
			
 
				     
			
 
				     with tab1:
			
 
				         validator.create_compact_layout(config)
			
@@ -1044,9 +1447,15 @@ def main():
 
				         # st.header("📄 VLM预校验识别结果")
			
 
				         current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md')
			
 
				         pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
			
 
				-        comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_comparison_result.json"
			
 
				-        pre_validation_path = pre_validation_dir / f"{current_md_path.stem}.md"
			
 
				+        comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
			
 
				+        # pre_validation_path = pre_validation_dir / f"{current_md_path.stem}.md"
			
 
				+        verify_md_path = validator.find_verify_md_path(validator.selected_file_index)
			
 
				+        
			
 
				         if comparison_result_path.exists():
			
 
				+            # 加载并显示验证结果
			
 
				+            with open(comparison_result_path, "r", encoding="utf-8") as f:
			
 
				+                comparison_result = json.load(f)
			
 
				+
			
 
				             # 左边显示OCR结果，右边显示VLM结果
			
 
				             col1, col2 = st.columns([1,1])
			
 
				             with col1:
			
@@ -1058,13 +1467,17 @@ def main():
 
				                 layout_type = "compact"
			
 
				                 validator.layout_manager.render_content_by_mode(original_md_content, "HTML渲染", font_size, height, layout_type)
			
 
				             with col2:
			
 
				-                st.subheader("🤖 VLM识别结果")
			
 
				-                with open(pre_validation_path, "r", encoding="utf-8") as f:
			
 
				-                    pre_validation_md_content = f.read()
			
 
				+                st.subheader("🤖 验证识别结果")
			
 
				+                with open(str(verify_md_path), "r", encoding="utf-8") as f:
			
 
				+                    verify_md_content = f.read()
			
 
				                 font_size = config['styles'].get('font_size', 10)
			
 
				                 height = config['styles']['layout'].get('default_height', 800)
			
 
				                 layout_type = "compact"
			
 
				-                validator.layout_manager.render_content_by_mode(pre_validation_md_content, "HTML渲染", font_size, height, layout_type)
			
 
				+                validator.layout_manager.render_content_by_mode(verify_md_content, "HTML渲染", font_size, height, layout_type)
			
 
				+
			
 
				+            # 显示差异统计
			
 
				+            st.markdown("---")
			
 
				+            validator.display_comparison_results(comparison_result, detailed=True)
			
 
				         else:
			
 
				             st.info("暂无预校验结果，请先运行VLM预校验")
			
 
				 
			
@@ -1079,35 +1492,5 @@ def main():
 
				         else:
			
 
				             st.info("当前OCR结果中没有检测到表格数据")
			
 
				     
			
 
				-    # with tab4:
			
 
				-    #     # 数据统计页面 - 保持原有逻辑
			
 
				-    #     st.header("📈 OCR数据统计")
			
 
				-        
			
 
				-    #     # 添加数据源特定的统计信息
			
 
				-    #     if validator.current_source_config:
			
 
				-    #         st.subheader(f"📊 {get_data_source_display_name(validator.current_source_config)} - 统计信息")
			
 
				-        
			
 
				-    #     if stats['categories']:
			
 
				-    #         st.subheader("📊 类别分布")
			
 
				-    #         fig_pie = px.pie(
			
 
				-    #             values=list(stats['categories'].values()),
			
 
				-    #             names=list(stats['categories'].keys()),
			
 
				-    #             title="文本类别分布"
			
 
				-    #         )
			
 
				-    #         st.plotly_chart(fig_pie, use_container_width=True)
			
 
				-        
			
 
				-    #     # 错误率分析
			
 
				-    #     st.subheader("📈 质量分析")
			
 
				-    #     accuracy_data = {
			
 
				-    #         '状态': ['正确', '错误'],
			
 
				-    #         '数量': [stats['clickable_texts'] - stats['marked_errors'], stats['marked_errors']]
			
 
				-    #     }
			
 
				-        
			
 
				-    #     fig_bar = px.bar(
			
 
				-    #         accuracy_data, x='状态', y='数量', title="识别质量分布",
			
 
				-    #         color='状态', color_discrete_map={'正确': 'green', '错误': 'red'}
			
 
				-    #     )
			
 
				-    #     st.plotly_chart(fig_bar, use_container_width=True)
			
 
				-    
			
 
				 if __name__ == "__main__":
			
 
				     main()