""" 验证结果展示模块 """ import streamlit as st import pandas as pd import plotly.express as px from io import BytesIO import json from pathlib import Path from ocr_validator_utils import process_all_images_in_content def display_single_page_cross_validation(validator, config): """显示单页交叉验证结果 Args: validator: OCR验证器实例 config: 配置字典 """ current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md') pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve() comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json" verify_md_path = validator.find_verify_md_path(validator.selected_file_index) # 检查验证结果是否与当前数据源匹配 result_is_valid = False comparison_result = None if comparison_result_path.exists(): try: with open(comparison_result_path, "r", encoding="utf-8") as f: comparison_result = json.load(f) # 检查文件路径是否匹配(验证结果是否为当前页面生成) if (comparison_result.get('file1_path') == str(current_md_path) and comparison_result.get('file2_path') == str(verify_md_path)): result_is_valid = True except Exception as e: st.error(f"读取验证结果失败: {e}") if result_is_valid: _display_valid_cross_validation_result( validator, config, current_md_path, verify_md_path, comparison_result ) else: _display_no_validation_result_prompt(validator) def _display_valid_cross_validation_result(validator, config, current_md_path, verify_md_path, comparison_result): """显示有效的交叉验证结果 Args: validator: OCR验证器实例 config: 配置字典 current_md_path: 当前OCR文件路径 verify_md_path: 验证文件路径 comparison_result: 对比结果字典 """ col1, col2 = st.columns([1, 1]) # 左侧:原OCR识别结果 with col1: st.subheader("🤖 原OCR识别结果") if current_md_path.exists(): with open(current_md_path, "r", encoding="utf-8") as f: original_md_content = f.read() original_md_content = process_all_images_in_content(original_md_content, current_md_path) font_size = config['styles'].get('font_size', 10) height = config['styles']['layout'].get('default_height', 800) validator.layout_manager.render_content_by_mode( original_md_content, "HTML渲染", font_size, height, "compact" ) else: st.error("原OCR文件不存在") # 右侧:验证识别结果 with col2: st.subheader("🤖 验证识别结果") if verify_md_path and verify_md_path.exists(): with open(str(verify_md_path), "r", encoding="utf-8") as f: verify_md_content = f.read() verify_md_content = process_all_images_in_content(verify_md_content, verify_md_path) font_size = config['styles'].get('font_size', 10) height = config['styles']['layout'].get('default_height', 800) validator.layout_manager.render_content_by_mode( verify_md_content, "HTML渲染", font_size, height, "compact" ) else: st.warning("验证文件不存在") st.markdown("---") # 显示详细的对比结果 display_comparison_results(comparison_result, detailed=True) def _display_no_validation_result_prompt(validator): """显示无验证结果的提示信息 Args: validator: OCR验证器实例 """ st.info("💡 暂无当前页面的交叉验证结果,请点击上方「交叉验证」按钮运行验证") # 显示当前数据源信息 col1, col2 = st.columns(2) with col1: st.write("**当前OCR数据源:**") from ocr_validator_utils import get_data_source_display_name if validator.current_source_config and validator.file_info: current_source_name = get_data_source_display_name(validator.current_source_config) current_page = validator.file_info[validator.selected_file_index]['page'] st.code(f"{current_source_name}\n第 {current_page} 页") else: st.warning("未选择OCR数据源") with col2: st.write("**当前验证数据源:**") if validator.verify_source_config: from ocr_validator_utils import get_data_source_display_name verify_source_name = get_data_source_display_name(validator.verify_source_config) st.code(verify_source_name) else: st.warning("未选择验证数据源") # 添加操作提示 st.markdown("---") st.markdown(""" ### 📝 操作步骤: 1. **选择数据源**: 在页面顶部选择不同的OCR数据源和验证数据源 2. **运行验证**: 点击「交叉验证」按钮开始批量验证 3. **查看结果**: 验证完成后,在此处查看详细对比结果 💡 **提示**: - 确保两个数据源包含相同页码的文件 - 建议选择不同OCR工具的结果进行交叉验证 - 验证结果会自动保存,可随时查看 """) def display_comparison_results(comparison_result: dict, detailed: bool = True): """显示对比结果 Args: comparison_result: 对比结果字典 detailed: 是否显示详细信息 """ st.header("📊 交叉验证结果") stats = comparison_result['statistics'] # 显示主要指标 col1, col2, col3, col4 = st.columns(4) with col1: st.metric("总差异数", stats['total_differences']) with col2: st.metric("表格差异", stats['table_differences']) with col3: st.metric("金额差异", stats.get('amount_differences', 0)) with col4: st.metric("段落差异", stats['paragraph_differences']) # 根据差异数量显示不同的提示 if stats['total_differences'] == 0: st.success("🎉 完美匹配!两个数据源结果完全一致") else: st.warning(f"⚠️ 发现 {stats['total_differences']} 个差异,建议人工检查") if comparison_result['differences'] and detailed: _display_differences_dataframe(comparison_result) _display_difference_details(comparison_result) _display_difference_charts(comparison_result) _provide_download_options(comparison_result) def _display_differences_dataframe(comparison_result: dict): """显示差异DataFrame""" st.subheader("🔍 差异详情对比") diff_data = [] for i, diff in enumerate(comparison_result['differences'], 1): diff_data.append({ '序号': i, '位置': diff['position'], '类型': diff['type'], '原OCR结果': diff['file1_value'][:100] + ('...' if len(diff['file1_value']) > 100 else ''), '验证结果': diff['file2_value'][:100] + ('...' if len(diff['file2_value']) > 100 else ''), '描述': diff['description'][:80] + ('...' if len(diff['description']) > 80 else ''), '严重程度': _get_severity_level(diff) }) df_differences = pd.DataFrame(diff_data) def highlight_severity(val): if val == '高': return 'background-color: #ffebee; color: #c62828' elif val == '中': return 'background-color: #fff3e0; color: #ef6c00' elif val == '低': return 'background-color: #e8f5e8; color: #2e7d32' return '' styled_df = df_differences.style.map( highlight_severity, subset=['严重程度'] ).format({'序号': '{:d}'}) st.dataframe(styled_df, width='stretch', height=400, hide_index=True) def _display_difference_details(comparison_result: dict): """显示详细差异""" st.subheader("🔍 详细差异查看") selected_diff_index = st.selectbox( "选择要查看的差异:", options=range(len(comparison_result['differences'])), format_func=lambda x: f"差异 {x+1}: {comparison_result['differences'][x]['position']} - {comparison_result['differences'][x]['type']}", key="selected_diff" ) if selected_diff_index is not None: diff = comparison_result['differences'][selected_diff_index] col1, col2 = st.columns(2) with col1: st.write("**原OCR结果:**") st.text_area("原OCR结果详情", value=diff['file1_value'], height=200, key=f"original_{selected_diff_index}", label_visibility="collapsed") with col2: st.write("**验证结果:**") st.text_area("验证结果详情", value=diff['file2_value'], height=200, key=f"verify_{selected_diff_index}", label_visibility="collapsed") st.info(f"**位置:** {diff['position']}") st.info(f"**类型:** {diff['type']}") st.info(f"**描述:** {diff['description']}") st.info(f"**严重程度:** {_get_severity_level(diff)}") def _display_difference_charts(comparison_result: dict): """显示差异统计图表""" st.subheader("📈 差异类型分布") type_counts = {} severity_counts = {'高': 0, '中': 0, '低': 0} for diff in comparison_result['differences']: diff_type = diff['type'] type_counts[diff_type] = type_counts.get(diff_type, 0) + 1 severity = _get_severity_level(diff) severity_counts[severity] += 1 col1, col2 = st.columns(2) with col1: if type_counts: fig_type = px.pie( values=list(type_counts.values()), names=list(type_counts.keys()), title="差异类型分布" ) st.plotly_chart(fig_type, width='stretch') with col2: fig_severity = px.bar( x=list(severity_counts.keys()), y=list(severity_counts.values()), title="差异严重程度分布", color=list(severity_counts.keys()), color_discrete_map={'高': '#f44336', '中': '#ff9800', '低': '#4caf50'} ) st.plotly_chart(fig_severity, width='stretch') def _provide_download_options(comparison_result: dict): """提供下载选项""" st.subheader("📥 导出验证结果") col1, col2, col3 = st.columns(3) with col1: if comparison_result['differences']: diff_data = [] for i, diff in enumerate(comparison_result['differences'], 1): diff_data.append({ '序号': i, '位置': diff['position'], '类型': diff['type'], '原OCR结果': diff['file1_value'], '验证结果': diff['file2_value'], '描述': diff['description'], '严重程度': _get_severity_level(diff) }) df_export = pd.DataFrame(diff_data) excel_buffer = BytesIO() df_export.to_excel(excel_buffer, index=False, sheet_name='差异详情') st.download_button( label="📊 下载差异详情(Excel)", data=excel_buffer.getvalue(), file_name=f"comparison_differences_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", key="download_differences_excel" ) with col2: stats_data = { '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'], '数量': [ comparison_result['statistics']['total_differences'], comparison_result['statistics']['table_differences'], comparison_result['statistics'].get('amount_differences', 0), comparison_result['statistics']['paragraph_differences'] ] } df_stats = pd.DataFrame(stats_data) csv_stats = df_stats.to_csv(index=False) st.download_button( label="📈 下载统计报告(CSV)", data=csv_stats, file_name=f"comparison_stats_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv", mime="text/csv", key="download_stats_csv" ) with col3: report_json = json.dumps(comparison_result, ensure_ascii=False, indent=2) st.download_button( label="📄 下载完整报告(JSON)", data=report_json, file_name=f"comparison_full_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json", mime="application/json", key="download_full_json" ) def _get_severity_level(diff: dict) -> str: """判断严重程度 Args: diff: 差异字典 Returns: 严重程度: '高', '中', '低' """ if 'severity' in diff: severity_map = {'critical': '高', 'high': '高', 'medium': '中', 'low': '低'} return severity_map.get(diff['severity'], '中') diff_type = diff['type'].lower() # 金额和数字类差异为高严重度 if 'amount' in diff_type or 'number' in diff_type: return '高' # 表格和结构类差异为中严重度 if 'table' in diff_type or 'structure' in diff_type: return '中' # 根据相似度判断 if 'similarity' in diff: similarity = diff['similarity'] if similarity < 50: return '高' elif similarity < 85: return '中' else: return '低' # 根据长度差异判断 len_diff = abs(len(diff['file1_value']) - len(diff['file2_value'])) if len_diff > 50: return '高' elif len_diff > 10: return '中' else: return '低'