| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- """
- 验证结果展示模块
- """
- import streamlit as st
- import pandas as pd
- import plotly.express as px
- from io import BytesIO
- import json
- from pathlib import Path
- from ocr_validator_utils import process_all_images_in_content
- def display_single_page_cross_validation(validator, config):
- """显示单页交叉验证结果
-
- Args:
- validator: OCR验证器实例
- config: 配置字典
- """
- current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md')
- pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
- comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
- verify_md_path = validator.find_verify_md_path(validator.selected_file_index)
-
- # 检查验证结果是否与当前数据源匹配
- result_is_valid = False
- comparison_result = None
-
- if comparison_result_path.exists():
- try:
- with open(comparison_result_path, "r", encoding="utf-8") as f:
- comparison_result = json.load(f)
-
- # 检查文件路径是否匹配(验证结果是否为当前页面生成)
- if (comparison_result.get('file1_path') == str(current_md_path) and
- comparison_result.get('file2_path') == str(verify_md_path)):
- result_is_valid = True
- except Exception as e:
- st.error(f"读取验证结果失败: {e}")
-
- if result_is_valid:
- _display_valid_cross_validation_result(
- validator, config, current_md_path, verify_md_path, comparison_result
- )
- else:
- _display_no_validation_result_prompt(validator)
- def _display_valid_cross_validation_result(validator, config, current_md_path, verify_md_path, comparison_result):
- """显示有效的交叉验证结果
-
- Args:
- validator: OCR验证器实例
- config: 配置字典
- current_md_path: 当前OCR文件路径
- verify_md_path: 验证文件路径
- comparison_result: 对比结果字典
- """
- col1, col2 = st.columns([1, 1])
-
- # 左侧:原OCR识别结果
- with col1:
- st.subheader("🤖 原OCR识别结果")
- if current_md_path.exists():
- with open(current_md_path, "r", encoding="utf-8") as f:
- original_md_content = f.read()
- original_md_content = process_all_images_in_content(original_md_content, current_md_path)
- font_size = config['styles'].get('font_size', 10)
- height = config['styles']['layout'].get('default_height', 800)
- validator.layout_manager.render_content_by_mode(
- original_md_content, "HTML渲染", font_size, height, "compact"
- )
- else:
- st.error("原OCR文件不存在")
-
- # 右侧:验证识别结果
- with col2:
- st.subheader("🤖 验证识别结果")
- if verify_md_path and verify_md_path.exists():
- with open(str(verify_md_path), "r", encoding="utf-8") as f:
- verify_md_content = f.read()
- verify_md_content = process_all_images_in_content(verify_md_content, verify_md_path)
-
- font_size = config['styles'].get('font_size', 10)
- height = config['styles']['layout'].get('default_height', 800)
- validator.layout_manager.render_content_by_mode(
- verify_md_content, "HTML渲染", font_size, height, "compact"
- )
- else:
- st.warning("验证文件不存在")
-
- st.markdown("---")
-
- # 显示详细的对比结果
- display_comparison_results(comparison_result, detailed=True)
- def _display_no_validation_result_prompt(validator):
- """显示无验证结果的提示信息
-
- Args:
- validator: OCR验证器实例
- """
- st.info("💡 暂无当前页面的交叉验证结果,请点击上方「交叉验证」按钮运行验证")
-
- # 显示当前数据源信息
- col1, col2 = st.columns(2)
-
- with col1:
- st.write("**当前OCR数据源:**")
- from ocr_validator_utils import get_data_source_display_name
-
- if validator.current_source_config and validator.file_info:
- current_source_name = get_data_source_display_name(validator.current_source_config)
- current_page = validator.file_info[validator.selected_file_index]['page']
- st.code(f"{current_source_name}\n第 {current_page} 页")
- else:
- st.warning("未选择OCR数据源")
-
- with col2:
- st.write("**当前验证数据源:**")
- if validator.verify_source_config:
- from ocr_validator_utils import get_data_source_display_name
- verify_source_name = get_data_source_display_name(validator.verify_source_config)
- st.code(verify_source_name)
- else:
- st.warning("未选择验证数据源")
-
- # 添加操作提示
- st.markdown("---")
- st.markdown("""
- ### 📝 操作步骤:
-
- 1. **选择数据源**: 在页面顶部选择不同的OCR数据源和验证数据源
- 2. **运行验证**: 点击「交叉验证」按钮开始批量验证
- 3. **查看结果**: 验证完成后,在此处查看详细对比结果
-
- 💡 **提示**:
- - 确保两个数据源包含相同页码的文件
- - 建议选择不同OCR工具的结果进行交叉验证
- - 验证结果会自动保存,可随时查看
- """)
- def display_comparison_results(comparison_result: dict, detailed: bool = True):
- """显示对比结果
-
- Args:
- comparison_result: 对比结果字典
- detailed: 是否显示详细信息
- """
- st.header("📊 交叉验证结果")
-
- stats = comparison_result['statistics']
-
- # 显示主要指标
- col1, col2, col3, col4 = st.columns(4)
- with col1:
- st.metric("总差异数", stats['total_differences'])
- with col2:
- st.metric("表格差异", stats['table_differences'])
- with col3:
- st.metric("金额差异", stats.get('amount_differences', 0))
- with col4:
- st.metric("段落差异", stats['paragraph_differences'])
-
- # 根据差异数量显示不同的提示
- if stats['total_differences'] == 0:
- st.success("🎉 完美匹配!两个数据源结果完全一致")
- else:
- st.warning(f"⚠️ 发现 {stats['total_differences']} 个差异,建议人工检查")
-
- if comparison_result['differences'] and detailed:
- _display_differences_dataframe(comparison_result)
- _display_difference_details(comparison_result)
- _display_difference_charts(comparison_result)
- _provide_download_options(comparison_result)
- def _display_differences_dataframe(comparison_result: dict):
- """显示差异DataFrame"""
- st.subheader("🔍 差异详情对比")
-
- diff_data = []
- for i, diff in enumerate(comparison_result['differences'], 1):
- diff_data.append({
- '序号': i,
- '位置': diff['position'],
- '类型': diff['type'],
- '原OCR结果': diff['file1_value'][:100] + ('...' if len(diff['file1_value']) > 100 else ''),
- '验证结果': diff['file2_value'][:100] + ('...' if len(diff['file2_value']) > 100 else ''),
- '描述': diff['description'][:80] + ('...' if len(diff['description']) > 80 else ''),
- '严重程度': _get_severity_level(diff)
- })
-
- df_differences = pd.DataFrame(diff_data)
-
- def highlight_severity(val):
- if val == '高':
- return 'background-color: #ffebee; color: #c62828'
- elif val == '中':
- return 'background-color: #fff3e0; color: #ef6c00'
- elif val == '低':
- return 'background-color: #e8f5e8; color: #2e7d32'
- return ''
-
- styled_df = df_differences.style.map(
- highlight_severity,
- subset=['严重程度']
- ).format({'序号': '{:d}'})
-
- st.dataframe(styled_df, width='stretch', height=400, hide_index=True)
- def _display_difference_details(comparison_result: dict):
- """显示详细差异"""
- st.subheader("🔍 详细差异查看")
-
- selected_diff_index = st.selectbox(
- "选择要查看的差异:",
- options=range(len(comparison_result['differences'])),
- format_func=lambda x: f"差异 {x+1}: {comparison_result['differences'][x]['position']} - {comparison_result['differences'][x]['type']}",
- key="selected_diff"
- )
-
- if selected_diff_index is not None:
- diff = comparison_result['differences'][selected_diff_index]
-
- col1, col2 = st.columns(2)
-
- with col1:
- st.write("**原OCR结果:**")
- st.text_area("原OCR结果详情", value=diff['file1_value'], height=200,
- key=f"original_{selected_diff_index}", label_visibility="collapsed")
-
- with col2:
- st.write("**验证结果:**")
- st.text_area("验证结果详情", value=diff['file2_value'], height=200,
- key=f"verify_{selected_diff_index}", label_visibility="collapsed")
-
- st.info(f"**位置:** {diff['position']}")
- st.info(f"**类型:** {diff['type']}")
- st.info(f"**描述:** {diff['description']}")
- st.info(f"**严重程度:** {_get_severity_level(diff)}")
- def _display_difference_charts(comparison_result: dict):
- """显示差异统计图表"""
- st.subheader("📈 差异类型分布")
-
- type_counts = {}
- severity_counts = {'高': 0, '中': 0, '低': 0}
-
- for diff in comparison_result['differences']:
- diff_type = diff['type']
- type_counts[diff_type] = type_counts.get(diff_type, 0) + 1
-
- severity = _get_severity_level(diff)
- severity_counts[severity] += 1
-
- col1, col2 = st.columns(2)
-
- with col1:
- if type_counts:
- fig_type = px.pie(
- values=list(type_counts.values()),
- names=list(type_counts.keys()),
- title="差异类型分布"
- )
- st.plotly_chart(fig_type, width='stretch')
-
- with col2:
- fig_severity = px.bar(
- x=list(severity_counts.keys()),
- y=list(severity_counts.values()),
- title="差异严重程度分布",
- color=list(severity_counts.keys()),
- color_discrete_map={'高': '#f44336', '中': '#ff9800', '低': '#4caf50'}
- )
- st.plotly_chart(fig_severity, width='stretch')
- def _provide_download_options(comparison_result: dict):
- """提供下载选项"""
- st.subheader("📥 导出验证结果")
-
- col1, col2, col3 = st.columns(3)
-
- with col1:
- if comparison_result['differences']:
- diff_data = []
- for i, diff in enumerate(comparison_result['differences'], 1):
- diff_data.append({
- '序号': i,
- '位置': diff['position'],
- '类型': diff['type'],
- '原OCR结果': diff['file1_value'],
- '验证结果': diff['file2_value'],
- '描述': diff['description'],
- '严重程度': _get_severity_level(diff)
- })
-
- df_export = pd.DataFrame(diff_data)
- excel_buffer = BytesIO()
- df_export.to_excel(excel_buffer, index=False, sheet_name='差异详情')
-
- st.download_button(
- label="📊 下载差异详情(Excel)",
- data=excel_buffer.getvalue(),
- file_name=f"comparison_differences_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- key="download_differences_excel"
- )
-
- with col2:
- stats_data = {
- '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
- '数量': [
- comparison_result['statistics']['total_differences'],
- comparison_result['statistics']['table_differences'],
- comparison_result['statistics'].get('amount_differences', 0),
- comparison_result['statistics']['paragraph_differences']
- ]
- }
-
- df_stats = pd.DataFrame(stats_data)
- csv_stats = df_stats.to_csv(index=False)
-
- st.download_button(
- label="📈 下载统计报告(CSV)",
- data=csv_stats,
- file_name=f"comparison_stats_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
- mime="text/csv",
- key="download_stats_csv"
- )
-
- with col3:
- report_json = json.dumps(comparison_result, ensure_ascii=False, indent=2)
-
- st.download_button(
- label="📄 下载完整报告(JSON)",
- data=report_json,
- file_name=f"comparison_full_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
- mime="application/json",
- key="download_full_json"
- )
- def _get_severity_level(diff: dict) -> str:
- """判断严重程度
-
- Args:
- diff: 差异字典
-
- Returns:
- 严重程度: '高', '中', '低'
- """
- if 'severity' in diff:
- severity_map = {'critical': '高', 'high': '高', 'medium': '中', 'low': '低'}
- return severity_map.get(diff['severity'], '中')
-
- diff_type = diff['type'].lower()
-
- # 金额和数字类差异为高严重度
- if 'amount' in diff_type or 'number' in diff_type:
- return '高'
-
- # 表格和结构类差异为中严重度
- if 'table' in diff_type or 'structure' in diff_type:
- return '中'
-
- # 根据相似度判断
- if 'similarity' in diff:
- similarity = diff['similarity']
- if similarity < 50:
- return '高'
- elif similarity < 85:
- return '中'
- else:
- return '低'
-
- # 根据长度差异判断
- len_diff = abs(len(diff['file1_value']) - len(diff['file2_value']))
- if len_diff > 50:
- return '高'
- elif len_diff > 10:
- return '中'
- else:
- return '低'
|