zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
							"""
验证结果展示模块
"""
import streamlit as st
import pandas as pd
import plotly.express as px
from io import BytesIO
import json
from pathlib import Path

from ocr_validator_utils import process_all_images_in_content

def display_single_page_cross_validation(validator, config):
    """显示单页交叉验证结果
    
    Args:
        validator: OCR验证器实例
        config: 配置字典
    """
    current_md_path = Path(validator.file_paths[validator.selected_file_index]).with_suffix('.md')
    pre_validation_dir = Path(validator.config['pre_validation'].get('out_dir', './output/pre_validation/')).resolve()
    comparison_result_path = pre_validation_dir / f"{current_md_path.stem}_cross_validation.json"
    verify_md_path = validator.find_verify_md_path(validator.selected_file_index)
    
    # 检查验证结果是否与当前数据源匹配
    result_is_valid = False
    comparison_result = None
    
    if comparison_result_path.exists():
        try:
            with open(comparison_result_path, "r", encoding="utf-8") as f:
                comparison_result = json.load(f)
            
            # 检查文件路径是否匹配（验证结果是否为当前页面生成）
            if (comparison_result.get('file1_path') == str(current_md_path) and 
                comparison_result.get('file2_path') == str(verify_md_path)):
                result_is_valid = True
        except Exception as e:
            st.error(f"读取验证结果失败: {e}")
    
    if result_is_valid:
        _display_valid_cross_validation_result(
            validator, config, current_md_path, verify_md_path, comparison_result
        )
    else:
        _display_no_validation_result_prompt(validator)


def _display_valid_cross_validation_result(validator, config, current_md_path, verify_md_path, comparison_result):
    """显示有效的交叉验证结果
    
    Args:
        validator: OCR验证器实例
        config: 配置字典
        current_md_path: 当前OCR文件路径
        verify_md_path: 验证文件路径
        comparison_result: 对比结果字典
    """
    col1, col2 = st.columns([1, 1])
    
    # 左侧：原OCR识别结果
    with col1:
        st.subheader("🤖 原OCR识别结果")
        if current_md_path.exists():
            with open(current_md_path, "r", encoding="utf-8") as f:
                original_md_content = f.read()
                original_md_content = process_all_images_in_content(original_md_content, current_md_path)            

            font_size = config['styles'].get('font_size', 10)
            height = config['styles']['layout'].get('default_height', 800)
            validator.layout_manager.render_content_by_mode(
                original_md_content, "HTML渲染", font_size, height, "compact"
            )
        else:
            st.error("原OCR文件不存在")
    
    # 右侧：验证识别结果
    with col2:
        st.subheader("🤖 验证识别结果")
        if verify_md_path and verify_md_path.exists():
            with open(str(verify_md_path), "r", encoding="utf-8") as f:
                verify_md_content = f.read()
                verify_md_content = process_all_images_in_content(verify_md_content, verify_md_path)
            
            font_size = config['styles'].get('font_size', 10)
            height = config['styles']['layout'].get('default_height', 800)
            validator.layout_manager.render_content_by_mode(
                verify_md_content, "HTML渲染", font_size, height, "compact"
            )
        else:
            st.warning("验证文件不存在")
    
    st.markdown("---")
    
    # 显示详细的对比结果
    display_comparison_results(comparison_result, detailed=True)


def _display_no_validation_result_prompt(validator):
    """显示无验证结果的提示信息
    
    Args:
        validator: OCR验证器实例
    """
    st.info("💡 暂无当前页面的交叉验证结果，请点击上方「交叉验证」按钮运行验证")
    
    # 显示当前数据源信息
    col1, col2 = st.columns(2)
    
    with col1:
        st.write("**当前OCR数据源:**")
        from ocr_validator_utils import get_data_source_display_name
        
        if validator.current_source_config and validator.file_info:
            current_source_name = get_data_source_display_name(validator.current_source_config)
            current_page = validator.file_info[validator.selected_file_index]['page']
            st.code(f"{current_source_name}\n第 {current_page} 页")
        else:
            st.warning("未选择OCR数据源")
    
    with col2:
        st.write("**当前验证数据源:**")
        if validator.verify_source_config:
            from ocr_validator_utils import get_data_source_display_name
            verify_source_name = get_data_source_display_name(validator.verify_source_config)
            st.code(verify_source_name)
        else:
            st.warning("未选择验证数据源")
    
    # 添加操作提示
    st.markdown("---")
    st.markdown("""
    ### 📝 操作步骤：
    
    1. **选择数据源**: 在页面顶部选择不同的OCR数据源和验证数据源
    2. **运行验证**: 点击「交叉验证」按钮开始批量验证
    3. **查看结果**: 验证完成后，在此处查看详细对比结果
    
    💡 **提示**: 
    - 确保两个数据源包含相同页码的文件
    - 建议选择不同OCR工具的结果进行交叉验证
    - 验证结果会自动保存，可随时查看
    """)


def display_comparison_results(comparison_result: dict, detailed: bool = True):
    """显示对比结果
    
    Args:
        comparison_result: 对比结果字典
        detailed: 是否显示详细信息
    """
    st.header("📊 交叉验证结果")
    
    stats = comparison_result['statistics']
    
    # 显示主要指标
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("总差异数", stats['total_differences'])
    with col2:
        st.metric("表格差异", stats['table_differences'])
    with col3:
        st.metric("金额差异", stats.get('amount_differences', 0))
    with col4:
        st.metric("段落差异", stats['paragraph_differences'])
    
    # 根据差异数量显示不同的提示
    if stats['total_differences'] == 0:
        st.success("🎉 完美匹配！两个数据源结果完全一致")
    else:
        st.warning(f"⚠️ 发现 {stats['total_differences']} 个差异，建议人工检查")
        
        if comparison_result['differences'] and detailed:
            _display_differences_dataframe(comparison_result)
            _display_difference_details(comparison_result)
            _display_difference_charts(comparison_result)
            _provide_download_options(comparison_result)


def _display_differences_dataframe(comparison_result: dict):
    """显示差异DataFrame"""
    st.subheader("🔍 差异详情对比")
    
    diff_data = []
    for i, diff in enumerate(comparison_result['differences'], 1):
        diff_data.append({
            '序号': i,
            '位置': diff['position'],
            '类型': diff['type'],
            '原OCR结果': diff['file1_value'][:100] + ('...' if len(diff['file1_value']) > 100 else ''),
            '验证结果': diff['file2_value'][:100] + ('...' if len(diff['file2_value']) > 100 else ''),
            '描述': diff['description'][:80] + ('...' if len(diff['description']) > 80 else ''),
            '严重程度': _get_severity_level(diff)
        })
    
    df_differences = pd.DataFrame(diff_data)
    
    def highlight_severity(val):
        if val == '高':
            return 'background-color: #ffebee; color: #c62828'
        elif val == '中':
            return 'background-color: #fff3e0; color: #ef6c00'
        elif val == '低':
            return 'background-color: #e8f5e8; color: #2e7d32'
        return ''
    
    styled_df = df_differences.style.applymap(
        highlight_severity, 
        subset=['严重程度']
    ).format({'序号': '{:d}'})
    
    st.dataframe(styled_df, width='stretch', height=400, hide_index=True)


def _display_difference_details(comparison_result: dict):
    """显示详细差异"""
    st.subheader("🔍 详细差异查看")
    
    selected_diff_index = st.selectbox(
        "选择要查看的差异:",
        options=range(len(comparison_result['differences'])),
        format_func=lambda x: f"差异 {x+1}: {comparison_result['differences'][x]['position']} - {comparison_result['differences'][x]['type']}",
        key="selected_diff"
    )
    
    if selected_diff_index is not None:
        diff = comparison_result['differences'][selected_diff_index]
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.write("**原OCR结果:**")
            st.text_area("原OCR结果详情", value=diff['file1_value'], height=200, 
                        key=f"original_{selected_diff_index}", label_visibility="collapsed")
        
        with col2:
            st.write("**验证结果:**")
            st.text_area("验证结果详情", value=diff['file2_value'], height=200, 
                        key=f"verify_{selected_diff_index}", label_visibility="collapsed")
        
        st.info(f"**位置:** {diff['position']}")
        st.info(f"**类型:** {diff['type']}")
        st.info(f"**描述:** {diff['description']}")
        st.info(f"**严重程度:** {_get_severity_level(diff)}")


def _display_difference_charts(comparison_result: dict):
    """显示差异统计图表"""
    st.subheader("📈 差异类型分布")
    
    type_counts = {}
    severity_counts = {'高': 0, '中': 0, '低': 0}
    
    for diff in comparison_result['differences']:
        diff_type = diff['type']
        type_counts[diff_type] = type_counts.get(diff_type, 0) + 1
        
        severity = _get_severity_level(diff)
        severity_counts[severity] += 1
    
    col1, col2 = st.columns(2)
    
    with col1:
        if type_counts:
            fig_type = px.pie(
                values=list(type_counts.values()),
                names=list(type_counts.keys()),
                title="差异类型分布"
            )
            st.plotly_chart(fig_type, width='stretch')
    
    with col2:
        fig_severity = px.bar(
            x=list(severity_counts.keys()),
            y=list(severity_counts.values()),
            title="差异严重程度分布",
            color=list(severity_counts.keys()),
            color_discrete_map={'高': '#f44336', '中': '#ff9800', '低': '#4caf50'}
        )
        st.plotly_chart(fig_severity, width='stretch')


def _provide_download_options(comparison_result: dict):
    """提供下载选项"""
    st.subheader("📥 导出验证结果")
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        if comparison_result['differences']:
            diff_data = []
            for i, diff in enumerate(comparison_result['differences'], 1):
                diff_data.append({
                    '序号': i,
                    '位置': diff['position'],
                    '类型': diff['type'],
                    '原OCR结果': diff['file1_value'],
                    '验证结果': diff['file2_value'],
                    '描述': diff['description'],
                    '严重程度': _get_severity_level(diff)
                })
            
            df_export = pd.DataFrame(diff_data)
            excel_buffer = BytesIO()
            df_export.to_excel(excel_buffer, index=False, sheet_name='差异详情')
            
            st.download_button(
                label="📊 下载差异详情(Excel)",
                data=excel_buffer.getvalue(),
                file_name=f"comparison_differences_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                key="download_differences_excel"
            )
    
    with col2:
        stats_data = {
            '统计项目': ['总差异数', '表格差异', '金额差异', '段落差异'],
            '数量': [
                comparison_result['statistics']['total_differences'],
                comparison_result['statistics']['table_differences'],
                comparison_result['statistics'].get('amount_differences', 0),
                comparison_result['statistics']['paragraph_differences']
            ]
        }
        
        df_stats = pd.DataFrame(stats_data)
        csv_stats = df_stats.to_csv(index=False)
        
        st.download_button(
            label="📈 下载统计报告(CSV)",
            data=csv_stats,
            file_name=f"comparison_stats_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
            mime="text/csv",
            key="download_stats_csv"
        )
    
    with col3:
        report_json = json.dumps(comparison_result, ensure_ascii=False, indent=2)
        
        st.download_button(
            label="📄 下载完整报告(JSON)",
            data=report_json,
            file_name=f"comparison_full_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json",
            key="download_full_json"
        )


def _get_severity_level(diff: dict) -> str:
    """判断严重程度
    
    Args:
        diff: 差异字典
    
    Returns:
        严重程度: '高', '中', '低'
    """
    if 'severity' in diff:
        severity_map = {'critical': '高', 'high': '高', 'medium': '中', 'low': '低'}
        return severity_map.get(diff['severity'], '中')
    
    diff_type = diff['type'].lower()
    
    # 金额和数字类差异为高严重度
    if 'amount' in diff_type or 'number' in diff_type:
        return '高'
    
    # 表格和结构类差异为中严重度
    if 'table' in diff_type or 'structure' in diff_type:
        return '中'
    
    # 根据相似度判断
    if 'similarity' in diff:
        similarity = diff['similarity']
        if similarity < 50:
            return '高'
        elif similarity < 85:
            return '中'
        else:
            return '低'
    
    # 根据长度差异判断
    len_diff = abs(len(diff['file1_value']) - len(diff['file2_value']))
    if len_diff > 50:
        return '高'
    elif len_diff > 10:
        return '中'
    else:
        return '低'