#!/usr/bin/env python3 """ 基于Streamlit的OCR可视化校验工具(主入口) """ import streamlit as st from pathlib import Path import json from streamlit_validator_core import StreamlitOCRValidator from streamlit_validator_ui import ( setup_page_config, create_data_source_selector, message_box ) from streamlit_validator_table import display_html_table_as_dataframe from streamlit_validator_cross import ( cross_validation_dialog, show_batch_cross_validation_results_dialog ) from streamlit_validator_result import display_single_page_cross_validation from ocr_validator_utils import get_data_source_display_name from config_manager import load_config # 🎯 使用新配置管理器 def reset_cross_validation_results(): """重置交叉验证结果""" if 'cross_validation_batch_result' in st.session_state: st.session_state.cross_validation_batch_result = None print("🔄 数据源已变更,交叉验证结果已清空") def main(): """主应用""" # 🎯 初始化配置管理器 if 'config_manager' not in st.session_state: try: st.session_state.config_manager = load_config(config_dir="config") # 🎯 生成 OCRValidator 所需的配置 st.session_state.validator_config = st.session_state.config_manager.to_validator_config() print("✅ 配置管理器初始化成功") print(f"📄 发现 {len(st.session_state.config_manager.list_documents())} 个文档配置") print(f"🔧 发现 {len(st.session_state.config_manager.list_ocr_tools())} 个 OCR 工具") except Exception as e: st.error(f"❌ 配置加载失败: {e}") st.stop() config_manager = st.session_state.config_manager validator_config = st.session_state.validator_config # 初始化应用 if 'validator' not in st.session_state: # 🎯 直接传递配置字典给 OCRValidator validator = StreamlitOCRValidator(config_dict=validator_config) st.session_state.validator = validator setup_page_config(validator_config) # 页面标题 st.title(validator_config['ui']['page_title']) # 初始化数据源追踪 st.session_state.current_ocr_source = validator.current_source_key st.session_state.current_verify_source = validator.verify_source_key else: validator = st.session_state.validator if 'selected_text' not in st.session_state: st.session_state.selected_text = None st.session_state.compact_search_query = None if 'marked_errors' not in st.session_state: st.session_state.marked_errors = set() # 数据源选择器 create_data_source_selector(validator) # ✅ 检测数据源是否变更 ocr_source_changed = False verify_source_changed = False if 'current_ocr_source' in st.session_state: if st.session_state.current_ocr_source != validator.current_source_key: ocr_source_changed = True st.session_state.current_ocr_source = validator.current_source_key print(f"🔄 OCR数据源已切换到: {validator.current_source_key}") if 'current_verify_source' in st.session_state: if st.session_state.current_verify_source != validator.verify_source_key: verify_source_changed = True st.session_state.current_verify_source = validator.verify_source_key print(f"🔄 验证数据源已切换到: {validator.verify_source_key}") # ✅ 如果任一数据源变更,清空交叉验证结果 if ocr_source_changed or verify_source_changed: reset_cross_validation_results() # 显示提示信息 if ocr_source_changed and verify_source_changed: st.info("ℹ️ OCR数据源和验证数据源已变更,请重新运行交叉验证") elif ocr_source_changed: st.info("ℹ️ OCR数据源已变更,请重新运行交叉验证") elif verify_source_changed: st.info("ℹ️ 验证数据源已变更,请重新运行交叉验证") # 如果没有可用的数据源,提前返回 if not validator.all_sources: st.warning("⚠️ 未找到任何数据源,请检查配置文件") # 🎯 显示配置信息帮助调试 with st.expander("🔍 配置信息", expanded=True): st.write("**已加载的文档:**") docs = config_manager.list_documents() if docs: for doc in docs: doc_config = config_manager.get_document(doc) st.write(f"- **{doc}**") st.write(f" - 基础目录: `{doc_config.base_dir}`") st.write(f" - OCR 结果: {len([r for r in doc_config.ocr_results if r.enabled])} 个已启用") else: st.write("无") st.write("**已加载的 OCR 工具:**") tools = config_manager.list_ocr_tools() if tools: for tool in tools: tool_config = config_manager.get_ocr_tool(tool) st.write(f"- **{tool_config.name}** (`{tool}`)") else: st.write("无") st.write("**配置文件路径:**") st.code(str(config_manager.config_dir / "global.yaml")) st.write("**生成的数据源:**") data_sources = config_manager.get_data_sources() if data_sources: for ds in data_sources: st.write(f"- `{ds.name}`") st.write(f" - 工具: {ds.ocr_tool}") st.write(f" - 结果目录: {ds.ocr_out_dir}") st.write(f" - 图片目录: {ds.src_img_dir}") else: st.write("无") st.stop() # 文件选择区域 with st.container(height=75, horizontal=True, horizontal_alignment='left', gap="medium"): if 'selected_file_index' not in st.session_state: st.session_state.selected_file_index = 0 if validator.display_options: selected_index = st.selectbox( "选择OCR结果文件", range(len(validator.display_options)), format_func=lambda i: validator.display_options[i], index=st.session_state.selected_file_index, key="selected_selectbox", label_visibility="collapsed" ) if selected_index != st.session_state.selected_file_index: st.session_state.selected_file_index = selected_index selected_file = validator.file_paths[selected_index] current_page = validator.file_info[selected_index]['page'] page_input = st.number_input( "输入页码", placeholder="输入页码", label_visibility="collapsed", min_value=1, max_value=len(validator.display_options), value=current_page, step=1, key="page_input" ) if page_input != current_page: for i, info in enumerate(validator.file_info): if info['page'] == page_input: st.session_state.selected_file_index = i selected_file = validator.file_paths[i] st.rerun() break if (st.session_state.selected_file_index >= 0 and validator.selected_file_index != st.session_state.selected_file_index and selected_file): validator.selected_file_index = st.session_state.selected_file_index st.session_state.validator.load_ocr_data(selected_file) current_source_name = get_data_source_display_name(validator.current_source_config) st.success(f"✅ 已加载 {current_source_name} - 第{validator.file_info[st.session_state.selected_file_index]['page']}页") st.rerun() else: st.warning("当前数据源中未找到OCR结果文件") # ✅ 交叉验证按钮 - 添加数据源检查 cross_validation_enabled = ( validator.current_source_key != validator.verify_source_key and validator.image_path and validator.md_content ) if st.button( "交叉验证", type="primary", icon=":material/compare_arrows:", disabled=not cross_validation_enabled, help="需要选择不同的OCR数据源和验证数据源" if not cross_validation_enabled else "开始批量交叉验证" ): cross_validation_dialog(validator) # ✅ 查看验证结果按钮 - 检查是否有验证结果 has_validation_results = ( 'cross_validation_batch_result' in st.session_state and st.session_state.cross_validation_batch_result is not None ) if st.button( "查看验证结果", type="secondary", icon=":material/quick_reference_all:", disabled=not has_validation_results, help="暂无验证结果,请先运行交叉验证" if not has_validation_results else "查看批量验证结果" ): show_batch_cross_validation_results_dialog() # 显示当前数据源统计信息 with st.expander("OCR工具统计信息", expanded=False): stats = validator.get_statistics() col1, col2, col3, col4, col5 = st.columns(5) with col1: st.metric("📊 总文本块", stats['total_texts']) with col2: st.metric("🔗 可点击文本", stats['clickable_texts']) with col3: st.metric("❌ 标记错误", stats['marked_errors']) with col4: st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%") with col5: if validator.current_source_config: tool_id = validator.current_source_config['ocr_tool'] # 🎯 从配置管理器获取工具名称 tool_config = config_manager.get_ocr_tool(tool_id) tool_display = tool_config.name if tool_config else tool_id.upper() st.metric("🔧 OCR工具", tool_display) if stats['tool_info']: st.write("**详细信息:**", stats['tool_info']) # 🎯 显示当前文档和 OCR 结果信息 if validator.current_source_config: source_name = validator.current_source_config['name'] # 解析数据源名称,提取文档名(更精确的解析) parts = source_name.split('_', 1) doc_name = parts[0] if parts else source_name doc_config = config_manager.get_document(doc_name) if doc_config: st.write("**文档信息:**") st.write(f"- 文档名称: {doc_config.name}") st.write(f"- 基础目录: {doc_config.base_dir}") st.write(f"- 可用 OCR 工具: {len([r for r in doc_config.ocr_results if r.enabled])} 个") # 🎯 添加配置管理面板 with st.expander("⚙️ 配置管理", expanded=False): col1, col2 = st.columns(2) with col1: st.subheader("📄 已加载文档") docs = config_manager.list_documents() for doc_name in docs: doc_config = config_manager.get_document(doc_name) enabled_count = len([r for r in doc_config.ocr_results if r.enabled]) total_count = len(doc_config.ocr_results) with st.container(): st.write(f"✅ **{doc_name}**") st.caption(f"📊 {enabled_count}/{total_count} 工具已启用") # 显示每个 OCR 工具的状态 for ocr_result in doc_config.ocr_results: status_icon = "🟢" if ocr_result.enabled else "⚪" tool_config = config_manager.get_ocr_tool(ocr_result.tool) tool_name = tool_config.name if tool_config else ocr_result.tool st.caption(f" {status_icon} {tool_name} - {ocr_result.description or ocr_result.result_dir}") with col2: st.subheader("🔧 已加载 OCR 工具") tools = config_manager.list_ocr_tools() for tool_id in tools: tool_config = config_manager.get_ocr_tool(tool_id) with st.container(): st.write(f"🔧 **{tool_config.name}**") st.caption(f"ID: `{tool_id}`") st.caption(f"描述: {tool_config.description}") tab1, tab2, tab3 = st.tabs(["📄 内容人工检查", "🔍 交叉验证结果", "📊 表格分析"]) with tab1: validator.create_compact_layout(validator_config) with tab2: # ✅ 使用封装的函数显示单页交叉验证结果 display_single_page_cross_validation(validator, validator_config) with tab3: st.header("📊 表格数据分析") if validator.md_content and '