| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- #!/usr/bin/env python3
- """
- 基于Streamlit的OCR可视化校验工具(主入口)
- """
- import streamlit as st
- from pathlib import Path
- import json
- from streamlit_validator_core import StreamlitOCRValidator
- from streamlit_validator_ui import (
- setup_page_config, create_data_source_selector, message_box
- )
- from streamlit_validator_table import display_html_table_as_dataframe
- from streamlit_validator_cross import (
- cross_validation_dialog, show_batch_cross_validation_results_dialog
- )
- from streamlit_validator_result import display_single_page_cross_validation
- from ocr_validator_utils import get_data_source_display_name
- def reset_cross_validation_results():
- """重置交叉验证结果"""
- if 'cross_validation_batch_result' in st.session_state:
- st.session_state.cross_validation_batch_result = None
- print("🔄 数据源已变更,交叉验证结果已清空")
- def main():
- """主应用"""
- # 初始化应用
- if 'validator' not in st.session_state:
- validator = StreamlitOCRValidator()
- st.session_state.validator = validator
- setup_page_config(validator.config)
-
- # 页面标题
- config = st.session_state.validator.config
- st.title(config['ui']['page_title'])
-
- # 初始化数据源追踪
- st.session_state.current_ocr_source = validator.current_source_key
- st.session_state.current_verify_source = validator.verify_source_key
- else:
- validator = st.session_state.validator
- config = st.session_state.validator.config
-
- if 'selected_text' not in st.session_state:
- st.session_state.selected_text = None
- st.session_state.compact_search_query = None
- if 'marked_errors' not in st.session_state:
- st.session_state.marked_errors = set()
-
- # 数据源选择器
- create_data_source_selector(validator)
-
- # ✅ 检测数据源是否变更
- ocr_source_changed = False
- verify_source_changed = False
-
- if 'current_ocr_source' in st.session_state:
- if st.session_state.current_ocr_source != validator.current_source_key:
- ocr_source_changed = True
- st.session_state.current_ocr_source = validator.current_source_key
- print(f"🔄 OCR数据源已切换到: {validator.current_source_key}")
-
- if 'current_verify_source' in st.session_state:
- if st.session_state.current_verify_source != validator.verify_source_key:
- verify_source_changed = True
- st.session_state.current_verify_source = validator.verify_source_key
- print(f"🔄 验证数据源已切换到: {validator.verify_source_key}")
-
- # ✅ 如果任一数据源变更,清空交叉验证结果
- if ocr_source_changed or verify_source_changed:
- reset_cross_validation_results()
-
- # 显示提示信息
- if ocr_source_changed and verify_source_changed:
- st.info("ℹ️ OCR数据源和验证数据源已变更,请重新运行交叉验证")
- elif ocr_source_changed:
- st.info("ℹ️ OCR数据源已变更,请重新运行交叉验证")
- elif verify_source_changed:
- st.info("ℹ️ 验证数据源已变更,请重新运行交叉验证")
-
- # 如果没有可用的数据源,提前返回
- if not validator.all_sources:
- st.stop()
-
- # 文件选择区域
- with st.container(height=75, horizontal=True, horizontal_alignment='left', gap="medium"):
- if 'selected_file_index' not in st.session_state:
- st.session_state.selected_file_index = 0
-
- if validator.display_options:
- selected_index = st.selectbox(
- "选择OCR结果文件",
- range(len(validator.display_options)),
- format_func=lambda i: validator.display_options[i],
- index=st.session_state.selected_file_index,
- key="selected_selectbox",
- label_visibility="collapsed"
- )
-
- if selected_index != st.session_state.selected_file_index:
- st.session_state.selected_file_index = selected_index
- selected_file = validator.file_paths[selected_index]
- current_page = validator.file_info[selected_index]['page']
- page_input = st.number_input(
- "输入页码",
- placeholder="输入页码",
- label_visibility="collapsed",
- min_value=1,
- max_value=len(validator.display_options),
- value=current_page,
- step=1,
- key="page_input"
- )
-
- if page_input != current_page:
- for i, info in enumerate(validator.file_info):
- if info['page'] == page_input:
- st.session_state.selected_file_index = i
- selected_file = validator.file_paths[i]
- st.rerun()
- break
- if (st.session_state.selected_file_index >= 0
- and validator.selected_file_index != st.session_state.selected_file_index
- and selected_file):
- validator.selected_file_index = st.session_state.selected_file_index
- st.session_state.validator.load_ocr_data(selected_file)
-
- current_source_name = get_data_source_display_name(validator.current_source_config)
- st.success(f"✅ 已加载 {current_source_name} - 第{validator.file_info[st.session_state.selected_file_index]['page']}页")
- st.rerun()
- else:
- st.warning("当前数据源中未找到OCR结果文件")
- # ✅ 交叉验证按钮 - 添加数据源检查
- cross_validation_enabled = (
- validator.current_source_key != validator.verify_source_key
- and validator.image_path
- and validator.md_content
- )
-
- if st.button(
- "交叉验证",
- type="primary",
- icon=":material/compare_arrows:",
- disabled=not cross_validation_enabled,
- help="需要选择不同的OCR数据源和验证数据源" if not cross_validation_enabled else "开始批量交叉验证"
- ):
- cross_validation_dialog(validator)
- # ✅ 查看验证结果按钮 - 检查是否有验证结果
- has_validation_results = (
- 'cross_validation_batch_result' in st.session_state
- and st.session_state.cross_validation_batch_result is not None
- )
-
- if st.button(
- "查看验证结果",
- type="secondary",
- icon=":material/quick_reference_all:",
- disabled=not has_validation_results,
- help="暂无验证结果,请先运行交叉验证" if not has_validation_results else "查看批量验证结果"
- ):
- show_batch_cross_validation_results_dialog()
- # 显示当前数据源统计信息
- with st.expander("🔧 OCR工具统计信息", expanded=False):
- stats = validator.get_statistics()
- col1, col2, col3, col4, col5 = st.columns(5)
-
- with col1:
- st.metric("📊 总文本块", stats['total_texts'])
- with col2:
- st.metric("🔗 可点击文本", stats['clickable_texts'])
- with col3:
- st.metric("❌ 标记错误", stats['marked_errors'])
- with col4:
- st.metric("✅ 准确率", f"{stats['accuracy_rate']:.1f}%")
- with col5:
- if validator.current_source_config:
- tool_display = validator.current_source_config['ocr_tool'].upper()
- st.metric("🔧 OCR工具", tool_display)
-
- if stats['tool_info']:
- st.write("**详细信息:**", stats['tool_info'])
-
- tab1, tab2, tab3 = st.tabs(["📄 内容人工检查", "🔍 交叉验证结果", "📊 表格分析"])
-
- with tab1:
- validator.create_compact_layout(config)
- with tab2:
- # ✅ 使用封装的函数显示单页交叉验证结果
- display_single_page_cross_validation(validator, config)
- with tab3:
- st.header("📊 表格数据分析")
-
- if validator.md_content and '<table' in validator.md_content.lower():
- st.subheader("🔍 表格数据预览")
- display_html_table_as_dataframe(validator.md_content)
- else:
- st.info("当前OCR结果中没有检测到表格数据")
-
- if __name__ == "__main__":
- main()
|