""" 核心验证器类 """ import streamlit as st from pathlib import Path from typing import Dict, List, Optional import json from ocr_validator_utils import ( load_ocr_data_file, process_ocr_data, get_ocr_statistics, find_available_ocr_files_multi_source, get_data_source_display_name ) from ocr_validator_layout import OCRLayoutManager class StreamlitOCRValidator: """核心验证器类""" def __init__(self, config_dict: Dict = None): # 🎯 参数名改为 config_dict """ 初始化验证器 Args: config_dict: 配置字典(从 ConfigManager.to_validator_config() 生成) """ self.config = config_dict # 🎯 直接赋值 self.ocr_data = [] self.md_content = "" self.image_path = "" self.text_bbox_mapping = {} self.selected_text = None self.marked_errors = set() # 多数据源相关 self.all_sources = {} self.current_source_key = None self.current_source_config = None self.file_info = [] self.selected_file_index = -1 self.display_options = [] self.file_paths = [] # 交叉验证数据源 self.verify_source_key = None self.verify_source_config = None self.verify_file_info = [] self.verify_display_options = [] self.verify_file_paths = [] # 初始化布局管理器 self.layout_manager = OCRLayoutManager(self) # 加载多数据源文件信息 self.load_multi_source_info() def load_multi_source_info(self): """加载多数据源文件信息""" self.all_sources = find_available_ocr_files_multi_source(self.config) if self.all_sources: source_keys = list(self.all_sources.keys()) first_source_key = source_keys[0] self.switch_to_source(first_source_key) if len(source_keys) > 1: self.switch_to_verify_source(source_keys[1]) def switch_to_source(self, source_key: str): """切换到指定OCR数据源""" if source_key in self.all_sources: self.current_source_key = source_key source_data = self.all_sources[source_key] self.current_source_config = source_data['config'] self.file_info = source_data['files'] if self.file_info: self.display_options = [f"{info['display_name']}" for info in self.file_info] self.file_paths = [info['path'] for info in self.file_info] self.selected_file_index = -1 print(f"✅ 切换到OCR数据源: {source_key}") else: print(f"⚠️ 数据源 {source_key} 没有可用文件") def switch_to_verify_source(self, source_key: str): """切换到指定验证数据源""" if source_key in self.all_sources: self.verify_source_key = source_key source_data = self.all_sources[source_key] self.verify_source_config = source_data['config'] self.verify_file_info = source_data['files'] if self.verify_file_info: self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info] self.verify_file_paths = [info['path'] for info in self.verify_file_info] print(f"✅ 切换到验证数据源: {source_key}") else: print(f"⚠️ 验证数据源 {source_key} 没有可用文件") def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None): """加载OCR相关数据""" try: if self.current_source_config: temp_config = self.config.copy() temp_config['paths'] = { 'ocr_out_dir': self.current_source_config['ocr_out_dir'], 'src_img_dir': self.current_source_config.get('src_img_dir', ''), 'pre_validation_dir': self.config['pre_validation']['out_dir'] } temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool'] self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config) else: self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config) self.process_data() except Exception as e: st.error(f"❌ 加载失败: {e}") st.exception(e) def process_data(self): """处理OCR数据""" self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config) def get_statistics(self) -> Dict: """获取统计信息""" return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors) def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]: """查找当前OCR文件对应的验证文件路径""" current_page = self.file_info[selected_file_index]['page'] verify_md_path = None for i, info in enumerate(self.verify_file_info): if info['page'] == current_page: verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md') break return verify_md_path def create_compact_layout(self, config): """创建紧凑布局""" return self.layout_manager.create_compact_layout(config)