| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- """
- 核心验证器类
- """
- import streamlit as st
- from pathlib import Path
- from typing import Dict, List, Optional
- import json
- from ocr_validator_utils import (
- load_ocr_data_file, process_ocr_data,
- get_ocr_statistics, find_available_ocr_files_multi_source,
- get_data_source_display_name
- )
- from ocr_validator_layout import OCRLayoutManager
- class StreamlitOCRValidator:
- """核心验证器类"""
-
- def __init__(self, config_dict: Dict = None): # 🎯 参数名改为 config_dict
- """
- 初始化验证器
-
- Args:
- config_dict: 配置字典(从 ConfigManager.to_validator_config() 生成)
- """
- self.config = config_dict # 🎯 直接赋值
- self.ocr_data = []
- self.md_content = ""
- self.image_path = ""
- self.text_bbox_mapping = {}
- self.selected_text = None
- self.marked_errors = set()
-
- # 多数据源相关
- self.all_sources = {}
- self.current_source_key = None
- self.current_source_config = None
- self.file_info = []
- self.selected_file_index = -1
- self.display_options = []
- self.file_paths = []
-
- # 交叉验证数据源
- self.verify_source_key = None
- self.verify_source_config = None
- self.verify_file_info = []
- self.verify_display_options = []
- self.verify_file_paths = []
- # 初始化布局管理器
- self.layout_manager = OCRLayoutManager(self)
- # 加载多数据源文件信息
- self.load_multi_source_info()
-
- def load_multi_source_info(self):
- """加载多数据源文件信息"""
- self.all_sources = find_available_ocr_files_multi_source(self.config)
-
- if self.all_sources:
- source_keys = list(self.all_sources.keys())
- first_source_key = source_keys[0]
- self.switch_to_source(first_source_key)
-
- if len(source_keys) > 1:
- self.switch_to_verify_source(source_keys[1])
-
- def switch_to_source(self, source_key: str):
- """切换到指定OCR数据源"""
- if source_key in self.all_sources:
- self.current_source_key = source_key
- source_data = self.all_sources[source_key]
- self.current_source_config = source_data['config']
- self.file_info = source_data['files']
-
- if self.file_info:
- self.display_options = [f"{info['display_name']}" for info in self.file_info]
- self.file_paths = [info['path'] for info in self.file_info]
- self.selected_file_index = -1
- print(f"✅ 切换到OCR数据源: {source_key}")
- else:
- print(f"⚠️ 数据源 {source_key} 没有可用文件")
-
- def switch_to_verify_source(self, source_key: str):
- """切换到指定验证数据源"""
- if source_key in self.all_sources:
- self.verify_source_key = source_key
- source_data = self.all_sources[source_key]
- self.verify_source_config = source_data['config']
- self.verify_file_info = source_data['files']
-
- if self.verify_file_info:
- self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
- self.verify_file_paths = [info['path'] for info in self.verify_file_info]
- print(f"✅ 切换到验证数据源: {source_key}")
- else:
- print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
- def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
- """加载OCR相关数据"""
- try:
- if self.current_source_config:
- temp_config = self.config.copy()
- temp_config['paths'] = {
- 'ocr_out_dir': self.current_source_config['ocr_out_dir'],
- 'src_img_dir': self.current_source_config.get('src_img_dir', ''),
- 'pre_validation_dir': self.config['pre_validation']['out_dir']
- }
- temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
-
- self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
- else:
- self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
-
- self.process_data()
- except Exception as e:
- st.error(f"❌ 加载失败: {e}")
- st.exception(e)
-
- def process_data(self):
- """处理OCR数据"""
- self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
-
- def get_statistics(self) -> Dict:
- """获取统计信息"""
- return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
-
- def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
- """查找当前OCR文件对应的验证文件路径"""
- current_page = self.file_info[selected_file_index]['page']
- verify_md_path = None
- for i, info in enumerate(self.verify_file_info):
- if info['page'] == current_page:
- verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
- break
- return verify_md_path
- def create_compact_layout(self, config):
- """创建紧凑布局"""
- return self.layout_manager.create_compact_layout(config)
|