zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
							"""
核心验证器类
"""
import streamlit as st
from pathlib import Path
from typing import Dict, List, Optional
import json

from ocr_validator_utils import (
    load_config, load_ocr_data_file, process_ocr_data,
    get_ocr_statistics, find_available_ocr_files_multi_source, 
    get_data_source_display_name
)
from ocr_validator_layout import OCRLayoutManager


class StreamlitOCRValidator:
    """核心验证器类"""
    
    def __init__(self):
        self.config = load_config()
        self.ocr_data = []
        self.md_content = ""
        self.image_path = ""
        self.text_bbox_mapping = {}
        self.selected_text = None
        self.marked_errors = set()
        
        # 多数据源相关
        self.all_sources = {}
        self.current_source_key = None
        self.current_source_config = None
        self.file_info = []
        self.selected_file_index = -1
        self.display_options = []
        self.file_paths = []
        
        # 交叉验证数据源
        self.verify_source_key = None
        self.verify_source_config = None
        self.verify_file_info = []
        self.verify_display_options = []
        self.verify_file_paths = []

        # 初始化布局管理器
        self.layout_manager = OCRLayoutManager(self)

        # 加载多数据源文件信息
        self.load_multi_source_info()
        
    def load_multi_source_info(self):
        """加载多数据源文件信息"""
        self.all_sources = find_available_ocr_files_multi_source(self.config)
        
        if self.all_sources:
            source_keys = list(self.all_sources.keys())
            first_source_key = source_keys[0]
            self.switch_to_source(first_source_key)
            
            if len(source_keys) > 1:
                self.switch_to_verify_source(source_keys[1])
    
    def switch_to_source(self, source_key: str):
        """切换到指定OCR数据源"""
        if source_key in self.all_sources:
            self.current_source_key = source_key
            source_data = self.all_sources[source_key]
            self.current_source_config = source_data['config']
            self.file_info = source_data['files']
            
            if self.file_info:
                self.display_options = [f"{info['display_name']}" for info in self.file_info]
                self.file_paths = [info['path'] for info in self.file_info]
                self.selected_file_index = -1
                print(f"✅ 切换到OCR数据源: {source_key}")
            else:
                print(f"⚠️ 数据源 {source_key} 没有可用文件")
    
    def switch_to_verify_source(self, source_key: str):
        """切换到指定验证数据源"""
        if source_key in self.all_sources:
            self.verify_source_key = source_key
            source_data = self.all_sources[source_key]
            self.verify_source_config = source_data['config']
            self.verify_file_info = source_data['files']
            
            if self.verify_file_info:
                self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
                self.verify_file_paths = [info['path'] for info in self.verify_file_info]
                print(f"✅ 切换到验证数据源: {source_key}")
            else:
                print(f"⚠️ 验证数据源 {source_key} 没有可用文件")

    def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
        """加载OCR相关数据"""
        try:
            if self.current_source_config:
                temp_config = self.config.copy()
                temp_config['paths'] = {
                    'ocr_out_dir': self.current_source_config['ocr_out_dir'],
                    'src_img_dir': self.current_source_config.get('src_img_dir', ''),
                    'pre_validation_dir': self.config['pre_validation']['out_dir']
                }
                temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
                
                self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
            else:
                self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
                
            self.process_data()
        except Exception as e:
            st.error(f"❌ 加载失败: {e}")
            st.exception(e)
    
    def process_data(self):
        """处理OCR数据"""
        self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
    
    def get_statistics(self) -> Dict:
        """获取统计信息"""
        return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
    
    def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
        """查找当前OCR文件对应的验证文件路径"""
        current_page = self.file_info[selected_file_index]['page']
        verify_md_path = None

        for i, info in enumerate(self.verify_file_info):
            if info['page'] == current_page:
                verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
                break

        return verify_md_path

    def create_compact_layout(self, config):
        """创建紧凑布局"""
        return self.layout_manager.create_compact_layout(config)