streamlit_validator_core.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. """
  2. 核心验证器类
  3. """
  4. import streamlit as st
  5. from pathlib import Path
  6. from typing import Dict, List, Optional
  7. import json
  8. from ocr_validator_utils import (
  9. load_ocr_data_file, process_ocr_data,
  10. get_ocr_statistics, find_available_ocr_files_multi_source,
  11. get_data_source_display_name
  12. )
  13. from ocr_validator_layout import OCRLayoutManager
  14. class StreamlitOCRValidator:
  15. """核心验证器类"""
  16. def __init__(self, config_dict: Dict = None): # 🎯 参数名改为 config_dict
  17. """
  18. 初始化验证器
  19. Args:
  20. config_dict: 配置字典(从 ConfigManager.to_validator_config() 生成)
  21. """
  22. self.config = config_dict # 🎯 直接赋值
  23. self.ocr_data = []
  24. self.md_content = ""
  25. self.image_path = ""
  26. self.text_bbox_mapping = {}
  27. self.selected_text = None
  28. self.marked_errors = set()
  29. # 多数据源相关
  30. self.all_sources = {}
  31. self.current_source_key = None
  32. self.current_source_config = None
  33. self.file_info = []
  34. self.selected_file_index = -1
  35. self.display_options = []
  36. self.file_paths = []
  37. # 交叉验证数据源
  38. self.verify_source_key = None
  39. self.verify_source_config = None
  40. self.verify_file_info = []
  41. self.verify_display_options = []
  42. self.verify_file_paths = []
  43. # 初始化布局管理器
  44. self.layout_manager = OCRLayoutManager(self)
  45. # 加载多数据源文件信息
  46. self.load_multi_source_info()
  47. def load_multi_source_info(self):
  48. """加载多数据源文件信息"""
  49. self.all_sources = find_available_ocr_files_multi_source(self.config)
  50. if self.all_sources:
  51. source_keys = list(self.all_sources.keys())
  52. first_source_key = source_keys[0]
  53. self.switch_to_source(first_source_key)
  54. if len(source_keys) > 1:
  55. self.switch_to_verify_source(source_keys[1])
  56. def switch_to_source(self, source_key: str):
  57. """切换到指定OCR数据源"""
  58. if source_key in self.all_sources:
  59. self.current_source_key = source_key
  60. source_data = self.all_sources[source_key]
  61. self.current_source_config = source_data['config']
  62. self.file_info = source_data['files']
  63. if self.file_info:
  64. self.display_options = [f"{info['display_name']}" for info in self.file_info]
  65. self.file_paths = [info['path'] for info in self.file_info]
  66. self.selected_file_index = -1
  67. print(f"✅ 切换到OCR数据源: {source_key}")
  68. else:
  69. print(f"⚠️ 数据源 {source_key} 没有可用文件")
  70. def switch_to_verify_source(self, source_key: str):
  71. """切换到指定验证数据源"""
  72. if source_key in self.all_sources:
  73. self.verify_source_key = source_key
  74. source_data = self.all_sources[source_key]
  75. self.verify_source_config = source_data['config']
  76. self.verify_file_info = source_data['files']
  77. if self.verify_file_info:
  78. self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
  79. self.verify_file_paths = [info['path'] for info in self.verify_file_info]
  80. print(f"✅ 切换到验证数据源: {source_key}")
  81. else:
  82. print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
  83. def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
  84. """加载OCR相关数据"""
  85. try:
  86. if self.current_source_config:
  87. temp_config = self.config.copy()
  88. temp_config['paths'] = {
  89. 'ocr_out_dir': self.current_source_config['ocr_out_dir'],
  90. 'src_img_dir': self.current_source_config.get('src_img_dir', ''),
  91. 'pre_validation_dir': self.config['pre_validation']['out_dir']
  92. }
  93. temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
  94. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
  95. else:
  96. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
  97. self.process_data()
  98. except Exception as e:
  99. st.error(f"❌ 加载失败: {e}")
  100. st.exception(e)
  101. def process_data(self):
  102. """处理OCR数据"""
  103. self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
  104. def get_statistics(self) -> Dict:
  105. """获取统计信息"""
  106. return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
  107. def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
  108. """查找当前OCR文件对应的验证文件路径"""
  109. current_page = self.file_info[selected_file_index]['page']
  110. verify_md_path = None
  111. for i, info in enumerate(self.verify_file_info):
  112. if info['page'] == current_page:
  113. verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
  114. break
  115. return verify_md_path
  116. def create_compact_layout(self, config):
  117. """创建紧凑布局"""
  118. return self.layout_manager.create_compact_layout(config)