streamlit_validator_core.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. 核心验证器类
  3. """
  4. import streamlit as st
  5. from pathlib import Path
  6. from typing import Dict, List, Optional
  7. import json
  8. from ocr_validator_utils import (
  9. load_ocr_data_file, process_ocr_data,
  10. get_ocr_statistics, find_available_ocr_files_multi_source,
  11. get_data_source_display_name
  12. )
  13. from ocr_validator_layout import OCRLayoutManager
  14. class StreamlitOCRValidator:
  15. """核心验证器类"""
  16. def __init__(self, config_dict: Dict = None): # 🎯 参数名改为 config_dict
  17. """
  18. 初始化验证器
  19. Args:
  20. config_dict: 配置字典(从 ConfigManager.to_validator_config() 生成)
  21. """
  22. self.config = config_dict # 🎯 直接赋值
  23. self.ocr_data = []
  24. self.md_content = ""
  25. self.image_path = ""
  26. self.text_bbox_mapping = {}
  27. self.selected_text = None
  28. self.marked_errors = set()
  29. # 多数据源相关
  30. self.all_sources = {}
  31. self.current_source_key = None
  32. self.current_source_config = None
  33. self.file_info = []
  34. self.selected_file_index = -1
  35. self.display_options = []
  36. self.file_paths = []
  37. # 交叉验证数据源
  38. self.verify_source_key = None
  39. self.verify_source_config = None
  40. self.verify_file_info = []
  41. self.verify_display_options = []
  42. self.verify_file_paths = []
  43. # 初始化布局管理器
  44. self.layout_manager = OCRLayoutManager(self)
  45. # 加载多数据源文件信息
  46. self.load_multi_source_info()
  47. def load_multi_source_info(self):
  48. """加载多数据源文件信息"""
  49. self.all_sources = find_available_ocr_files_multi_source(self.config)
  50. if self.all_sources:
  51. source_keys = list(self.all_sources.keys())
  52. first_source_key = source_keys[0]
  53. self.switch_to_source(first_source_key)
  54. if len(source_keys) > 1:
  55. self.switch_to_verify_source(source_keys[1])
  56. def switch_to_source(self, source_key: str):
  57. """切换到指定OCR数据源"""
  58. if source_key in self.all_sources:
  59. self.current_source_key = source_key
  60. source_data = self.all_sources[source_key]
  61. self.current_source_config = source_data['config']
  62. self.file_info = source_data['files']
  63. if self.file_info:
  64. self.display_options = [f"{info['display_name']}" for info in self.file_info]
  65. self.file_paths = [info['path'] for info in self.file_info]
  66. self.selected_file_index = -1
  67. print(f"✅ 切换到OCR数据源: {source_key}")
  68. else:
  69. print(f"⚠️ 数据源 {source_key} 没有可用文件")
  70. else:
  71. raise FileNotFoundError(f"找不到文件路径: {source_key}")
  72. def switch_to_verify_source(self, source_key: str):
  73. """切换到指定验证数据源"""
  74. if source_key in self.all_sources:
  75. self.verify_source_key = source_key
  76. source_data = self.all_sources[source_key]
  77. self.verify_source_config = source_data['config']
  78. self.verify_file_info = source_data['files']
  79. if self.verify_file_info:
  80. self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
  81. self.verify_file_paths = [info['path'] for info in self.verify_file_info]
  82. print(f"✅ 切换到验证数据源: {source_key}")
  83. else:
  84. print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
  85. else:
  86. raise FileNotFoundError(f"找不到文件路径: {source_key}")
  87. def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
  88. """加载OCR相关数据"""
  89. try:
  90. if self.current_source_config:
  91. temp_config = self.config.copy()
  92. temp_config['paths'] = {
  93. 'ocr_out_dir': self.current_source_config['ocr_out_dir'],
  94. 'src_img_dir': self.current_source_config.get('src_img_dir', ''),
  95. 'pre_validation_dir': self.config['pre_validation']['out_dir']
  96. }
  97. temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
  98. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
  99. else:
  100. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
  101. self.process_data()
  102. except Exception as e:
  103. st.error(f"❌ 加载失败: {e}")
  104. st.exception(e)
  105. def process_data(self):
  106. """处理OCR数据"""
  107. self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
  108. def get_statistics(self) -> Dict:
  109. """获取统计信息"""
  110. return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
  111. def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
  112. """查找当前OCR文件对应的验证文件路径"""
  113. current_page = self.file_info[selected_file_index]['page']
  114. verify_md_path = None
  115. for i, info in enumerate(self.verify_file_info):
  116. if info['page'] == current_page:
  117. verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
  118. break
  119. return verify_md_path
  120. def create_compact_layout(self, config):
  121. """创建紧凑布局"""
  122. return self.layout_manager.create_compact_layout(config)