streamlit_validator_core.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. """
  2. 核心验证器类
  3. """
  4. import streamlit as st
  5. from pathlib import Path
  6. from typing import Dict, List, Optional
  7. import json
  8. from ocr_validator_utils import (
  9. load_config, load_ocr_data_file, process_ocr_data,
  10. get_ocr_statistics, find_available_ocr_files_multi_source,
  11. get_data_source_display_name
  12. )
  13. from ocr_validator_layout import OCRLayoutManager
  14. class StreamlitOCRValidator:
  15. """核心验证器类"""
  16. def __init__(self):
  17. self.config = load_config()
  18. self.ocr_data = []
  19. self.md_content = ""
  20. self.image_path = ""
  21. self.text_bbox_mapping = {}
  22. self.selected_text = None
  23. self.marked_errors = set()
  24. # 多数据源相关
  25. self.all_sources = {}
  26. self.current_source_key = None
  27. self.current_source_config = None
  28. self.file_info = []
  29. self.selected_file_index = -1
  30. self.display_options = []
  31. self.file_paths = []
  32. # 交叉验证数据源
  33. self.verify_source_key = None
  34. self.verify_source_config = None
  35. self.verify_file_info = []
  36. self.verify_display_options = []
  37. self.verify_file_paths = []
  38. # 初始化布局管理器
  39. self.layout_manager = OCRLayoutManager(self)
  40. # 加载多数据源文件信息
  41. self.load_multi_source_info()
  42. def load_multi_source_info(self):
  43. """加载多数据源文件信息"""
  44. self.all_sources = find_available_ocr_files_multi_source(self.config)
  45. if self.all_sources:
  46. source_keys = list(self.all_sources.keys())
  47. first_source_key = source_keys[0]
  48. self.switch_to_source(first_source_key)
  49. if len(source_keys) > 1:
  50. self.switch_to_verify_source(source_keys[1])
  51. def switch_to_source(self, source_key: str):
  52. """切换到指定OCR数据源"""
  53. if source_key in self.all_sources:
  54. self.current_source_key = source_key
  55. source_data = self.all_sources[source_key]
  56. self.current_source_config = source_data['config']
  57. self.file_info = source_data['files']
  58. if self.file_info:
  59. self.display_options = [f"{info['display_name']}" for info in self.file_info]
  60. self.file_paths = [info['path'] for info in self.file_info]
  61. self.selected_file_index = -1
  62. print(f"✅ 切换到OCR数据源: {source_key}")
  63. else:
  64. print(f"⚠️ 数据源 {source_key} 没有可用文件")
  65. def switch_to_verify_source(self, source_key: str):
  66. """切换到指定验证数据源"""
  67. if source_key in self.all_sources:
  68. self.verify_source_key = source_key
  69. source_data = self.all_sources[source_key]
  70. self.verify_source_config = source_data['config']
  71. self.verify_file_info = source_data['files']
  72. if self.verify_file_info:
  73. self.verify_display_options = [f"{info['display_name']}" for info in self.verify_file_info]
  74. self.verify_file_paths = [info['path'] for info in self.verify_file_info]
  75. print(f"✅ 切换到验证数据源: {source_key}")
  76. else:
  77. print(f"⚠️ 验证数据源 {source_key} 没有可用文件")
  78. def load_ocr_data(self, json_path: str, md_path: Optional[str] = None, image_path: Optional[str] = None):
  79. """加载OCR相关数据"""
  80. try:
  81. if self.current_source_config:
  82. temp_config = self.config.copy()
  83. temp_config['paths'] = {
  84. 'ocr_out_dir': self.current_source_config['ocr_out_dir'],
  85. 'src_img_dir': self.current_source_config.get('src_img_dir', ''),
  86. 'pre_validation_dir': self.config['pre_validation']['out_dir']
  87. }
  88. temp_config['current_ocr_tool'] = self.current_source_config['ocr_tool']
  89. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, temp_config)
  90. else:
  91. self.ocr_data, self.md_content, self.image_path = load_ocr_data_file(json_path, self.config)
  92. self.process_data()
  93. except Exception as e:
  94. st.error(f"❌ 加载失败: {e}")
  95. st.exception(e)
  96. def process_data(self):
  97. """处理OCR数据"""
  98. self.text_bbox_mapping = process_ocr_data(self.ocr_data, self.config)
  99. def get_statistics(self) -> Dict:
  100. """获取统计信息"""
  101. return get_ocr_statistics(self.ocr_data, self.text_bbox_mapping, self.marked_errors)
  102. def find_verify_md_path(self, selected_file_index: int) -> Optional[Path]:
  103. """查找当前OCR文件对应的验证文件路径"""
  104. current_page = self.file_info[selected_file_index]['page']
  105. verify_md_path = None
  106. for i, info in enumerate(self.verify_file_info):
  107. if info['page'] == current_page:
  108. verify_md_path = Path(self.verify_file_paths[i]).with_suffix('.md')
  109. break
  110. return verify_md_path
  111. def create_compact_layout(self, config):
  112. """创建紧凑布局"""
  113. return self.layout_manager.create_compact_layout(config)