config_manager.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. """
  2. 配置管理器
  3. 支持分层配置和自动发现数据源
  4. 支持 Jinja2 模板变量
  5. """
  6. import yaml
  7. from pathlib import Path
  8. from typing import Dict, List, Optional, Any
  9. from dataclasses import dataclass, field
  10. import logging
  11. from jinja2 import Template # 🎯 新增
  12. @dataclass
  13. class OCRToolConfig:
  14. """OCR 工具配置"""
  15. name: str
  16. description: str
  17. json_structure: str
  18. text_field: str
  19. bbox_field: str
  20. category_field: str
  21. confidence_field: str = "confidence"
  22. parsing_results_field: Optional[str] = None
  23. rec_texts_field: Optional[str] = None
  24. rec_boxes_field: Optional[str] = None
  25. table_body_field: Optional[str] = None
  26. table_cells_field: Optional[str] = None
  27. img_path_field: Optional[str] = None
  28. rotation: Dict[str, Any] = field(default_factory=dict)
  29. @classmethod
  30. def from_dict(cls, tool_id: str, data: Dict) -> 'OCRToolConfig':
  31. """从字典创建"""
  32. return cls(
  33. name=data.get('name', tool_id),
  34. description=data.get('description', ''),
  35. json_structure=data.get('json_structure', 'object'),
  36. text_field=data.get('text_field', 'text'),
  37. bbox_field=data.get('bbox_field', 'bbox'),
  38. category_field=data.get('category_field', 'category'),
  39. confidence_field=data.get('confidence_field', 'confidence'),
  40. parsing_results_field=data.get('parsing_results_field'),
  41. rec_texts_field=data.get('rec_texts_field'),
  42. rec_boxes_field=data.get('rec_boxes_field'),
  43. table_body_field=data.get('table_body_field'),
  44. table_cells_field=data.get('table_cells_field'),
  45. img_path_field=data.get('img_path_field'),
  46. rotation=data.get('rotation', {})
  47. )
  48. def to_dict(self) -> Dict:
  49. """转换为字典(用于 OCRValidator)"""
  50. config_dict = {
  51. 'name': self.name,
  52. 'description': self.description,
  53. 'json_structure': self.json_structure,
  54. 'text_field': self.text_field,
  55. 'bbox_field': self.bbox_field,
  56. 'category_field': self.category_field,
  57. 'confidence_field': self.confidence_field,
  58. 'rotation': self.rotation
  59. }
  60. # 添加可选字段
  61. if self.parsing_results_field:
  62. config_dict['parsing_results_field'] = self.parsing_results_field
  63. if self.rec_texts_field:
  64. config_dict['rec_texts_field'] = self.rec_texts_field
  65. if self.rec_boxes_field:
  66. config_dict['rec_boxes_field'] = self.rec_boxes_field
  67. if self.table_body_field:
  68. config_dict['table_body_field'] = self.table_body_field
  69. if self.table_cells_field:
  70. config_dict['table_cells_field'] = self.table_cells_field
  71. if self.img_path_field:
  72. config_dict['img_path_field'] = self.img_path_field
  73. return config_dict
  74. @dataclass
  75. class OCRResultConfig:
  76. """OCR 结果配置"""
  77. tool: str
  78. result_dir: str
  79. image_dir: Optional[str]
  80. description: str = ""
  81. enabled: bool = True
  82. @classmethod
  83. def from_dict(cls, data: Dict, context: Dict = None) -> 'OCRResultConfig':
  84. """
  85. 🎯 从字典创建(支持 Jinja2 模板)
  86. Args:
  87. data: 配置数据
  88. context: 模板上下文(如 {'name': '德_内蒙古银行照'})
  89. """
  90. # 🎯 渲染模板
  91. if context:
  92. result_dir = cls._render_template(data['result_dir'], context)
  93. image_dir = cls._render_template(data.get('image_dir'), context) if data.get('image_dir') else None
  94. description = cls._render_template(data.get('description', ''), context)
  95. else:
  96. result_dir = data['result_dir']
  97. image_dir = data.get('image_dir')
  98. description = data.get('description', '')
  99. return cls(
  100. tool=data['tool'],
  101. result_dir=result_dir,
  102. image_dir=image_dir,
  103. description=description,
  104. enabled=data.get('enabled', True)
  105. )
  106. @staticmethod
  107. def _render_template(template_str: Optional[str], context: Dict) -> Optional[str]:
  108. """🎯 渲染 Jinja2 模板"""
  109. if not template_str:
  110. return None
  111. try:
  112. template = Template(template_str)
  113. return template.render(context)
  114. except Exception as e:
  115. logging.warning(f"模板渲染失败: {template_str}, 错误: {e}")
  116. return template_str
  117. @dataclass
  118. class DocumentConfig:
  119. """文档配置"""
  120. name: str
  121. base_dir: str
  122. ocr_results: List[OCRResultConfig] = field(default_factory=list)
  123. @classmethod
  124. def from_dict(cls, data: Dict) -> 'DocumentConfig':
  125. """从字典创建(支持 Jinja2 模板)"""
  126. doc_data = data.get('document', data)
  127. # 🎯 构建模板上下文
  128. context = {
  129. 'name': doc_data['name'],
  130. 'base_dir': doc_data['base_dir']
  131. }
  132. return cls(
  133. name=doc_data['name'],
  134. base_dir=doc_data['base_dir'],
  135. ocr_results=[
  136. OCRResultConfig.from_dict(r, context)
  137. for r in doc_data.get('ocr_results', [])
  138. ]
  139. )
  140. @dataclass
  141. class DataSource:
  142. """数据源(用于 OCRValidator)"""
  143. name: str
  144. ocr_tool: str
  145. ocr_out_dir: str
  146. src_img_dir: str
  147. description: str = ""
  148. class ConfigManager:
  149. """配置管理器"""
  150. def __init__(self, config_dir: str = "config"):
  151. """
  152. Args:
  153. config_dir: 配置文件目录
  154. """
  155. self.config_dir = Path(config_dir)
  156. self.logger = logging.getLogger(__name__)
  157. # 加载配置
  158. self.global_config = self._load_global_config()
  159. self.ocr_tools = self._load_ocr_tools()
  160. self.documents = self._load_documents()
  161. def _load_global_config(self) -> Dict:
  162. """加载全局配置"""
  163. config_file = self.config_dir / "global.yaml"
  164. if not config_file.exists():
  165. self.logger.warning(f"全局配置文件不存在: {config_file}")
  166. return {}
  167. with open(config_file, 'r', encoding='utf-8') as f:
  168. return yaml.safe_load(f) or {}
  169. def _load_ocr_tools(self) -> Dict[str, OCRToolConfig]:
  170. """加载 OCR 工具配置(从 global.yaml)"""
  171. tools_data = self.global_config.get('ocr', {}).get('tools', {})
  172. tools = {}
  173. for tool_id, tool_data in tools_data.items():
  174. tools[tool_id] = OCRToolConfig.from_dict(tool_id, tool_data)
  175. return tools
  176. def _load_documents(self) -> Dict[str, DocumentConfig]:
  177. """加载文档配置(支持 Jinja2 模板)"""
  178. documents = {}
  179. # 从 global.yaml 读取文档配置文件列表
  180. doc_files = self.global_config.get('data_sources', [])
  181. for doc_file in doc_files:
  182. # 支持相对路径和绝对路径
  183. if not doc_file.endswith('.yaml'):
  184. doc_file = f"{doc_file}.yaml"
  185. yaml_path = self.config_dir / doc_file
  186. if not yaml_path.exists():
  187. self.logger.warning(f"文档配置文件不存在: {yaml_path}")
  188. continue
  189. try:
  190. with open(yaml_path, 'r', encoding='utf-8') as f:
  191. data = yaml.safe_load(f)
  192. # 🎯 使用支持 Jinja2 的解析方法
  193. doc_config = DocumentConfig.from_dict(data)
  194. documents[doc_config.name] = doc_config
  195. self.logger.info(f"✅ 加载文档配置: {doc_config.name} ({len(doc_config.ocr_results)} 个 OCR 结果)")
  196. except Exception as e:
  197. self.logger.error(f"加载文档配置失败: {yaml_path}, 错误: {e}")
  198. return documents
  199. def get_ocr_tool(self, tool_id: str) -> Optional[OCRToolConfig]:
  200. """获取 OCR 工具配置"""
  201. return self.ocr_tools.get(tool_id)
  202. def get_document(self, doc_name: str) -> Optional[DocumentConfig]:
  203. """获取文档配置"""
  204. return self.documents.get(doc_name)
  205. def list_documents(self) -> List[str]:
  206. """列出所有文档"""
  207. return list(self.documents.keys())
  208. def list_ocr_tools(self) -> List[str]:
  209. """列出所有 OCR 工具"""
  210. return list(self.ocr_tools.keys())
  211. def get_data_sources(self) -> List[DataSource]:
  212. """
  213. 生成数据源列表(供 OCRValidator 使用)
  214. 从文档配置自动生成 data_sources
  215. """
  216. data_sources = []
  217. for doc_name, doc_config in self.documents.items():
  218. base_dir = Path(doc_config.base_dir)
  219. for ocr_result in doc_config.ocr_results:
  220. if not ocr_result.enabled:
  221. continue
  222. # 构建完整路径
  223. ocr_out_dir = str(base_dir / ocr_result.result_dir)
  224. if ocr_result.image_dir:
  225. src_img_dir = str(base_dir / ocr_result.image_dir)
  226. else:
  227. # 如果未指定图片目录,使用结果目录
  228. src_img_dir = str(base_dir / ocr_result.result_dir / doc_name)
  229. # 生成数据源名称
  230. if ocr_result.description:
  231. source_name = f"{doc_name}_{ocr_result.description}"
  232. else:
  233. tool_config = self.get_ocr_tool(ocr_result.tool)
  234. tool_name = tool_config.name if tool_config else ocr_result.tool
  235. source_name = f"{doc_name}_{tool_name}"
  236. data_source = DataSource(
  237. name=source_name,
  238. ocr_tool=ocr_result.tool,
  239. ocr_out_dir=ocr_out_dir,
  240. src_img_dir=src_img_dir,
  241. description=ocr_result.description or f"{doc_name} 使用 {ocr_result.tool}"
  242. )
  243. data_sources.append(data_source)
  244. return data_sources
  245. def get_config_value(self, key_path: str, default=None):
  246. """
  247. 获取配置值(支持点号路径)
  248. Examples:
  249. get_config_value('styles.font_size')
  250. get_config_value('ocr.min_text_length')
  251. """
  252. keys = key_path.split('.')
  253. value = self.global_config
  254. for key in keys:
  255. if isinstance(value, dict):
  256. value = value.get(key)
  257. else:
  258. return default
  259. return value if value is not None else default
  260. def to_validator_config(self) -> Dict:
  261. """
  262. 转换为 OCRValidator 所需的配置格式
  263. Returns:
  264. 包含 data_sources 和 ocr.tools 的配置字典
  265. """
  266. # 构建 data_sources 列表
  267. data_sources_list = []
  268. for ds in self.get_data_sources():
  269. data_sources_list.append({
  270. 'name': ds.name,
  271. 'ocr_tool': ds.ocr_tool,
  272. 'ocr_out_dir': ds.ocr_out_dir,
  273. 'src_img_dir': ds.src_img_dir
  274. })
  275. # 构建 ocr.tools 字典
  276. ocr_tools_dict = {}
  277. for tool_id, tool_config in self.ocr_tools.items():
  278. ocr_tools_dict[tool_id] = tool_config.to_dict()
  279. # 返回完整配置
  280. config = self.global_config.copy()
  281. config['data_sources'] = data_sources_list
  282. # 确保 ocr.tools 存在
  283. if 'ocr' not in config:
  284. config['ocr'] = {}
  285. config['ocr']['tools'] = ocr_tools_dict
  286. return config
  287. # ============================================================================
  288. # 便捷函数
  289. # ============================================================================
  290. def load_config(config_dir: str = "config") -> ConfigManager:
  291. """加载配置"""
  292. return ConfigManager(config_dir)