config_loader.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. """
  2. 配置文件加载/保存
  3. """
  4. import argparse
  5. import json
  6. import sys
  7. import yaml
  8. import re
  9. from pathlib import Path
  10. from typing import Dict, List, Optional, Tuple
  11. from jinja2 import Template
  12. TABLE_EDITOR_DEFAULTS = {
  13. "viewport": {"width": 1200, "height": 600},
  14. "display": {
  15. "default_zoom": 1.0,
  16. "zoom_min": 0.25,
  17. "zoom_max": 2.0,
  18. "zoom_step": 0.25,
  19. "default_line_width": 2,
  20. "line_width_min": 1,
  21. "line_width_max": 5,
  22. "show_line_numbers": True,
  23. },
  24. "output": {
  25. "directory": "output/table_structures",
  26. "structure_suffix": "_structure.json",
  27. "image_suffix": "_with_lines.png",
  28. "defaults": {
  29. "save_structure": True,
  30. "save_image": True,
  31. "line_color": "黑色",
  32. },
  33. "line_colors": [
  34. {"name": "黑色", "rgb": [0, 0, 0]},
  35. {"name": "蓝色", "rgb": [0, 0, 255]},
  36. {"name": "红色", "rgb": [255, 0, 0]},
  37. ],
  38. },
  39. }
  40. def parse_table_editor_cli_args(argv: Optional[List[str]] = None):
  41. parser = argparse.ArgumentParser(add_help=False)
  42. parser.add_argument(
  43. "--config",
  44. type=str,
  45. default=None,
  46. help="table_line_generator 配置文件路径",
  47. )
  48. args, _ = parser.parse_known_args(argv if argv is not None else sys.argv[1:])
  49. return args
  50. def load_table_editor_config(config_path: Path) -> Dict:
  51. config_path = Path(config_path)
  52. table_cfg = {}
  53. if config_path.exists():
  54. with open(config_path, "r", encoding="utf-8") as fp:
  55. data = yaml.safe_load(fp) or {}
  56. table_cfg = data.get("table_editor", {})
  57. else:
  58. print(f"[table_editor] config not found: {config_path}, using defaults")
  59. def merge(section):
  60. merged = TABLE_EDITOR_DEFAULTS[section].copy()
  61. merged.update(table_cfg.get(section, {}))
  62. return merged
  63. result = {
  64. "viewport": merge("viewport"),
  65. "display": merge("display"),
  66. "output": merge("output"),
  67. "data_sources": _prepare_data_sources(table_cfg.get("data_sources"), config_path.parent),
  68. }
  69. result["output"]["line_colors"] = table_cfg.get("output", {}).get(
  70. "line_colors", TABLE_EDITOR_DEFAULTS["output"]["line_colors"]
  71. )
  72. result["output"]["defaults"] = {
  73. **TABLE_EDITOR_DEFAULTS["output"]["defaults"],
  74. **table_cfg.get("output", {}).get("defaults", {}),
  75. }
  76. return result
  77. def _compile_pattern(pattern: Optional[str], context: Dict) -> Optional[re.Pattern]:
  78. if not pattern:
  79. return None
  80. rendered = Template(pattern).render(**context)
  81. return re.compile(rendered)
  82. def _render_template(value, context):
  83. if value is None:
  84. return None
  85. if isinstance(value, (int, float, bool)):
  86. return value
  87. return Template(str(value)).render(**context)
  88. def _resolve_path(path_str: str, base_dir: Optional[Path], config_root: Path) -> Path:
  89. path = Path(path_str).expanduser()
  90. if not path.is_absolute():
  91. path = (base_dir or config_root) / path
  92. return path.resolve()
  93. def _prepare_data_sources(raw_sources: Optional[List[Dict]], config_root: Path) -> List[Dict]:
  94. prepared = []
  95. for src in raw_sources or []:
  96. # 🎯 构建模板上下文
  97. ctx = {
  98. 'name': src['name'],
  99. 'base_dir': src['base_dir']
  100. }
  101. base_dir_path = ctx['base_dir']
  102. def resolve_dir(field: str) -> Path:
  103. raw_value = src.get(field)
  104. if raw_value is None:
  105. raise ValueError(f"[table_editor] data source '{src.get('name')}' 缺少 {field}")
  106. rendered = _render_template(raw_value, ctx)
  107. if not rendered:
  108. raise ValueError(f"[table_editor] data source '{src.get('name')}' {field} 为空")
  109. return _resolve_path(rendered, base_dir_path, config_root)
  110. json_dir = resolve_dir("json_dir")
  111. image_dir = resolve_dir("image_dir")
  112. prepared_source = {
  113. **src,
  114. "json_dir": json_dir,
  115. "image_dir": image_dir,
  116. "context": ctx,
  117. }
  118. prepared_source["json_pattern"] = _render_template(src.get("json_pattern"), ctx)
  119. prepared_source["image_pattern"] = _render_template(src.get("image_pattern"), ctx)
  120. if "output" in src:
  121. output_cfg = dict(src["output"])
  122. if "directory" in output_cfg:
  123. rendered = _render_template(output_cfg["directory"], ctx)
  124. if rendered:
  125. output_cfg["directory"] = str(_resolve_path(rendered, base_dir_path, config_root))
  126. for suffix_key in ("structure_suffix", "image_suffix"):
  127. if suffix_key in output_cfg:
  128. output_cfg[suffix_key] = _render_template(output_cfg[suffix_key], ctx)
  129. prepared_source["output"] = output_cfg
  130. prepared.append(prepared_source)
  131. return prepared
  132. def build_data_source_catalog(source_cfg: Dict) -> List[Dict]:
  133. json_dir = Path(source_cfg["json_dir"]).expanduser().resolve()
  134. image_dir = Path(source_cfg["image_dir"]).expanduser().resolve()
  135. json_suffix = source_cfg.get("json_suffix", ".json")
  136. image_suffix = source_cfg.get("image_suffix", ".png")
  137. context = dict(source_cfg.get("context") or {})
  138. if not context:
  139. context = dict(source_cfg.get("variables", {}))
  140. context.setdefault("name", source_cfg.get("name", ""))
  141. json_regex = _compile_pattern(source_cfg.get("json_pattern"), context)
  142. image_regex = _compile_pattern(source_cfg.get("image_pattern"), context)
  143. json_files = []
  144. for file in json_dir.glob("*"):
  145. if not file.is_file():
  146. continue
  147. match = None
  148. if json_regex:
  149. match = json_regex.fullmatch(file.name)
  150. if not match:
  151. continue
  152. elif json_suffix and not file.name.endswith(json_suffix):
  153. continue
  154. page_token = match.group("page") if match and "page" in match.groupdict() else None
  155. json_files.append({
  156. "path": file,
  157. "stem": file.stem,
  158. "page_token": page_token,
  159. "page": int(page_token) if page_token and page_token.isdigit() else None,
  160. "mtime": file.stat().st_mtime,
  161. })
  162. sort_key = source_cfg.get("sort_key", "name")
  163. if sort_key == "page" and any(item["page"] is not None for item in json_files):
  164. json_files.sort(key=lambda x: (x["page"] is None, x["page"] if x["page"] is not None else x["stem"]))
  165. elif sort_key == "mtime":
  166. json_files.sort(key=lambda x: x["mtime"])
  167. else:
  168. json_files.sort(key=lambda x: x["stem"])
  169. image_map: Dict[str, Path] = {}
  170. for img in image_dir.glob("*"):
  171. if not img.is_file():
  172. continue
  173. match = None
  174. if image_regex:
  175. match = image_regex.fullmatch(img.name)
  176. if not match:
  177. continue
  178. elif image_suffix and not img.name.endswith(image_suffix):
  179. continue
  180. page_token = match.group("page") if match and "page" in match.groupdict() else None
  181. key = page_token or img.stem
  182. image_map[key] = img
  183. catalog = []
  184. for idx, item in enumerate(json_files, start=1):
  185. key = item["page_token"] or item["stem"]
  186. catalog.append({
  187. "index": idx,
  188. "display": f"{idx:03d} · {key}",
  189. "json": item["path"],
  190. "image": image_map.get(key),
  191. "page": item["page"],
  192. "page_token": item["page_token"],
  193. })
  194. return catalog
  195. def load_structure_from_config(config_path: Path) -> dict:
  196. """
  197. 从配置文件加载表格结构
  198. Args:
  199. config_path: 配置文件路径
  200. Returns:
  201. 表格结构字典
  202. """
  203. with open(config_path, 'r', encoding='utf-8') as f:
  204. structure = json.load(f)
  205. # 兼容旧版配置(补充缺失字段)
  206. if 'horizontal_lines' not in structure:
  207. # 从 rows 生成横线坐标
  208. horizontal_lines = []
  209. for row in structure.get('rows', []):
  210. horizontal_lines.append(row['y_start'])
  211. if structure.get('rows'):
  212. horizontal_lines.append(structure['rows'][-1]['y_end'])
  213. structure['horizontal_lines'] = horizontal_lines
  214. if 'vertical_lines' not in structure:
  215. # 从 columns 生成竖线坐标
  216. vertical_lines = []
  217. for col in structure.get('columns', []):
  218. vertical_lines.append(col['x_start'])
  219. if structure.get('columns'):
  220. vertical_lines.append(structure['columns'][-1]['x_end'])
  221. structure['vertical_lines'] = vertical_lines
  222. # 转换修改标记(从列表转为集合)
  223. if 'modified_h_lines' in structure:
  224. structure['modified_h_lines'] = set(structure['modified_h_lines'])
  225. else:
  226. structure['modified_h_lines'] = set()
  227. if 'modified_v_lines' in structure:
  228. structure['modified_v_lines'] = set(structure['modified_v_lines'])
  229. else:
  230. structure['modified_v_lines'] = set()
  231. # 转换旧版的 modified_rows/modified_cols(如果存在)
  232. if 'modified_rows' in structure and not structure['modified_h_lines']:
  233. structure['modified_h_lines'] = set(structure.get('modified_rows', []))
  234. if 'modified_cols' in structure and not structure['modified_v_lines']:
  235. structure['modified_v_lines'] = set(structure.get('modified_cols', []))
  236. return structure
  237. def save_structure_to_config(structure: dict, output_path: Path):
  238. """
  239. 保存表格结构到配置文件
  240. Args:
  241. structure: 表格结构字典
  242. output_path: 输出文件路径
  243. """
  244. save_data = {
  245. 'rows': structure['rows'],
  246. 'columns': structure['columns'],
  247. 'horizontal_lines': structure.get('horizontal_lines', []),
  248. 'vertical_lines': structure.get('vertical_lines', []),
  249. 'row_height': structure['row_height'],
  250. 'col_widths': structure['col_widths'],
  251. 'table_bbox': structure['table_bbox'],
  252. 'modified_h_lines': list(structure.get('modified_h_lines', set())),
  253. 'modified_v_lines': list(structure.get('modified_v_lines', set()))
  254. }
  255. with open(output_path, 'w', encoding='utf-8') as f:
  256. json.dump(save_data, f, indent=2, ensure_ascii=False)