""" 配置文件加载/保存 """ import argparse import json import sys import yaml import re from pathlib import Path from typing import Dict, List, Optional, Tuple from jinja2 import Template TABLE_EDITOR_DEFAULTS = { "viewport": {"width": 1200, "height": 600}, "display": { "default_zoom": 1.0, "zoom_min": 0.25, "zoom_max": 2.0, "zoom_step": 0.25, "default_line_width": 2, "line_width_min": 1, "line_width_max": 5, "show_line_numbers": True, }, "output": { "directory": "output/table_structures", "structure_suffix": "_structure.json", "image_suffix": "_with_lines.png", "defaults": { "save_structure": True, "save_image": True, "line_color": "黑色", }, "line_colors": [ {"name": "黑色", "rgb": [0, 0, 0]}, {"name": "蓝色", "rgb": [0, 0, 255]}, {"name": "红色", "rgb": [255, 0, 0]}, ], }, } def parse_table_editor_cli_args(argv: Optional[List[str]] = None): parser = argparse.ArgumentParser(add_help=False) parser.add_argument( "--config", type=str, default=None, help="table_line_generator 配置文件路径", ) args, _ = parser.parse_known_args(argv if argv is not None else sys.argv[1:]) return args def load_table_editor_config(config_path: Path) -> Dict: config_path = Path(config_path) table_cfg = {} if config_path.exists(): with open(config_path, "r", encoding="utf-8") as fp: data = yaml.safe_load(fp) or {} table_cfg = data.get("table_editor", {}) else: print(f"[table_editor] config not found: {config_path}, using defaults") def merge(section): merged = TABLE_EDITOR_DEFAULTS[section].copy() merged.update(table_cfg.get(section, {})) return merged result = { "viewport": merge("viewport"), "display": merge("display"), "output": merge("output"), "data_sources": _prepare_data_sources(table_cfg.get("data_sources"), config_path.parent), } result["output"]["line_colors"] = table_cfg.get("output", {}).get( "line_colors", TABLE_EDITOR_DEFAULTS["output"]["line_colors"] ) result["output"]["defaults"] = { **TABLE_EDITOR_DEFAULTS["output"]["defaults"], **table_cfg.get("output", {}).get("defaults", {}), } return result def _compile_pattern(pattern: Optional[str], context: Dict) -> Optional[re.Pattern]: if not pattern: return None rendered = Template(pattern).render(**context) return re.compile(rendered) def _render_template(value, context): if value is None: return None if isinstance(value, (int, float, bool)): return value return Template(str(value)).render(**context) def _resolve_path(path_str: str, base_dir: Optional[Path], config_root: Path) -> Path: path = Path(path_str).expanduser() if not path.is_absolute(): path = (base_dir or config_root) / path return path.resolve() def _prepare_data_sources(raw_sources: Optional[List[Dict]], config_root: Path) -> List[Dict]: prepared = [] for src in raw_sources or []: # 🎯 构建模板上下文 ctx = { 'name': src['name'], 'base_dir': src['base_dir'] } base_dir_path = ctx['base_dir'] def resolve_dir(field: str) -> Path: raw_value = src.get(field) if raw_value is None: raise ValueError(f"[table_editor] data source '{src.get('name')}' 缺少 {field}") rendered = _render_template(raw_value, ctx) if not rendered: raise ValueError(f"[table_editor] data source '{src.get('name')}' {field} 为空") return _resolve_path(rendered, base_dir_path, config_root) json_dir = resolve_dir("json_dir") image_dir = resolve_dir("image_dir") prepared_source = { **src, "json_dir": json_dir, "image_dir": image_dir, "context": ctx, } prepared_source["json_pattern"] = _render_template(src.get("json_pattern"), ctx) prepared_source["image_pattern"] = _render_template(src.get("image_pattern"), ctx) if "output" in src: output_cfg = dict(src["output"]) if "directory" in output_cfg: rendered = _render_template(output_cfg["directory"], ctx) if rendered: output_cfg["directory"] = str(_resolve_path(rendered, base_dir_path, config_root)) for suffix_key in ("structure_suffix", "image_suffix"): if suffix_key in output_cfg: output_cfg[suffix_key] = _render_template(output_cfg[suffix_key], ctx) prepared_source["output"] = output_cfg prepared.append(prepared_source) return prepared def build_data_source_catalog(source_cfg: Dict) -> List[Dict]: json_dir = Path(source_cfg["json_dir"]).expanduser().resolve() image_dir = Path(source_cfg["image_dir"]).expanduser().resolve() json_suffix = source_cfg.get("json_suffix", ".json") image_suffix = source_cfg.get("image_suffix", ".png") context = dict(source_cfg.get("context") or {}) if not context: context = dict(source_cfg.get("variables", {})) context.setdefault("name", source_cfg.get("name", "")) json_regex = _compile_pattern(source_cfg.get("json_pattern"), context) image_regex = _compile_pattern(source_cfg.get("image_pattern"), context) json_files = [] for file in json_dir.glob("*"): if not file.is_file(): continue match = None if json_regex: match = json_regex.fullmatch(file.name) if not match: continue elif json_suffix and not file.name.endswith(json_suffix): continue page_token = match.group("page") if match and "page" in match.groupdict() else None json_files.append({ "path": file, "stem": file.stem, "page_token": page_token, "page": int(page_token) if page_token and page_token.isdigit() else None, "mtime": file.stat().st_mtime, }) sort_key = source_cfg.get("sort_key", "name") if sort_key == "page" and any(item["page"] is not None for item in json_files): json_files.sort(key=lambda x: (x["page"] is None, x["page"] if x["page"] is not None else x["stem"])) elif sort_key == "mtime": json_files.sort(key=lambda x: x["mtime"]) else: json_files.sort(key=lambda x: x["stem"]) image_map: Dict[str, Path] = {} for img in image_dir.glob("*"): if not img.is_file(): continue match = None if image_regex: match = image_regex.fullmatch(img.name) if not match: continue elif image_suffix and not img.name.endswith(image_suffix): continue page_token = match.group("page") if match and "page" in match.groupdict() else None key = page_token or img.stem image_map[key] = img catalog = [] for idx, item in enumerate(json_files, start=1): key = item["page_token"] or item["stem"] catalog.append({ "index": idx, "display": f"{idx:03d} · {key}", "json": item["path"], "image": image_map.get(key), "page": item["page"], "page_token": item["page_token"], }) return catalog def load_structure_from_config(config_path: Path) -> dict: """ 从配置文件加载表格结构 Args: config_path: 配置文件路径 Returns: 表格结构字典 """ with open(config_path, 'r', encoding='utf-8') as f: structure = json.load(f) # 兼容旧版配置(补充缺失字段) if 'horizontal_lines' not in structure: # 从 rows 生成横线坐标 horizontal_lines = [] for row in structure.get('rows', []): horizontal_lines.append(row['y_start']) if structure.get('rows'): horizontal_lines.append(structure['rows'][-1]['y_end']) structure['horizontal_lines'] = horizontal_lines if 'vertical_lines' not in structure: # 从 columns 生成竖线坐标 vertical_lines = [] for col in structure.get('columns', []): vertical_lines.append(col['x_start']) if structure.get('columns'): vertical_lines.append(structure['columns'][-1]['x_end']) structure['vertical_lines'] = vertical_lines # 转换修改标记(从列表转为集合) if 'modified_h_lines' in structure: structure['modified_h_lines'] = set(structure['modified_h_lines']) else: structure['modified_h_lines'] = set() if 'modified_v_lines' in structure: structure['modified_v_lines'] = set(structure['modified_v_lines']) else: structure['modified_v_lines'] = set() # 转换旧版的 modified_rows/modified_cols(如果存在) if 'modified_rows' in structure and not structure['modified_h_lines']: structure['modified_h_lines'] = set(structure.get('modified_rows', [])) if 'modified_cols' in structure and not structure['modified_v_lines']: structure['modified_v_lines'] = set(structure.get('modified_cols', [])) return structure def save_structure_to_config(structure: dict, output_path: Path): """ 保存表格结构到配置文件 Args: structure: 表格结构字典 output_path: 输出文件路径 """ save_data = { 'rows': structure['rows'], 'columns': structure['columns'], 'horizontal_lines': structure.get('horizontal_lines', []), 'vertical_lines': structure.get('vertical_lines', []), 'row_height': structure['row_height'], 'col_widths': structure['col_widths'], 'table_bbox': structure['table_bbox'], 'modified_h_lines': list(structure.get('modified_h_lines', set())), 'modified_v_lines': list(structure.get('modified_v_lines', set())) } with open(output_path, 'w', encoding='utf-8') as f: json.dump(save_data, f, indent=2, ensure_ascii=False)