| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- """
- 配置文件加载/保存
- """
- import argparse
- import json
- import sys
- import yaml
- import re
- from pathlib import Path
- from typing import Dict, List, Optional, Tuple
- from jinja2 import Template
- TABLE_EDITOR_DEFAULTS = {
- "viewport": {"width": 1200, "height": 600},
- "display": {
- "default_zoom": 1.0,
- "zoom_min": 0.25,
- "zoom_max": 2.0,
- "zoom_step": 0.25,
- "default_line_width": 2,
- "line_width_min": 1,
- "line_width_max": 5,
- "show_line_numbers": True,
- },
- "output": {
- "directory": "output/table_structures",
- "structure_suffix": "_structure.json",
- "image_suffix": "_with_lines.png",
- "defaults": {
- "save_structure": True,
- "save_image": True,
- "line_color": "黑色",
- },
- "line_colors": [
- {"name": "黑色", "rgb": [0, 0, 0]},
- {"name": "蓝色", "rgb": [0, 0, 255]},
- {"name": "红色", "rgb": [255, 0, 0]},
- ],
- },
- }
- def parse_table_editor_cli_args(argv: Optional[List[str]] = None):
- parser = argparse.ArgumentParser(add_help=False)
- parser.add_argument(
- "--config",
- type=str,
- default=None,
- help="table_line_generator 配置文件路径",
- )
- args, _ = parser.parse_known_args(argv if argv is not None else sys.argv[1:])
- return args
- def load_table_editor_config(config_path: Path) -> Dict:
- config_path = Path(config_path)
- table_cfg = {}
- if config_path.exists():
- with open(config_path, "r", encoding="utf-8") as fp:
- data = yaml.safe_load(fp) or {}
- table_cfg = data.get("table_editor", {})
- else:
- print(f"[table_editor] config not found: {config_path}, using defaults")
- def merge(section):
- merged = TABLE_EDITOR_DEFAULTS[section].copy()
- merged.update(table_cfg.get(section, {}))
- return merged
- result = {
- "viewport": merge("viewport"),
- "display": merge("display"),
- "output": merge("output"),
- "data_sources": _prepare_data_sources(table_cfg.get("data_sources"), config_path.parent),
- }
- result["output"]["line_colors"] = table_cfg.get("output", {}).get(
- "line_colors", TABLE_EDITOR_DEFAULTS["output"]["line_colors"]
- )
- result["output"]["defaults"] = {
- **TABLE_EDITOR_DEFAULTS["output"]["defaults"],
- **table_cfg.get("output", {}).get("defaults", {}),
- }
- return result
- def _compile_pattern(pattern: Optional[str], context: Dict) -> Optional[re.Pattern]:
- if not pattern:
- return None
- rendered = Template(pattern).render(**context)
- return re.compile(rendered)
- def _render_template(value, context):
- if value is None:
- return None
- if isinstance(value, (int, float, bool)):
- return value
- return Template(str(value)).render(**context)
- def _resolve_path(path_str: str, base_dir: Optional[Path], config_root: Path) -> Path:
- path = Path(path_str).expanduser()
- if not path.is_absolute():
- path = (base_dir or config_root) / path
- return path.resolve()
- def _prepare_data_sources(raw_sources: Optional[List[Dict]], config_root: Path) -> List[Dict]:
- prepared = []
- for src in raw_sources or []:
- # 🎯 构建模板上下文
- ctx = {
- 'name': src['name'],
- 'base_dir': src['base_dir']
- }
- base_dir_path = ctx['base_dir']
- def resolve_dir(field: str) -> Path:
- raw_value = src.get(field)
- if raw_value is None:
- raise ValueError(f"[table_editor] data source '{src.get('name')}' 缺少 {field}")
- rendered = _render_template(raw_value, ctx)
- if not rendered:
- raise ValueError(f"[table_editor] data source '{src.get('name')}' {field} 为空")
- return _resolve_path(rendered, base_dir_path, config_root)
- json_dir = resolve_dir("json_dir")
- image_dir = resolve_dir("image_dir")
- prepared_source = {
- **src,
- "json_dir": json_dir,
- "image_dir": image_dir,
- "context": ctx,
- }
- prepared_source["json_pattern"] = _render_template(src.get("json_pattern"), ctx)
- prepared_source["image_pattern"] = _render_template(src.get("image_pattern"), ctx)
- if "output" in src:
- output_cfg = dict(src["output"])
- if "directory" in output_cfg:
- rendered = _render_template(output_cfg["directory"], ctx)
- if rendered:
- output_cfg["directory"] = str(_resolve_path(rendered, base_dir_path, config_root))
- for suffix_key in ("structure_suffix", "image_suffix"):
- if suffix_key in output_cfg:
- output_cfg[suffix_key] = _render_template(output_cfg[suffix_key], ctx)
- prepared_source["output"] = output_cfg
- prepared.append(prepared_source)
- return prepared
- def build_data_source_catalog(source_cfg: Dict) -> List[Dict]:
- json_dir = Path(source_cfg["json_dir"]).expanduser().resolve()
- image_dir = Path(source_cfg["image_dir"]).expanduser().resolve()
- json_suffix = source_cfg.get("json_suffix", ".json")
- image_suffix = source_cfg.get("image_suffix", ".png")
- context = dict(source_cfg.get("context") or {})
- if not context:
- context = dict(source_cfg.get("variables", {}))
- context.setdefault("name", source_cfg.get("name", ""))
- json_regex = _compile_pattern(source_cfg.get("json_pattern"), context)
- image_regex = _compile_pattern(source_cfg.get("image_pattern"), context)
- json_files = []
- for file in json_dir.glob("*"):
- if not file.is_file():
- continue
- match = None
- if json_regex:
- match = json_regex.fullmatch(file.name)
- if not match:
- continue
- elif json_suffix and not file.name.endswith(json_suffix):
- continue
- page_token = match.group("page") if match and "page" in match.groupdict() else None
- json_files.append({
- "path": file,
- "stem": file.stem,
- "page_token": page_token,
- "page": int(page_token) if page_token and page_token.isdigit() else None,
- "mtime": file.stat().st_mtime,
- })
- sort_key = source_cfg.get("sort_key", "name")
- if sort_key == "page" and any(item["page"] is not None for item in json_files):
- json_files.sort(key=lambda x: (x["page"] is None, x["page"] if x["page"] is not None else x["stem"]))
- elif sort_key == "mtime":
- json_files.sort(key=lambda x: x["mtime"])
- else:
- json_files.sort(key=lambda x: x["stem"])
- image_map: Dict[str, Path] = {}
- for img in image_dir.glob("*"):
- if not img.is_file():
- continue
- match = None
- if image_regex:
- match = image_regex.fullmatch(img.name)
- if not match:
- continue
- elif image_suffix and not img.name.endswith(image_suffix):
- continue
- page_token = match.group("page") if match and "page" in match.groupdict() else None
- key = page_token or img.stem
- image_map[key] = img
- catalog = []
- for idx, item in enumerate(json_files, start=1):
- key = item["page_token"] or item["stem"]
- catalog.append({
- "index": idx,
- "display": f"{idx:03d} · {key}",
- "json": item["path"],
- "image": image_map.get(key),
- "page": item["page"],
- "page_token": item["page_token"],
- })
- return catalog
- def load_structure_from_config(config_path: Path) -> dict:
- """
- 从配置文件加载表格结构
-
- Args:
- config_path: 配置文件路径
-
- Returns:
- 表格结构字典
- """
- with open(config_path, 'r', encoding='utf-8') as f:
- structure = json.load(f)
-
- # 兼容旧版配置(补充缺失字段)
- if 'horizontal_lines' not in structure:
- # 从 rows 生成横线坐标
- horizontal_lines = []
- for row in structure.get('rows', []):
- horizontal_lines.append(row['y_start'])
- if structure.get('rows'):
- horizontal_lines.append(structure['rows'][-1]['y_end'])
- structure['horizontal_lines'] = horizontal_lines
-
- if 'vertical_lines' not in structure:
- # 从 columns 生成竖线坐标
- vertical_lines = []
- for col in structure.get('columns', []):
- vertical_lines.append(col['x_start'])
- if structure.get('columns'):
- vertical_lines.append(structure['columns'][-1]['x_end'])
- structure['vertical_lines'] = vertical_lines
-
- # 转换修改标记(从列表转为集合)
- if 'modified_h_lines' in structure:
- structure['modified_h_lines'] = set(structure['modified_h_lines'])
- else:
- structure['modified_h_lines'] = set()
-
- if 'modified_v_lines' in structure:
- structure['modified_v_lines'] = set(structure['modified_v_lines'])
- else:
- structure['modified_v_lines'] = set()
-
- # 转换旧版的 modified_rows/modified_cols(如果存在)
- if 'modified_rows' in structure and not structure['modified_h_lines']:
- structure['modified_h_lines'] = set(structure.get('modified_rows', []))
- if 'modified_cols' in structure and not structure['modified_v_lines']:
- structure['modified_v_lines'] = set(structure.get('modified_cols', []))
-
- return structure
- def save_structure_to_config(structure: dict, output_path: Path):
- """
- 保存表格结构到配置文件
-
- Args:
- structure: 表格结构字典
- output_path: 输出文件路径
- """
- save_data = {
- 'rows': structure['rows'],
- 'columns': structure['columns'],
- 'horizontal_lines': structure.get('horizontal_lines', []),
- 'vertical_lines': structure.get('vertical_lines', []),
- 'row_height': structure['row_height'],
- 'col_widths': structure['col_widths'],
- 'table_bbox': structure['table_bbox'],
- 'modified_h_lines': list(structure.get('modified_h_lines', set())),
- 'modified_v_lines': list(structure.get('modified_v_lines', set()))
- }
-
- with open(output_path, 'w', encoding='utf-8') as f:
- json.dump(save_data, f, indent=2, ensure_ascii=False)
|