zhengchun
/
ocr_verify


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
							"""
配置文件加载/保存
"""

import argparse
import json
import sys
import yaml
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from jinja2 import Template

TABLE_EDITOR_DEFAULTS = {
    "viewport": {"width": 1200, "height": 600},
    "display": {
        "default_zoom": 1.0,
        "zoom_min": 0.25,
        "zoom_max": 2.0,
        "zoom_step": 0.25,
        "default_line_width": 2,
        "line_width_min": 1,
        "line_width_max": 5,
        "show_line_numbers": True,
    },
    "output": {
        "directory": "output/table_structures",
        "structure_suffix": "_structure.json",
        "image_suffix": "_with_lines.png",
        "defaults": {
            "save_structure": True,
            "save_image": True,
            "line_color": "黑色",
        },
        "line_colors": [
            {"name": "黑色", "rgb": [0, 0, 0]},
            {"name": "蓝色", "rgb": [0, 0, 255]},
            {"name": "红色", "rgb": [255, 0, 0]},
        ],
    },
}


def parse_table_editor_cli_args(argv: Optional[List[str]] = None):
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument(
        "--config",
        type=str,
        default=None,
        help="table_line_generator 配置文件路径",
    )
    args, _ = parser.parse_known_args(argv if argv is not None else sys.argv[1:])
    return args


def load_table_editor_config(config_path: Path) -> Dict:
    config_path = Path(config_path)
    table_cfg = {}

    if config_path.exists():
        with open(config_path, "r", encoding="utf-8") as fp:
            data = yaml.safe_load(fp) or {}
            table_cfg = data.get("table_editor", {})
    else:
        print(f"[table_editor] config not found: {config_path}, using defaults")

    def merge(section):
        merged = TABLE_EDITOR_DEFAULTS[section].copy()
        merged.update(table_cfg.get(section, {}))
        return merged

    result = {
        "viewport": merge("viewport"),
        "display": merge("display"),
        "output": merge("output"),
        "data_sources": _prepare_data_sources(table_cfg.get("data_sources"), config_path.parent),
    }
    result["output"]["line_colors"] = table_cfg.get("output", {}).get(
        "line_colors", TABLE_EDITOR_DEFAULTS["output"]["line_colors"]
    )
    result["output"]["defaults"] = {
        **TABLE_EDITOR_DEFAULTS["output"]["defaults"],
        **table_cfg.get("output", {}).get("defaults", {}),
    }
    return result


def _compile_pattern(pattern: Optional[str], context: Dict) -> Optional[re.Pattern]:
    if not pattern:
        return None
    rendered = Template(pattern).render(**context)
    return re.compile(rendered)


def _render_template(value, context):
    if value is None:
        return None
    if isinstance(value, (int, float, bool)):
        return value
    return Template(str(value)).render(**context)


def _resolve_path(path_str: str, base_dir: Optional[Path], config_root: Path) -> Path:
    path = Path(path_str).expanduser()
    if not path.is_absolute():
        path = (base_dir or config_root) / path
    return path.resolve()


def _prepare_data_sources(raw_sources: Optional[List[Dict]], config_root: Path) -> List[Dict]:
    prepared = []
    for src in raw_sources or []:
        # 🎯 构建模板上下文
        ctx = {
            'name': src['name'],
            'base_dir': src['base_dir']
        }
        base_dir_path = ctx['base_dir']
        def resolve_dir(field: str) -> Path:
            raw_value = src.get(field)
            if raw_value is None:
                raise ValueError(f"[table_editor] data source '{src.get('name')}' 缺少 {field}")
            rendered = _render_template(raw_value, ctx)
            if not rendered:
                raise ValueError(f"[table_editor] data source '{src.get('name')}' {field} 为空")
            return _resolve_path(rendered, base_dir_path, config_root)

        json_dir = resolve_dir("json_dir")
        image_dir = resolve_dir("image_dir")

        prepared_source = {
            **src,
            "json_dir": json_dir,
            "image_dir": image_dir,
            "context": ctx,
        }
        prepared_source["json_pattern"] = _render_template(src.get("json_pattern"), ctx)
        prepared_source["image_pattern"] = _render_template(src.get("image_pattern"), ctx)

        if "output" in src:
            output_cfg = dict(src["output"])
            if "directory" in output_cfg:
                rendered = _render_template(output_cfg["directory"], ctx)
                if rendered:
                    output_cfg["directory"] = str(_resolve_path(rendered, base_dir_path, config_root))
            for suffix_key in ("structure_suffix", "image_suffix"):
                if suffix_key in output_cfg:
                    output_cfg[suffix_key] = _render_template(output_cfg[suffix_key], ctx)
            prepared_source["output"] = output_cfg

        prepared.append(prepared_source)
    return prepared


def build_data_source_catalog(source_cfg: Dict) -> List[Dict]:
    json_dir = Path(source_cfg["json_dir"]).expanduser().resolve()
    image_dir = Path(source_cfg["image_dir"]).expanduser().resolve()
    json_suffix = source_cfg.get("json_suffix", ".json")
    image_suffix = source_cfg.get("image_suffix", ".png")

    context = dict(source_cfg.get("context") or {})
    if not context:
        context = dict(source_cfg.get("variables", {}))
        context.setdefault("name", source_cfg.get("name", ""))

    json_regex = _compile_pattern(source_cfg.get("json_pattern"), context)
    image_regex = _compile_pattern(source_cfg.get("image_pattern"), context)

    json_files = []
    for file in json_dir.glob("*"):
        if not file.is_file():
            continue
        match = None
        if json_regex:
            match = json_regex.fullmatch(file.name)
            if not match:
                continue
        elif json_suffix and not file.name.endswith(json_suffix):
            continue
        page_token = match.group("page") if match and "page" in match.groupdict() else None
        json_files.append({
            "path": file,
            "stem": file.stem,
            "page_token": page_token,
            "page": int(page_token) if page_token and page_token.isdigit() else None,
            "mtime": file.stat().st_mtime,
        })

    sort_key = source_cfg.get("sort_key", "name")
    if sort_key == "page" and any(item["page"] is not None for item in json_files):
        json_files.sort(key=lambda x: (x["page"] is None, x["page"] if x["page"] is not None else x["stem"]))
    elif sort_key == "mtime":
        json_files.sort(key=lambda x: x["mtime"])
    else:
        json_files.sort(key=lambda x: x["stem"])

    image_map: Dict[str, Path] = {}
    for img in image_dir.glob("*"):
        if not img.is_file():
            continue
        match = None
        if image_regex:
            match = image_regex.fullmatch(img.name)
            if not match:
                continue
        elif image_suffix and not img.name.endswith(image_suffix):
            continue
        page_token = match.group("page") if match and "page" in match.groupdict() else None
        key = page_token or img.stem
        image_map[key] = img

    catalog = []
    for idx, item in enumerate(json_files, start=1):
        key = item["page_token"] or item["stem"]
        catalog.append({
            "index": idx,
            "display": f"{idx:03d} · {key}",
            "json": item["path"],
            "image": image_map.get(key),
            "page": item["page"],
            "page_token": item["page_token"],
        })
    return catalog


def load_structure_from_config(config_path: Path) -> dict:
    """
    从配置文件加载表格结构
    
    Args:
        config_path: 配置文件路径
    
    Returns:
        表格结构字典
    """
    with open(config_path, 'r', encoding='utf-8') as f:
        structure = json.load(f)
    
    # 兼容旧版配置（补充缺失字段）
    if 'horizontal_lines' not in structure:
        # 从 rows 生成横线坐标
        horizontal_lines = []
        for row in structure.get('rows', []):
            horizontal_lines.append(row['y_start'])
        if structure.get('rows'):
            horizontal_lines.append(structure['rows'][-1]['y_end'])
        structure['horizontal_lines'] = horizontal_lines
    
    if 'vertical_lines' not in structure:
        # 从 columns 生成竖线坐标
        vertical_lines = []
        for col in structure.get('columns', []):
            vertical_lines.append(col['x_start'])
        if structure.get('columns'):
            vertical_lines.append(structure['columns'][-1]['x_end'])
        structure['vertical_lines'] = vertical_lines
    
    # 转换修改标记（从列表转为集合）
    if 'modified_h_lines' in structure:
        structure['modified_h_lines'] = set(structure['modified_h_lines'])
    else:
        structure['modified_h_lines'] = set()
    
    if 'modified_v_lines' in structure:
        structure['modified_v_lines'] = set(structure['modified_v_lines'])
    else:
        structure['modified_v_lines'] = set()
    
    # 转换旧版的 modified_rows/modified_cols（如果存在）
    if 'modified_rows' in structure and not structure['modified_h_lines']:
        structure['modified_h_lines'] = set(structure.get('modified_rows', []))
    if 'modified_cols' in structure and not structure['modified_v_lines']:
        structure['modified_v_lines'] = set(structure.get('modified_cols', []))
    
    return structure


def save_structure_to_config(structure: dict, output_path: Path):
    """
    保存表格结构到配置文件
    
    Args:
        structure: 表格结构字典
        output_path: 输出文件路径
    """
    save_data = {
        'rows': structure['rows'],
        'columns': structure['columns'],
        'horizontal_lines': structure.get('horizontal_lines', []),
        'vertical_lines': structure.get('vertical_lines', []),
        'row_height': structure['row_height'],
        'col_widths': structure['col_widths'],
        'table_bbox': structure['table_bbox'],
        'modified_h_lines': list(structure.get('modified_h_lines', set())),
        'modified_v_lines': list(structure.get('modified_v_lines', set()))
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(save_data, f, indent=2, ensure_ascii=False)