1 月之前 · d25c465024
--- a/ocr_tools/cell_preprocess_lab/cell121_sweep.py
+++ b/ocr_tools/cell_preprocess_lab/cell121_sweep.py
@@ -1,194 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""cell121 参数扫描：去水印方式 / threshold / contrast / upscale / det 阈值 / 整格 rec。"""
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-from itertools import product
			
 
				-from pathlib import Path
			
 
				-from typing import Any, Dict, List, Optional, Tuple
			
 
				-
			
 
				-import cv2
			
 
				-import numpy as np
			
 
				-
			
 
				-_repo_root = Path(__file__).resolve().parents[2]
			
 
				-if str(_repo_root) not in sys.path:
			
 
				-    sys.path.insert(0, str(_repo_root))
			
 
				-
			
 
				-from ocr_utils.watermark import WatermarkProcessor, merge_watermark_config
			
 
				-from ocr_utils.watermark.contrast import apply_contrast_enhancement_config
			
 
				-
			
 
				-CELL121 = Path(
			
 
				-    "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/"
			
 
				-    "bank_statement_yusys_local/debug/table_recognition_wired/tablecell_ocr/"
			
 
				-    "彭_广东兴宁农村商业银行_page_002_0/cell121_empty_empty.png"
			
 
				-)
			
 
				-OUT_DIR = Path(__file__).parent / "output/彭_广东兴宁农村商业银行/cell121_sweep"
			
 
				-MODEL_DIR = Path(
			
 
				-    "/Users/zhch158/models/modelscope_cache/models/OpenDataLab/"
			
 
				-    "PDF-Extract-Kit-1___0/models/OCR/paddleocr_torch"
			
 
				-)
			
 
				-
			
 
				-TARGET = "20240927"
			
 
				-
			
 
				-
			
 
				-def _upscale(img: np.ndarray, min_side: int) -> np.ndarray:
			
 
				-    h, w = img.shape[:2]
			
 
				-    if h >= min_side and w >= min_side:
			
 
				-        return img
			
 
				-    s = max(min_side / max(h, 1), min_side / max(w, 1), 1.0)
			
 
				-    return cv2.resize(img, None, fx=s, fy=s, interpolation=cv2.INTER_CUBIC)
			
 
				-
			
 
				-
			
 
				-def _preprocess(
			
 
				-    raw: np.ndarray,
			
 
				-    *,
			
 
				-    method: str,
			
 
				-    thresh: Optional[int],
			
 
				-    contrast: bool,
			
 
				-    upscale: int,
			
 
				-) -> np.ndarray:
			
 
				-    user: Dict[str, Any] = {"enabled": True, "method": method}
			
 
				-    if method == "threshold" and thresh is not None:
			
 
				-        user["threshold"] = thresh
			
 
				-    cfg = merge_watermark_config("cell", user)
			
 
				-    img, _ = WatermarkProcessor(cfg, scope="cell").process(raw, force=True)
			
 
				-    if contrast:
			
 
				-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				-        ce = dict(cfg.get("contrast_enhancement") or {})
			
 
				-        ce["enabled"] = True
			
 
				-        ce["text_black_target"] = 88
			
 
				-        gray = apply_contrast_enhancement_config(gray, ce)
			
 
				-        img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
			
 
				-    return _upscale(img, upscale)
			
 
				-
			
 
				-
			
 
				-def _ocr(engine: Any, img: np.ndarray, *, det: bool, rec: bool) -> Dict[str, Any]:
			
 
				-    try:
			
 
				-        res = engine.ocr(img, det=det, rec=rec)
			
 
				-        texts: List[str] = []
			
 
				-        if res and res[0]:
			
 
				-            if det:
			
 
				-                for item in res[0]:
			
 
				-                    if item and len(item) >= 2 and item[1]:
			
 
				-                        texts.append(str(item[1][0] or ""))
			
 
				-            else:
			
 
				-                for item in res[0]:
			
 
				-                    if isinstance(item, (list, tuple)) and len(item) >= 1:
			
 
				-                        texts.append(str(item[0] or ""))
			
 
				-        text = "".join(texts).strip()
			
 
				-        return {
			
 
				-            "text": text,
			
 
				-            "det": det,
			
 
				-            "rec": rec,
			
 
				-            "n_boxes": len(res[0]) if res and res[0] else 0,
			
 
				-        }
			
 
				-    except Exception as e:
			
 
				-        return {"text": "", "error": str(e), "det": det, "rec": rec}
			
 
				-
			
 
				-
			
 
				-def _make_engine(det_thresh: float) -> Any:
			
 
				-    from ocr_tools.pytorch_models.pytorch_paddle import PytorchPaddleOCR
			
 
				-
			
 
				-    return PytorchPaddleOCR(
			
 
				-        lang="ch",
			
 
				-        det_model_path=str(MODEL_DIR / "ch_PP-OCRv5_det_infer.pth"),
			
 
				-        rec_model_path=str(MODEL_DIR / "ch_PP-OCRv4_rec_server_doc_infer.pth"),
			
 
				-        det_db_box_thresh=det_thresh,
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def main() -> None:
			
 
				-    if not CELL121.is_file():
			
 
				-        raise FileNotFoundError(CELL121)
			
 
				-    raw = cv2.imread(str(CELL121))
			
 
				-    OUT_DIR.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-    methods = ["threshold", "masked_adaptive"]
			
 
				-    thresholds = [155, 165, 170, 175, 180, None]
			
 
				-    contrasts = [False, True]
			
 
				-    upscales = [64, 96, 128, 192]
			
 
				-    det_threshs = [0.2, 0.3, 0.4, 0.5]
			
 
				-    ocr_modes = [("det_rec", True, True), ("whole_rec", False, True)]
			
 
				-
			
 
				-    results: List[Dict[str, Any]] = []
			
 
				-    hits: List[Dict[str, Any]] = []
			
 
				-    engines: Dict[float, Any] = {}
			
 
				-
			
 
				-    total = 0
			
 
				-    for method, thresh, contrast, upscale, det_th in product(
			
 
				-        methods, thresholds, contrasts, upscales, det_threshs
			
 
				-    ):
			
 
				-        if method != "threshold" and thresh is not None:
			
 
				-            continue
			
 
				-        if det_th not in engines:
			
 
				-            print(f"加载 OCR det_db_box_thresh={det_th} ...")
			
 
				-            engines[det_th] = _make_engine(det_th)
			
 
				-
			
 
				-        img = _preprocess(
			
 
				-            raw, method=method, thresh=thresh, contrast=contrast, upscale=upscale
			
 
				-        )
			
 
				-        tag = (
			
 
				-            f"{method}_t{thresh or 'd'}_c{int(contrast)}_u{upscale}_det{det_th}"
			
 
				-        )
			
 
				-        cv2.imwrite(str(OUT_DIR / f"{tag}.png"), img)
			
 
				-
			
 
				-        for mode_name, det, rec in ocr_modes:
			
 
				-            total += 1
			
 
				-            ocr = _ocr(engines[det_th], img, det=det, rec=rec)
			
 
				-            row = {
			
 
				-                "tag": tag,
			
 
				-                "method": method,
			
 
				-                "threshold": thresh,
			
 
				-                "contrast": contrast,
			
 
				-                "upscale": upscale,
			
 
				-                "det_db_box_thresh": det_th,
			
 
				-                "ocr_mode": mode_name,
			
 
				-                **ocr,
			
 
				-            }
			
 
				-            results.append(row)
			
 
				-            t = row.get("text", "")
			
 
				-            if TARGET in t or (len(t) >= 6 and t.isdigit()):
			
 
				-                row["match"] = "full" if TARGET in t else "partial"
			
 
				-                hits.append(row)
			
 
				-                print(f"HIT [{row['match']}] {mode_name} {tag} -> {t!r}")
			
 
				-
			
 
				-    # 原图对照
			
 
				-    for det_th in [0.3, 0.5]:
			
 
				-        if det_th not in engines:
			
 
				-            engines[det_th] = _make_engine(det_th)
			
 
				-        for mode_name, det, rec in ocr_modes:
			
 
				-            ocr = _ocr(engines[det_th], _upscale(raw, 128), det=det, rec=rec)
			
 
				-            row = {
			
 
				-                "tag": "raw_upscale128",
			
 
				-                "det_db_box_thresh": det_th,
			
 
				-                "ocr_mode": mode_name,
			
 
				-                **ocr,
			
 
				-            }
			
 
				-            results.append(row)
			
 
				-            if TARGET in (row.get("text") or ""):
			
 
				-                hits.append(row)
			
 
				-
			
 
				-    report = {
			
 
				-        "input": str(CELL121),
			
 
				-        "target": TARGET,
			
 
				-        "total_trials": total,
			
 
				-        "hits": hits,
			
 
				-        "all_results": results,
			
 
				-    }
			
 
				-    out_json = OUT_DIR / "cell121_sweep_report.json"
			
 
				-    out_json.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
			
 
				-
			
 
				-    print(f"\n完成 {total} 次 OCR 试验，命中 {len(hits)} 条")
			
 
				-    print(f"报告: {out_json}")
			
 
				-    if hits:
			
 
				-        print("\n最佳命中:")
			
 
				-        for h in hits[:10]:
			
 
				-            print(f"  {h.get('ocr_mode')} {h.get('tag')}: {h.get('text')!r}")
			
 
				-    else:
			
 
				-        print("未出现完整 20240927，请查看 cell121_sweep/*.png 与 report 中 partial 结果")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/ocr_tools/cell_preprocess_lab/cell_preprocess_lab.py
+++ b/ocr_tools/cell_preprocess_lab/cell_preprocess_lab.py
@@ -8,6 +8,9 @@
 
				     python cell_preprocess_lab.py cell219.png -o /tmp/cell_lab
			
 
				     python cell_preprocess_lab.py /path/to/tablecell_ocr/ -o /tmp/batch --compare-methods
			
 
				     python cell_preprocess_lab.py cell217.png -o /tmp/out --denoise --contrast
			
 
				+
			
 
				+参数网格扫描见 cell_sweep.py:
			
 
				+    python cell_sweep.py cell219_empty_empty_raw.png -o ./out -t "ATM存折取款"
			
 
				 """
			
 
				 from __future__ import annotations
			
 
				 
			
--- a/ocr_tools/cell_preprocess_lab/cell_sweep.py
+++ b/ocr_tools/cell_preprocess_lab/cell_sweep.py
@@ -0,0 +1,554 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+单元格裁剪图预处理参数扫描：去水印 / threshold / contrast / upscale / det 阈值 / OCR 模式。
			
 
				+
			
 
				+默认从 **原图**（`*_raw.png`）出发，与 pipeline 二次 OCR 一致，避免对已预处理 debug 图二次去水印。
			
 
				+
			
 
				+用法:
			
 
				+    python cell_sweep.py cell219_empty_empty_raw.png -o ./out -t "ATM存折取款"
			
 
				+    python cell_sweep.py /path/to/tablecell_ocr/ -o ./out
			
 
				+    python cell_sweep.py cell.png --quick --no-save-images
			
 
				+    OCR_DET_MODEL_PATH=... OCR_REC_MODEL_PATH=... python cell_sweep.py cell.png
			
 
				+"""
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+from itertools import product
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
			
 
				+
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+
			
 
				+_repo_root = Path(__file__).resolve().parents[2]
			
 
				+if str(_repo_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(_repo_root))
			
 
				+
			
 
				+from ocr_utils.watermark import WatermarkProcessor, merge_watermark_config
			
 
				+from ocr_utils.watermark.contrast import apply_contrast_enhancement_config
			
 
				+
			
 
				+_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff", ".webp"}
			
 
				+_DEFAULT_MODEL_DIR = Path(
			
 
				+    "/Users/zhch158/models/modelscope_cache/models/OpenDataLab/"
			
 
				+    "PDF-Extract-Kit-1___0/models/OCR/paddleocr_torch"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def _parse_csv_ints(s: str) -> List[Optional[int]]:
			
 
				+    out: List[Optional[int]] = []
			
 
				+    for part in s.split(","):
			
 
				+        part = part.strip()
			
 
				+        if not part or part.lower() in ("none", "d", "default"):
			
 
				+            out.append(None)
			
 
				+        else:
			
 
				+            out.append(int(part))
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def _parse_csv_floats(s: str) -> List[float]:
			
 
				+    return [float(x.strip()) for x in s.split(",") if x.strip()]
			
 
				+
			
 
				+
			
 
				+def _parse_csv_bools(s: str) -> List[bool]:
			
 
				+    out: List[bool] = []
			
 
				+    for part in s.split(","):
			
 
				+        p = part.strip().lower()
			
 
				+        if p in ("1", "true", "yes", "on"):
			
 
				+            out.append(True)
			
 
				+        elif p in ("0", "false", "no", "off"):
			
 
				+            out.append(False)
			
 
				+        else:
			
 
				+            raise ValueError(f"无效的 bool 值: {part!r}")
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def _default_model_dir() -> Path:
			
 
				+    det = os.environ.get("OCR_DET_MODEL_PATH")
			
 
				+    if det:
			
 
				+        return Path(det).parent
			
 
				+    return _DEFAULT_MODEL_DIR
			
 
				+
			
 
				+
			
 
				+def _upscale(img: np.ndarray, min_side: int) -> np.ndarray:
			
 
				+    h, w = img.shape[:2]
			
 
				+    if h >= min_side and w >= min_side:
			
 
				+        return img
			
 
				+    s = max(min_side / max(h, 1), min_side / max(w, 1), 1.0)
			
 
				+    return cv2.resize(img, None, fx=s, fy=s, interpolation=cv2.INTER_CUBIC)
			
 
				+
			
 
				+
			
 
				+def _preprocess(
			
 
				+    raw: np.ndarray,
			
 
				+    *,
			
 
				+    method: str,
			
 
				+    thresh: Optional[int],
			
 
				+    contrast: bool,
			
 
				+    upscale: int,
			
 
				+    text_black_target: int,
			
 
				+) -> np.ndarray:
			
 
				+    user: Dict[str, Any] = {"enabled": True, "method": method}
			
 
				+    if method == "threshold" and thresh is not None:
			
 
				+        user["threshold"] = thresh
			
 
				+    cfg = merge_watermark_config("cell", user)
			
 
				+    img, _ = WatermarkProcessor(cfg, scope="cell").process(raw, force=True)
			
 
				+    if contrast:
			
 
				+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+        ce = dict(cfg.get("contrast_enhancement") or {})
			
 
				+        ce["enabled"] = True
			
 
				+        ce["text_black_target"] = text_black_target
			
 
				+        gray = apply_contrast_enhancement_config(gray, ce)
			
 
				+        img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
			
 
				+    return _upscale(img, upscale)
			
 
				+
			
 
				+
			
 
				+def _parse_rec_pair(rec_part: Any) -> Tuple[str, float]:
			
 
				+    """从 OCR 返回的 (text, score) 或嵌套结构中解析识别结果。"""
			
 
				+    if rec_part is None:
			
 
				+        return "", 0.0
			
 
				+    if isinstance(rec_part, (list, tuple)) and len(rec_part) >= 2:
			
 
				+        if isinstance(rec_part[0], (list, tuple, dict)):
			
 
				+            return "", 0.0
			
 
				+        txt = str(rec_part[0] or "").strip()
			
 
				+        try:
			
 
				+            sc = float(rec_part[1] or 0.0)
			
 
				+        except (TypeError, ValueError):
			
 
				+            sc = 0.0
			
 
				+        return txt, sc if txt else 0.0
			
 
				+    if isinstance(rec_part, (list, tuple)) and len(rec_part) == 1:
			
 
				+        txt = str(rec_part[0] or "").strip()
			
 
				+        return txt, 0.0
			
 
				+    return "", 0.0
			
 
				+
			
 
				+
			
 
				+def _aggregate_rec_score(boxes: List[Dict[str, Any]]) -> float:
			
 
				+    """按字符数加权平均识别分（与 pipeline aggregate_line_ocr 一致）。"""
			
 
				+    total_len = sum(len(b.get("text") or "") for b in boxes)
			
 
				+    if total_len <= 0:
			
 
				+        return 0.0
			
 
				+    weighted = sum(
			
 
				+        len(b.get("text") or "") * float(b.get("score") or 0.0) for b in boxes
			
 
				+    )
			
 
				+    return weighted / total_len
			
 
				+
			
 
				+
			
 
				+def _ocr(engine: Any, img: np.ndarray, *, det: bool, rec: bool) -> Dict[str, Any]:
			
 
				+    empty: Dict[str, Any] = {
			
 
				+        "text": "",
			
 
				+        "score": 0.0,
			
 
				+        "boxes": [],
			
 
				+        "det": det,
			
 
				+        "rec": rec,
			
 
				+        "n_boxes": 0,
			
 
				+    }
			
 
				+    try:
			
 
				+        res = engine.ocr(img, det=det, rec=rec)
			
 
				+        items = res[0] if res and res[0] is not None else []
			
 
				+        boxes_out: List[Dict[str, Any]] = []
			
 
				+
			
 
				+        if det:
			
 
				+            for item in items:
			
 
				+                if not item or len(item) < 2:
			
 
				+                    continue
			
 
				+                text, score = _parse_rec_pair(item[1])
			
 
				+                bbox = item[0]
			
 
				+                if hasattr(bbox, "tolist"):
			
 
				+                    bbox = bbox.tolist()
			
 
				+                entry: Dict[str, Any] = {
			
 
				+                    "text": text,
			
 
				+                    "score": round(score, 6),
			
 
				+                }
			
 
				+                if bbox is not None:
			
 
				+                    entry["det_bbox"] = bbox
			
 
				+                boxes_out.append(entry)
			
 
				+        else:
			
 
				+            for item in items:
			
 
				+                text, score = _parse_rec_pair(item)
			
 
				+                if not text and isinstance(item, (list, tuple)) and len(item) >= 1:
			
 
				+                    text, score = _parse_rec_pair(item[0])
			
 
				+                boxes_out.append({"text": text, "score": round(score, 6)})
			
 
				+
			
 
				+        text = "".join(b["text"] for b in boxes_out if b.get("text")).strip()
			
 
				+        agg_score = _aggregate_rec_score(boxes_out)
			
 
				+        return {
			
 
				+            "text": text,
			
 
				+            "score": round(agg_score, 6),
			
 
				+            "boxes": boxes_out,
			
 
				+            "det": det,
			
 
				+            "rec": rec,
			
 
				+            "n_boxes": len(boxes_out),
			
 
				+        }
			
 
				+    except Exception as e:
			
 
				+        out = dict(empty)
			
 
				+        out["error"] = str(e)
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+def _make_engine(det_thresh: float, model_dir: Path) -> Any:
			
 
				+    from ocr_tools.pytorch_models.pytorch_paddle import PytorchPaddleOCR
			
 
				+
			
 
				+    det_path = os.environ.get("OCR_DET_MODEL_PATH") or str(
			
 
				+        model_dir / "ch_PP-OCRv5_det_infer.pth"
			
 
				+    )
			
 
				+    rec_path = os.environ.get("OCR_REC_MODEL_PATH") or str(
			
 
				+        model_dir / "ch_PP-OCRv4_rec_server_doc_infer.pth"
			
 
				+    )
			
 
				+    return PytorchPaddleOCR(
			
 
				+        lang="ch",
			
 
				+        det_model_path=det_path,
			
 
				+        rec_model_path=rec_path,
			
 
				+        det_db_box_thresh=det_thresh,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def resolve_input_image(path: Path, *, prefer_raw: bool) -> Path:
			
 
				+    """优先使用与 pipeline debug 配套的 *_raw.png。"""
			
 
				+    if not prefer_raw or path.stem.endswith("_raw"):
			
 
				+        return path
			
 
				+    raw_path = path.parent / f"{path.stem}_raw{path.suffix}"
			
 
				+    if raw_path.is_file():
			
 
				+        print(f"  使用原图: {raw_path.name}（跳过 {path.name}）")
			
 
				+        return raw_path
			
 
				+    return path
			
 
				+
			
 
				+
			
 
				+def collect_inputs(path: Path, *, prefer_raw: bool) -> List[Path]:
			
 
				+    if path.is_file():
			
 
				+        if path.suffix.lower() not in _IMAGE_SUFFIXES:
			
 
				+            raise ValueError(f"不支持的图像格式: {path}")
			
 
				+        return [resolve_input_image(path, prefer_raw=prefer_raw)]
			
 
				+
			
 
				+    if not path.is_dir():
			
 
				+        raise FileNotFoundError(path)
			
 
				+
			
 
				+    all_images = sorted(
			
 
				+        p
			
 
				+        for p in path.iterdir()
			
 
				+        if p.is_file() and p.suffix.lower() in _IMAGE_SUFFIXES
			
 
				+    )
			
 
				+    if not all_images:
			
 
				+        raise FileNotFoundError(f"目录内无图像: {path}")
			
 
				+
			
 
				+    if prefer_raw:
			
 
				+        raws = [p for p in all_images if p.stem.endswith("_raw")]
			
 
				+        if raws:
			
 
				+            return raws
			
 
				+
			
 
				+    chosen: List[Path] = []
			
 
				+    for p in all_images:
			
 
				+        if p.stem.endswith("_raw"):
			
 
				+            continue
			
 
				+        raw_sibling = p.parent / f"{p.stem}_raw{p.suffix}"
			
 
				+        if prefer_raw and raw_sibling.is_file():
			
 
				+            continue
			
 
				+        chosen.append(p)
			
 
				+    return chosen or all_images
			
 
				+
			
 
				+
			
 
				+def _match_hit(text: str, target: Optional[str]) -> Optional[str]:
			
 
				+    if not text:
			
 
				+        return None
			
 
				+    if not target:
			
 
				+        return "nonempty"
			
 
				+    if target in text:
			
 
				+        return "full"
			
 
				+    if len(target) >= 6 and target.isdigit() and len(text) >= 6 and text.isdigit():
			
 
				+        return "partial"
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def run_sweep(
			
 
				+    input_path: Path,
			
 
				+    out_dir: Path,
			
 
				+    *,
			
 
				+    prefer_raw: bool,
			
 
				+    target: Optional[str],
			
 
				+    model_dir: Path,
			
 
				+    methods: Sequence[str],
			
 
				+    thresholds: Sequence[Optional[int]],
			
 
				+    contrasts: Sequence[bool],
			
 
				+    upscales: Sequence[int],
			
 
				+    det_threshs: Sequence[float],
			
 
				+    text_black_target: int,
			
 
				+    save_images: bool,
			
 
				+    run_baseline: bool,
			
 
				+    baseline_upscale: int,
			
 
				+) -> Dict[str, Any]:
			
 
				+    resolved = resolve_input_image(input_path, prefer_raw=prefer_raw)
			
 
				+    raw = cv2.imread(str(resolved))
			
 
				+    if raw is None:
			
 
				+        raise RuntimeError(f"无法读取图像: {resolved}")
			
 
				+
			
 
				+    stem = resolved.stem.removesuffix("_raw") if resolved.stem.endswith("_raw") else resolved.stem
			
 
				+    cell_out = out_dir / stem
			
 
				+    cell_out.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    ocr_modes: List[Tuple[str, bool, bool]] = [
			
 
				+        ("det_rec", True, True),
			
 
				+        ("whole_rec", False, True),
			
 
				+    ]
			
 
				+
			
 
				+    results: List[Dict[str, Any]] = []
			
 
				+    hits: List[Dict[str, Any]] = []
			
 
				+    engines: Dict[float, Any] = {}
			
 
				+    total = 0
			
 
				+
			
 
				+    for method, thresh, contrast, upscale, det_th in product(
			
 
				+        methods, thresholds, contrasts, upscales, det_threshs
			
 
				+    ):
			
 
				+        if method != "threshold" and thresh is not None:
			
 
				+            continue
			
 
				+        if det_th not in engines:
			
 
				+            print(f"  [{stem}] 加载 OCR det_db_box_thresh={det_th} ...")
			
 
				+            engines[det_th] = _make_engine(det_th, model_dir)
			
 
				+
			
 
				+        img = _preprocess(
			
 
				+            raw,
			
 
				+            method=method,
			
 
				+            thresh=thresh,
			
 
				+            contrast=contrast,
			
 
				+            upscale=upscale,
			
 
				+            text_black_target=text_black_target,
			
 
				+        )
			
 
				+        tag = f"{method}_t{thresh or 'd'}_c{int(contrast)}_u{upscale}_det{det_th}"
			
 
				+        if save_images:
			
 
				+            cv2.imwrite(str(cell_out / f"{tag}.png"), img)
			
 
				+
			
 
				+        for mode_name, det, rec in ocr_modes:
			
 
				+            total += 1
			
 
				+            ocr = _ocr(engines[det_th], img, det=det, rec=rec)
			
 
				+            row: Dict[str, Any] = {
			
 
				+                "tag": tag,
			
 
				+                "method": method,
			
 
				+                "threshold": thresh,
			
 
				+                "contrast": contrast,
			
 
				+                "upscale": upscale,
			
 
				+                "det_db_box_thresh": det_th,
			
 
				+                "ocr_mode": mode_name,
			
 
				+                **ocr,
			
 
				+            }
			
 
				+            results.append(row)
			
 
				+            m = _match_hit(row.get("text", ""), target)
			
 
				+            if m:
			
 
				+                row["match"] = m
			
 
				+                hits.append(row)
			
 
				+                print(
			
 
				+                    f"  HIT [{m}] {mode_name} {tag} "
			
 
				+                    f"score={row.get('score')} -> {row.get('text')!r}"
			
 
				+                )
			
 
				+
			
 
				+    if run_baseline:
			
 
				+        for det_th in det_threshs:
			
 
				+            if det_th not in engines:
			
 
				+                engines[det_th] = _make_engine(det_th, model_dir)
			
 
				+            base_img = _upscale(raw, baseline_upscale)
			
 
				+            if save_images:
			
 
				+                cv2.imwrite(str(cell_out / f"baseline_upscale{baseline_upscale}.png"), base_img)
			
 
				+            for mode_name, det, rec in ocr_modes:
			
 
				+                ocr = _ocr(engines[det_th], base_img, det=det, rec=rec)
			
 
				+                row = {
			
 
				+                    "tag": f"baseline_upscale{baseline_upscale}",
			
 
				+                    "det_db_box_thresh": det_th,
			
 
				+                    "ocr_mode": mode_name,
			
 
				+                    **ocr,
			
 
				+                }
			
 
				+                results.append(row)
			
 
				+                m = _match_hit(row.get("text", ""), target)
			
 
				+                if m:
			
 
				+                    row["match"] = m
			
 
				+                    hits.append(row)
			
 
				+
			
 
				+    report = {
			
 
				+        "input": str(resolved),
			
 
				+        "input_requested": str(input_path),
			
 
				+        "output_dir": str(cell_out),
			
 
				+        "target": target,
			
 
				+        "total_trials": total,
			
 
				+        "hits": hits,
			
 
				+        "all_results": results,
			
 
				+    }
			
 
				+    report_path = cell_out / "sweep_report.json"
			
 
				+    report_path.write_text(
			
 
				+        json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8"
			
 
				+    )
			
 
				+    return report
			
 
				+
			
 
				+
			
 
				+def _build_arg_parser() -> argparse.ArgumentParser:
			
 
				+    p = argparse.ArgumentParser(
			
 
				+        description="单元格图预处理 + OCR 参数网格扫描（对齐 pipeline 格级二次 OCR）",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "input",
			
 
				+        type=Path,
			
 
				+        help="单元格裁剪图路径，或 tablecell_ocr 目录（批量扫描）",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "-o",
			
 
				+        "--output",
			
 
				+        type=Path,
			
 
				+        default=None,
			
 
				+        help="输出目录，默认 <input_dir|input_parent>/sweep_out/<stem>",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "-t",
			
 
				+        "--target",
			
 
				+        default=None,
			
 
				+        help="期望 OCR 文本；用于标记 HIT（子串匹配）。省略则任意非空为 HIT",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--model-dir",
			
 
				+        type=Path,
			
 
				+        default=None,
			
 
				+        help="PaddleOCR torch 模型目录（含 det/rec .pth），也可用 OCR_*_MODEL_PATH",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--no-prefer-raw",
			
 
				+        action="store_true",
			
 
				+        help="不自动选用同名的 *_raw.png",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--quick",
			
 
				+        action="store_true",
			
 
				+        help="缩小网格（threshold 170,175 × upscale 128,192 × det 0.3,0.5）",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--methods",
			
 
				+        default="threshold,masked_adaptive",
			
 
				+        help="去水印方式，逗号分隔",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--thresholds",
			
 
				+        default="155,165,170,175,180,none",
			
 
				+        help="threshold 法的阈值；none=预设默认",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--contrasts",
			
 
				+        default="false,true",
			
 
				+        help="是否 contrast，逗号分隔 false,true",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--upscales",
			
 
				+        default="64,96,128,192",
			
 
				+        help="最短边放大目标，逗号分隔整数",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--det-threshs",
			
 
				+        default="0.2,0.3,0.4,0.5",
			
 
				+        help="det_db_box_thresh，逗号分隔",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--text-black-target",
			
 
				+        type=int,
			
 
				+        default=88,
			
 
				+        help="contrast text_restore 目标黑度",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--no-save-images",
			
 
				+        action="store_true",
			
 
				+        help="不写出中间预处理 png（仅报告）",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--no-baseline",
			
 
				+        action="store_true",
			
 
				+        help="跳过「仅放大、不去水印」对照组",
			
 
				+    )
			
 
				+    p.add_argument(
			
 
				+        "--baseline-upscale",
			
 
				+        type=int,
			
 
				+        default=128,
			
 
				+        help="baseline 对照组的最短边放大",
			
 
				+    )
			
 
				+    return p
			
 
				+
			
 
				+
			
 
				+def main(argv: Optional[Sequence[str]] = None) -> None:
			
 
				+    args = _build_arg_parser().parse_args(argv)
			
 
				+    inputs = collect_inputs(args.input, prefer_raw=not args.no_prefer_raw)
			
 
				+    if not inputs:
			
 
				+        raise SystemExit("未找到可扫描的图像")
			
 
				+
			
 
				+    if args.output is not None:
			
 
				+        out_root = args.output
			
 
				+    elif args.input.is_file():
			
 
				+        out_root = args.input.parent / "sweep_out"
			
 
				+    else:
			
 
				+        out_root = args.input / "sweep_out"
			
 
				+    out_root.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    model_dir = args.model_dir or _default_model_dir()
			
 
				+    methods = [m.strip() for m in args.methods.split(",") if m.strip()]
			
 
				+
			
 
				+    if args.quick:
			
 
				+        thresholds = [170, 175]
			
 
				+        upscales = [128, 192]
			
 
				+        det_threshs = [0.3, 0.5]
			
 
				+        contrasts = [False, True]
			
 
				+    else:
			
 
				+        thresholds = _parse_csv_ints(args.thresholds)
			
 
				+        upscales = [int(x) for x in args.upscales.split(",") if x.strip()]
			
 
				+        det_threshs = _parse_csv_floats(args.det_threshs)
			
 
				+        contrasts = _parse_csv_bools(args.contrasts)
			
 
				+
			
 
				+    print(f"扫描 {len(inputs)} 张图 -> {out_root}")
			
 
				+    print(f"  methods={methods} thresholds={thresholds} upscales={upscales}")
			
 
				+    if args.target:
			
 
				+        print(f"  target={args.target!r}")
			
 
				+
			
 
				+    summary: List[Dict[str, Any]] = []
			
 
				+    for img_path in inputs:
			
 
				+        print(f"\n=== {img_path.name} ===")
			
 
				+        report = run_sweep(
			
 
				+            img_path,
			
 
				+            out_root,
			
 
				+            prefer_raw=not args.no_prefer_raw,
			
 
				+            target=args.target,
			
 
				+            model_dir=model_dir,
			
 
				+            methods=methods,
			
 
				+            thresholds=thresholds,
			
 
				+            contrasts=contrasts,
			
 
				+            upscales=upscales,
			
 
				+            det_threshs=det_threshs,
			
 
				+            text_black_target=args.text_black_target,
			
 
				+            save_images=not args.no_save_images,
			
 
				+            run_baseline=not args.no_baseline,
			
 
				+            baseline_upscale=args.baseline_upscale,
			
 
				+        )
			
 
				+        summary.append(
			
 
				+            {
			
 
				+                "input": report["input"],
			
 
				+                "hits": len(report["hits"]),
			
 
				+                "report": str(Path(report["output_dir"]) / "sweep_report.json"),
			
 
				+            }
			
 
				+        )
			
 
				+
			
 
				+    index_path = out_root / "sweep_index.json"
			
 
				+    index_path.write_text(
			
 
				+        json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8"
			
 
				+    )
			
 
				+    print(f"\n全部完成，索引: {index_path}")
			
 
				+    for s in summary:
			
 
				+        print(f"  {s['input']}: {s['hits']} hits -> {s['report']}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    if len(sys.argv) == 1:
			
 
				+        print("ℹ️  未提供命令行参数，使用默认配置运行...")
			
 
				+        default_config = {
			
 
				+            "input": "/Users/zhch158/workspace/data/流水分析/彭_广东兴宁农村商业银行/bank_statement_yusys_local/debug/table_recognition_wired/tablecell_ocr/彭_广东兴宁农村商业银行_page_002_0/cell219_empty_empty_raw.png",
			
 
				+            "output": "./output/彭_广东兴宁农村商业银行/cell219_sweep",
			
 
				+            "target": "ATM存折取款",
			
 
				+        }
			
 
				+        sys.argv = [sys.argv[0], default_config["input"]]
			
 
				+        for key, value in default_config.items():
			
 
				+            if key == "input":
			
 
				+                continue
			
 
				+            flag = f"--{key.replace('_', '-')}"
			
 
				+            if isinstance(value, bool) and value:
			
 
				+                sys.argv.append(flag)
			
 
				+            elif not isinstance(value, bool):
			
 
				+                sys.argv.extend([flag, str(value)])
			
 
				+
			
 
				+    sys.exit(main())