пре 1 месец · 73e783c91b
--- a/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
+++ b/ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py
@@ -5,6 +5,7 @@
 
															 """
														
 
															 from typing import List, Dict, Any, Tuple, Optional
														
 
															 import bisect
														
 
															+import json
														
 
															 import cv2
														
 
															 import numpy as np
														
 
															 import os
														
@@ -12,6 +13,7 @@ import re
 
															 from loguru import logger
														
 
															 from ocr_utils.coordinate_utils import CoordinateUtils
														
 
															+from ocr_utils.watermark import WatermarkProcessor
														
 
															 class TextFiller:
														
@@ -50,6 +52,61 @@ class TextFiller:
 
															         self.second_pass_prefer_whole_on_tie: bool = bool(
														
 
															             sp_cfg.get("prefer_whole_on_tie", True)
														
 
															         )
														
 
															+        self.second_pass_reocr_mode: str = str(sp_cfg.get("reocr_mode", "default"))
														
 
															+        self.second_pass_header_row: int = int(sp_cfg.get("header_row", 0))
														
 
															+        self.second_pass_strip_aspect: float = float(
														
 
															+            sp_cfg.get("strip_fallback_aspect_ratio", 1.8)
														
 
															+        )
														
 
															+        self.second_pass_whole_longer_extra: int = int(
														
 
															+            sp_cfg.get("whole_longer_min_extra_chars", 2)
														
 
															+        )
														
 
															+        self.second_pass_row_peer_min_nonempty: int = int(
														
 
															+            sp_cfg.get("row_peer_min_nonempty", 5)
														
 
															+        )
														
 
															+        cpp = sp_cfg.get("cell_preprocess") or {}
														
 
															+        if not isinstance(cpp, dict):
														
 
															+            cpp = {}
														
 
															+        light = cpp.get("light") or {}
														
 
															+        if not isinstance(light, dict):
														
 
															+            light = {}
														
 
															+        self.second_pass_light_upscale_min: int = int(
														
 
															+            light.get("upscale_min_side", 64)
														
 
															+        )
														
 
															+        er = cpp.get("enhance_retry") or {}
														
 
															+        if not isinstance(er, dict):
														
 
															+            er = {}
														
 
															+        self.second_pass_enhance_retry_enabled: bool = bool(er.get("enabled", True))
														
 
															+        self.second_pass_enhance_score_below: float = float(
														
 
															+            er.get("score_below", 0.90)
														
 
															+        )
														
 
															+        self.second_pass_enhance_min_chars: int = int(er.get("min_chars", 4))
														
 
															+        self.second_pass_enhance_short_tall: bool = bool(
														
 
															+            er.get("short_text_in_tall_cell", True)
														
 
															+        )
														
 
															+        contrast = er.get("contrast") or {}
														
 
															+        if not isinstance(contrast, dict):
														
 
															+            contrast = {}
														
 
															+        self.second_pass_enhance_contrast: Dict[str, Any] = dict(contrast)
														
 
															+        sharpen = er.get("sharpen") or {}
														
 
															+        if not isinstance(sharpen, dict):
														
 
															+            sharpen = {}
														
 
															+        self.second_pass_enhance_sharpen: Dict[str, Any] = dict(sharpen)
														
 
															+
														
 
															+        wm_user = cpp.get("watermark") or {}
														
 
															+        if not isinstance(wm_user, dict):
														
 
															+            wm_user = {}
														
 
															+        self._cell_wm_processor = WatermarkProcessor.from_user_config(
														
 
															+            wm_user, scope="cell"
														
 
															+        )
														
 
															+        denoise = cpp.get("denoise") or {}
														
 
															+        if not isinstance(denoise, dict):
														
 
															+            denoise = {}
														
 
															+        self._cell_denoise_enabled: bool = bool(denoise.get("enabled", True))
														
 
															+        self._cell_denoise_method: str = str(denoise.get("method", "median"))
														
 
															+        cell_contrast = cpp.get("contrast") or {}
														
 
															+        if not isinstance(cell_contrast, dict):
														
 
															+            cell_contrast = {}
														
 
															+        self._cell_contrast_cfg: Dict[str, Any] = dict(cell_contrast)
														
 
															     @staticmethod
														
 
															     def sanitize_debug_filename(text: str, max_length: int = 50) -> str:
														
@@ -232,7 +289,7 @@ class TextFiller:
 
															     def _recognize_whole_cell(self, cell_img: np.ndarray) -> Tuple[str, float]:
														
 
															         try:
														
 
															-            rec_res = self.ocr_engine.ocr(cell_img, det=False, rec=True)
														
 
															+            rec_res = self.ocr_engine.ocr(cell_img, det=True, rec=True)
														
 
															             items = self._extract_ocr_batch_results(rec_res)
														
 
															             if not items:
														
 
															                 return "", 0.0
														
@@ -241,19 +298,22 @@ class TextFiller:
 
															             logger.warning(f"整格 OCR 失败: {e}")
														
 
															             return "", 0.0
														
 
															-    def _recognize_cell_lines(self, cell_img: np.ndarray) -> List[Tuple[str, float]]:
														
 
															-        """det 分行后逐行识别，检测框按阅读顺序（上行下、左到右）排序。"""
														
 
															-        blocks: List[Tuple[str, float]] = []
														
 
															+    def _recognize_cell_lines_detailed(
														
 
															+        self, cell_img: np.ndarray
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """det 分行后逐行识别，返回含 det_bbox 的行列表。"""
														
 
															+        lines: List[Dict[str, Any]] = []
														
 
															         try:
														
 
															             det_res = self.ocr_engine.ocr(cell_img, det=True, rec=False)
														
 
															             dt_boxes = []
														
 
															             if det_res and len(det_res) > 0:
														
 
															                 dt_boxes = det_res[0] if det_res[0] else []
														
 
															             if not dt_boxes:
														
 
															-                return blocks
														
 
															+                return lines
														
 
															             h, w = cell_img.shape[:2]
														
 
															             sorted_boxes = self.sort_det_boxes_reading_order(dt_boxes, h, w)
														
 
															             rec_img_list: List[np.ndarray] = []
														
 
															+            det_bboxes: List[List[int]] = []
														
 
															             for box in sorted_boxes:
														
 
															                 xyxy = self._det_box_to_xyxy(box, w, h)
														
 
															                 if xyxy is None:
														
@@ -262,17 +322,146 @@ class TextFiller:
 
															                 cropped = cell_img[y1:y2, x1:x2]
														
 
															                 if cropped.size > 0:
														
 
															                     rec_img_list.append(cropped)
														
 
															+                    det_bboxes.append([x1, y1, x2, y2])
														
 
															             if not rec_img_list:
														
 
															-                return blocks
														
 
															+                return lines
														
 
															             rec_res = self.ocr_engine.ocr(rec_img_list, det=False, rec=True)
														
 
															             rec_items = self._extract_ocr_batch_results(rec_res)
														
 
															-            for rec_item in rec_items:
														
 
															+            for idx, rec_item in enumerate(rec_items):
														
 
															                 text, score = self._parse_single_rec_item(rec_item)
														
 
															                 if text:
														
 
															-                    blocks.append((text, score))
														
 
															+                    lines.append(
														
 
															+                        {
														
 
															+                            "index": len(lines),
														
 
															+                            "text": text,
														
 
															+                            "score": score,
														
 
															+                            "det_bbox": det_bboxes[idx] if idx < len(det_bboxes) else [],
														
 
															+                        }
														
 
															+                    )
														
 
															         except Exception as e:
														
 
															             logger.warning(f"分行 OCR 失败: {e}")
														
 
															-        return blocks
														
 
															+        return lines
														
 
															+
														
 
															+    def _recognize_cell_lines(self, cell_img: np.ndarray) -> List[Tuple[str, float]]:
														
 
															+        return [
														
 
															+            (ln["text"], ln["score"])
														
 
															+            for ln in self._recognize_cell_lines_detailed(cell_img)
														
 
															+        ]
														
 
															+
														
 
															+    def _needs_strip_line_fallback(
														
 
															+        self,
														
 
															+        cell_img: np.ndarray,
														
 
															+        line_blocks: List[Tuple[str, float]],
														
 
															+    ) -> bool:
														
 
															+        if cell_img is None or cell_img.size == 0:
														
 
															+            return False
														
 
															+        h, w = cell_img.shape[:2]
														
 
															+        if w <= 0:
														
 
															+            return False
														
 
															+        if h / w < self.second_pass_strip_aspect:
														
 
															+            return False
														
 
															+        return len(line_blocks) <= 1
														
 
															+
														
 
															+    def _recognize_strip_fallback(
														
 
															+        self, cell_img: np.ndarray, n_strips: int = 4
														
 
															+    ) -> Tuple[str, float, List[Dict[str, Any]]]:
														
 
															+        """竖长格水平条带扫描 det+rec。"""
														
 
															+        h, w = cell_img.shape[:2]
														
 
															+        if h < 8 or w < 4:
														
 
															+            return "", 0.0, []
														
 
															+        n_strips = max(2, int(n_strips))
														
 
															+        strip_h = max(1, h // n_strips)
														
 
															+        all_lines: List[Dict[str, Any]] = []
														
 
															+        for si in range(n_strips):
														
 
															+            y1 = si * strip_h
														
 
															+            y2 = h if si == n_strips - 1 else (si + 1) * strip_h
														
 
															+            strip = cell_img[y1:y2, :]
														
 
															+            if strip.size == 0:
														
 
															+                continue
														
 
															+            for ln in self._recognize_cell_lines_detailed(strip):
														
 
															+                bb = ln.get("det_bbox") or []
														
 
															+                if len(bb) >= 4:
														
 
															+                    ln = dict(ln)
														
 
															+                    ln["det_bbox"] = [bb[0], bb[1] + y1, bb[2], bb[3] + y1]
														
 
															+                all_lines.append(ln)
														
 
															+        blocks = [(ln["text"], ln["score"]) for ln in all_lines]
														
 
															+        text, score = self.aggregate_line_ocr(
														
 
															+            blocks,
														
 
															+            line_min_score=self.second_pass_line_min_score,
														
 
															+            drop_low_score_blocks=self.second_pass_drop_low,
														
 
															+        )
														
 
															+        return text, score, all_lines
														
 
															+
														
 
															+    def _upscale_cell_if_small(
														
 
															+        self, cell_img: np.ndarray, min_side: Optional[int] = None
														
 
															+    ) -> np.ndarray:
														
 
															+        min_side = min_side if min_side is not None else self.second_pass_light_upscale_min
														
 
															+        ch, cw = cell_img.shape[:2]
														
 
															+        if ch >= min_side and cw >= min_side:
														
 
															+            return cell_img
														
 
															+        scale = max(min_side / max(ch, 1), min_side / max(cw, 1), 1.0)
														
 
															+        if scale <= 1.0:
														
 
															+            return cell_img
														
 
															+        return cv2.resize(
														
 
															+            cell_img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC
														
 
															+        )
														
 
															+
														
 
															+    def _denoise_cell(self, cell_img: np.ndarray) -> np.ndarray:
														
 
															+        if not self._cell_denoise_enabled:
														
 
															+            return cell_img
														
 
															+        method = self._cell_denoise_method
														
 
															+        if method == "median":
														
 
															+            k = 3
														
 
															+            if cell_img.ndim == 2:
														
 
															+                return cv2.medianBlur(cell_img, k)
														
 
															+            return cv2.medianBlur(cell_img, k)
														
 
															+        return cell_img
														
 
															+
														
 
															+    def _apply_cell_contrast(
														
 
															+        self, cell_img: np.ndarray, contrast_cfg: Dict[str, Any]
														
 
															+    ) -> np.ndarray:
														
 
															+        from ocr_utils.watermark.contrast import apply_contrast_enhancement_config
														
 
															+
														
 
															+        if not contrast_cfg.get("enabled", False):
														
 
															+            return cell_img
														
 
															+        if len(cell_img.shape) == 3:
														
 
															+            gray = cv2.cvtColor(cell_img, cv2.COLOR_BGR2GRAY)
														
 
															+        else:
														
 
															+            gray = cell_img
														
 
															+        gray = apply_contrast_enhancement_config(gray, contrast_cfg)
														
 
															+        if self.second_pass_enhance_sharpen.get("enabled", False):
														
 
															+            amount = float(self.second_pass_enhance_sharpen.get("amount", 0.3))
														
 
															+            blurred = cv2.GaussianBlur(gray, (0, 0), 1.0)
														
 
															+            gray = cv2.addWeighted(gray, 1.0 + amount, blurred, -amount, 0)
														
 
															+        if cell_img.ndim == 3:
														
 
															+            return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
														
 
															+        return gray
														
 
															+
														
 
															+    def _preprocess_cell_for_ocr(
														
 
															+        self, cell_img: np.ndarray, mode: str = "light"
														
 
															+    ) -> Tuple[np.ndarray, List[str]]:
														
 
															+        stages: List[str] = []
														
 
															+        img = cell_img
														
 
															+
														
 
															+        if self._cell_wm_processor.enabled:
														
 
															+            img, wm_stages = self._cell_wm_processor.process(img, force=True)
														
 
															+            stages.extend(wm_stages)
														
 
															+
														
 
															+        if self._cell_denoise_enabled and "wm" in stages:
														
 
															+            img = self._denoise_cell(img)
														
 
															+            stages.append("denoise")
														
 
															+
														
 
															+        if mode == "enhance":
														
 
															+            contrast_cfg = self.second_pass_enhance_contrast
														
 
															+            if self._cell_contrast_cfg.get("enabled", False):
														
 
															+                contrast_cfg = self._cell_contrast_cfg
														
 
															+            if contrast_cfg.get("enabled", False) and "wm" in stages:
														
 
															+                img = self._apply_cell_contrast(img, contrast_cfg)
														
 
															+                stages.append("contrast")
														
 
															+
														
 
															+        img = self._upscale_cell_if_small(img)
														
 
															+        stages.append("upscale")
														
 
															+        return img, stages
														
 
															     def _pick_line_vs_whole(
														
 
															         self,
														
@@ -280,23 +469,339 @@ class TextFiller:
 
															         line_score: float,
														
 
															         whole_text: str,
														
 
															         whole_score: float,
														
 
															+        strip_text: str = "",
														
 
															+        strip_score: float = 0.0,
														
 
															     ) -> Tuple[str, float, str]:
														
 
															-        """返回 (text, score, strategy) strategy in lines|whole|tie_whole|tie_lines."""
														
 
															-        if not self.second_pass_whole_fallback:
														
 
															-            return line_text, line_score, "lines"
														
 
															-        if not whole_text and line_text:
														
 
															-            return line_text, line_score, "lines"
														
 
															-        if whole_text and not line_text:
														
 
															-            return whole_text, whole_score, "whole"
														
 
															-        if not whole_text and not line_text:
														
 
															+        """返回 (text, score, strategy)。"""
														
 
															+        candidates: List[Tuple[str, float, str]] = []
														
 
															+        if line_text:
														
 
															+            candidates.append((line_text, line_score, "lines"))
														
 
															+        if whole_text and self.second_pass_whole_fallback:
														
 
															+            candidates.append((whole_text, whole_score, "whole"))
														
 
															+        if strip_text:
														
 
															+            candidates.append((strip_text, strip_score, "strip"))
														
 
															+
														
 
															+        if not candidates:
														
 
															             return "", 0.0, "empty"
														
 
															-        if line_score > whole_score:
														
 
															-            return line_text, line_score, "lines"
														
 
															-        if line_score < whole_score:
														
 
															-            return whole_text, whole_score, "whole"
														
 
															-        if self.second_pass_prefer_whole_on_tie and whole_text:
														
 
															-            return whole_text, whole_score, "tie_whole"
														
 
															-        return line_text, line_score, "tie_lines"
														
 
															+
														
 
															+        if (
														
 
															+            whole_text
														
 
															+            and line_text
														
 
															+            and line_score > whole_score
														
 
															+            and len(whole_text) >= len(line_text) + self.second_pass_whole_longer_extra
														
 
															+            and len(whole_text) > len(line_text)
														
 
															+        ):
														
 
															+            return whole_text, whole_score, "whole_longer"
														
 
															+
														
 
															+        if (
														
 
															+            strip_text
														
 
															+            and line_text
														
 
															+            and line_score > strip_score
														
 
															+            and len(strip_text) >= len(line_text) + self.second_pass_whole_longer_extra
														
 
															+            and len(strip_text) > len(line_text)
														
 
															+        ):
														
 
															+            return strip_text, strip_score, "strip_longer"
														
 
															+
														
 
															+        best = max(candidates, key=lambda c: (c[1], len(c[0])))
														
 
															+        if len(candidates) > 1:
														
 
															+            top_score = best[1]
														
 
															+            tied = [c for c in candidates if abs(c[1] - top_score) < 1e-6]
														
 
															+            if len(tied) > 1 and self.second_pass_prefer_whole_on_tie:
														
 
															+                for pref in ("whole", "strip", "lines"):
														
 
															+                    for c in tied:
														
 
															+                        if c[2] == pref or c[2].endswith(pref):
														
 
															+                            if pref == "whole" and c[2] == "whole":
														
 
															+                                return c[0], c[1], "tie_whole"
														
 
															+                            if pref == "strip" and "strip" in c[2]:
														
 
															+                                return c[0], c[1], "tie_strip"
														
 
															+                return best[0], best[1], "tie_lines"
														
 
															+        return best[0], best[1], best[2]
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _pick_better_ocr_result(
														
 
															+        pass1: Dict[str, Any], pass2: Dict[str, Any]
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        """Pass2 增强重试后择优；拒绝异常分数或覆盖已接受的高分短文本。"""
														
 
															+        t1 = (pass1.get("final_text") or "").strip()
														
 
															+        t2 = (pass2.get("final_text") or "").strip()
														
 
															+        s1 = float(pass1.get("final_score") or 0.0)
														
 
															+        s2 = float(pass2.get("final_score") or 0.0)
														
 
															+        if not t2:
														
 
															+            return pass1
														
 
															+        if not t1:
														
 
															+            return pass2 if 0.0 <= s2 <= 1.0 else pass1
														
 
															+        if s2 > 1.0 or s2 < 0.0:
														
 
															+            return pass1
														
 
															+        if pass1.get("accepted") and not pass2.get("accepted"):
														
 
															+            return pass1
														
 
															+        if s1 >= 0.95 and len(t2) > len(t1) + 2 and s2 < 0.5:
														
 
															+            return pass1
														
 
															+        if len(t2) > len(t1) + 1 and s1 >= 0.9 and s2 <= s1:
														
 
															+            return pass1
														
 
															+        if len(t2) > len(t1) + 1:
														
 
															+            return pass2
														
 
															+        if len(t1) > len(t2) + 1:
														
 
															+            return pass1
														
 
															+        if s2 > s1 + 0.02:
														
 
															+            return pass2
														
 
															+        if s1 > s2 + 0.02:
														
 
															+            return pass1
														
 
															+        return pass2 if len(t2) >= len(t1) else pass1
														
 
															+
														
 
															+    def _should_run_whole_fallback(
														
 
															+        self,
														
 
															+        line_text: str,
														
 
															+        line_score: float,
														
 
															+        cell_img: np.ndarray,
														
 
															+        line_blocks: List[Tuple[str, float]],
														
 
															+        base_conf_th: float,
														
 
															+    ) -> bool:
														
 
															+        if not self.second_pass_whole_fallback:
														
 
															+            return False
														
 
															+        if not line_text:
														
 
															+            return True
														
 
															+        if line_score < base_conf_th:
														
 
															+            return True
														
 
															+        if self._needs_strip_line_fallback(cell_img, line_blocks):
														
 
															+            return True
														
 
															+        if (
														
 
															+            line_text
														
 
															+            and line_score >= base_conf_th
														
 
															+            and len(line_text) < self.second_pass_enhance_min_chars
														
 
															+        ):
														
 
															+            return True
														
 
															+        return False
														
 
															+
														
 
															+    def _needs_enhance_retry(
														
 
															+        self,
														
 
															+        result: Dict[str, Any],
														
 
															+        cell_img: np.ndarray,
														
 
															+        dynamic_conf_th: float,
														
 
															+    ) -> Tuple[bool, List[str]]:
														
 
															+        if not self.second_pass_enhance_retry_enabled:
														
 
															+            return False, []
														
 
															+        reasons: List[str] = []
														
 
															+        text = (result.get("final_text") or "").strip()
														
 
															+        score = float(result.get("final_score") or 0.0)
														
 
															+        if not result.get("accepted", False):
														
 
															+            reasons.append("not_accepted")
														
 
															+        if score < self.second_pass_enhance_score_below:
														
 
															+            reasons.append("score_below_threshold")
														
 
															+        if text and len(text) < self.second_pass_enhance_min_chars:
														
 
															+            reasons.append("suspicious_short_text")
														
 
															+        h, w = cell_img.shape[:2]
														
 
															+        if (
														
 
															+            self.second_pass_enhance_short_tall
														
 
															+            and w > 0
														
 
															+            and h / w >= self.second_pass_strip_aspect
														
 
															+            and len(result.get("lines") or []) <= 1
														
 
															+            and len(text) < self.second_pass_enhance_min_chars + 2
														
 
															+        ):
														
 
															+            reasons.append("tall_cell_single_line")
														
 
															+        return bool(reasons), reasons
														
 
															+
														
 
															+    def _ocr_one_cell(
														
 
															+        self, cell_img: np.ndarray, base_conf_th: float
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        line_entries = self._recognize_cell_lines_detailed(cell_img)
														
 
															+        line_blocks = [(ln["text"], ln["score"]) for ln in line_entries]
														
 
															+        line_text, line_score = self.aggregate_line_ocr(
														
 
															+            line_blocks,
														
 
															+            line_min_score=self.second_pass_line_min_score,
														
 
															+            drop_low_score_blocks=self.second_pass_drop_low,
														
 
															+        )
														
 
															+
														
 
															+        whole_text, whole_score = "", 0.0
														
 
															+        whole_skipped = "line_score_ok"
														
 
															+        run_whole = self._should_run_whole_fallback(
														
 
															+            line_text, line_score, cell_img, line_blocks, base_conf_th
														
 
															+        )
														
 
															+        if run_whole:
														
 
															+            whole_text, whole_score = self._recognize_whole_cell(cell_img)
														
 
															+            whole_skipped = None
														
 
															+        elif line_text and line_score >= base_conf_th:
														
 
															+            if len(line_text) < self.second_pass_enhance_min_chars:
														
 
															+                whole_skipped = "short_text_high_score"
														
 
															+            else:
														
 
															+                whole_skipped = "line_score>=%.2f" % base_conf_th
														
 
															+        else:
														
 
															+            whole_skipped = "line_score>=%.2f" % base_conf_th
														
 
															+
														
 
															+        strip_text, strip_score, strip_lines = "", 0.0, []
														
 
															+        if self._needs_strip_line_fallback(cell_img, line_blocks):
														
 
															+            strip_text, strip_score, strip_lines = self._recognize_strip_fallback(
														
 
															+                cell_img
														
 
															+            )
														
 
															+
														
 
															+        final_text, final_score, strategy = self._pick_line_vs_whole(
														
 
															+            line_text,
														
 
															+            line_score,
														
 
															+            whole_text,
														
 
															+            whole_score,
														
 
															+            strip_text,
														
 
															+            strip_score,
														
 
															+        )
														
 
															+        dynamic_conf_th = self.calculate_dynamic_confidence_threshold(
														
 
															+            final_text, base_conf_th
														
 
															+        )
														
 
															+        accepted = bool(final_text) and final_score >= dynamic_conf_th
														
 
															+        return {
														
 
															+            "lines": line_entries,
														
 
															+            "line_aggregate": {"text": line_text, "score": line_score},
														
 
															+            "whole": {
														
 
															+                "text": whole_text,
														
 
															+                "score": whole_score,
														
 
															+                "skipped": whole_skipped,
														
 
															+            },
														
 
															+            "strip": {
														
 
															+                "text": strip_text,
														
 
															+                "score": strip_score,
														
 
															+                "lines": strip_lines,
														
 
															+            },
														
 
															+            "final_text": final_text,
														
 
															+            "final_score": final_score,
														
 
															+            "strategy": strategy,
														
 
															+            "dynamic_conf_threshold": dynamic_conf_th,
														
 
															+            "accepted": accepted,
														
 
															+        }
														
 
															+
														
 
															+    def _infer_header_row(
														
 
															+        self,
														
 
															+        merged_cells: Optional[List[Dict[str, Any]]],
														
 
															+        texts: List[str],
														
 
															+        scores: List[float],
														
 
															+    ) -> int:
														
 
															+        if self.second_pass_header_row >= 0:
														
 
															+            return self.second_pass_header_row
														
 
															+        if not merged_cells:
														
 
															+            return 0
														
 
															+        row_scores: Dict[int, List[float]] = {}
														
 
															+        for i, cell in enumerate(merged_cells):
														
 
															+            row = int(cell.get("row", 0))
														
 
															+            t = (texts[i] if i < len(texts) else "").strip()
														
 
															+            sc = float(scores[i] if i < len(scores) else 0.0)
														
 
															+            if t:
														
 
															+                row_scores.setdefault(row, []).append(sc)
														
 
															+        if not row_scores:
														
 
															+            return 0
														
 
															+        best_row = 0
														
 
															+        best_avg = -1.0
														
 
															+        for row, scs in row_scores.items():
														
 
															+            avg = sum(scs) / len(scs)
														
 
															+            if avg > best_avg:
														
 
															+                best_avg = avg
														
 
															+                best_row = row
														
 
															+        return best_row
														
 
															+
														
 
															+    def _should_second_pass_cell(
														
 
															+        self,
														
 
															+        i: int,
														
 
															+        texts: List[str],
														
 
															+        scores: List[float],
														
 
															+        need_reocr_indices: List[int],
														
 
															+        merged_cells: Optional[List[Dict[str, Any]]],
														
 
															+        pdf_type: str,
														
 
															+        force_all: bool,
														
 
															+        header_row: int,
														
 
															+    ) -> Tuple[bool, List[str]]:
														
 
															+        reasons: List[str] = []
														
 
															+        t = texts[i] if i < len(texts) else ""
														
 
															+        sc = float(scores[i] if i < len(scores) else 0.0)
														
 
															+        bbox_row = None
														
 
															+        if merged_cells and i < len(merged_cells):
														
 
															+            bbox_row = int(merged_cells[i].get("row", 0))
														
 
															+
														
 
															+        if force_all:
														
 
															+            return True, ["force_all"]
														
 
															+        if i in need_reocr_indices:
														
 
															+            reasons.append("spanning_or_cross_cell")
														
 
															+        if sc < 0.90:
														
 
															+            reasons.append("low_first_pass_score")
														
 
															+        if merged_cells and i < len(merged_cells):
														
 
															+            bb = merged_cells[i].get("bbox") or []
														
 
															+            if len(bb) >= 4:
														
 
															+                w_box = bb[2] - bb[0]
														
 
															+                h_box = bb[3] - bb[1]
														
 
															+                if h_box > w_box * 2.5 and sc < 0.95:
														
 
															+                    reasons.append("tall_cell_low_score")
														
 
															+
														
 
															+        if self.second_pass_reocr_mode == "bank_statement" and merged_cells:
														
 
															+            if bbox_row is not None and bbox_row > header_row and not (t or "").strip():
														
 
															+                if "body_row_empty" not in reasons:
														
 
															+                    reasons.append("body_row_empty")
														
 
															+            if bbox_row is not None and bbox_row > header_row:
														
 
															+                same_row_nonempty = 0
														
 
															+                for j, other in enumerate(merged_cells):
														
 
															+                    if int(other.get("row", -1)) != bbox_row:
														
 
															+                        continue
														
 
															+                    ot = (texts[j] if j < len(texts) else "").strip()
														
 
															+                    if ot:
														
 
															+                        same_row_nonempty += 1
														
 
															+                if (
														
 
															+                    not (t or "").strip()
														
 
															+                    and same_row_nonempty >= self.second_pass_row_peer_min_nonempty
														
 
															+                    and "row_peer_nonempty" not in reasons
														
 
															+                ):
														
 
															+                    reasons.append("row_peer_nonempty")
														
 
															+
														
 
															+        if not reasons:
														
 
															+            if (not t or not t.strip()) and sc < 0.95 and pdf_type != "txt":
														
 
															+                reasons.append("empty_low_score")
														
 
															+
														
 
															+        return bool(reasons), reasons
														
 
															+
														
 
															+    def _save_cell_ocr_debug(
														
 
															+        self,
														
 
															+        cell_ocr_dir: str,
														
 
															+        cell_idx: int,
														
 
															+        debug_img: np.ndarray,
														
 
															+        result: Dict[str, Any],
														
 
															+        *,
														
 
															+        first_pass_text: str = "",
														
 
															+        first_pass_score: float = 0.0,
														
 
															+        trigger_reasons: Optional[List[str]] = None,
														
 
															+        bbox: Optional[List[float]] = None,
														
 
															+        pass_label: str = "",
														
 
															+    ) -> None:
														
 
															+        tag = self.sanitize_debug_filename(result.get("final_text") or "empty")
														
 
															+        strategy = result.get("strategy") or "empty"
														
 
															+        stem = f"cell{cell_idx:03d}"
														
 
															+        if pass_label:
														
 
															+            stem += f"_{pass_label}"
														
 
															+        stem += f"_{strategy}_{tag}"
														
 
															+        png_path = os.path.join(cell_ocr_dir, f"{stem}.png")
														
 
															+        try:
														
 
															+            cv2.imwrite(png_path, debug_img)
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"保存单元格OCR图片失败 (cell {cell_idx}): {e}")
														
 
															+            return
														
 
															+        payload = {
														
 
															+            "cell_idx": cell_idx,
														
 
															+            "bbox": bbox,
														
 
															+            "first_pass": {"text": first_pass_text, "score": first_pass_score},
														
 
															+            "trigger_reason": trigger_reasons or [],
														
 
															+            "lines": result.get("lines") or [],
														
 
															+            "line_aggregate": result.get("line_aggregate"),
														
 
															+            "whole": result.get("whole"),
														
 
															+            "strip": result.get("strip"),
														
 
															+            "final": {
														
 
															+                "text": result.get("final_text") or "",
														
 
															+                "score": result.get("final_score") or 0.0,
														
 
															+                "strategy": strategy,
														
 
															+                "accepted": result.get("accepted", False),
														
 
															+            },
														
 
															+            "dynamic_conf_threshold": result.get("dynamic_conf_threshold"),
														
 
															+            "pass1": result.get("pass1"),
														
 
															+            "pass2": result.get("pass2"),
														
 
															+            "enhance_retry": result.get("enhance_retry"),
														
 
															+            "preprocess_stages": result.get("preprocess_stages") or [],
														
 
															+        }
														
 
															+        json_path = os.path.join(cell_ocr_dir, f"{stem}.json")
														
 
															+        try:
														
 
															+            with open(json_path, "w", encoding="utf-8") as f:
														
 
															+                json.dump(payload, f, ensure_ascii=False, indent=2)
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"保存单元格OCR JSON失败 (cell {cell_idx}): {e}")
														
 
															     @staticmethod
														
 
															     def calculate_dynamic_confidence_threshold(text: str, base_threshold: float = 0.9) -> float:
														
@@ -683,34 +1188,6 @@ class TextFiller:
 
															                 processed_ocr_indices.add(ocr_idx)
														
 
															-        # 已匹配到单元格但 OCR box 宽度明显超出单元格（漏检跨格的补充）
														
 
															-        # for cell_idx, cell_bbox in enumerate(bboxes):
														
 
															-        #     if not matched_boxes_list[cell_idx]:
														
 
															-        #         continue
														
 
															-        #     cell_w = cell_bbox[2] - cell_bbox[0]
														
 
															-        #     if cell_w <= 0:
														
 
															-        #         continue
														
 
															-        #     for box in matched_boxes_list[cell_idx]:
														
 
															-        #         ocr_bbox = CoordinateUtils.poly_to_bbox(box.get("bbox", []))
														
 
															-        #         if not ocr_bbox or len(ocr_bbox) < 4:
														
 
															-        #             continue
														
 
															-        #         ocr_w = ocr_bbox[2] - ocr_bbox[0]
														
 
															-        #         if ocr_w <= cell_w * self.ocr_bbox_width_overflow_ratio:
														
 
															-        #             continue
														
 
															-        #         cx = (ocr_bbox[0] + ocr_bbox[2]) / 2
														
 
															-        #         cy = (ocr_bbox[1] + ocr_bbox[3]) / 2
														
 
															-        #         spanning = self.detect_ocr_box_spanning_cells(
														
 
															-        #             ocr_bbox, bboxes, center_point=(cx, cy)
														
 
															-        #         )
														
 
															-        #         targets = spanning if len(spanning) >= 2 else [cell_idx]
														
 
															-        #         for tidx in targets:
														
 
															-        #             if tidx not in need_reocr_indices:
														
 
															-        #                 need_reocr_indices.append(tidx)
														
 
															-        #         logger.debug(
														
 
															-        #             f"OCR box 宽度({ocr_w:.0f})超出单元格{cell_idx}宽度({cell_w:.0f})，"
														
 
															-        #             f"标记重识别: {targets}"
														
 
															-        #         )
														
 
															-        
														
 
															         return texts, scores, matched_boxes_list, need_reocr_indices
														
 
															     @staticmethod
														
@@ -864,10 +1341,11 @@ class TextFiller:
 
															         force_all: bool = False,
														
 
															         output_dir: Optional[str] = None,
														
 
															         debug_prefix: Optional[str] = None,
														
 
															+        merged_cells: Optional[List[Dict[str, Any]]] = None,
														
 
															     ) -> List[str]:
														
 
															         """
														
 
															-        二次OCR：分行 det+rec（低分块丢弃、长度加权置信度）+ 整格 det=False 兜底择优。
														
 
															-        debug 图落盘至 output_dir/{debug_prefix}/cell{idx}_{text}.png
														
 
															+        二次OCR：分行 det+rec + 整格/条带兜底 + 低分笔画增强重试。
														
 
															+        debug: output_dir/{debug_prefix}/cell{idx}_{strategy}_{tag}.png + 同名 .json
														
 
															         """
														
 
															         try:
														
 
															             if not self.ocr_engine:
														
@@ -888,28 +1366,21 @@ class TextFiller:
 
															             h_img, w_img = table_image.shape[:2]
														
 
															             margin = self.cell_crop_margin
														
 
															-            trigger_score_thresh = 0.90
														
 
															-
														
 
															-            crop_list: List[np.ndarray] = []
														
 
															-            crop_indices: List[int] = []
														
 
															-
														
 
															-            for i, t in enumerate(texts):
														
 
															-                bbox = bboxes[i]
														
 
															-                w_box = bbox[2] - bbox[0]
														
 
															-                h_box = bbox[3] - bbox[1]
														
 
															-
														
 
															-                need_reocr = False
														
 
															-                if force_all:
														
 
															-                    need_reocr = True
														
 
															-                elif i in need_reocr_indices:
														
 
															-                    need_reocr = True
														
 
															-                elif (not t or not t.strip()) and scores[i] < 0.95:
														
 
															-                    need_reocr = pdf_type != 'txt'
														
 
															-                elif scores[i] < trigger_score_thresh:
														
 
															-                    need_reocr = True
														
 
															-                elif h_box > w_box * 2.5 and scores[i] < 0.95:
														
 
															-                    need_reocr = True
														
 
															+            header_row = self._infer_header_row(merged_cells, texts, scores)
														
 
															+
														
 
															+            jobs: List[Tuple[int, np.ndarray, List[str], List[float]]] = []
														
 
															+            for i, _t in enumerate(texts):
														
 
															+                need_reocr, trigger_reasons = self._should_second_pass_cell(
														
 
															+                    i,
														
 
															+                    texts,
														
 
															+                    scores,
														
 
															+                    need_reocr_indices,
														
 
															+                    merged_cells,
														
 
															+                    pdf_type,
														
 
															+                    force_all,
														
 
															+                    header_row,
														
 
															+                )
														
 
															                 if not need_reocr or i >= len(bboxes):
														
 
															                     continue
														
@@ -921,61 +1392,92 @@ class TextFiller:
 
															                 if x2 <= x1 or y2 <= y1:
														
 
															                     continue
														
 
															-                cell_img = table_image[y1:y2, x1:x2]
														
 
															-                if cell_img.size == 0:
														
 
															+                raw_crop = table_image[y1:y2, x1:x2]
														
 
															+                if raw_crop.size == 0:
														
 
															                     continue
														
 
															-                ch, cw = cell_img.shape[:2]
														
 
															-                if ch < 64 or cw < 64:
														
 
															-                    cell_img = cv2.resize(
														
 
															-                        cell_img, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC
														
 
															-                    )
														
 
															-                    logger.debug(
														
 
															-                        f"单元格 {i} 裁剪过小，放大至 {cell_img.shape[1]}x{cell_img.shape[0]} 像素"
														
 
															-                    )
														
 
															-
														
 
															-                crop_list.append(cell_img)
														
 
															-                crop_indices.append(i)
														
 
															+                fp_text = texts[i] if i < len(texts) else ""
														
 
															+                fp_score = float(scores[i] if i < len(scores) else 0.0)
														
 
															+                jobs.append((i, raw_crop, trigger_reasons, [fp_text, fp_score, bboxes[i]]))
														
 
															-            if not crop_list:
														
 
															+            if not jobs:
														
 
															                 return texts
														
 
															-            logger.info(f"触发二次OCR: {len(crop_list)} 个单元格 (总数 {len(texts)})")
														
 
															+            logger.info(
														
 
															+                f"触发二次OCR: {len(jobs)} 个单元格 (总数 {len(texts)}, "
														
 
															+                f"mode={self.second_pass_reocr_mode}, header_row={header_row})"
														
 
															+            )
														
 
															             base_conf_th = self.ocr_conf_threshold
														
 
															-            line_min = self.second_pass_line_min_score
														
 
															-            drop_low = self.second_pass_drop_low
														
 
															-            for k, cell_img in enumerate(crop_list):
														
 
															-                cell_idx = crop_indices[k]
														
 
															-
														
 
															-                line_blocks = self._recognize_cell_lines(cell_img)
														
 
															-                line_text, line_score = self.aggregate_line_ocr(
														
 
															-                    line_blocks,
														
 
															-                    line_min_score=line_min,
														
 
															-                    drop_low_score_blocks=drop_low,
														
 
															+            for cell_idx, raw_crop, trigger_reasons, meta in jobs:
														
 
															+                fp_text, fp_score, cell_bbox = meta[0], float(meta[1]), meta[2]
														
 
															+                cell_img, preprocess_stages = self._preprocess_cell_for_ocr(
														
 
															+                    raw_crop, mode="light"
														
 
															                 )
														
 
															+                pass1 = self._ocr_one_cell(cell_img, base_conf_th)
														
 
															+                pass1["preprocess_stages"] = list(preprocess_stages)
														
 
															+                pass1["pass1"] = {
														
 
															+                    "text": pass1.get("final_text"),
														
 
															+                    "score": pass1.get("final_score"),
														
 
															+                    "strategy": pass1.get("strategy"),
														
 
															+                }
														
 
															-                whole_text, whole_score = ("", 0.0)
														
 
															-                if self.second_pass_whole_fallback and line_score < base_conf_th:
														
 
															-                    whole_text, whole_score = self._recognize_whole_cell(cell_img)
														
 
															+                result = dict(pass1)
														
 
															+                enhance_info: Dict[str, Any] = {"triggered": False, "reason": []}
														
 
															+                dyn_th = float(pass1.get("dynamic_conf_threshold") or base_conf_th)
														
 
															+                do_retry, retry_reasons = self._needs_enhance_retry(
														
 
															+                    pass1, cell_img, dyn_th
														
 
															+                )
														
 
															+                if do_retry:
														
 
															+                    enhance_info["triggered"] = True
														
 
															+                    enhance_info["reason"] = retry_reasons
														
 
															+                    enhanced_img, enhance_stages = self._preprocess_cell_for_ocr(
														
 
															+                        raw_crop, mode="enhance"
														
 
															+                    )
														
 
															+                    pass2 = self._ocr_one_cell(enhanced_img, base_conf_th)
														
 
															+                    pass2["preprocess_stages"] = list(enhance_stages)
														
 
															+                    pass2["pass2"] = {
														
 
															+                        "text": pass2.get("final_text"),
														
 
															+                        "score": pass2.get("final_score"),
														
 
															+                        "strategy": pass2.get("strategy"),
														
 
															+                    }
														
 
															+                    result = self._pick_better_ocr_result(pass1, pass2)
														
 
															+                    result["pass1"] = pass1.get("pass1")
														
 
															+                    result["pass2"] = pass2.get("pass2")
														
 
															+                    enhance_info["pass2"] = result.get("pass2")
														
 
															+                result["enhance_retry"] = enhance_info
														
 
															-                final_text, final_score, strategy = self._pick_line_vs_whole(
														
 
															-                    line_text, line_score, whole_text, whole_score
														
 
															+                debug_img, _ = self._preprocess_cell_for_ocr(
														
 
															+                    raw_crop, mode="enhance" if enhance_info["triggered"] else "light"
														
 
															                 )
														
 
															+                if cell_ocr_dir:
														
 
															+                    self._save_cell_ocr_debug(
														
 
															+                        cell_ocr_dir,
														
 
															+                        cell_idx,
														
 
															+                        debug_img,
														
 
															+                        result,
														
 
															+                        first_pass_text=fp_text,
														
 
															+                        first_pass_score=fp_score,
														
 
															+                        trigger_reasons=trigger_reasons,
														
 
															+                        bbox=cell_bbox,
														
 
															+                    )
														
 
															-                if cell_ocr_dir and cell_img is not None:
														
 
															-                    try:
														
 
															-                        tag = self.sanitize_debug_filename(final_text or "empty")
														
 
															-                        filename = f"cell{cell_idx:03d}_{strategy}_{tag}.png"
														
 
															-                        cv2.imwrite(os.path.join(cell_ocr_dir, filename), cell_img)
														
 
															-                    except Exception as e:
														
 
															-                        logger.warning(f"保存单元格OCR图片失败 (cell {cell_idx}): {e}")
														
 
															+                final_text = (result.get("final_text") or "").strip()
														
 
															+                final_score = float(result.get("final_score") or 0.0)
														
 
															+                strategy = result.get("strategy") or "empty"
														
 
															                 if not final_text:
														
 
															+                    logger.debug(
														
 
															+                        f"单元格 {cell_idx} 二次OCR({strategy}) 无文本, "
														
 
															+                        f"trigger={trigger_reasons}"
														
 
															+                    )
														
 
															                     continue
														
 
															-                dynamic_conf_th = self.calculate_dynamic_confidence_threshold(
														
 
															-                    final_text, base_conf_th
														
 
															+                dynamic_conf_th = float(
														
 
															+                    result.get("dynamic_conf_threshold")
														
 
															+                    or self.calculate_dynamic_confidence_threshold(
														
 
															+                        final_text, base_conf_th
														
 
															+                    )
														
 
															                 )
														
 
															                 if final_score >= dynamic_conf_th:
														
 
															                     texts[cell_idx] = final_text