|
|
@@ -806,9 +806,11 @@ class TextFiller:
|
|
|
if force_all:
|
|
|
return True, ["force_all"]
|
|
|
if i in need_reocr_indices:
|
|
|
- reasons.append("spanning_or_cross_cell")
|
|
|
+ reasons.append("incomplete_vertical_ocr_box, or spanning_cross_cell")
|
|
|
+ return bool(reasons), reasons
|
|
|
if sc < 0.90:
|
|
|
reasons.append("low_first_pass_score")
|
|
|
+ return bool(reasons), reasons
|
|
|
if merged_cells and i < len(merged_cells):
|
|
|
bb = merged_cells[i].get("bbox") or []
|
|
|
if len(bb) >= 4:
|
|
|
@@ -816,6 +818,7 @@ class TextFiller:
|
|
|
h_box = bb[3] - bb[1]
|
|
|
if h_box > w_box * 2.5 and sc < 0.95:
|
|
|
reasons.append("tall_cell_low_score")
|
|
|
+ return bool(reasons), reasons
|
|
|
|
|
|
# ── bank_statement 空单元格逻辑:仅对无值的格生效 ──
|
|
|
cell_empty = not (t or "").strip()
|
|
|
@@ -823,6 +826,7 @@ class TextFiller:
|
|
|
if bbox_row is not None and bbox_row == header_row:
|
|
|
if "header_row_empty" not in reasons:
|
|
|
reasons.append("header_row_empty")
|
|
|
+ return bool(reasons), reasons
|
|
|
elif bbox_row is not None and bbox_row > header_row:
|
|
|
if bbox_col is not None and bbox_col >= 0:
|
|
|
col_empty_ratio = self._column_empty_ratio(
|
|
|
@@ -832,16 +836,18 @@ class TextFiller:
|
|
|
# 该列表体大部分格子有值 → 本格为空可能是 OCR 遗漏
|
|
|
if "body_row_empty_column_mostly_filled" not in reasons:
|
|
|
reasons.append("body_row_empty_column_mostly_filled")
|
|
|
+ return bool(reasons), reasons
|
|
|
else:
|
|
|
# 该列表体本来就多为空 → 不触发二次 OCR
|
|
|
return False, []
|
|
|
else:
|
|
|
if "body_row_empty" not in reasons:
|
|
|
reasons.append("body_row_empty")
|
|
|
-
|
|
|
+ return bool(reasons), reasons
|
|
|
if not reasons:
|
|
|
if cell_empty and sc < 0.95 and pdf_type != "txt":
|
|
|
reasons.append("empty_low_score")
|
|
|
+ return bool(reasons), reasons
|
|
|
|
|
|
return bool(reasons), reasons
|
|
|
|
|
|
@@ -1376,7 +1382,7 @@ class TextFiller:
|
|
|
scores[idx] = 0.0
|
|
|
matched_boxes_list[idx] = []
|
|
|
if idx not in need_reocr_indices:
|
|
|
- logger.debug(f"单元格[{idx}]检测到 OCR box 纵向不完整,需要二次 OCR: {ocr_item['text'][:20]}...")
|
|
|
+ logger.debug(f"单元格[{idx}]检测到 OCR box 纵向不完整,需要二次 OCR: {matched[0][0][:20]}...")
|
|
|
need_reocr_indices.append(idx)
|
|
|
else:
|
|
|
matched, force_zero_score = self._resolve_cell_matched_boxes(
|