Explorar o código

fix(优化文本填充逻辑): 更新TextFiller类中的文本填充逻辑,增加对OCR结果不完整的判断,优化返回的原因列表,提升OCR处理的准确性与灵活性。

zhch158_admin hai 7 horas
pai
achega
ac90e7c976

+ 9 - 3
ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py

@@ -806,9 +806,11 @@ class TextFiller:
         if force_all:
             return True, ["force_all"]
         if i in need_reocr_indices:
-            reasons.append("spanning_or_cross_cell")
+            reasons.append("incomplete_vertical_ocr_box, or spanning_cross_cell")
+            return bool(reasons), reasons
         if sc < 0.90:
             reasons.append("low_first_pass_score")
+            return bool(reasons), reasons
         if merged_cells and i < len(merged_cells):
             bb = merged_cells[i].get("bbox") or []
             if len(bb) >= 4:
@@ -816,6 +818,7 @@ class TextFiller:
                 h_box = bb[3] - bb[1]
                 if h_box > w_box * 2.5 and sc < 0.95:
                     reasons.append("tall_cell_low_score")
+                    return bool(reasons), reasons
 
         # ── bank_statement 空单元格逻辑:仅对无值的格生效 ──
         cell_empty = not (t or "").strip()
@@ -823,6 +826,7 @@ class TextFiller:
             if bbox_row is not None and bbox_row == header_row:
                 if "header_row_empty" not in reasons:
                     reasons.append("header_row_empty")
+                    return bool(reasons), reasons
             elif bbox_row is not None and bbox_row > header_row:
                 if bbox_col is not None and bbox_col >= 0:
                     col_empty_ratio = self._column_empty_ratio(
@@ -832,16 +836,18 @@ class TextFiller:
                         # 该列表体大部分格子有值 → 本格为空可能是 OCR 遗漏
                         if "body_row_empty_column_mostly_filled" not in reasons:
                             reasons.append("body_row_empty_column_mostly_filled")
+                            return bool(reasons), reasons
                     else:
                         # 该列表体本来就多为空 → 不触发二次 OCR
                         return False, []
                 else:
                     if "body_row_empty" not in reasons:
                         reasons.append("body_row_empty")
-
+                        return bool(reasons), reasons
         if not reasons:
             if cell_empty and sc < 0.95 and pdf_type != "txt":
                 reasons.append("empty_low_score")
+                return bool(reasons), reasons
 
         return bool(reasons), reasons
 
@@ -1376,7 +1382,7 @@ class TextFiller:
                     scores[idx] = 0.0
                     matched_boxes_list[idx] = []
                     if idx not in need_reocr_indices:
-                        logger.debug(f"单元格[{idx}]检测到 OCR box 纵向不完整,需要二次 OCR: {ocr_item['text'][:20]}...")
+                        logger.debug(f"单元格[{idx}]检测到 OCR box 纵向不完整,需要二次 OCR: {matched[0][0][:20]}...")
                         need_reocr_indices.append(idx)
                 else:
                     matched, force_zero_score = self._resolve_cell_matched_boxes(