|
|
@@ -424,26 +424,26 @@ class TextFiller:
|
|
|
need_reocr = True
|
|
|
reocr_reason = "强制全量OCR"
|
|
|
else:
|
|
|
- # 1. 文本为空且置信度不是极高
|
|
|
- if (not t or not t.strip()) and scores[i] < 0.95:
|
|
|
+ # 1. OCR 误合并:OCR box 跨多个单元格或过大, 跨单元格中的一个单元格的文本可能是''
|
|
|
+ if i in need_reocr_indices:
|
|
|
+ need_reocr = True
|
|
|
+ reocr_reason = "OCR误合并"
|
|
|
+ # 2. 文本为空且置信度不是极高
|
|
|
+ elif (not t or not t.strip()) and scores[i] < 0.95:
|
|
|
if pdf_type == 'txt':
|
|
|
# PDF文本模式下,空文本不触发二次OCR
|
|
|
need_reocr = False
|
|
|
else:
|
|
|
need_reocr = True
|
|
|
reocr_reason = "空文本"
|
|
|
- # 2. 置信度过低
|
|
|
+ # 3. 置信度过低
|
|
|
elif scores[i] < trigger_score_thresh:
|
|
|
need_reocr = True
|
|
|
reocr_reason = "低置信度"
|
|
|
- # 3. 竖排单元格 (高宽比 > 2.5) 且置信度不是极高
|
|
|
+ # 4. 竖排单元格 (高宽比 > 2.5) 且置信度不是极高
|
|
|
elif h_box > w_box * 2.5 and scores[i] < 0.95:
|
|
|
need_reocr = True
|
|
|
reocr_reason = "竖排文本"
|
|
|
- # 4. OCR 误合并:OCR box 跨多个单元格或过大
|
|
|
- elif i in need_reocr_indices:
|
|
|
- need_reocr = True
|
|
|
- reocr_reason = "OCR误合并"
|
|
|
|
|
|
if not need_reocr:
|
|
|
continue
|