Просмотр исходного кода

feat: 更新二次OCR填充逻辑,增加OCR误合并检测和文本空值处理

zhch158_admin 3 дней назад
Родитель
Сommit
975ab2f230

+ 8 - 8
ocr_tools/universal_doc_parser/models/adapters/wired_table/text_filling.py

@@ -424,26 +424,26 @@ class TextFiller:
                     need_reocr = True
                     reocr_reason = "强制全量OCR"
                 else:
-                    # 1. 文本为空且置信度不是极高
-                    if (not t or not t.strip()) and scores[i] < 0.95:
+                    # 1. OCR 误合并:OCR box 跨多个单元格或过大, 跨单元格中的一个单元格的文本可能是''
+                    if i in need_reocr_indices:
+                        need_reocr = True
+                        reocr_reason = "OCR误合并"
+                    # 2. 文本为空且置信度不是极高
+                    elif (not t or not t.strip()) and scores[i] < 0.95:
                         if pdf_type == 'txt':
                             # PDF文本模式下,空文本不触发二次OCR
                             need_reocr = False
                         else:
                             need_reocr = True
                             reocr_reason = "空文本"
-                    # 2. 置信度过低
+                    # 3. 置信度过低
                     elif scores[i] < trigger_score_thresh:
                         need_reocr = True
                         reocr_reason = "低置信度"
-                    # 3. 竖排单元格 (高宽比 > 2.5) 且置信度不是极高
+                    # 4. 竖排单元格 (高宽比 > 2.5) 且置信度不是极高
                     elif h_box > w_box * 2.5 and scores[i] < 0.95:
                         need_reocr = True
                         reocr_reason = "竖排文本"
-                    # 4. OCR 误合并:OCR box 跨多个单元格或过大
-                    elif i in need_reocr_indices:
-                        need_reocr = True
-                        reocr_reason = "OCR误合并"
 
                 if not need_reocr:
                     continue