Kaynağa Gözat

feat: introduce OcrConfidence class and update confidence threshold checks in OCR processing

myhloli 5 ay önce
ebeveyn
işleme
59d8f105e5

+ 3 - 3
mineru/backend/pipeline/batch_analyze.py

@@ -5,8 +5,8 @@ from collections import defaultdict
 import numpy as np
 
 from .model_init import AtomModelSingleton
-from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
-from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list
+from ...utils.model_utils import crop_img, get_res_list_from_layout_res
+from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
 
 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
 MFD_BASE_BATCH_SIZE = 1
@@ -315,7 +315,7 @@ class BatchAnalyze:
                         ocr_text, ocr_score = ocr_res_list[index]
                         layout_res_item['text'] = ocr_text
                         layout_res_item['score'] = float(f"{ocr_score:.3f}")
-                        if ocr_score < 0.6:
+                        if ocr_score < OcrConfidence.min_confidence:
                             layout_res_item['category_id'] = 16
 
                     total_processed += len(img_crop_list)

+ 2 - 1
mineru/backend/pipeline/model_json_to_middle_json.py

@@ -14,6 +14,7 @@ from mineru.utils.enum_class import ContentType
 from mineru.utils.llm_aided import llm_aided_title
 from mineru.utils.model_utils import clean_memory
 from mineru.backend.pipeline.pipeline_magic_model import MagicModel
+from mineru.utils.ocr_utils import OcrConfidence
 from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
 from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
     remove_overlaps_min_spans, txt_spans_extract
@@ -208,7 +209,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
             need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
         for index, span in enumerate(need_ocr_list):
             ocr_text, ocr_score = ocr_res_list[index]
-            if ocr_score > 0.6:
+            if ocr_score > OcrConfidence.min_confidence:
                 span['content'] = ocr_text
                 span['score'] = float(f"{ocr_score:.3f}")
             else:

+ 11 - 1
mineru/utils/ocr_utils.py

@@ -4,6 +4,11 @@ import cv2
 import numpy as np
 
 
+class OcrConfidence:
+    min_confidence = 0.68
+    min_width = 3
+
+
 def merge_spans_to_line(spans, threshold=0.6):
     if len(spans) == 0:
         return []
@@ -304,7 +309,7 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
             p1, p2, p3, p4 = box_ocr_res[0]
             text, score = box_ocr_res[1]
             # logger.info(f"text: {text}, score: {score}")
-            if score < 0.6:  # 过滤低置信度的结果
+            if score < OcrConfidence.min_confidence:  # 过滤低置信度的结果
                 continue
         else:
             p1, p2, p3, p4 = box_ocr_res
@@ -317,6 +322,11 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
         # average_angle_degrees = calculate_angle_degrees(box_ocr_res[0])
         # if average_angle_degrees > 0.5:
         poly = [p1, p2, p3, p4]
+
+        if (p3[0] - p1[0]) < OcrConfidence.min_width:
+            # logger.info(f"width too small: {p3[0] - p1[0]}, text: {text}")
+            continue
+
         if calculate_is_angle(poly):
             # logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}")
             # 与x轴的夹角超过0.5度,对边界做一下矫正