Bläddra i källkod

fix(magic_pdf): prevent removal of low-confidence spans already dropped

myhloli 1 år sedan
förälder
incheckning
5f992de4da
1 ändrade filer med 12 tillägg och 7 borttagningar
  1. 12 7
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 12 - 7
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 
+
 def remove_overlaps_low_confidence_spans(spans):
     dropped_spans = []
     #  删除重叠spans中置信度低的的那些
     for span1 in spans:
         for span2 in spans:
             if span1 != span2:
-                if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
-                    if span1['score'] < span2['score']:
-                        span_need_remove = span1
-                    else:
-                        span_need_remove = span2
-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
-                        dropped_spans.append(span_need_remove)
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                        if span1['score'] < span2['score']:
+                            span_need_remove = span1
+                        else:
+                            span_need_remove = span2
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
 
     if len(dropped_spans) > 0:
         for span_need_remove in dropped_spans: