瀏覽代碼

fix(remove_overlaps_min_spans): optimize overlap detection in OCR span list modification

- Improve logic to skip dropped spans in overlap detection
- Enhance efficiency by avoiding unnecessary comparisons
myhloli 1 年之前
父節點
當前提交
e4810cee17
共有 1 個文件被更改,包括 9 次插入5 次删除
  1. 9 5
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 9 - 5
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -38,11 +38,15 @@ def remove_overlaps_min_spans(spans):
     for span1 in spans:
         for span2 in spans:
             if span1 != span2:
-                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
-                if overlap_box is not None:
-                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
-                        dropped_spans.append(span_need_remove)
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                    if overlap_box is not None:
+                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
 
     if len(dropped_spans) > 0:
         for span_need_remove in dropped_spans: