Sfoglia il codice sorgente

Merge pull request #1048 from myhloli/dev

fix(remove_overlaps_min_spans): optimize overlap detection in OCR span list modification
Xiaomeng Zhao 1 anno fa
parent
commit
a703e527d9
1 ha cambiato i file con 9 aggiunte e 13 eliminazioni
  1. 9 13
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 9 - 13
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -42,19 +42,15 @@ def remove_overlaps_min_spans(spans):
     for span1 in spans:
         for span2 in spans:
             if span1 != span2:
-                overlap_box = get_minbox_if_overlap_by_ratio(
-                    span1['bbox'], span2['bbox'], 0.65
-                )
-                if overlap_box is not None:
-                    span_need_remove = next(
-                        (span for span in spans if span['bbox'] == overlap_box), None
-                    )
-                    if (
-                        span_need_remove is not None
-                        and span_need_remove not in dropped_spans
-                    ):
-                        dropped_spans.append(span_need_remove)
-
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                    if overlap_box is not None:
+                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
     if len(dropped_spans) > 0:
         for span_need_remove in dropped_spans:
             spans.remove(span_need_remove)