Ver código fonte

fix remove error

赵小蒙 1 ano atrás
pai
commit
f70289f99e

+ 4 - 2
magic_pdf/pre_proc/ocr_detect_all_bboxes.py

@@ -71,7 +71,9 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
         for discarded_block in discarded_blocks:
             block_bbox = block[:4]
             if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
-                need_remove.append(block)
+                if block not in need_remove:
+                    need_remove.append(block)
+                    break
 
     if len(need_remove) > 0:
         for block in need_remove:
@@ -90,7 +92,7 @@ def remove_overlaps_min_blocks(all_bboxes):
                 overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
                 if overlap_box is not None:
                     bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
-                    if bbox_to_remove is not None:
+                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
                         need_remove.append(bbox_to_remove)
 
     if len(need_remove) > 0:

+ 9 - 8
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -14,14 +14,14 @@ def remove_overlaps_min_spans(spans):
             if span1 != span2:
                 overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
                 if overlap_box is not None:
-                    bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
-                    if bbox_to_remove is not None:
-                        dropped_spans.append(bbox_to_remove)
+                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                    if span_need_remove is not None and span_need_remove not in dropped_spans:
+                        dropped_spans.append(span_need_remove)
 
     if len(dropped_spans) > 0:
-        for dropped_span in dropped_spans:
-            spans.remove(dropped_span)
-            dropped_span['tag'] = DropTag.SPAN_OVERLAP
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
 
     return spans, dropped_spans
 
@@ -33,8 +33,9 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
     for span in spans:
         for removed_bbox in need_remove_spans_bboxes:
             if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
-                need_remove_spans.append(span)
-                break
+                if span not in need_remove_spans:
+                    need_remove_spans.append(span)
+                    break
 
     if len(need_remove_spans) > 0:
         for span in need_remove_spans: