ocr_remove_spans.py 607 B

1234567891011121314151617
  1. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
  2. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  3. # 遍历spans, 判断是否在removed_span_block_bboxes中
  4. # 如果是, 则删除该span 否则, 保留该span
  5. need_remove_spans = []
  6. for span in spans:
  7. for removed_bbox in need_remove_spans_bboxes:
  8. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  9. need_remove_spans.append(span)
  10. break
  11. for span in need_remove_spans:
  12. spans.remove(span)
  13. return spans