ocr_remove_spans.py 551 B

123456789101112131415161718
  1. from magic_pdf.libs.boxbase import _is_in_or_part_overlap
  2. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  3. # 遍历spans, 判断是否在removed_span_block_bboxes中
  4. # 如果是, 则删除该span
  5. # 否则, 保留该span
  6. need_remove_spans = []
  7. for span in spans:
  8. for bbox in need_remove_spans_bboxes:
  9. if _is_in_or_part_overlap(span['bbox'], bbox):
  10. need_remove_spans.append(span)
  11. break
  12. for span in need_remove_spans:
  13. spans.remove(span)
  14. return spans