ocr_span_list_modify.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio
  2. def remove_overlaps_min_spans(spans):
  3. # 删除重叠spans中较小的那些
  4. for span1 in spans.copy():
  5. for span2 in spans.copy():
  6. if span1 != span2:
  7. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  8. if overlap_box is not None:
  9. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  10. if bbox_to_remove is not None:
  11. spans.remove(bbox_to_remove)
  12. return spans
  13. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  14. # 遍历spans, 判断是否在removed_span_block_bboxes中
  15. # 如果是, 则删除该span 否则, 保留该span
  16. need_remove_spans = []
  17. for span in spans:
  18. for removed_bbox in need_remove_spans_bboxes:
  19. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  20. need_remove_spans.append(span)
  21. break
  22. for span in need_remove_spans:
  23. spans.remove(span)
  24. return spans
  25. def adjust_bbox_for_standalone_block(spans):
  26. # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  27. for sb_span in spans:
  28. if sb_span['type'] in ["displayed_equation", "image", "table"]:
  29. for text_span in spans:
  30. if text_span['type'] in ['text', 'inline_equation']:
  31. # 判断span2的纵向高度是否被span所覆盖
  32. if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
  33. # 判断span2是否在span左边
  34. if text_span['bbox'][0] < sb_span['bbox'][0]:
  35. # 调整span的y0和span2的y0一致
  36. sb_span['bbox'][1] = text_span['bbox'][1]
  37. return spans