| 123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio
- def remove_overlaps_min_spans(spans):
- # 删除重叠spans中较小的那些
- for span1 in spans.copy():
- for span2 in spans.copy():
- if span1 != span2:
- overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
- if overlap_box is not None:
- bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
- if bbox_to_remove is not None:
- spans.remove(bbox_to_remove)
- return spans
- def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
- # 遍历spans, 判断是否在removed_span_block_bboxes中
- # 如果是, 则删除该span 否则, 保留该span
- need_remove_spans = []
- for span in spans:
- for removed_bbox in need_remove_spans_bboxes:
- if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
- need_remove_spans.append(span)
- break
- for span in need_remove_spans:
- spans.remove(span)
- return spans
- def adjust_bbox_for_standalone_block(spans):
- # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
- for sb_span in spans:
- if sb_span['type'] in ["displayed_equation", "image", "table"]:
- for text_span in spans:
- if text_span['type'] in ['text', 'inline_equation']:
- # 判断span2的纵向高度是否被span所覆盖
- if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
- # 判断span2是否在span左边
- if text_span['bbox'][0] < sb_span['bbox'][0]:
- # 调整span的y0和span2的y0一致
- sb_span['bbox'][1] = text_span['bbox'][1]
- return spans
|