|
|
@@ -0,0 +1,45 @@
|
|
|
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio
|
|
|
+
|
|
|
+
|
|
|
+def remove_overlaps_min_spans(spans):
|
|
|
+ # 删除重叠spans中较小的那些
|
|
|
+ for span1 in spans.copy():
|
|
|
+ for span2 in spans.copy():
|
|
|
+ if span1 != span2:
|
|
|
+ overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
|
|
|
+ if overlap_box is not None:
|
|
|
+ bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
|
|
|
+ if bbox_to_remove is not None:
|
|
|
+ spans.remove(bbox_to_remove)
|
|
|
+ return spans
|
|
|
+
|
|
|
+
|
|
|
+def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
|
|
|
+ # 遍历spans, 判断是否在removed_span_block_bboxes中
|
|
|
+ # 如果是, 则删除该span 否则, 保留该span
|
|
|
+ need_remove_spans = []
|
|
|
+ for span in spans:
|
|
|
+ for removed_bbox in need_remove_spans_bboxes:
|
|
|
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
|
|
|
+ need_remove_spans.append(span)
|
|
|
+ break
|
|
|
+
|
|
|
+ for span in need_remove_spans:
|
|
|
+ spans.remove(span)
|
|
|
+
|
|
|
+ return spans
|
|
|
+
|
|
|
+
|
|
|
+def adjust_bbox_for_standalone_block(spans):
|
|
|
+ # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
|
|
+ for sb_span in spans:
|
|
|
+ if sb_span['type'] in ["displayed_equation", "image", "table"]:
|
|
|
+ for text_span in spans:
|
|
|
+ if text_span['type'] in ['text', 'inline_equation']:
|
|
|
+ # 判断span2的纵向高度是否被span所覆盖
|
|
|
+ if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
|
|
|
+ # 判断span2是否在span左边
|
|
|
+ if text_span['bbox'][0] < sb_span['bbox'][0]:
|
|
|
+ # 调整span的y0和span2的y0一致
|
|
|
+ sb_span['bbox'][1] = text_span['bbox'][1]
|
|
|
+ return spans
|