赵小蒙 1 год назад
Родитель
Сommit
154eed1ade
1 измененных файлов с 5 добавлено и 0 удалено
  1. 5 0
      magic_pdf/pre_proc/ocr_span_list_modify.py

+ 5 - 0
magic_pdf/pre_proc/ocr_span_list_modify.py

@@ -44,10 +44,15 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
         # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
         need_remove_spans = []
         for span in spans:
+            # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
             for removed_bbox in removed_bboxes:
                 if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
                     need_remove_spans.append(span)
                     break
+                # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
+                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1]+span['bbox'][3])/2 > removed_bbox[3] and removed_bbox[0] < (span['bbox'][0]+span['bbox'][2])/2 < removed_bbox[2]:
+                    need_remove_spans.append(span)
+                    break
 
         for span in need_remove_spans:
             spans.remove(span)