浏览代码

更新remove_spans_by_bboxes中选择被删除的span的逻辑

赵小蒙 1 年之前
父节点
当前提交
0c279ffccd
共有 2 个文件被更改,包括 25 次插入5 次删除
  1. 21 0
      magic_pdf/libs/boxbase.py
  2. 4 5
      magic_pdf/pre_proc/ocr_remove_spans.py

+ 21 - 0
magic_pdf/libs/boxbase.py

@@ -177,6 +177,27 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
     else:
         return intersection_area / min_box_area
 
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占bbox1的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+
 
 def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
     """

+ 4 - 5
magic_pdf/pre_proc/ocr_remove_spans.py

@@ -1,14 +1,13 @@
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 
 
 def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
     # 遍历spans, 判断是否在removed_span_block_bboxes中
-    # 如果是, 则删除该span
-    # 否则, 保留该span
+    # 如果是, 则删除该span 否则, 保留该span
     need_remove_spans = []
     for span in spans:
-        for bbox in need_remove_spans_bboxes:
-            if _is_in_or_part_overlap(span['bbox'], bbox):
+        for removed_bbox in need_remove_spans_bboxes:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
                 need_remove_spans.append(span)
                 break