ソースを参照

fix: table and footnote relations

许瑞 1 年間 前
コミット
bd1ca92a5f
1 ファイル変更23 行追加2 行削除
  1. 23 2
      magic_pdf/model/magic_model.py

+ 23 - 2
magic_pdf/model/magic_model.py

@@ -89,6 +89,25 @@ class MagicModel:
         ret = []
         MAX_DIS_OF_POINT = 10**9 + 7
 
+        # subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
+        # 再求出筛选出的 subjects 和 object 的最短距离!
+        def may_find_other_nearest_bbox(subject_idx, object_idx):
+            ret = float("inf")
+            x0 = min(all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0])
+            y0 = min(all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1])
+            x1 = max(all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2])
+            y1 = max(all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3])
+
+            object_area = abs(all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0]) * abs(all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1])
+            for i in range(len(all_bboxes)):
+                if i == subject_idx or all_bboxes[i]["category_id"] != subject_category_id:
+                    continue
+                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in(all_bboxes[i]["bbox"], [x0, y0, x1, y1]):
+                    i_area = abs(all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1])
+                    if i_area >= object_area:
+                        ret = min(float("inf"), dis[i][object_idx]) 
+            return ret
+
         subjects = self.__reduct_overlap(
             list(
                 map(
@@ -170,8 +189,10 @@ class MagicModel:
 
             arr.sort(key=lambda x: x[0])
             if len(arr) > 0:
-                candidates.append(arr[0][1])
-                seen.add(arr[0][1])
+                # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject]
+                if may_find_other_nearest_bbox(i, j) >= arr[0][0]:
+                    candidates.append(arr[0][1])
+                    seen.add(arr[0][1])
 
             # 已经获取初始种子
             for j in set(candidates):