ソースを参照

feat: update remove overlap

许瑞 1 年間 前
コミット
6a3d1f2dfb
2 ファイル変更14 行追加5 行削除
  1. 0 3
      magic_pdf/model/magic_model.py
  2. 14 2
      magic_pdf/pre_proc/remove_bbox_overlap.py

+ 0 - 3
magic_pdf/model/magic_model.py

@@ -461,9 +461,6 @@ class MagicModel:
                     blocks.append(block)
         return blocks
 
-    def get_model_list(self, page_no):
-        return self.__model_list[page_no]
-
 
 if __name__ == "__main__":
     drw = DiskReaderWriter(r"D:/project/20231108code-clean")

+ 14 - 2
magic_pdf/pre_proc/remove_bbox_overlap.py

@@ -3,9 +3,21 @@ from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
 
 def _remove_overlap_between_bbox(spans):
     res = []
-    for v in spans:
+
+    keeps = [True] * len(spans)
+    for i in range(len(spans)):
+        for j in range(len(spans)):
+            if i == j:
+                continue
+            if _is_in(spans[i]["bbox"], spans[j]["bbox"]):
+                keeps[i] = False
+
+    for idx, v in enumerate(spans):
+        if not keeps[idx]:
+            continue
+
         for i in range(len(res)):
-            if _is_in(res[i]["bbox"], v["bbox"]) or _is_in(v["bbox"], res[i]["bbox"]):
+            if  _is_in(v["bbox"], res[i]["bbox"]):
                 continue
             if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
                 ix0, iy0, ix1, iy1 = res[i]["bbox"]