remove_bbox_overlap.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
  2. def _remove_overlap_between_bbox_for_span(spans):
  3. res = []
  4. keeps = [True] * len(spans)
  5. for i in range(len(spans)):
  6. for j in range(len(spans)):
  7. if i == j:
  8. continue
  9. if _is_in(spans[i]["bbox"], spans[j]["bbox"]):
  10. keeps[i] = False
  11. for idx, v in enumerate(spans):
  12. if not keeps[idx]:
  13. continue
  14. for i in range(len(res)):
  15. if _is_in(v["bbox"], res[i]["bbox"]):
  16. continue
  17. if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
  18. ix0, iy0, ix1, iy1 = res[i]["bbox"]
  19. x0, y0, x1, y1 = v["bbox"]
  20. diff_x = min(x1, ix1) - max(x0, ix0)
  21. diff_y = min(y1, iy1) - max(y0, iy0)
  22. if diff_y > diff_x:
  23. if x1 >= ix1:
  24. mid = (x0 + ix1) // 2
  25. ix1 = min(mid - 0.25, ix1)
  26. x0 = max(mid + 0.25, x0)
  27. else:
  28. mid = (ix0 + x1) // 2
  29. ix0 = max(mid + 0.25, ix0)
  30. x1 = min(mid - 0.25, x1)
  31. else:
  32. if y1 >= iy1:
  33. mid = (y0 + iy1) // 2
  34. y0 = max(mid + 0.25, y0)
  35. iy1 = min(iy1, mid-0.25)
  36. else:
  37. mid = (iy0 + y1) // 2
  38. y1 = min(y1, mid-0.25)
  39. iy0 = max(mid + 0.25, iy0)
  40. res[i]["bbox"] = [ix0, iy0, ix1, iy1]
  41. v["bbox"] = [x0, y0, x1, y1]
  42. res.append(v)
  43. return res
  44. def _remove_overlap_between_bbox_for_block(all_bboxes):
  45. res = []
  46. keeps = [True] * len(all_bboxes)
  47. for i in range(len(all_bboxes)):
  48. for j in range(len(all_bboxes)):
  49. if i == j:
  50. continue
  51. if _is_in(all_bboxes[i][:4], all_bboxes[j][:4]):
  52. keeps[i] = False
  53. for idx, v in enumerate(all_bboxes):
  54. if not keeps[idx]:
  55. continue
  56. for i in range(len(res)):
  57. if _is_in(v[:4], res[i][:4]):
  58. continue
  59. if _is_in_or_part_overlap(res[i][:4], v[:4]):
  60. ix0, iy0, ix1, iy1 = res[i][:4]
  61. x0, y0, x1, y1 = v[:4]
  62. diff_x = min(x1, ix1) - max(x0, ix0)
  63. diff_y = min(y1, iy1) - max(y0, iy0)
  64. if diff_y > diff_x:
  65. if x1 >= ix1:
  66. mid = (x0 + ix1) // 2
  67. ix1 = min(mid - 0.25, ix1)
  68. x0 = max(mid + 0.25, x0)
  69. else:
  70. mid = (ix0 + x1) // 2
  71. ix0 = max(mid + 0.25, ix0)
  72. x1 = min(mid - 0.25, x1)
  73. else:
  74. if y1 >= iy1:
  75. mid = (y0 + iy1) // 2
  76. y0 = max(mid + 0.25, y0)
  77. iy1 = min(iy1, mid-0.25)
  78. else:
  79. mid = (iy0 + y1) // 2
  80. y1 = min(y1, mid-0.25)
  81. iy0 = max(mid + 0.25, iy0)
  82. res[i][:4] = [ix0, iy0, ix1, iy1]
  83. v[:4] = [x0, y0, x1, y1]
  84. res.append(v)
  85. return res
  86. def remove_overlap_between_bbox_for_span(spans):
  87. return _remove_overlap_between_bbox_for_span(spans)
  88. def remove_overlap_between_bbox_for_block(all_bboxes):
  89. return _remove_overlap_between_bbox_for_block(all_bboxes)