resolve_bbox_conflict.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. """
  2. 从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
  3. 1. 首先去掉出现在图片上的bbox,图片包括表格和图片
  4. 2. 然后去掉出现在文字blcok上的图片bbox
  5. """
  6. from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
  7. from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
  8. def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
  9. """
  10. text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
  11. 当下采用一种粗暴的方式:
  12. 1. 去掉图片上的公式
  13. 2. 去掉table上的公式
  14. 2. 图片和文字block部分重叠,首先丢弃图片
  15. 3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
  16. 4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
  17. 5. 去掉表格上的文字
  18. """
  19. text_block_removed = []
  20. images_backup = []
  21. # 去掉位于图片上的文字block
  22. for image_box in images:
  23. for text_block in text_raw_blocks:
  24. text_bbox = text_block["bbox"]
  25. if _is_in(text_bbox, image_box):
  26. text_block['tag'] = ON_IMAGE_TEXT
  27. text_block_removed.append(text_block)
  28. # 去掉table上的文字block
  29. for table_box in tables:
  30. for text_block in text_raw_blocks:
  31. text_bbox = text_block["bbox"]
  32. if _is_in(text_bbox, table_box):
  33. text_block['tag'] = ON_TABLE_TEXT
  34. text_block_removed.append(text_block)
  35. for text_block in text_block_removed:
  36. if text_block in text_raw_blocks:
  37. text_raw_blocks.remove(text_block)
  38. # 第一步去掉在图片上出现的公式box
  39. temp = []
  40. for image_box in images:
  41. for eq1 in interline_equations:
  42. if _is_in_or_part_overlap(image_box, eq1[:4]):
  43. temp.append(eq1)
  44. for eq2 in inline_equations:
  45. if _is_in_or_part_overlap(image_box, eq2[:4]):
  46. temp.append(eq2)
  47. for eq in temp:
  48. if eq in interline_equations:
  49. interline_equations.remove(eq)
  50. if eq in inline_equations:
  51. inline_equations.remove(eq)
  52. # 第二步去掉在表格上出现的公式box
  53. temp = []
  54. for table_box in tables:
  55. for eq1 in interline_equations:
  56. if _is_in_or_part_overlap(table_box, eq1[:4]):
  57. temp.append(eq1)
  58. for eq2 in inline_equations:
  59. if _is_in_or_part_overlap(table_box, eq2[:4]):
  60. temp.append(eq2)
  61. for eq in temp:
  62. if eq in interline_equations:
  63. interline_equations.remove(eq)
  64. if eq in inline_equations:
  65. inline_equations.remove(eq)
  66. # 图片和文字重叠,丢掉图片
  67. for image_box in images:
  68. for text_block in text_raw_blocks:
  69. text_bbox = text_block["bbox"]
  70. if _is_in_or_part_overlap(image_box, text_bbox):
  71. images_backup.append(image_box)
  72. break
  73. for image_box in images_backup:
  74. images.remove(image_box)
  75. # 图片和图片重叠,两张都暂时不参与版面计算
  76. images_dup_index = []
  77. for i in range(len(images)):
  78. for j in range(i+1, len(images)):
  79. if _is_in_or_part_overlap(images[i], images[j]):
  80. images_dup_index.append(i)
  81. images_dup_index.append(j)
  82. dup_idx = set(images_dup_index)
  83. for img_id in dup_idx:
  84. images_backup.append(images[img_id])
  85. images[img_id] = None
  86. images = [img for img in images if img is not None]
  87. # 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
  88. # 对于这样的文本块删除,然后保留行间公式的大小不变。
  89. # 当计算完毕layout,这部分再合并回来
  90. text_block_removed_2 = []
  91. # for text_block in text_raw_blocks:
  92. # text_bbox = text_block["bbox"]
  93. # for eq in interline_equations:
  94. # ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
  95. # if ratio>0.05:
  96. # text_block['tag'] = "belong-to-interline-equation"
  97. # text_block_removed_2.append(text_block)
  98. # break
  99. # for tb in text_block_removed_2:
  100. # if tb in text_raw_blocks:
  101. # text_raw_blocks.remove(tb)
  102. # text_block_removed = text_block_removed + text_block_removed_2
  103. return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
  104. def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> bool:
  105. """
  106. 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
  107. 因为这种情况大概率发生了公式没有被检测出来。
  108. """
  109. if len(text_blocks)==0:
  110. return False
  111. page_min_y = 0
  112. page_max_y = max(yy['bbox'][3] for yy in text_blocks)
  113. def __max_y(lst:list):
  114. if len(lst)>0:
  115. return max([item[1] for item in lst])
  116. return page_min_y
  117. def __min_y(lst:list):
  118. if len(lst)>0:
  119. return min([item[3] for item in lst])
  120. return page_max_y
  121. clip_y0 = __max_y(header)
  122. clip_y1 = __min_y(footer)
  123. txt_bboxes = []
  124. for text_block in text_blocks:
  125. bbox = text_block["bbox"]
  126. if bbox[1]>=clip_y0 and bbox[3]<=clip_y1:
  127. txt_bboxes.append(bbox)
  128. for i in range(len(txt_bboxes)):
  129. for j in range(i+1, len(txt_bboxes)):
  130. if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
  131. return True
  132. return False