block_pre_proc.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from mineru.utils.boxbase import (
  3. calculate_iou,
  4. calculate_overlap_area_in_bbox1_area_ratio,
  5. calculate_vertical_projection_overlap_ratio,
  6. get_minbox_if_overlap_by_ratio
  7. )
  8. from mineru.utils.enum_class import BlockType
  9. def prepare_block_bboxes(
  10. img_body_blocks,
  11. img_caption_blocks,
  12. img_footnote_blocks,
  13. table_body_blocks,
  14. table_caption_blocks,
  15. table_footnote_blocks,
  16. discarded_blocks,
  17. text_blocks,
  18. title_blocks,
  19. interline_equation_blocks,
  20. page_w,
  21. page_h,
  22. ):
  23. all_bboxes = []
  24. add_bboxes(img_body_blocks, BlockType.IMAGE_BODY, all_bboxes)
  25. add_bboxes(img_caption_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
  26. add_bboxes(img_footnote_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
  27. add_bboxes(table_body_blocks, BlockType.TABLE_BODY, all_bboxes)
  28. add_bboxes(table_caption_blocks, BlockType.TABLE_CAPTION, all_bboxes)
  29. add_bboxes(table_footnote_blocks, BlockType.TABLE_FOOTNOTE, all_bboxes)
  30. add_bboxes(text_blocks, BlockType.TEXT, all_bboxes)
  31. add_bboxes(title_blocks, BlockType.TITLE, all_bboxes)
  32. add_bboxes(interline_equation_blocks, BlockType.INTERLINE_EQUATION, all_bboxes)
  33. """block嵌套问题解决"""
  34. """文本框与标题框重叠,优先信任文本框"""
  35. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  36. """任何框体与舍弃框重叠,优先信任舍弃框"""
  37. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  38. # interline_equation 与title或text框冲突的情况,分两种情况处理
  39. """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
  40. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  41. """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
  42. # 通过后续大框套小框逻辑删除
  43. """discarded_blocks"""
  44. all_discarded_blocks = []
  45. add_bboxes(discarded_blocks, BlockType.DISCARDED, all_discarded_blocks)
  46. """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
  47. footnote_blocks = []
  48. for discarded in discarded_blocks:
  49. x0, y0, x1, y1 = discarded['bbox']
  50. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
  51. footnote_blocks.append([x0, y0, x1, y1])
  52. """移除在footnote下面的任何框"""
  53. need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
  54. if len(need_remove_blocks) > 0:
  55. for block in need_remove_blocks:
  56. all_bboxes.remove(block)
  57. all_discarded_blocks.append(block)
  58. """经过以上处理后,还存在大框套小框的情况,则删除小框"""
  59. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  60. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  61. """将剩余的bbox做分离处理,防止后面分layout时出错"""
  62. # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  63. all_bboxes.sort(key=lambda x: x[0]+x[1])
  64. return all_bboxes, all_discarded_blocks, footnote_blocks
  65. def add_bboxes(blocks, block_type, bboxes):
  66. for block in blocks:
  67. x0, y0, x1, y1 = block['bbox']
  68. if block_type in [
  69. BlockType.IMAGE_BODY,
  70. BlockType.IMAGE_CAPTION,
  71. BlockType.IMAGE_FOOTNOTE,
  72. BlockType.TABLE_BODY,
  73. BlockType.TABLE_CAPTION,
  74. BlockType.TABLE_FOOTNOTE,
  75. ]:
  76. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score'], block['group_id']])
  77. else:
  78. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score']])
  79. def fix_text_overlap_title_blocks(all_bboxes):
  80. # 先提取所有text和title block
  81. text_blocks = []
  82. for block in all_bboxes:
  83. if block[7] == BlockType.TEXT:
  84. text_blocks.append(block)
  85. title_blocks = []
  86. for block in all_bboxes:
  87. if block[7] == BlockType.TITLE:
  88. title_blocks.append(block)
  89. need_remove = []
  90. for text_block in text_blocks:
  91. for title_block in title_blocks:
  92. text_block_bbox = text_block[:4]
  93. title_block_bbox = title_block[:4]
  94. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  95. if title_block not in need_remove:
  96. need_remove.append(title_block)
  97. if len(need_remove) > 0:
  98. for block in need_remove:
  99. all_bboxes.remove(block)
  100. return all_bboxes
  101. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  102. need_remove = []
  103. for block in all_bboxes:
  104. for discarded_block in discarded_blocks:
  105. block_bbox = block[:4]
  106. if (
  107. calculate_overlap_area_in_bbox1_area_ratio(
  108. block_bbox, discarded_block['bbox']
  109. )
  110. > 0.6
  111. ):
  112. if block not in need_remove:
  113. need_remove.append(block)
  114. break
  115. if len(need_remove) > 0:
  116. for block in need_remove:
  117. all_bboxes.remove(block)
  118. return all_bboxes
  119. def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
  120. # 先提取所有text和interline block
  121. text_blocks = []
  122. for block in all_bboxes:
  123. if block[7] == BlockType.TEXT:
  124. text_blocks.append(block)
  125. interline_equation_blocks = []
  126. for block in all_bboxes:
  127. if block[7] == BlockType.INTERLINE_EQUATION:
  128. interline_equation_blocks.append(block)
  129. need_remove = []
  130. for interline_equation_block in interline_equation_blocks:
  131. for text_block in text_blocks:
  132. interline_equation_block_bbox = interline_equation_block[:4]
  133. text_block_bbox = text_block[:4]
  134. if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
  135. if text_block not in need_remove:
  136. need_remove.append(text_block)
  137. if len(need_remove) > 0:
  138. for block in need_remove:
  139. all_bboxes.remove(block)
  140. return all_bboxes
  141. def find_blocks_under_footnote(all_bboxes, footnote_blocks):
  142. need_remove_blocks = []
  143. for block in all_bboxes:
  144. block_x0, block_y0, block_x1, block_y1 = block[:4]
  145. for footnote_bbox in footnote_blocks:
  146. footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
  147. # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
  148. if (
  149. block_y0 >= footnote_y1
  150. and calculate_vertical_projection_overlap_ratio(
  151. (block_x0, block_y0, block_x1, block_y1), footnote_bbox
  152. )
  153. >= 0.8
  154. ):
  155. if block not in need_remove_blocks:
  156. need_remove_blocks.append(block)
  157. break
  158. return need_remove_blocks
  159. def remove_overlaps_min_blocks(all_bboxes):
  160. # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
  161. # 删除重叠blocks中较小的那些
  162. need_remove = []
  163. for block1 in all_bboxes:
  164. for block2 in all_bboxes:
  165. if block1 != block2:
  166. block1_bbox = block1[:4]
  167. block2_bbox = block2[:4]
  168. overlap_box = get_minbox_if_overlap_by_ratio(
  169. block1_bbox, block2_bbox, 0.8
  170. )
  171. if overlap_box is not None:
  172. block_to_remove = next(
  173. (block for block in all_bboxes if block[:4] == overlap_box),
  174. None,
  175. )
  176. if (
  177. block_to_remove is not None
  178. and block_to_remove not in need_remove
  179. ):
  180. large_block = block1 if block1 != block_to_remove else block2
  181. x1, y1, x2, y2 = large_block[:4]
  182. sx1, sy1, sx2, sy2 = block_to_remove[:4]
  183. x1 = min(x1, sx1)
  184. y1 = min(y1, sy1)
  185. x2 = max(x2, sx2)
  186. y2 = max(y2, sy2)
  187. large_block[:4] = [x1, y1, x2, y2]
  188. need_remove.append(block_to_remove)
  189. if len(need_remove) > 0:
  190. for block in need_remove:
  191. all_bboxes.remove(block)
  192. return all_bboxes