block_pre_proc.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from mineru.utils.boxbase import (
  3. calculate_iou,
  4. calculate_overlap_area_in_bbox1_area_ratio,
  5. calculate_vertical_projection_overlap_ratio,
  6. get_minbox_if_overlap_by_ratio
  7. )
  8. from mineru.utils.enum_class import BlockType
  9. def process_groups(groups, body_key, caption_key, footnote_key):
  10. body_blocks = []
  11. caption_blocks = []
  12. footnote_blocks = []
  13. maybe_text_image_blocks = []
  14. for i, group in enumerate(groups):
  15. if body_key == 'image_body' and len(group[caption_key]) == 0 and len(group[footnote_key]) == 0:
  16. # 如果没有caption和footnote,则不需要将group_id添加到image_body中
  17. group[body_key]['group_id'] = i
  18. maybe_text_image_blocks.append(group[body_key])
  19. continue
  20. else:
  21. group[body_key]['group_id'] = i
  22. body_blocks.append(group[body_key])
  23. for caption_block in group[caption_key]:
  24. caption_block['group_id'] = i
  25. caption_blocks.append(caption_block)
  26. for footnote_block in group[footnote_key]:
  27. footnote_block['group_id'] = i
  28. footnote_blocks.append(footnote_block)
  29. return body_blocks, caption_blocks, footnote_blocks, maybe_text_image_blocks
  30. def prepare_block_bboxes(
  31. img_body_blocks,
  32. img_caption_blocks,
  33. img_footnote_blocks,
  34. table_body_blocks,
  35. table_caption_blocks,
  36. table_footnote_blocks,
  37. discarded_blocks,
  38. text_blocks,
  39. title_blocks,
  40. interline_equation_blocks,
  41. page_w,
  42. page_h,
  43. ):
  44. all_bboxes = []
  45. add_bboxes(img_body_blocks, BlockType.IMAGE_BODY, all_bboxes)
  46. add_bboxes(img_caption_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
  47. add_bboxes(img_footnote_blocks, BlockType.IMAGE_CAPTION, all_bboxes)
  48. add_bboxes(table_body_blocks, BlockType.TABLE_BODY, all_bboxes)
  49. add_bboxes(table_caption_blocks, BlockType.TABLE_CAPTION, all_bboxes)
  50. add_bboxes(table_footnote_blocks, BlockType.TABLE_FOOTNOTE, all_bboxes)
  51. add_bboxes(text_blocks, BlockType.TEXT, all_bboxes)
  52. add_bboxes(title_blocks, BlockType.TITLE, all_bboxes)
  53. add_bboxes(interline_equation_blocks, BlockType.INTERLINE_EQUATION, all_bboxes)
  54. """block嵌套问题解决"""
  55. """文本框与标题框重叠,优先信任文本框"""
  56. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  57. """任何框体与舍弃框重叠,优先信任舍弃框"""
  58. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  59. # interline_equation 与title或text框冲突的情况,分两种情况处理
  60. """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
  61. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  62. """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
  63. # 通过后续大框套小框逻辑删除
  64. """discarded_blocks"""
  65. all_discarded_blocks = []
  66. add_bboxes(discarded_blocks, BlockType.DISCARDED, all_discarded_blocks)
  67. """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
  68. footnote_blocks = []
  69. for discarded in discarded_blocks:
  70. x0, y0, x1, y1 = discarded['bbox']
  71. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
  72. footnote_blocks.append([x0, y0, x1, y1])
  73. """移除在footnote下面的任何框"""
  74. need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
  75. if len(need_remove_blocks) > 0:
  76. for block in need_remove_blocks:
  77. all_bboxes.remove(block)
  78. all_discarded_blocks.append(block)
  79. """经过以上处理后,还存在大框套小框的情况,则删除小框"""
  80. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  81. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  82. """粗排序后返回"""
  83. all_bboxes.sort(key=lambda x: x[0]+x[1])
  84. return all_bboxes, all_discarded_blocks, footnote_blocks
  85. def add_bboxes(blocks, block_type, bboxes):
  86. for block in blocks:
  87. x0, y0, x1, y1 = block['bbox']
  88. if block_type in [
  89. BlockType.IMAGE_BODY,
  90. BlockType.IMAGE_CAPTION,
  91. BlockType.IMAGE_FOOTNOTE,
  92. BlockType.TABLE_BODY,
  93. BlockType.TABLE_CAPTION,
  94. BlockType.TABLE_FOOTNOTE,
  95. ]:
  96. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score'], block['group_id']])
  97. else:
  98. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score']])
  99. def fix_text_overlap_title_blocks(all_bboxes):
  100. # 先提取所有text和title block
  101. text_blocks = []
  102. for block in all_bboxes:
  103. if block[7] == BlockType.TEXT:
  104. text_blocks.append(block)
  105. title_blocks = []
  106. for block in all_bboxes:
  107. if block[7] == BlockType.TITLE:
  108. title_blocks.append(block)
  109. need_remove = []
  110. for text_block in text_blocks:
  111. for title_block in title_blocks:
  112. text_block_bbox = text_block[:4]
  113. title_block_bbox = title_block[:4]
  114. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  115. if title_block not in need_remove:
  116. need_remove.append(title_block)
  117. if len(need_remove) > 0:
  118. for block in need_remove:
  119. all_bboxes.remove(block)
  120. return all_bboxes
  121. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  122. need_remove = []
  123. for block in all_bboxes:
  124. for discarded_block in discarded_blocks:
  125. block_bbox = block[:4]
  126. if (
  127. calculate_overlap_area_in_bbox1_area_ratio(
  128. block_bbox, discarded_block['bbox']
  129. )
  130. > 0.6
  131. ):
  132. if block not in need_remove:
  133. need_remove.append(block)
  134. break
  135. if len(need_remove) > 0:
  136. for block in need_remove:
  137. all_bboxes.remove(block)
  138. return all_bboxes
  139. def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
  140. # 先提取所有text和interline block
  141. text_blocks = []
  142. for block in all_bboxes:
  143. if block[7] == BlockType.TEXT:
  144. text_blocks.append(block)
  145. interline_equation_blocks = []
  146. for block in all_bboxes:
  147. if block[7] == BlockType.INTERLINE_EQUATION:
  148. interline_equation_blocks.append(block)
  149. need_remove = []
  150. for interline_equation_block in interline_equation_blocks:
  151. for text_block in text_blocks:
  152. interline_equation_block_bbox = interline_equation_block[:4]
  153. text_block_bbox = text_block[:4]
  154. if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
  155. if text_block not in need_remove:
  156. need_remove.append(text_block)
  157. if len(need_remove) > 0:
  158. for block in need_remove:
  159. all_bboxes.remove(block)
  160. return all_bboxes
  161. def find_blocks_under_footnote(all_bboxes, footnote_blocks):
  162. need_remove_blocks = []
  163. for block in all_bboxes:
  164. block_x0, block_y0, block_x1, block_y1 = block[:4]
  165. for footnote_bbox in footnote_blocks:
  166. footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
  167. # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
  168. if (
  169. block_y0 >= footnote_y1
  170. and calculate_vertical_projection_overlap_ratio(
  171. (block_x0, block_y0, block_x1, block_y1), footnote_bbox
  172. )
  173. >= 0.8
  174. ):
  175. if block not in need_remove_blocks:
  176. need_remove_blocks.append(block)
  177. break
  178. return need_remove_blocks
  179. def remove_overlaps_min_blocks(all_bboxes):
  180. # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
  181. # 删除重叠blocks中较小的那些
  182. need_remove = []
  183. for i in range(len(all_bboxes)):
  184. for j in range(i + 1, len(all_bboxes)):
  185. block1 = all_bboxes[i]
  186. block2 = all_bboxes[j]
  187. block1_bbox = block1[:4]
  188. block2_bbox = block2[:4]
  189. overlap_box = get_minbox_if_overlap_by_ratio(
  190. block1_bbox, block2_bbox, 0.8
  191. )
  192. if overlap_box is not None:
  193. # 判断哪个区块的面积更小,移除较小的区块
  194. area1 = (block1[2] - block1[0]) * (block1[3] - block1[1])
  195. area2 = (block2[2] - block2[0]) * (block2[3] - block2[1])
  196. if area1 <= area2:
  197. block_to_remove = block1
  198. large_block = block2
  199. else:
  200. block_to_remove = block2
  201. large_block = block1
  202. if block_to_remove not in need_remove:
  203. x1, y1, x2, y2 = large_block[:4]
  204. sx1, sy1, sx2, sy2 = block_to_remove[:4]
  205. x1 = min(x1, sx1)
  206. y1 = min(y1, sy1)
  207. x2 = max(x2, sx2)
  208. y2 = max(y2, sy2)
  209. large_block[:4] = [x1, y1, x2, y2]
  210. need_remove.append(block_to_remove)
  211. for block in need_remove:
  212. if block in all_bboxes:
  213. all_bboxes.remove(block)
  214. return all_bboxes