ocr_detect_all_bboxes.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
  3. calculate_iou, calculate_vertical_projection_overlap_ratio
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import BlockType
  6. from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
  7. def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
  8. title_blocks, interline_equation_blocks, page_w, page_h):
  9. all_bboxes = []
  10. all_discarded_blocks = []
  11. for image in img_blocks:
  12. x0, y0, x1, y1 = image['bbox']
  13. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
  14. for table in table_blocks:
  15. x0, y0, x1, y1 = table['bbox']
  16. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
  17. for text in text_blocks:
  18. x0, y0, x1, y1 = text['bbox']
  19. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
  20. for title in title_blocks:
  21. x0, y0, x1, y1 = title['bbox']
  22. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
  23. for interline_equation in interline_equation_blocks:
  24. x0, y0, x1, y1 = interline_equation['bbox']
  25. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
  26. '''block嵌套问题解决'''
  27. '''文本框与标题框重叠,优先信任文本框'''
  28. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  29. '''任何框体与舍弃框重叠,优先信任舍弃框'''
  30. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  31. # interline_equation 与title或text框冲突的情况,分两种情况处理
  32. '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
  33. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  34. '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
  35. # 通过后续大框套小框逻辑删除
  36. '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
  37. for discarded in discarded_blocks:
  38. x0, y0, x1, y1 = discarded['bbox']
  39. all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
  40. # 将footnote加入到all_bboxes中,用来计算layout
  41. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  42. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
  43. '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
  44. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  45. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  46. '''将剩余的bbox做分离处理,防止后面分layout时出错'''
  47. all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  48. return all_bboxes, all_discarded_blocks, drop_reasons
  49. def add_bboxes(blocks, block_type, bboxes):
  50. for block in blocks:
  51. x0, y0, x1, y1 = block['bbox']
  52. if block_type in [
  53. BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
  54. BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
  55. ]:
  56. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
  57. else:
  58. bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
  59. def ocr_prepare_bboxes_for_layout_split_v2(
  60. img_body_blocks, img_caption_blocks, img_footnote_blocks,
  61. table_body_blocks, table_caption_blocks, table_footnote_blocks,
  62. discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
  63. ):
  64. all_bboxes = []
  65. add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
  66. add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
  67. add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
  68. add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
  69. add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
  70. add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
  71. add_bboxes(text_blocks, BlockType.Text, all_bboxes)
  72. add_bboxes(title_blocks, BlockType.Title, all_bboxes)
  73. add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
  74. '''block嵌套问题解决'''
  75. '''文本框与标题框重叠,优先信任文本框'''
  76. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  77. '''任何框体与舍弃框重叠,优先信任舍弃框'''
  78. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  79. # interline_equation 与title或text框冲突的情况,分两种情况处理
  80. '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
  81. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  82. '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
  83. # 通过后续大框套小框逻辑删除
  84. '''discarded_blocks'''
  85. all_discarded_blocks = []
  86. add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
  87. '''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
  88. footnote_blocks = []
  89. for discarded in discarded_blocks:
  90. x0, y0, x1, y1 = discarded['bbox']
  91. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  92. footnote_blocks.append([x0, y0, x1, y1])
  93. '''移除在footnote下面的任何框'''
  94. need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
  95. if len(need_remove_blocks) > 0:
  96. for block in need_remove_blocks:
  97. all_bboxes.remove(block)
  98. all_discarded_blocks.append(block)
  99. '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
  100. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  101. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  102. '''将剩余的bbox做分离处理,防止后面分layout时出错'''
  103. all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  104. return all_bboxes, all_discarded_blocks
  105. def find_blocks_under_footnote(all_bboxes, footnote_blocks):
  106. need_remove_blocks = []
  107. for block in all_bboxes:
  108. block_x0, block_y0, block_x1, block_y1 = block[:4]
  109. for footnote_bbox in footnote_blocks:
  110. footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
  111. # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
  112. if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
  113. if block not in need_remove_blocks:
  114. need_remove_blocks.append(block)
  115. break
  116. return need_remove_blocks
  117. def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
  118. # 先提取所有text和interline block
  119. text_blocks = []
  120. for block in all_bboxes:
  121. if block[7] == BlockType.Text:
  122. text_blocks.append(block)
  123. interline_equation_blocks = []
  124. for block in all_bboxes:
  125. if block[7] == BlockType.InterlineEquation:
  126. interline_equation_blocks.append(block)
  127. need_remove = []
  128. for interline_equation_block in interline_equation_blocks:
  129. for text_block in text_blocks:
  130. interline_equation_block_bbox = interline_equation_block[:4]
  131. text_block_bbox = text_block[:4]
  132. if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
  133. if text_block not in need_remove:
  134. need_remove.append(text_block)
  135. if len(need_remove) > 0:
  136. for block in need_remove:
  137. all_bboxes.remove(block)
  138. return all_bboxes
  139. def fix_text_overlap_title_blocks(all_bboxes):
  140. # 先提取所有text和title block
  141. text_blocks = []
  142. for block in all_bboxes:
  143. if block[7] == BlockType.Text:
  144. text_blocks.append(block)
  145. title_blocks = []
  146. for block in all_bboxes:
  147. if block[7] == BlockType.Title:
  148. title_blocks.append(block)
  149. need_remove = []
  150. for text_block in text_blocks:
  151. for title_block in title_blocks:
  152. text_block_bbox = text_block[:4]
  153. title_block_bbox = title_block[:4]
  154. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  155. if title_block not in need_remove:
  156. need_remove.append(title_block)
  157. if len(need_remove) > 0:
  158. for block in need_remove:
  159. all_bboxes.remove(block)
  160. return all_bboxes
  161. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  162. need_remove = []
  163. for block in all_bboxes:
  164. for discarded_block in discarded_blocks:
  165. block_bbox = block[:4]
  166. if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
  167. if block not in need_remove:
  168. need_remove.append(block)
  169. break
  170. if len(need_remove) > 0:
  171. for block in need_remove:
  172. all_bboxes.remove(block)
  173. return all_bboxes
  174. def remove_overlaps_min_blocks(all_bboxes):
  175. # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
  176. # 删除重叠blocks中较小的那些
  177. need_remove = []
  178. for block1 in all_bboxes:
  179. for block2 in all_bboxes:
  180. if block1 != block2:
  181. block1_bbox = block1[:4]
  182. block2_bbox = block2[:4]
  183. overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
  184. if overlap_box is not None:
  185. block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
  186. if block_to_remove is not None and block_to_remove not in need_remove:
  187. large_block = block1 if block1 != block_to_remove else block2
  188. x1, y1, x2, y2 = large_block[:4]
  189. sx1, sy1, sx2, sy2 = block_to_remove[:4]
  190. x1 = min(x1, sx1)
  191. y1 = min(y1, sy1)
  192. x2 = max(x2, sx2)
  193. y2 = max(y2, sy2)
  194. large_block[:4] = [x1, y1, x2, y2]
  195. need_remove.append(block_to_remove)
  196. if len(need_remove) > 0:
  197. for block in need_remove:
  198. all_bboxes.remove(block)
  199. return all_bboxes