ocr_detect_all_bboxes.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. from magic_pdf.config.ocr_content_type import BlockType
  2. from magic_pdf.libs.boxbase import (
  3. calculate_iou,
  4. calculate_overlap_area_in_bbox1_area_ratio,
  5. calculate_vertical_projection_overlap_ratio,
  6. get_minbox_if_overlap_by_ratio
  7. )
  8. from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
  9. def add_bboxes(blocks, block_type, bboxes):
  10. for block in blocks:
  11. x0, y0, x1, y1 = block['bbox']
  12. if block_type in [
  13. BlockType.ImageBody,
  14. BlockType.ImageCaption,
  15. BlockType.ImageFootnote,
  16. BlockType.TableBody,
  17. BlockType.TableCaption,
  18. BlockType.TableFootnote,
  19. ]:
  20. bboxes.append(
  21. [
  22. x0,
  23. y0,
  24. x1,
  25. y1,
  26. None,
  27. None,
  28. None,
  29. block_type,
  30. None,
  31. None,
  32. None,
  33. None,
  34. block['score'],
  35. block['group_id'],
  36. ]
  37. )
  38. else:
  39. bboxes.append(
  40. [
  41. x0,
  42. y0,
  43. x1,
  44. y1,
  45. None,
  46. None,
  47. None,
  48. block_type,
  49. None,
  50. None,
  51. None,
  52. None,
  53. block['score'],
  54. ]
  55. )
  56. def ocr_prepare_bboxes_for_layout_split_v2(
  57. img_body_blocks,
  58. img_caption_blocks,
  59. img_footnote_blocks,
  60. table_body_blocks,
  61. table_caption_blocks,
  62. table_footnote_blocks,
  63. discarded_blocks,
  64. text_blocks,
  65. title_blocks,
  66. interline_equation_blocks,
  67. page_w,
  68. page_h,
  69. ):
  70. all_bboxes = []
  71. add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
  72. add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
  73. add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
  74. add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
  75. add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
  76. add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
  77. add_bboxes(text_blocks, BlockType.Text, all_bboxes)
  78. add_bboxes(title_blocks, BlockType.Title, all_bboxes)
  79. add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
  80. """block嵌套问题解决"""
  81. """文本框与标题框重叠,优先信任文本框"""
  82. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  83. """任何框体与舍弃框重叠,优先信任舍弃框"""
  84. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  85. # interline_equation 与title或text框冲突的情况,分两种情况处理
  86. """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
  87. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  88. """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
  89. # 通过后续大框套小框逻辑删除
  90. """discarded_blocks"""
  91. all_discarded_blocks = []
  92. add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
  93. """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
  94. footnote_blocks = []
  95. for discarded in discarded_blocks:
  96. x0, y0, x1, y1 = discarded['bbox']
  97. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  98. footnote_blocks.append([x0, y0, x1, y1])
  99. """移除在footnote下面的任何框"""
  100. need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
  101. if len(need_remove_blocks) > 0:
  102. for block in need_remove_blocks:
  103. all_bboxes.remove(block)
  104. all_discarded_blocks.append(block)
  105. """经过以上处理后,还存在大框套小框的情况,则删除小框"""
  106. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  107. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  108. """将剩余的bbox做分离处理,防止后面分layout时出错"""
  109. all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  110. return all_bboxes, all_discarded_blocks
  111. def find_blocks_under_footnote(all_bboxes, footnote_blocks):
  112. need_remove_blocks = []
  113. for block in all_bboxes:
  114. block_x0, block_y0, block_x1, block_y1 = block[:4]
  115. for footnote_bbox in footnote_blocks:
  116. footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
  117. # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
  118. if (
  119. block_y0 >= footnote_y1
  120. and calculate_vertical_projection_overlap_ratio(
  121. (block_x0, block_y0, block_x1, block_y1), footnote_bbox
  122. )
  123. >= 0.8
  124. ):
  125. if block not in need_remove_blocks:
  126. need_remove_blocks.append(block)
  127. break
  128. return need_remove_blocks
  129. def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
  130. # 先提取所有text和interline block
  131. text_blocks = []
  132. for block in all_bboxes:
  133. if block[7] == BlockType.Text:
  134. text_blocks.append(block)
  135. interline_equation_blocks = []
  136. for block in all_bboxes:
  137. if block[7] == BlockType.InterlineEquation:
  138. interline_equation_blocks.append(block)
  139. need_remove = []
  140. for interline_equation_block in interline_equation_blocks:
  141. for text_block in text_blocks:
  142. interline_equation_block_bbox = interline_equation_block[:4]
  143. text_block_bbox = text_block[:4]
  144. if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
  145. if text_block not in need_remove:
  146. need_remove.append(text_block)
  147. if len(need_remove) > 0:
  148. for block in need_remove:
  149. all_bboxes.remove(block)
  150. return all_bboxes
  151. def fix_text_overlap_title_blocks(all_bboxes):
  152. # 先提取所有text和title block
  153. text_blocks = []
  154. for block in all_bboxes:
  155. if block[7] == BlockType.Text:
  156. text_blocks.append(block)
  157. title_blocks = []
  158. for block in all_bboxes:
  159. if block[7] == BlockType.Title:
  160. title_blocks.append(block)
  161. need_remove = []
  162. for text_block in text_blocks:
  163. for title_block in title_blocks:
  164. text_block_bbox = text_block[:4]
  165. title_block_bbox = title_block[:4]
  166. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  167. if title_block not in need_remove:
  168. need_remove.append(title_block)
  169. if len(need_remove) > 0:
  170. for block in need_remove:
  171. all_bboxes.remove(block)
  172. return all_bboxes
  173. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  174. need_remove = []
  175. for block in all_bboxes:
  176. for discarded_block in discarded_blocks:
  177. block_bbox = block[:4]
  178. if (
  179. calculate_overlap_area_in_bbox1_area_ratio(
  180. block_bbox, discarded_block['bbox']
  181. )
  182. > 0.6
  183. ):
  184. if block not in need_remove:
  185. need_remove.append(block)
  186. break
  187. if len(need_remove) > 0:
  188. for block in need_remove:
  189. all_bboxes.remove(block)
  190. return all_bboxes
  191. def remove_overlaps_min_blocks(all_bboxes):
  192. # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
  193. # 删除重叠blocks中较小的那些
  194. need_remove = []
  195. for block1 in all_bboxes:
  196. for block2 in all_bboxes:
  197. if block1 != block2:
  198. block1_bbox = block1[:4]
  199. block2_bbox = block2[:4]
  200. overlap_box = get_minbox_if_overlap_by_ratio(
  201. block1_bbox, block2_bbox, 0.8
  202. )
  203. if overlap_box is not None:
  204. block_to_remove = next(
  205. (block for block in all_bboxes if block[:4] == overlap_box),
  206. None,
  207. )
  208. if (
  209. block_to_remove is not None
  210. and block_to_remove not in need_remove
  211. ):
  212. large_block = block1 if block1 != block_to_remove else block2
  213. x1, y1, x2, y2 = large_block[:4]
  214. sx1, sy1, sx2, sy2 = block_to_remove[:4]
  215. x1 = min(x1, sx1)
  216. y1 = min(y1, sy1)
  217. x2 = max(x2, sx2)
  218. y2 = max(y2, sy2)
  219. large_block[:4] = [x1, y1, x2, y2]
  220. need_remove.append(block_to_remove)
  221. if len(need_remove) > 0:
  222. for block in need_remove:
  223. all_bboxes.remove(block)
  224. return all_bboxes