ocr_detect_all_bboxes.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. from magic_pdf.config.ocr_content_type import BlockType
  2. from magic_pdf.libs.boxbase import (
  3. calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
  4. calculate_vertical_projection_overlap_ratio,
  5. get_minbox_if_overlap_by_ratio)
  6. from magic_pdf.pre_proc.remove_bbox_overlap import \
  7. remove_overlap_between_bbox_for_block
  8. def ocr_prepare_bboxes_for_layout_split(
  9. img_blocks,
  10. table_blocks,
  11. discarded_blocks,
  12. text_blocks,
  13. title_blocks,
  14. interline_equation_blocks,
  15. page_w,
  16. page_h,
  17. ):
  18. all_bboxes = []
  19. all_discarded_blocks = []
  20. for image in img_blocks:
  21. x0, y0, x1, y1 = image['bbox']
  22. all_bboxes.append(
  23. [
  24. x0,
  25. y0,
  26. x1,
  27. y1,
  28. None,
  29. None,
  30. None,
  31. BlockType.Image,
  32. None,
  33. None,
  34. None,
  35. None,
  36. image['score'],
  37. ]
  38. )
  39. for table in table_blocks:
  40. x0, y0, x1, y1 = table['bbox']
  41. all_bboxes.append(
  42. [
  43. x0,
  44. y0,
  45. x1,
  46. y1,
  47. None,
  48. None,
  49. None,
  50. BlockType.Table,
  51. None,
  52. None,
  53. None,
  54. None,
  55. table['score'],
  56. ]
  57. )
  58. for text in text_blocks:
  59. x0, y0, x1, y1 = text['bbox']
  60. all_bboxes.append(
  61. [
  62. x0,
  63. y0,
  64. x1,
  65. y1,
  66. None,
  67. None,
  68. None,
  69. BlockType.Text,
  70. None,
  71. None,
  72. None,
  73. None,
  74. text['score'],
  75. ]
  76. )
  77. for title in title_blocks:
  78. x0, y0, x1, y1 = title['bbox']
  79. all_bboxes.append(
  80. [
  81. x0,
  82. y0,
  83. x1,
  84. y1,
  85. None,
  86. None,
  87. None,
  88. BlockType.Title,
  89. None,
  90. None,
  91. None,
  92. None,
  93. title['score'],
  94. ]
  95. )
  96. for interline_equation in interline_equation_blocks:
  97. x0, y0, x1, y1 = interline_equation['bbox']
  98. all_bboxes.append(
  99. [
  100. x0,
  101. y0,
  102. x1,
  103. y1,
  104. None,
  105. None,
  106. None,
  107. BlockType.InterlineEquation,
  108. None,
  109. None,
  110. None,
  111. None,
  112. interline_equation['score'],
  113. ]
  114. )
  115. """block嵌套问题解决"""
  116. """文本框与标题框重叠,优先信任文本框"""
  117. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  118. """任何框体与舍弃框重叠,优先信任舍弃框"""
  119. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  120. # interline_equation 与title或text框冲突的情况,分两种情况处理
  121. """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
  122. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  123. """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
  124. # 通过后续大框套小框逻辑删除
  125. """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
  126. for discarded in discarded_blocks:
  127. x0, y0, x1, y1 = discarded['bbox']
  128. all_discarded_blocks.append(
  129. [
  130. x0,
  131. y0,
  132. x1,
  133. y1,
  134. None,
  135. None,
  136. None,
  137. BlockType.Discarded,
  138. None,
  139. None,
  140. None,
  141. None,
  142. discarded['score'],
  143. ]
  144. )
  145. # 将footnote加入到all_bboxes中,用来计算layout
  146. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  147. all_bboxes.append(
  148. [
  149. x0,
  150. y0,
  151. x1,
  152. y1,
  153. None,
  154. None,
  155. None,
  156. BlockType.Footnote,
  157. None,
  158. None,
  159. None,
  160. None,
  161. discarded['score'],
  162. ]
  163. )
  164. """经过以上处理后,还存在大框套小框的情况,则删除小框"""
  165. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  166. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  167. """将剩余的bbox做分离处理,防止后面分layout时出错"""
  168. all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  169. return all_bboxes, all_discarded_blocks, drop_reasons
  170. def add_bboxes(blocks, block_type, bboxes):
  171. for block in blocks:
  172. x0, y0, x1, y1 = block['bbox']
  173. if block_type in [
  174. BlockType.ImageBody,
  175. BlockType.ImageCaption,
  176. BlockType.ImageFootnote,
  177. BlockType.TableBody,
  178. BlockType.TableCaption,
  179. BlockType.TableFootnote,
  180. ]:
  181. bboxes.append(
  182. [
  183. x0,
  184. y0,
  185. x1,
  186. y1,
  187. None,
  188. None,
  189. None,
  190. block_type,
  191. None,
  192. None,
  193. None,
  194. None,
  195. block['score'],
  196. block['group_id'],
  197. ]
  198. )
  199. else:
  200. bboxes.append(
  201. [
  202. x0,
  203. y0,
  204. x1,
  205. y1,
  206. None,
  207. None,
  208. None,
  209. block_type,
  210. None,
  211. None,
  212. None,
  213. None,
  214. block['score'],
  215. ]
  216. )
  217. def ocr_prepare_bboxes_for_layout_split_v2(
  218. img_body_blocks,
  219. img_caption_blocks,
  220. img_footnote_blocks,
  221. table_body_blocks,
  222. table_caption_blocks,
  223. table_footnote_blocks,
  224. discarded_blocks,
  225. text_blocks,
  226. title_blocks,
  227. interline_equation_blocks,
  228. page_w,
  229. page_h,
  230. ):
  231. all_bboxes = []
  232. add_bboxes(img_body_blocks, BlockType.ImageBody, all_bboxes)
  233. add_bboxes(img_caption_blocks, BlockType.ImageCaption, all_bboxes)
  234. add_bboxes(img_footnote_blocks, BlockType.ImageFootnote, all_bboxes)
  235. add_bboxes(table_body_blocks, BlockType.TableBody, all_bboxes)
  236. add_bboxes(table_caption_blocks, BlockType.TableCaption, all_bboxes)
  237. add_bboxes(table_footnote_blocks, BlockType.TableFootnote, all_bboxes)
  238. add_bboxes(text_blocks, BlockType.Text, all_bboxes)
  239. add_bboxes(title_blocks, BlockType.Title, all_bboxes)
  240. add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
  241. """block嵌套问题解决"""
  242. """文本框与标题框重叠,优先信任文本框"""
  243. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  244. """任何框体与舍弃框重叠,优先信任舍弃框"""
  245. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  246. # interline_equation 与title或text框冲突的情况,分两种情况处理
  247. """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
  248. all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
  249. """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
  250. # 通过后续大框套小框逻辑删除
  251. """discarded_blocks"""
  252. all_discarded_blocks = []
  253. add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
  254. """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
  255. footnote_blocks = []
  256. for discarded in discarded_blocks:
  257. x0, y0, x1, y1 = discarded['bbox']
  258. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  259. footnote_blocks.append([x0, y0, x1, y1])
  260. """移除在footnote下面的任何框"""
  261. need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
  262. if len(need_remove_blocks) > 0:
  263. for block in need_remove_blocks:
  264. all_bboxes.remove(block)
  265. all_discarded_blocks.append(block)
  266. """经过以上处理后,还存在大框套小框的情况,则删除小框"""
  267. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  268. all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
  269. """将剩余的bbox做分离处理,防止后面分layout时出错"""
  270. all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
  271. return all_bboxes, all_discarded_blocks
  272. def find_blocks_under_footnote(all_bboxes, footnote_blocks):
  273. need_remove_blocks = []
  274. for block in all_bboxes:
  275. block_x0, block_y0, block_x1, block_y1 = block[:4]
  276. for footnote_bbox in footnote_blocks:
  277. footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
  278. # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
  279. if (
  280. block_y0 >= footnote_y1
  281. and calculate_vertical_projection_overlap_ratio(
  282. (block_x0, block_y0, block_x1, block_y1), footnote_bbox
  283. )
  284. >= 0.8
  285. ):
  286. if block not in need_remove_blocks:
  287. need_remove_blocks.append(block)
  288. break
  289. return need_remove_blocks
  290. def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
  291. # 先提取所有text和interline block
  292. text_blocks = []
  293. for block in all_bboxes:
  294. if block[7] == BlockType.Text:
  295. text_blocks.append(block)
  296. interline_equation_blocks = []
  297. for block in all_bboxes:
  298. if block[7] == BlockType.InterlineEquation:
  299. interline_equation_blocks.append(block)
  300. need_remove = []
  301. for interline_equation_block in interline_equation_blocks:
  302. for text_block in text_blocks:
  303. interline_equation_block_bbox = interline_equation_block[:4]
  304. text_block_bbox = text_block[:4]
  305. if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
  306. if text_block not in need_remove:
  307. need_remove.append(text_block)
  308. if len(need_remove) > 0:
  309. for block in need_remove:
  310. all_bboxes.remove(block)
  311. return all_bboxes
  312. def fix_text_overlap_title_blocks(all_bboxes):
  313. # 先提取所有text和title block
  314. text_blocks = []
  315. for block in all_bboxes:
  316. if block[7] == BlockType.Text:
  317. text_blocks.append(block)
  318. title_blocks = []
  319. for block in all_bboxes:
  320. if block[7] == BlockType.Title:
  321. title_blocks.append(block)
  322. need_remove = []
  323. for text_block in text_blocks:
  324. for title_block in title_blocks:
  325. text_block_bbox = text_block[:4]
  326. title_block_bbox = title_block[:4]
  327. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  328. if title_block not in need_remove:
  329. need_remove.append(title_block)
  330. if len(need_remove) > 0:
  331. for block in need_remove:
  332. all_bboxes.remove(block)
  333. return all_bboxes
  334. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  335. need_remove = []
  336. for block in all_bboxes:
  337. for discarded_block in discarded_blocks:
  338. block_bbox = block[:4]
  339. if (
  340. calculate_overlap_area_in_bbox1_area_ratio(
  341. block_bbox, discarded_block['bbox']
  342. )
  343. > 0.6
  344. ):
  345. if block not in need_remove:
  346. need_remove.append(block)
  347. break
  348. if len(need_remove) > 0:
  349. for block in need_remove:
  350. all_bboxes.remove(block)
  351. return all_bboxes
  352. def remove_overlaps_min_blocks(all_bboxes):
  353. # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
  354. # 删除重叠blocks中较小的那些
  355. need_remove = []
  356. for block1 in all_bboxes:
  357. for block2 in all_bboxes:
  358. if block1 != block2:
  359. block1_bbox = block1[:4]
  360. block2_bbox = block2[:4]
  361. overlap_box = get_minbox_if_overlap_by_ratio(
  362. block1_bbox, block2_bbox, 0.8
  363. )
  364. if overlap_box is not None:
  365. block_to_remove = next(
  366. (block for block in all_bboxes if block[:4] == overlap_box),
  367. None,
  368. )
  369. if (
  370. block_to_remove is not None
  371. and block_to_remove not in need_remove
  372. ):
  373. large_block = block1 if block1 != block_to_remove else block2
  374. x1, y1, x2, y2 = large_block[:4]
  375. sx1, sy1, sx2, sy2 = block_to_remove[:4]
  376. x1 = min(x1, sx1)
  377. y1 = min(y1, sy1)
  378. x2 = max(x2, sx2)
  379. y2 = max(y2, sy2)
  380. large_block[:4] = [x1, y1, x2, y2]
  381. need_remove.append(block_to_remove)
  382. if len(need_remove) > 0:
  383. for block in need_remove:
  384. all_bboxes.remove(block)
  385. return all_bboxes