ocr_detect_all_bboxes.py 4.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
  2. calculate_iou
  3. from magic_pdf.libs.drop_tag import DropTag
  4. from magic_pdf.libs.ocr_content_type import BlockType
  5. def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
  6. title_blocks, interline_equation_blocks, page_w, page_h):
  7. all_bboxes = []
  8. for image in img_blocks:
  9. x0, y0, x1, y1 = image['bbox']
  10. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
  11. for table in table_blocks:
  12. x0, y0, x1, y1 = table['bbox']
  13. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
  14. for text in text_blocks:
  15. x0, y0, x1, y1 = text['bbox']
  16. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
  17. for title in title_blocks:
  18. x0, y0, x1, y1 = title['bbox']
  19. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
  20. for interline_equation in interline_equation_blocks:
  21. x0, y0, x1, y1 = interline_equation['bbox']
  22. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
  23. '''block嵌套问题解决'''
  24. '''文本框与标题框重叠,优先信任标题框'''
  25. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  26. '''任何框体与舍弃框重叠,优先信任舍弃框'''
  27. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  28. '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
  29. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  30. '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
  31. for discarded in discarded_blocks:
  32. x0, y0, x1, y1 = discarded['bbox']
  33. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  34. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
  35. return all_bboxes
  36. def fix_text_overlap_title_blocks(all_bboxes):
  37. # 先提取所有text和title block
  38. text_blocks = []
  39. for block in all_bboxes:
  40. if block[7] == BlockType.Text:
  41. text_blocks.append(block)
  42. title_blocks = []
  43. for block in all_bboxes:
  44. if block[7] == BlockType.Title:
  45. title_blocks.append(block)
  46. for text_block in text_blocks:
  47. for title_block in title_blocks:
  48. text_block_bbox = text_block[0], text_block[1], text_block[2], text_block[3]
  49. title_block_bbox = title_block[0], title_block[1], title_block[2], title_block[3]
  50. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  51. all_bboxes.remove(text_block)
  52. return all_bboxes
  53. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  54. for block in all_bboxes.copy():
  55. for discarded_block in discarded_blocks:
  56. block_bbox = block[0], block[1], block[2], block[3]
  57. if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
  58. all_bboxes.remove(block)
  59. return all_bboxes
  60. def remove_overlaps_min_blocks(all_bboxes):
  61. # 删除重叠blocks中较小的那些
  62. for block1 in all_bboxes.copy():
  63. for block2 in all_bboxes.copy():
  64. if block1 != block2:
  65. block1_bbox = [block1[0], block1[1], block1[2], block1[3]]
  66. block2_bbox = [block2[0], block2[1], block2[2], block2[3]]
  67. overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
  68. if overlap_box is not None:
  69. bbox_to_remove = next(
  70. (block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
  71. None)
  72. if bbox_to_remove is not None:
  73. all_bboxes.remove(bbox_to_remove)
  74. return all_bboxes