ocr_detect_all_bboxes.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
  2. calculate_iou
  3. from magic_pdf.libs.drop_tag import DropTag
  4. from magic_pdf.libs.ocr_content_type import BlockType
  5. from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
  6. def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
  7. title_blocks, interline_equation_blocks, page_w, page_h):
  8. all_bboxes = []
  9. all_discarded_blocks = []
  10. for image in img_blocks:
  11. x0, y0, x1, y1 = image['bbox']
  12. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None])
  13. for table in table_blocks:
  14. x0, y0, x1, y1 = table['bbox']
  15. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None])
  16. for text in text_blocks:
  17. x0, y0, x1, y1 = text['bbox']
  18. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None])
  19. for title in title_blocks:
  20. x0, y0, x1, y1 = title['bbox']
  21. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None])
  22. for interline_equation in interline_equation_blocks:
  23. x0, y0, x1, y1 = interline_equation['bbox']
  24. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None])
  25. '''block嵌套问题解决'''
  26. '''文本框与标题框重叠,优先信任文本框'''
  27. all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
  28. '''任何框体与舍弃框重叠,优先信任舍弃框'''
  29. all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
  30. '''经过以上处理后,还存在大框套小框的情况,则删除小框'''
  31. all_bboxes = remove_overlaps_min_blocks(all_bboxes)
  32. '''将剩余的bbox做分离处理,防止后面分layout时出错'''
  33. all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
  34. '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
  35. for discarded in discarded_blocks:
  36. x0, y0, x1, y1 = discarded['bbox']
  37. all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None])
  38. # 将footnote加入到all_bboxes中,用来计算layout
  39. if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
  40. all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
  41. return all_bboxes, all_discarded_blocks
  42. def fix_text_overlap_title_blocks(all_bboxes):
  43. # 先提取所有text和title block
  44. text_blocks = []
  45. for block in all_bboxes:
  46. if block[7] == BlockType.Text:
  47. text_blocks.append(block)
  48. title_blocks = []
  49. for block in all_bboxes:
  50. if block[7] == BlockType.Title:
  51. title_blocks.append(block)
  52. for text_block in text_blocks:
  53. for title_block in title_blocks:
  54. text_block_bbox = text_block[:4]
  55. title_block_bbox = title_block[:4]
  56. if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
  57. all_bboxes.remove(title_block)
  58. return all_bboxes
  59. def remove_need_drop_blocks(all_bboxes, discarded_blocks):
  60. need_remove = []
  61. for block in all_bboxes:
  62. for discarded_block in discarded_blocks:
  63. block_bbox = block[:4]
  64. if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
  65. if block not in need_remove:
  66. need_remove.append(block)
  67. break
  68. if len(need_remove) > 0:
  69. for block in need_remove:
  70. all_bboxes.remove(block)
  71. return all_bboxes
  72. def remove_overlaps_min_blocks(all_bboxes):
  73. # 删除重叠blocks中较小的那些
  74. need_remove = []
  75. for block1 in all_bboxes:
  76. for block2 in all_bboxes:
  77. if block1 != block2:
  78. block1_bbox = block1[:4]
  79. block2_bbox = block2[:4]
  80. overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
  81. if overlap_box is not None:
  82. bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
  83. if bbox_to_remove is not None and bbox_to_remove not in need_remove:
  84. need_remove.append(bbox_to_remove)
  85. if len(need_remove) > 0:
  86. for block in need_remove:
  87. all_bboxes.remove(block)
  88. return all_bboxes