ocr_fix_block_logic.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
  2. from magic_pdf.libs.ocr_content_type import ContentType, BlockType
  3. from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, line_sort_spans_by_left_to_right
  4. def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
  5. block_spans = []
  6. # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
  7. for span in spans:
  8. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.8:
  9. block_spans.append(span)
  10. block_lines = merge_spans_to_line(block_spans)
  11. # 对line中的span进行排序
  12. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  13. block = {
  14. 'bbox': block_bbox,
  15. 'block_type': block_type,
  16. 'lines': sort_block_lines
  17. }
  18. return block, block_spans
  19. def make_body_block(span: dict, block_bbox: list, block_type: str):
  20. # 创建body_block
  21. body_line = {
  22. 'bbox': block_bbox,
  23. 'spans': [span],
  24. }
  25. body_block = {
  26. 'bbox': block_bbox,
  27. 'block_type': block_type,
  28. 'lines': [body_line]
  29. }
  30. return body_block
  31. def fix_image_block(block, img_blocks):
  32. block['blocks'] = []
  33. # 遍历img_blocks,找到与当前block匹配的img_block
  34. for img_block in img_blocks:
  35. if img_block['bbox'] == block['bbox']:
  36. # 创建img_body_block
  37. for span in block['spans']:
  38. if span['type'] == ContentType.Image and span['bbox'] == img_block['img_body_bbox']:
  39. # 创建img_body_block
  40. img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
  41. block['blocks'].append(img_body_block)
  42. # 从spans中移除img_body_block中已经放入的span
  43. block['spans'].remove(span)
  44. break
  45. # 根据list长度,判断img_block中是否有img_caption
  46. if len(img_block['img_caption_bbox']) > 0:
  47. img_caption_block, img_caption_spans = merge_spans_to_block(
  48. block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
  49. )
  50. block['blocks'].append(img_caption_block)
  51. break
  52. del block['spans']
  53. return block
  54. def fix_table_block(block, table_blocks):
  55. block['blocks'] = []
  56. # 遍历table_blocks,找到与当前block匹配的table_block
  57. for table_block in table_blocks:
  58. if table_block['bbox'] == block['bbox']:
  59. # 创建table_body_block
  60. for span in block['spans']:
  61. if span['type'] == ContentType.Table and span['bbox'] == table_block['table_body_bbox']:
  62. # 创建table_body_block
  63. table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
  64. block['blocks'].append(table_body_block)
  65. # 从spans中移除img_body_block中已经放入的span
  66. block['spans'].remove(span)
  67. break
  68. # 根据list长度,判断table_block中是否有caption
  69. if len(table_block['table_caption_bbox']) > 0:
  70. table_caption_block, table_caption_spans = merge_spans_to_block(
  71. block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
  72. )
  73. block['blocks'].append(table_caption_block)
  74. # 如果table_caption_block_spans不为空
  75. if len(table_caption_spans) > 0:
  76. # 一些span已经放入了caption_block中,需要从block['spans']中删除
  77. for span in table_caption_spans:
  78. block['spans'].remove(span)
  79. # 根据list长度,判断table_block中是否有table_note
  80. if len(table_block['table_footnote_bbox']) > 0:
  81. table_footnote_block, table_footnote_spans = merge_spans_to_block(
  82. block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
  83. )
  84. block['blocks'].append(table_footnote_block)
  85. break
  86. del block['spans']
  87. return block
  88. def fix_text_block(block):
  89. block_lines = merge_spans_to_line(block['spans'])
  90. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  91. block['lines'] = sort_block_lines
  92. del block['spans']
  93. return block