ocr_dict_merge.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. from magic_pdf.config.ocr_content_type import BlockType, ContentType
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
  3. # 将每一个line中的span从左到右排序
  4. def line_sort_spans_by_left_to_right(lines):
  5. line_objects = []
  6. for line in lines:
  7. # 按照x0坐标排序
  8. line.sort(key=lambda span: span['bbox'][0])
  9. line_bbox = [
  10. min(span['bbox'][0] for span in line), # x0
  11. min(span['bbox'][1] for span in line), # y0
  12. max(span['bbox'][2] for span in line), # x1
  13. max(span['bbox'][3] for span in line), # y1
  14. ]
  15. line_objects.append({
  16. 'bbox': line_bbox,
  17. 'spans': line,
  18. })
  19. return line_objects
  20. def merge_spans_to_line(spans, threshold=0.6):
  21. if len(spans) == 0:
  22. return []
  23. else:
  24. # 按照y0坐标排序
  25. spans.sort(key=lambda span: span['bbox'][1])
  26. lines = []
  27. current_line = [spans[0]]
  28. for span in spans[1:]:
  29. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  30. # image和table类型,同上
  31. if span['type'] in [
  32. ContentType.InterlineEquation, ContentType.Image,
  33. ContentType.Table
  34. ] or any(s['type'] in [
  35. ContentType.InterlineEquation, ContentType.Image,
  36. ContentType.Table
  37. ] for s in current_line):
  38. # 则开始新行
  39. lines.append(current_line)
  40. current_line = [span]
  41. continue
  42. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  43. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
  44. current_line.append(span)
  45. else:
  46. # 否则,开始新行
  47. lines.append(current_line)
  48. current_line = [span]
  49. # 添加最后一行
  50. if current_line:
  51. lines.append(current_line)
  52. return lines
  53. def fill_spans_in_blocks(blocks, spans, radio):
  54. """将allspans中的span按位置关系,放入blocks中."""
  55. block_with_spans = []
  56. for block in blocks:
  57. block_type = block[7]
  58. block_bbox = block[0:4]
  59. block_dict = {
  60. 'type': block_type,
  61. 'bbox': block_bbox,
  62. }
  63. if block_type in [
  64. BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
  65. BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
  66. ]:
  67. block_dict['group_id'] = block[-1]
  68. block_spans = []
  69. for span in spans:
  70. span_bbox = span['bbox']
  71. if calculate_overlap_area_in_bbox1_area_ratio(
  72. span_bbox, block_bbox) > radio:
  73. block_spans.append(span)
  74. '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
  75. # displayed_list = []
  76. # text_inline_lines = []
  77. # modify_y_axis(block_spans, displayed_list, text_inline_lines)
  78. '''模型识别错误的行间公式, type类型转换成行内公式'''
  79. # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
  80. '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
  81. # block_spans = remove_overlap_between_bbox_for_span(block_spans)
  82. block_dict['spans'] = block_spans
  83. block_with_spans.append(block_dict)
  84. # 从spans删除已经放入block_spans中的span
  85. if len(block_spans) > 0:
  86. for span in block_spans:
  87. spans.remove(span)
  88. return block_with_spans, spans
  89. def fix_block_spans_v2(block_with_spans):
  90. """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
  91. 需要将caption和footnote的text_span放入相应img_block和table_block内的
  92. caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
  93. fix_blocks = []
  94. for block in block_with_spans:
  95. block_type = block['type']
  96. if block_type in [BlockType.Text, BlockType.Title,
  97. BlockType.ImageCaption, BlockType.ImageFootnote,
  98. BlockType.TableCaption, BlockType.TableFootnote
  99. ]:
  100. block = fix_text_block(block)
  101. elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
  102. block = fix_interline_block(block)
  103. else:
  104. continue
  105. fix_blocks.append(block)
  106. return fix_blocks
  107. def fix_discarded_block(discarded_block_with_spans):
  108. fix_discarded_blocks = []
  109. for block in discarded_block_with_spans:
  110. block = fix_text_block(block)
  111. fix_discarded_blocks.append(block)
  112. return fix_discarded_blocks
  113. def fix_text_block(block):
  114. # 文本block中的公式span都应该转换成行内type
  115. for span in block['spans']:
  116. if span['type'] == ContentType.InterlineEquation:
  117. span['type'] = ContentType.InlineEquation
  118. block_lines = merge_spans_to_line(block['spans'])
  119. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  120. block['lines'] = sort_block_lines
  121. del block['spans']
  122. return block
  123. def fix_interline_block(block):
  124. block_lines = merge_spans_to_line(block['spans'])
  125. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  126. block['lines'] = sort_block_lines
  127. del block['spans']
  128. return block