ocr_dict_merge.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import ContentType
  6. from magic_pdf.pre_proc.ocr_fix_block_logic import fix_image_block, fix_table_block
  7. # 将每一个line中的span从左到右排序
  8. def line_sort_spans_by_left_to_right(lines):
  9. line_objects = []
  10. for line in lines:
  11. # 按照x0坐标排序
  12. line.sort(key=lambda span: span['bbox'][0])
  13. line_bbox = [
  14. min(span['bbox'][0] for span in line), # x0
  15. min(span['bbox'][1] for span in line), # y0
  16. max(span['bbox'][2] for span in line), # x1
  17. max(span['bbox'][3] for span in line), # y1
  18. ]
  19. line_objects.append({
  20. "bbox": line_bbox,
  21. "spans": line,
  22. })
  23. return line_objects
  24. def merge_spans_to_line(spans):
  25. if len(spans) == 0:
  26. return []
  27. else:
  28. # 按照y0坐标排序
  29. spans.sort(key=lambda span: span['bbox'][1])
  30. lines = []
  31. current_line = [spans[0]]
  32. for span in spans[1:]:
  33. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  34. # image和table类型,同上
  35. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
  36. s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
  37. current_line):
  38. # 则开始新行
  39. lines.append(current_line)
  40. current_line = [span]
  41. continue
  42. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  43. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  44. current_line.append(span)
  45. else:
  46. # 否则,开始新行
  47. lines.append(current_line)
  48. current_line = [span]
  49. # 添加最后一行
  50. if current_line:
  51. lines.append(current_line)
  52. return lines
  53. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  54. lines = []
  55. new_spans = []
  56. dropped_spans = []
  57. for item in layout_bboxes:
  58. layout_bbox = item['layout_bbox']
  59. # 遍历spans,将每个span放入对应的layout中
  60. layout_sapns = []
  61. for span in spans:
  62. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
  63. layout_sapns.append(span)
  64. # 如果layout_sapns不为空,则放入new_spans中
  65. if len(layout_sapns) > 0:
  66. new_spans.append(layout_sapns)
  67. # 从spans删除已经放入layout_sapns中的span
  68. for layout_sapn in layout_sapns:
  69. spans.remove(layout_sapn)
  70. if len(new_spans) > 0:
  71. for layout_sapns in new_spans:
  72. layout_lines = merge_spans_to_line(layout_sapns)
  73. lines.extend(layout_lines)
  74. # 对line中的span进行排序
  75. lines = line_sort_spans_by_left_to_right(lines)
  76. for span in spans:
  77. span['tag'] = DropTag.NOT_IN_LAYOUT
  78. dropped_spans.append(span)
  79. return lines, dropped_spans
  80. def merge_lines_to_block(lines):
  81. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  82. blocks = []
  83. for line in lines:
  84. blocks.append(
  85. {
  86. "bbox": line["bbox"],
  87. "lines": [line],
  88. }
  89. )
  90. return blocks
  91. def sort_blocks_by_layout(all_bboxes, layout_bboxes):
  92. new_blocks = []
  93. sort_blocks = []
  94. for item in layout_bboxes:
  95. layout_bbox = item['layout_bbox']
  96. # 遍历blocks,将每个blocks放入对应的layout中
  97. layout_blocks = []
  98. for block in all_bboxes:
  99. # 如果是footnote则跳过
  100. if block[7] == 'footnote':
  101. continue
  102. block_bbox = [block[0], block[1], block[2], block[3]]
  103. if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
  104. layout_blocks.append(block)
  105. # 如果layout_blocks不为空,则放入new_blocks中
  106. if len(layout_blocks) > 0:
  107. new_blocks.append(layout_blocks)
  108. # 从spans删除已经放入layout_sapns中的span
  109. for layout_block in layout_blocks:
  110. all_bboxes.remove(layout_block)
  111. # 如果new_blocks不为空,则对new_blocks中每个block进行排序
  112. if len(new_blocks) > 0:
  113. for bboxes_in_layout_block in new_blocks:
  114. bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
  115. sort_blocks.extend(bboxes_in_layout_block)
  116. # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
  117. return sort_blocks
  118. def fill_spans_in_blocks(blocks, spans):
  119. block_with_spans = []
  120. for block in blocks:
  121. block_type = block[7]
  122. block_bbox = block[0:4]
  123. block_dict = {
  124. 'block_type': block_type,
  125. 'bbox': block_bbox,
  126. }
  127. block_spans = []
  128. for span in spans:
  129. span_bbox = span['bbox']
  130. if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.8:
  131. block_spans.append(span)
  132. block_dict['spans'] = block_spans
  133. block_with_spans.append(block_dict)
  134. # 从spans删除已经放入block_spans中的span
  135. if len(block_spans) > 0:
  136. for span in block_spans:
  137. spans.remove(span)
  138. return block_with_spans
  139. def fix_block_spans(block_with_spans, img_blocks, table_blocks):
  140. fix_blocks = []
  141. for block in block_with_spans:
  142. block_type = block['block_type']
  143. # 只有type为image_block和table_block才需要处理
  144. if block_type == 'image_block':
  145. block = fix_image_block(block, img_blocks)
  146. elif block_type == 'table_block':
  147. block = fix_table_block(block, table_blocks)
  148. elif block_type == 'text_block':
  149. pass
  150. elif block_type == 'title_block':
  151. pass
  152. elif block_type == 'interline_equation_block':
  153. pass
  154. else:
  155. continue
  156. fix_blocks.append(block)
  157. return fix_blocks