ocr_dict_merge.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import ContentType
  6. # 将每一个line中的span从左到右排序
  7. def line_sort_spans_by_left_to_right(lines):
  8. line_objects = []
  9. for line in lines:
  10. # 按照x0坐标排序
  11. line.sort(key=lambda span: span['bbox'][0])
  12. line_bbox = [
  13. min(span['bbox'][0] for span in line), # x0
  14. min(span['bbox'][1] for span in line), # y0
  15. max(span['bbox'][2] for span in line), # x1
  16. max(span['bbox'][3] for span in line), # y1
  17. ]
  18. line_objects.append({
  19. "bbox": line_bbox,
  20. "spans": line,
  21. })
  22. return line_objects
  23. def merge_spans_to_line(spans):
  24. if len(spans) == 0:
  25. return []
  26. else:
  27. # 按照y0坐标排序
  28. spans.sort(key=lambda span: span['bbox'][1])
  29. lines = []
  30. current_line = [spans[0]]
  31. for span in spans[1:]:
  32. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  33. # image和table类型,同上
  34. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
  35. s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
  36. # 则开始新行
  37. lines.append(current_line)
  38. current_line = [span]
  39. continue
  40. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  41. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  42. current_line.append(span)
  43. else:
  44. # 否则,开始新行
  45. lines.append(current_line)
  46. current_line = [span]
  47. # 添加最后一行
  48. if current_line:
  49. lines.append(current_line)
  50. return lines
  51. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  52. lines = []
  53. new_spans = []
  54. dropped_spans = []
  55. for item in layout_bboxes:
  56. layout_bbox = item['layout_bbox']
  57. # 遍历spans,将每个span放入对应的layout中
  58. layout_sapns = []
  59. for span in spans:
  60. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
  61. layout_sapns.append(span)
  62. # 如果layout_sapns不为空,则放入new_spans中
  63. if len(layout_sapns) > 0:
  64. new_spans.append(layout_sapns)
  65. # 从spans删除已经放入layout_sapns中的span
  66. for layout_sapn in layout_sapns:
  67. spans.remove(layout_sapn)
  68. if len(new_spans) > 0:
  69. for layout_sapns in new_spans:
  70. layout_lines = merge_spans_to_line(layout_sapns)
  71. lines.extend(layout_lines)
  72. # 对line中的span进行排序
  73. lines = line_sort_spans_by_left_to_right(lines)
  74. for span in spans:
  75. span['tag'] = DropTag.NOT_IN_LAYOUT
  76. dropped_spans.append(span)
  77. return lines, dropped_spans
  78. def merge_lines_to_block(lines):
  79. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  80. blocks = []
  81. for line in lines:
  82. blocks.append(
  83. {
  84. "bbox": line["bbox"],
  85. "lines": [line],
  86. }
  87. )
  88. return blocks