ocr_dict_merge.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
  2. # 删除重叠spans中较小的那些
  3. def remove_overlaps_min_spans(spans):
  4. for span1 in spans.copy():
  5. for span2 in spans.copy():
  6. if span1 != span2:
  7. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
  8. if overlap_box is not None:
  9. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  10. if bbox_to_remove is not None:
  11. spans.remove(bbox_to_remove)
  12. return spans
  13. def merge_spans_to_line(spans):
  14. # 按照y0坐标排序
  15. spans.sort(key=lambda span: span['bbox'][1])
  16. lines = []
  17. current_line = [spans[0]]
  18. for span in spans[1:]:
  19. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  20. # image和table类型,同上
  21. if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  22. # 则开始新行
  23. lines.append(current_line)
  24. current_line = [span]
  25. continue
  26. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  27. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  28. current_line.append(span)
  29. else:
  30. # 否则,开始新行
  31. lines.append(current_line)
  32. current_line = [span]
  33. # 添加最后一行
  34. if current_line:
  35. lines.append(current_line)
  36. # 计算每行的边界框,并对每行中的span按照x0进行排序
  37. line_objects = []
  38. for line in lines:
  39. # 按照x0坐标排序
  40. line.sort(key=lambda span: span['bbox'][0])
  41. line_bbox = [
  42. min(span['bbox'][0] for span in line), # x0
  43. min(span['bbox'][1] for span in line), # y0
  44. max(span['bbox'][2] for span in line), # x1
  45. max(span['bbox'][3] for span in line), # y1
  46. ]
  47. line_objects.append({
  48. "bbox": line_bbox,
  49. "spans": line,
  50. })
  51. return line_objects
  52. def modify_y_axis(spans: list):
  53. inline_list = []
  54. displayed_list = []
  55. text_list = []
  56. image_list = []
  57. table_list = []
  58. spans.sort(key=lambda span: span['bbox'][1])
  59. lines = []
  60. current_line = [spans[0]]
  61. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  62. displayed_list.append(spans[0])
  63. line_first_y0 = spans[0]["bbox"][1]
  64. line_first_y = spans[0]["bbox"][3]
  65. #用于给行间公式搜索
  66. text_inline_lines = []
  67. for span in spans[1:]:
  68. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  69. # image和table类型,同上
  70. if span['type'] in ["displayed_equation", "image", "table"] or any(
  71. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  72. #传入
  73. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  74. displayed_list.append(span)
  75. # 则开始新行
  76. lines.append(current_line)
  77. current_line = [span]
  78. line_first_y0 = span["bbox"][1]
  79. line_first_y = span["bbox"][3]
  80. continue
  81. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  82. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  83. if span["bbox"][1] < line_first_y0:
  84. line_first_y0 = span["bbox"][1]
  85. if span["bbox"][3] > line_first_y:
  86. line_first_y = span["bbox"][3]
  87. current_line.append(span)
  88. else:
  89. # 否则,开始新行
  90. lines.append(current_line)
  91. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  92. current_line = [span]
  93. line_first_y0 = spans[0]["bbox"][1]
  94. line_first_y = spans[0]["bbox"][3]
  95. # 添加最后一行
  96. if current_line:
  97. lines.append(current_line)
  98. if len(current_line)>1 or current_line[0]["type"] in ["text", "inline_equation"]:
  99. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  100. for line in text_inline_lines:
  101. # 按照x0坐标排序
  102. current_line = line[0]
  103. current_line.sort(key=lambda span: span['bbox'][0])
  104. #调整每一个文字行内bbox统一
  105. for line in text_inline_lines:
  106. current_line, (line_first_y0, line_first_y) = line
  107. for span in current_line:
  108. span["bbox"][1] = line_first_y0
  109. span["bbox"][3] = line_first_y
  110. #错误行间公式转行内公式
  111. j = 0
  112. for i in range(len(displayed_list)):
  113. span = displayed_list[i]
  114. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  115. while j < len(text_inline_lines):
  116. text_line = text_inline_lines[j]
  117. y0, y1 = text_line[1]
  118. if span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1 and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  119. span["bbox"][1] = y0
  120. span["bbox"][3] = y1
  121. if span["type"] == "displayed_equation":
  122. span["type"] = "inline_equation"
  123. break
  124. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  125. break
  126. else:
  127. j += 1
  128. return spans