ocr_dict_merge.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio
  4. # 将每一个line中的span从左到右排序
  5. def line_sort_spans_by_left_to_right(lines):
  6. line_objects = []
  7. for line in lines:
  8. # 按照x0坐标排序
  9. line.sort(key=lambda span: span['bbox'][0])
  10. line_bbox = [
  11. min(span['bbox'][0] for span in line), # x0
  12. min(span['bbox'][1] for span in line), # y0
  13. max(span['bbox'][2] for span in line), # x1
  14. max(span['bbox'][3] for span in line), # y1
  15. ]
  16. line_objects.append({
  17. "bbox": line_bbox,
  18. "spans": line,
  19. })
  20. return line_objects
  21. def merge_spans_to_line(spans):
  22. # 按照y0坐标排序
  23. spans.sort(key=lambda span: span['bbox'][1])
  24. lines = []
  25. current_line = [spans[0]]
  26. for span in spans[1:]:
  27. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  28. # image和table类型,同上
  29. if span['type'] in ["displayed_equation", "image", "table"] or any(
  30. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  31. # 则开始新行
  32. lines.append(current_line)
  33. current_line = [span]
  34. continue
  35. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  36. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  37. current_line.append(span)
  38. else:
  39. # 否则,开始新行
  40. lines.append(current_line)
  41. current_line = [span]
  42. # 添加最后一行
  43. if current_line:
  44. lines.append(current_line)
  45. return lines
  46. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  47. lines = []
  48. new_spans = []
  49. for item in layout_bboxes:
  50. layout_bbox = item['layout_bbox']
  51. # 遍历spans,将每个span放入对应的layout中
  52. layout_sapns = []
  53. for span in spans:
  54. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
  55. layout_sapns.append(span)
  56. # 如果layout_sapns不为空,则放入new_spans中
  57. if len(layout_sapns) > 0:
  58. new_spans.append(layout_sapns)
  59. # 从spans删除已经放入layout_sapns中的span
  60. for layout_sapn in layout_sapns:
  61. spans.remove(layout_sapn)
  62. if len(new_spans) > 0:
  63. for layout_sapns in new_spans:
  64. layout_lines = merge_spans_to_line(layout_sapns)
  65. lines.extend(layout_lines)
  66. #对line中的span进行排序
  67. lines = line_sort_spans_by_left_to_right(lines)
  68. return lines
  69. def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
  70. # displayed_list = []
  71. spans.sort(key=lambda span: span['bbox'][1])
  72. lines = []
  73. current_line = [spans[0]]
  74. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  75. displayed_list.append(spans[0])
  76. line_first_y0 = spans[0]["bbox"][1]
  77. line_first_y = spans[0]["bbox"][3]
  78. #用于给行间公式搜索
  79. # text_inline_lines = []
  80. for span in spans[1:]:
  81. # if span.get("content","") == "78.":
  82. # print("debug")
  83. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  84. # image和table类型,同上
  85. if span['type'] in ["displayed_equation", "image", "table"] or any(
  86. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  87. #传入
  88. if span["type"] in ["displayed_equation", "image", "table"]:
  89. displayed_list.append(span)
  90. # 则开始新行
  91. lines.append(current_line)
  92. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  93. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  94. current_line = [span]
  95. line_first_y0 = span["bbox"][1]
  96. line_first_y = span["bbox"][3]
  97. continue
  98. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  99. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  100. if span["type"] == "text":
  101. line_first_y0 = span["bbox"][1]
  102. line_first_y = span["bbox"][3]
  103. current_line.append(span)
  104. else:
  105. # 否则,开始新行
  106. lines.append(current_line)
  107. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  108. current_line = [span]
  109. line_first_y0 = span["bbox"][1]
  110. line_first_y = span["bbox"][3]
  111. # 添加最后一行
  112. if current_line:
  113. lines.append(current_line)
  114. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  115. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  116. for line in text_inline_lines:
  117. # 按照x0坐标排序
  118. current_line = line[0]
  119. current_line.sort(key=lambda span: span['bbox'][0])
  120. #调整每一个文字行内bbox统一
  121. for line in text_inline_lines:
  122. current_line, (line_first_y0, line_first_y) = line
  123. for span in current_line:
  124. span["bbox"][1] = line_first_y0
  125. span["bbox"][3] = line_first_y
  126. # return spans, displayed_list, text_inline_lines
  127. def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
  128. #错误行间公式转行内公式
  129. j = 0
  130. for i in range(len(displayed_list)):
  131. # if i == 8:
  132. # print("debug")
  133. span = displayed_list[i]
  134. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  135. while j < len(text_inline_lines):
  136. text_line = text_inline_lines[j]
  137. y0, y1 = text_line[1]
  138. if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  139. span["bbox"][1] = y0
  140. # span["bbox"][3] = y1
  141. #调整公式类型
  142. if span["type"] == "displayed_equation":
  143. if j+1 >= len(text_inline_lines):
  144. span["type"] = "inline_equation"
  145. else:
  146. y0_next, y1_next = text_inline_lines[j + 1][1]
  147. if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
  148. span["type"] = "inline_equation"
  149. break
  150. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  151. break
  152. else:
  153. j += 1
  154. return spans