ocr_dict_merge.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio
  4. # 将每一个line中的span从左到右排序
  5. def line_sort_spans_by_left_to_right(lines):
  6. line_objects = []
  7. for line in lines:
  8. # 按照x0坐标排序
  9. line.sort(key=lambda span: span['bbox'][0])
  10. line_bbox = [
  11. min(span['bbox'][0] for span in line), # x0
  12. min(span['bbox'][1] for span in line), # y0
  13. max(span['bbox'][2] for span in line), # x1
  14. max(span['bbox'][3] for span in line), # y1
  15. ]
  16. line_objects.append({
  17. "bbox": line_bbox,
  18. "spans": line,
  19. })
  20. return line_objects
  21. def merge_spans_to_line(spans):
  22. # 按照y0坐标排序
  23. spans.sort(key=lambda span: span['bbox'][1])
  24. lines = []
  25. current_line = [spans[0]]
  26. for span in spans[1:]:
  27. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  28. # image和table类型,同上
  29. if span['type'] in ["displayed_equation", "image", "table"] or any(
  30. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  31. # 则开始新行
  32. lines.append(current_line)
  33. current_line = [span]
  34. continue
  35. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  36. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  37. current_line.append(span)
  38. else:
  39. # 否则,开始新行
  40. lines.append(current_line)
  41. current_line = [span]
  42. # 添加最后一行
  43. if current_line:
  44. lines.append(current_line)
  45. return lines
  46. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  47. lines = []
  48. new_spans = []
  49. for item in layout_bboxes:
  50. layout_bbox = item['layout_bbox']
  51. # 遍历spans,将每个span放入对应的layout中
  52. layout_sapns = []
  53. for span in spans:
  54. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
  55. layout_sapns.append(span)
  56. # 如果layout_sapns不为空,则放入new_spans中
  57. if len(layout_sapns) > 0:
  58. new_spans.append(layout_sapns)
  59. # 从spans删除已经放入layout_sapns中的span
  60. for layout_sapn in layout_sapns:
  61. spans.remove(layout_sapn)
  62. if len(new_spans) > 0:
  63. for layout_sapns in new_spans:
  64. layout_lines = merge_spans_to_line(layout_sapns)
  65. lines.extend(layout_lines)
  66. #对line中的span进行排序
  67. lines = line_sort_spans_by_left_to_right(lines)
  68. return lines
  69. def modify_y_axis(spans: list):
  70. inline_list = []
  71. displayed_list = []
  72. text_list = []
  73. image_list = []
  74. table_list = []
  75. spans.sort(key=lambda span: span['bbox'][1])
  76. lines = []
  77. current_line = [spans[0]]
  78. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  79. displayed_list.append(spans[0])
  80. line_first_y0 = spans[0]["bbox"][1]
  81. line_first_y = spans[0]["bbox"][3]
  82. #用于给行间公式搜索
  83. text_inline_lines = []
  84. for span in spans[1:]:
  85. # if span.get("content","") == "78.":
  86. # print("debug")
  87. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  88. # image和table类型,同上
  89. if span['type'] in ["displayed_equation", "image", "table"] or any(
  90. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  91. #传入
  92. if span["type"] in ["displayed_equation", "image", "table"]:
  93. displayed_list.append(span)
  94. # 则开始新行
  95. lines.append(current_line)
  96. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  97. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  98. current_line = [span]
  99. line_first_y0 = span["bbox"][1]
  100. line_first_y = span["bbox"][3]
  101. continue
  102. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  103. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  104. if span["bbox"][1] < line_first_y0:
  105. line_first_y0 = span["bbox"][1]
  106. if span["bbox"][3] > line_first_y:
  107. line_first_y = span["bbox"][3]
  108. current_line.append(span)
  109. else:
  110. # 否则,开始新行
  111. lines.append(current_line)
  112. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  113. current_line = [span]
  114. line_first_y0 = span["bbox"][1]
  115. line_first_y = span["bbox"][3]
  116. # 添加最后一行
  117. if current_line:
  118. lines.append(current_line)
  119. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  120. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  121. for line in text_inline_lines:
  122. # 按照x0坐标排序
  123. current_line = line[0]
  124. current_line.sort(key=lambda span: span['bbox'][0])
  125. #调整每一个文字行内bbox统一
  126. for line in text_inline_lines:
  127. current_line, (line_first_y0, line_first_y) = line
  128. for span in current_line:
  129. span["bbox"][1] = line_first_y0
  130. span["bbox"][3] = line_first_y
  131. #错误行间公式转行内公式
  132. j = 0
  133. for i in range(len(displayed_list)):
  134. # if i == 8:
  135. # print("debug")
  136. span = displayed_list[i]
  137. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  138. while j < len(text_inline_lines):
  139. text_line = text_inline_lines[j]
  140. y0, y1 = text_line[1]
  141. if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  142. span["bbox"][1] = y0
  143. # span["bbox"][3] = y1
  144. #调整公式类型
  145. if span["type"] == "displayed_equation":
  146. span["type"] = "inline_equation"
  147. break
  148. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  149. break
  150. else:
  151. j += 1
  152. return spans