ocr_dict_merge.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio
  4. # 删除重叠spans中较小的那些
  5. def remove_overlaps_min_spans(spans):
  6. for span1 in spans.copy():
  7. for span2 in spans.copy():
  8. if span1 != span2:
  9. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.5)
  10. if overlap_box is not None:
  11. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  12. if bbox_to_remove is not None:
  13. spans.remove(bbox_to_remove)
  14. return spans
  15. # 将每一个line中的span从左到右排序
  16. def line_sort_spans_by_left_to_right(lines):
  17. line_objects = []
  18. for line in lines:
  19. # 按照x0坐标排序
  20. line.sort(key=lambda span: span['bbox'][0])
  21. line_bbox = [
  22. min(span['bbox'][0] for span in line), # x0
  23. min(span['bbox'][1] for span in line), # y0
  24. max(span['bbox'][2] for span in line), # x1
  25. max(span['bbox'][3] for span in line), # y1
  26. ]
  27. line_objects.append({
  28. "bbox": line_bbox,
  29. "spans": line,
  30. })
  31. return line_objects
  32. def merge_spans_to_line(spans):
  33. # 按照y0坐标排序
  34. spans.sort(key=lambda span: span['bbox'][1])
  35. lines = []
  36. current_line = [spans[0]]
  37. for span in spans[1:]:
  38. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  39. # image和table类型,同上
  40. if span['type'] in ["displayed_equation", "image", "table"] or any(
  41. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  42. # 则开始新行
  43. lines.append(current_line)
  44. current_line = [span]
  45. continue
  46. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  47. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  48. current_line.append(span)
  49. else:
  50. # 否则,开始新行
  51. lines.append(current_line)
  52. current_line = [span]
  53. # 添加最后一行
  54. if current_line:
  55. lines.append(current_line)
  56. return lines
  57. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  58. lines = []
  59. new_spans = []
  60. for item in layout_bboxes:
  61. layout_bbox = item['layout_bbox']
  62. # 遍历spans,将每个span放入对应的layout中
  63. layout_sapns = []
  64. for span in spans:
  65. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
  66. layout_sapns.append(span)
  67. # 如果layout_sapns不为空,则放入new_spans中
  68. if len(layout_sapns) > 0:
  69. new_spans.append(layout_sapns)
  70. # 从spans删除已经放入layout_sapns中的span
  71. for layout_sapn in layout_sapns:
  72. spans.remove(layout_sapn)
  73. if len(new_spans) > 0:
  74. for layout_sapns in new_spans:
  75. layout_lines = merge_spans_to_line(layout_sapns)
  76. lines.extend(layout_lines)
  77. #对line中的span进行排序
  78. lines = line_sort_spans_by_left_to_right(lines)
  79. return lines
  80. def modify_y_axis(spans: list):
  81. inline_list = []
  82. displayed_list = []
  83. text_list = []
  84. image_list = []
  85. table_list = []
  86. spans.sort(key=lambda span: span['bbox'][1])
  87. lines = []
  88. current_line = [spans[0]]
  89. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  90. displayed_list.append(spans[0])
  91. line_first_y0 = spans[0]["bbox"][1]
  92. line_first_y = spans[0]["bbox"][3]
  93. #用于给行间公式搜索
  94. text_inline_lines = []
  95. for span in spans[1:]:
  96. if span.get("content","") == "78.":
  97. print("debug")
  98. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  99. # image和table类型,同上
  100. if span['type'] in ["displayed_equation", "image", "table"] or any(
  101. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  102. #传入
  103. if span["type"] in ["displayed_equation", "image", "table"]:
  104. displayed_list.append(span)
  105. # 则开始新行
  106. lines.append(current_line)
  107. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  108. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  109. current_line = [span]
  110. line_first_y0 = span["bbox"][1]
  111. line_first_y = span["bbox"][3]
  112. continue
  113. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  114. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  115. if span["bbox"][1] < line_first_y0:
  116. line_first_y0 = span["bbox"][1]
  117. if span["bbox"][3] > line_first_y:
  118. line_first_y = span["bbox"][3]
  119. current_line.append(span)
  120. else:
  121. # 否则,开始新行
  122. lines.append(current_line)
  123. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  124. current_line = [span]
  125. line_first_y0 = span["bbox"][1]
  126. line_first_y = span["bbox"][3]
  127. # 添加最后一行
  128. if current_line:
  129. lines.append(current_line)
  130. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  131. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  132. for line in text_inline_lines:
  133. # 按照x0坐标排序
  134. current_line = line[0]
  135. current_line.sort(key=lambda span: span['bbox'][0])
  136. #调整每一个文字行内bbox统一
  137. for line in text_inline_lines:
  138. current_line, (line_first_y0, line_first_y) = line
  139. for span in current_line:
  140. span["bbox"][1] = line_first_y0
  141. span["bbox"][3] = line_first_y
  142. #错误行间公式转行内公式
  143. j = 0
  144. for i in range(len(displayed_list)):
  145. if i == 8:
  146. print("debug")
  147. span = displayed_list[i]
  148. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  149. while j < len(text_inline_lines):
  150. text_line = text_inline_lines[j]
  151. y0, y1 = text_line[1]
  152. if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  153. span["bbox"][1] = y0
  154. # span["bbox"][3] = y1
  155. #调整公式类型
  156. if span["type"] == "displayed_equation":
  157. span["type"] = "inline_equation"
  158. break
  159. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  160. break
  161. else:
  162. j += 1
  163. return spans