ocr_span_list_modify.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
  2. __is_overlaps_y_exceeds_threshold
  3. def remove_overlaps_min_spans(spans):
  4. # 删除重叠spans中较小的那些
  5. for span1 in spans.copy():
  6. for span2 in spans.copy():
  7. if span1 != span2:
  8. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  9. if overlap_box is not None:
  10. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  11. if bbox_to_remove is not None:
  12. spans.remove(bbox_to_remove)
  13. return spans
  14. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  15. # 遍历spans, 判断是否在removed_span_block_bboxes中
  16. # 如果是, 则删除该span 否则, 保留该span
  17. need_remove_spans = []
  18. for span in spans:
  19. for removed_bbox in need_remove_spans_bboxes:
  20. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  21. need_remove_spans.append(span)
  22. break
  23. for span in need_remove_spans:
  24. spans.remove(span)
  25. return spans
  26. def adjust_bbox_for_standalone_block(spans):
  27. # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  28. for sb_span in spans:
  29. if sb_span['type'] in ["displayed_equation", "image", "table"]:
  30. for text_span in spans:
  31. if text_span['type'] in ['text', 'inline_equation']:
  32. # 判断span2的纵向高度是否被span所覆盖
  33. if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
  34. # 判断span2是否在span左边
  35. if text_span['bbox'][0] < sb_span['bbox'][0]:
  36. # 调整span的y0和span2的y0一致
  37. sb_span['bbox'][1] = text_span['bbox'][1]
  38. return spans
  39. def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
  40. # displayed_list = []
  41. spans.sort(key=lambda span: span['bbox'][1])
  42. lines = []
  43. current_line = [spans[0]]
  44. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  45. displayed_list.append(spans[0])
  46. line_first_y0 = spans[0]["bbox"][1]
  47. line_first_y = spans[0]["bbox"][3]
  48. # 用于给行间公式搜索
  49. # text_inline_lines = []
  50. for span in spans[1:]:
  51. # if span.get("content","") == "78.":
  52. # print("debug")
  53. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  54. # image和table类型,同上
  55. if span['type'] in ["displayed_equation", "image", "table"] or any(
  56. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  57. # 传入
  58. if span["type"] in ["displayed_equation", "image", "table"]:
  59. displayed_list.append(span)
  60. # 则开始新行
  61. lines.append(current_line)
  62. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  63. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  64. current_line = [span]
  65. line_first_y0 = span["bbox"][1]
  66. line_first_y = span["bbox"][3]
  67. continue
  68. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  69. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  70. if span["type"] == "text":
  71. line_first_y0 = span["bbox"][1]
  72. line_first_y = span["bbox"][3]
  73. current_line.append(span)
  74. else:
  75. # 否则,开始新行
  76. lines.append(current_line)
  77. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  78. current_line = [span]
  79. line_first_y0 = span["bbox"][1]
  80. line_first_y = span["bbox"][3]
  81. # 添加最后一行
  82. if current_line:
  83. lines.append(current_line)
  84. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  85. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  86. for line in text_inline_lines:
  87. # 按照x0坐标排序
  88. current_line = line[0]
  89. current_line.sort(key=lambda span: span['bbox'][0])
  90. #调整每一个文字行内bbox统一
  91. for line in text_inline_lines:
  92. current_line, (line_first_y0, line_first_y) = line
  93. for span in current_line:
  94. span["bbox"][1] = line_first_y0
  95. span["bbox"][3] = line_first_y
  96. # return spans, displayed_list, text_inline_lines
  97. def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
  98. #错误行间公式转行内公式
  99. j = 0
  100. for i in range(len(displayed_list)):
  101. # if i == 8:
  102. # print("debug")
  103. span = displayed_list[i]
  104. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  105. while j < len(text_inline_lines):
  106. text_line = text_inline_lines[j]
  107. y0, y1 = text_line[1]
  108. if (span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  109. span["bbox"][1] = y0
  110. # span["bbox"][3] = y1
  111. # 调整公式类型
  112. if span["type"] == "displayed_equation":
  113. if j+1 >= len(text_inline_lines):
  114. span["type"] = "inline_equation"
  115. else:
  116. y0_next, y1_next = text_inline_lines[j + 1][1]
  117. if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
  118. span["type"] = "inline_equation"
  119. break
  120. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  121. break
  122. else:
  123. j += 1
  124. return spans