ocr_span_list_modify.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
  3. __is_overlaps_y_exceeds_threshold
  4. def remove_overlaps_min_spans(spans):
  5. # 删除重叠spans中较小的那些
  6. for span1 in spans.copy():
  7. for span2 in spans.copy():
  8. if span1 != span2:
  9. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  10. if overlap_box is not None:
  11. bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  12. if bbox_to_remove is not None:
  13. spans.remove(bbox_to_remove)
  14. return spans
  15. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  16. # 遍历spans, 判断是否在removed_span_block_bboxes中
  17. # 如果是, 则删除该span 否则, 保留该span
  18. need_remove_spans = []
  19. for span in spans:
  20. for removed_bbox in need_remove_spans_bboxes:
  21. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  22. need_remove_spans.append(span)
  23. break
  24. for span in need_remove_spans:
  25. spans.remove(span)
  26. return spans
  27. def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
  28. dropped_text_block = []
  29. dropped_image_block = []
  30. dropped_table_block = []
  31. for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
  32. # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
  33. need_remove_spans = []
  34. for span in spans:
  35. for removed_bbox in removed_bboxes:
  36. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  37. need_remove_spans.append(span)
  38. break
  39. for span in need_remove_spans:
  40. spans.remove(span)
  41. span['tag'] = drop_tag
  42. if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
  43. dropped_text_block.append(span)
  44. elif span['type'] == 'image':
  45. dropped_image_block.append(span)
  46. elif span['type'] == 'table':
  47. dropped_table_block.append(span)
  48. return spans, dropped_text_block, dropped_image_block, dropped_table_block
  49. def adjust_bbox_for_standalone_block(spans):
  50. # 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  51. for sb_span in spans:
  52. if sb_span['type'] in ["displayed_equation", "image", "table"]:
  53. for text_span in spans:
  54. if text_span['type'] in ['text', 'inline_equation']:
  55. # 判断span2的纵向高度是否被span所覆盖
  56. if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
  57. # 判断span2是否在span左边
  58. if text_span['bbox'][0] < sb_span['bbox'][0]:
  59. # 调整span的y0和span2的y0一致
  60. sb_span['bbox'][1] = text_span['bbox'][1]
  61. return spans
  62. def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
  63. # displayed_list = []
  64. spans.sort(key=lambda span: span['bbox'][1])
  65. lines = []
  66. current_line = [spans[0]]
  67. if spans[0]["type"] in ["displayed_equation", "image", "table"]:
  68. displayed_list.append(spans[0])
  69. line_first_y0 = spans[0]["bbox"][1]
  70. line_first_y = spans[0]["bbox"][3]
  71. # 用于给行间公式搜索
  72. # text_inline_lines = []
  73. for span in spans[1:]:
  74. # if span.get("content","") == "78.":
  75. # print("debug")
  76. # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
  77. # image和table类型,同上
  78. if span['type'] in ["displayed_equation", "image", "table"] or any(
  79. s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
  80. # 传入
  81. if span["type"] in ["displayed_equation", "image", "table"]:
  82. displayed_list.append(span)
  83. # 则开始新行
  84. lines.append(current_line)
  85. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  86. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  87. current_line = [span]
  88. line_first_y0 = span["bbox"][1]
  89. line_first_y = span["bbox"][3]
  90. continue
  91. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  92. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  93. if span["type"] == "text":
  94. line_first_y0 = span["bbox"][1]
  95. line_first_y = span["bbox"][3]
  96. current_line.append(span)
  97. else:
  98. # 否则,开始新行
  99. lines.append(current_line)
  100. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  101. current_line = [span]
  102. line_first_y0 = span["bbox"][1]
  103. line_first_y = span["bbox"][3]
  104. # 添加最后一行
  105. if current_line:
  106. lines.append(current_line)
  107. if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
  108. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  109. for line in text_inline_lines:
  110. # 按照x0坐标排序
  111. current_line = line[0]
  112. current_line.sort(key=lambda span: span['bbox'][0])
  113. # 调整每一个文字行内bbox统一
  114. for line in text_inline_lines:
  115. current_line, (line_first_y0, line_first_y) = line
  116. for span in current_line:
  117. span["bbox"][1] = line_first_y0
  118. span["bbox"][3] = line_first_y
  119. # return spans, displayed_list, text_inline_lines
  120. def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
  121. # 错误行间公式转行内公式
  122. j = 0
  123. for i in range(len(displayed_list)):
  124. # if i == 8:
  125. # print("debug")
  126. span = displayed_list[i]
  127. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  128. while j < len(text_inline_lines):
  129. text_line = text_inline_lines[j]
  130. y0, y1 = text_line[1]
  131. if (
  132. span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
  133. span['bbox'], (0, y0, 0, y1)):
  134. # 调整公式类型
  135. if span["type"] == "displayed_equation":
  136. # 最后一行是行间公式
  137. if j + 1 >= len(text_inline_lines):
  138. span["type"] = "inline_equation"
  139. span["bbox"][1] = y0
  140. span["bbox"][3] = y1
  141. else:
  142. # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
  143. y0_next, y1_next = text_inline_lines[j + 1][1]
  144. if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
  145. y1 - y0) > span_y - span_y0:
  146. span["type"] = "inline_equation"
  147. span["bbox"][1] = y0
  148. span["bbox"][3] = y1
  149. break
  150. elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
  151. (0, y0, 0, y1)):
  152. break
  153. else:
  154. j += 1
  155. return spans
  156. def get_qa_need_list(blocks):
  157. # 创建 images, tables, interline_equations, inline_equations 的副本
  158. images = []
  159. tables = []
  160. interline_equations = []
  161. inline_equations = []
  162. for block in blocks:
  163. for line in block["lines"]:
  164. for span in line["spans"]:
  165. if span["type"] == "image":
  166. images.append(span)
  167. elif span["type"] == "table":
  168. tables.append(span)
  169. elif span["type"] == "inline_equation":
  170. inline_equations.append(span)
  171. elif span["type"] == "displayed_equation":
  172. interline_equations.append(span)
  173. else:
  174. continue
  175. return images, tables, interline_equations, inline_equations