ocr_span_list_modify.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
  3. __is_overlaps_y_exceeds_threshold, calculate_iou
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import ContentType, BlockType
  6. def remove_overlaps_low_confidence_spans(spans):
  7. dropped_spans = []
  8. # 删除重叠spans中置信度低的的那些
  9. for span1 in spans:
  10. for span2 in spans:
  11. if span1 != span2:
  12. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  13. if span1 in dropped_spans or span2 in dropped_spans:
  14. continue
  15. else:
  16. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  17. if span1['score'] < span2['score']:
  18. span_need_remove = span1
  19. else:
  20. span_need_remove = span2
  21. if span_need_remove is not None and span_need_remove not in dropped_spans:
  22. dropped_spans.append(span_need_remove)
  23. if len(dropped_spans) > 0:
  24. for span_need_remove in dropped_spans:
  25. spans.remove(span_need_remove)
  26. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  27. return spans, dropped_spans
  28. def remove_overlaps_min_spans(spans):
  29. dropped_spans = []
  30. # 删除重叠spans中较小的那些
  31. for span1 in spans:
  32. for span2 in spans:
  33. if span1 != span2:
  34. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  35. if span1 in dropped_spans or span2 in dropped_spans:
  36. continue
  37. else:
  38. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  39. if overlap_box is not None:
  40. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  41. if span_need_remove is not None and span_need_remove not in dropped_spans:
  42. dropped_spans.append(span_need_remove)
  43. if len(dropped_spans) > 0:
  44. for span_need_remove in dropped_spans:
  45. spans.remove(span_need_remove)
  46. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  47. return spans, dropped_spans
  48. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  49. # 遍历spans, 判断是否在removed_span_block_bboxes中
  50. # 如果是, 则删除该span 否则, 保留该span
  51. need_remove_spans = []
  52. for span in spans:
  53. for removed_bbox in need_remove_spans_bboxes:
  54. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  55. if span not in need_remove_spans:
  56. need_remove_spans.append(span)
  57. break
  58. if len(need_remove_spans) > 0:
  59. for span in need_remove_spans:
  60. spans.remove(span)
  61. return spans
  62. def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
  63. dropped_spans = []
  64. for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
  65. # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
  66. need_remove_spans = []
  67. for span in spans:
  68. # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
  69. for removed_bbox in removed_bboxes:
  70. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
  71. need_remove_spans.append(span)
  72. break
  73. # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
  74. elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
  75. removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
  76. need_remove_spans.append(span)
  77. break
  78. for span in need_remove_spans:
  79. spans.remove(span)
  80. span['tag'] = drop_tag
  81. dropped_spans.append(span)
  82. return spans, dropped_spans
  83. def adjust_bbox_for_standalone_block(spans):
  84. # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  85. for sb_span in spans:
  86. if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
  87. for text_span in spans:
  88. if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
  89. # 判断span2的纵向高度是否被span所覆盖
  90. if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
  91. # 判断span2是否在span左边
  92. if text_span['bbox'][0] < sb_span['bbox'][0]:
  93. # 调整span的y0和span2的y0一致
  94. sb_span['bbox'][1] = text_span['bbox'][1]
  95. return spans
  96. def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
  97. # displayed_list = []
  98. # 如果spans为空,则不处理
  99. if len(spans) == 0:
  100. pass
  101. else:
  102. spans.sort(key=lambda span: span['bbox'][1])
  103. lines = []
  104. current_line = [spans[0]]
  105. if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
  106. displayed_list.append(spans[0])
  107. line_first_y0 = spans[0]["bbox"][1]
  108. line_first_y = spans[0]["bbox"][3]
  109. # 用于给行间公式搜索
  110. # text_inline_lines = []
  111. for span in spans[1:]:
  112. # if span.get("content","") == "78.":
  113. # print("debug")
  114. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  115. # image和table类型,同上
  116. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
  117. s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
  118. current_line):
  119. # 传入
  120. if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
  121. displayed_list.append(span)
  122. # 则开始新行
  123. lines.append(current_line)
  124. if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
  125. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  126. current_line = [span]
  127. line_first_y0 = span["bbox"][1]
  128. line_first_y = span["bbox"][3]
  129. continue
  130. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  131. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  132. if span["type"] == "text":
  133. line_first_y0 = span["bbox"][1]
  134. line_first_y = span["bbox"][3]
  135. current_line.append(span)
  136. else:
  137. # 否则,开始新行
  138. lines.append(current_line)
  139. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  140. current_line = [span]
  141. line_first_y0 = span["bbox"][1]
  142. line_first_y = span["bbox"][3]
  143. # 添加最后一行
  144. if current_line:
  145. lines.append(current_line)
  146. if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
  147. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  148. for line in text_inline_lines:
  149. # 按照x0坐标排序
  150. current_line = line[0]
  151. current_line.sort(key=lambda span: span['bbox'][0])
  152. # 调整每一个文字行内bbox统一
  153. for line in text_inline_lines:
  154. current_line, (line_first_y0, line_first_y) = line
  155. for span in current_line:
  156. span["bbox"][1] = line_first_y0
  157. span["bbox"][3] = line_first_y
  158. # return spans, displayed_list, text_inline_lines
  159. def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
  160. # 错误行间公式转行内公式
  161. j = 0
  162. for i in range(len(displayed_list)):
  163. # if i == 8:
  164. # print("debug")
  165. span = displayed_list[i]
  166. span_y0, span_y = span["bbox"][1], span["bbox"][3]
  167. while j < len(text_inline_lines):
  168. text_line = text_inline_lines[j]
  169. y0, y1 = text_line[1]
  170. if (
  171. span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
  172. ) and __is_overlaps_y_exceeds_threshold(
  173. span['bbox'], (0, y0, 0, y1)
  174. ):
  175. # 调整公式类型
  176. if span["type"] == ContentType.InterlineEquation:
  177. # 最后一行是行间公式
  178. if j + 1 >= len(text_inline_lines):
  179. span["type"] = ContentType.InlineEquation
  180. span["bbox"][1] = y0
  181. span["bbox"][3] = y1
  182. else:
  183. # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
  184. y0_next, y1_next = text_inline_lines[j + 1][1]
  185. if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
  186. y1 - y0) > span_y - span_y0:
  187. span["type"] = ContentType.InlineEquation
  188. span["bbox"][1] = y0
  189. span["bbox"][3] = y1
  190. break
  191. elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
  192. (0, y0, 0, y1)):
  193. break
  194. else:
  195. j += 1
  196. return spans
  197. def get_qa_need_list(blocks):
  198. # 创建 images, tables, interline_equations, inline_equations 的副本
  199. images = []
  200. tables = []
  201. interline_equations = []
  202. inline_equations = []
  203. for block in blocks:
  204. for line in block["lines"]:
  205. for span in line["spans"]:
  206. if span["type"] == ContentType.Image:
  207. images.append(span)
  208. elif span["type"] == ContentType.Table:
  209. tables.append(span)
  210. elif span["type"] == ContentType.InlineEquation:
  211. inline_equations.append(span)
  212. elif span["type"] == ContentType.InterlineEquation:
  213. interline_equations.append(span)
  214. else:
  215. continue
  216. return images, tables, interline_equations, inline_equations
  217. def get_qa_need_list_v2(blocks):
  218. # 创建 images, tables, interline_equations, inline_equations 的副本
  219. images = []
  220. tables = []
  221. interline_equations = []
  222. for block in blocks:
  223. if block["type"] == BlockType.Image:
  224. images.append(block)
  225. elif block["type"] == BlockType.Table:
  226. tables.append(block)
  227. elif block["type"] == BlockType.InterlineEquation:
  228. interline_equations.append(block)
  229. return images, tables, interline_equations