ocr_span_list_modify.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. from magic_pdf.config.drop_tag import DropTag
  2. from magic_pdf.config.ocr_content_type import BlockType, ContentType
  3. from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
  4. calculate_iou,
  5. calculate_overlap_area_in_bbox1_area_ratio,
  6. get_minbox_if_overlap_by_ratio)
  7. def remove_overlaps_low_confidence_spans(spans):
  8. dropped_spans = []
  9. # 删除重叠spans中置信度低的的那些
  10. for span1 in spans:
  11. for span2 in spans:
  12. if span1 != span2:
  13. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  14. if span1 in dropped_spans or span2 in dropped_spans:
  15. continue
  16. else:
  17. if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
  18. if span1['score'] < span2['score']:
  19. span_need_remove = span1
  20. else:
  21. span_need_remove = span2
  22. if (
  23. span_need_remove is not None
  24. and span_need_remove not in dropped_spans
  25. ):
  26. dropped_spans.append(span_need_remove)
  27. if len(dropped_spans) > 0:
  28. for span_need_remove in dropped_spans:
  29. spans.remove(span_need_remove)
  30. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  31. return spans, dropped_spans
  32. def remove_overlaps_min_spans(spans):
  33. dropped_spans = []
  34. # 删除重叠spans中较小的那些
  35. for span1 in spans:
  36. for span2 in spans:
  37. if span1 != span2:
  38. # span1 或 span2 任何一个都不应该在 dropped_spans 中
  39. if span1 in dropped_spans or span2 in dropped_spans:
  40. continue
  41. else:
  42. overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
  43. if overlap_box is not None:
  44. span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
  45. if span_need_remove is not None and span_need_remove not in dropped_spans:
  46. dropped_spans.append(span_need_remove)
  47. if len(dropped_spans) > 0:
  48. for span_need_remove in dropped_spans:
  49. spans.remove(span_need_remove)
  50. span_need_remove['tag'] = DropTag.SPAN_OVERLAP
  51. return spans, dropped_spans
  52. def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
  53. # 遍历spans, 判断是否在removed_span_block_bboxes中
  54. # 如果是, 则删除该span 否则, 保留该span
  55. need_remove_spans = []
  56. for span in spans:
  57. for removed_bbox in need_remove_spans_bboxes:
  58. if (
  59. calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
  60. > 0.5
  61. ):
  62. if span not in need_remove_spans:
  63. need_remove_spans.append(span)
  64. break
  65. if len(need_remove_spans) > 0:
  66. for span in need_remove_spans:
  67. spans.remove(span)
  68. return spans
  69. def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
  70. dropped_spans = []
  71. for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
  72. # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
  73. need_remove_spans = []
  74. for span in spans:
  75. # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
  76. for removed_bbox in removed_bboxes:
  77. if (
  78. calculate_overlap_area_in_bbox1_area_ratio(
  79. span['bbox'], removed_bbox
  80. )
  81. > 0.5
  82. ):
  83. need_remove_spans.append(span)
  84. break
  85. # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
  86. elif (
  87. drop_tag == DropTag.FOOTNOTE
  88. and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
  89. and removed_bbox[0]
  90. < (span['bbox'][0] + span['bbox'][2]) / 2
  91. < removed_bbox[2]
  92. ):
  93. need_remove_spans.append(span)
  94. break
  95. for span in need_remove_spans:
  96. spans.remove(span)
  97. span['tag'] = drop_tag
  98. dropped_spans.append(span)
  99. return spans, dropped_spans
  100. def adjust_bbox_for_standalone_block(spans):
  101. # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
  102. for sb_span in spans:
  103. if sb_span['type'] in [
  104. ContentType.InterlineEquation,
  105. ContentType.Image,
  106. ContentType.Table,
  107. ]:
  108. for text_span in spans:
  109. if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
  110. # 判断span2的纵向高度是否被span所覆盖
  111. if (
  112. sb_span['bbox'][1] < text_span['bbox'][1]
  113. and sb_span['bbox'][3] > text_span['bbox'][3]
  114. ):
  115. # 判断span2是否在span左边
  116. if text_span['bbox'][0] < sb_span['bbox'][0]:
  117. # 调整span的y0和span2的y0一致
  118. sb_span['bbox'][1] = text_span['bbox'][1]
  119. return spans
  120. def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
  121. # displayed_list = []
  122. # 如果spans为空,则不处理
  123. if len(spans) == 0:
  124. pass
  125. else:
  126. spans.sort(key=lambda span: span['bbox'][1])
  127. lines = []
  128. current_line = [spans[0]]
  129. if spans[0]['type'] in [
  130. ContentType.InterlineEquation,
  131. ContentType.Image,
  132. ContentType.Table,
  133. ]:
  134. displayed_list.append(spans[0])
  135. line_first_y0 = spans[0]['bbox'][1]
  136. line_first_y = spans[0]['bbox'][3]
  137. # 用于给行间公式搜索
  138. # text_inline_lines = []
  139. for span in spans[1:]:
  140. # if span.get("content","") == "78.":
  141. # print("debug")
  142. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  143. # image和table类型,同上
  144. if span['type'] in [
  145. ContentType.InterlineEquation,
  146. ContentType.Image,
  147. ContentType.Table,
  148. ] or any(
  149. s['type']
  150. in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
  151. for s in current_line
  152. ):
  153. # 传入
  154. if span['type'] in [
  155. ContentType.InterlineEquation,
  156. ContentType.Image,
  157. ContentType.Table,
  158. ]:
  159. displayed_list.append(span)
  160. # 则开始新行
  161. lines.append(current_line)
  162. if len(current_line) > 1 or current_line[0]['type'] in [
  163. ContentType.Text,
  164. ContentType.InlineEquation,
  165. ]:
  166. text_inline_lines.append(
  167. (current_line, (line_first_y0, line_first_y))
  168. )
  169. current_line = [span]
  170. line_first_y0 = span['bbox'][1]
  171. line_first_y = span['bbox'][3]
  172. continue
  173. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  174. if __is_overlaps_y_exceeds_threshold(
  175. span['bbox'], current_line[-1]['bbox']
  176. ):
  177. if span['type'] == 'text':
  178. line_first_y0 = span['bbox'][1]
  179. line_first_y = span['bbox'][3]
  180. current_line.append(span)
  181. else:
  182. # 否则,开始新行
  183. lines.append(current_line)
  184. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  185. current_line = [span]
  186. line_first_y0 = span['bbox'][1]
  187. line_first_y = span['bbox'][3]
  188. # 添加最后一行
  189. if current_line:
  190. lines.append(current_line)
  191. if len(current_line) > 1 or current_line[0]['type'] in [
  192. ContentType.Text,
  193. ContentType.InlineEquation,
  194. ]:
  195. text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
  196. for line in text_inline_lines:
  197. # 按照x0坐标排序
  198. current_line = line[0]
  199. current_line.sort(key=lambda span: span['bbox'][0])
  200. # 调整每一个文字行内bbox统一
  201. for line in text_inline_lines:
  202. current_line, (line_first_y0, line_first_y) = line
  203. for span in current_line:
  204. span['bbox'][1] = line_first_y0
  205. span['bbox'][3] = line_first_y
  206. # return spans, displayed_list, text_inline_lines
  207. def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
  208. # 错误行间公式转行内公式
  209. j = 0
  210. for i in range(len(displayed_list)):
  211. # if i == 8:
  212. # print("debug")
  213. span = displayed_list[i]
  214. span_y0, span_y = span['bbox'][1], span['bbox'][3]
  215. while j < len(text_inline_lines):
  216. text_line = text_inline_lines[j]
  217. y0, y1 = text_line[1]
  218. if (
  219. span_y0 < y0 < span_y
  220. or span_y0 < y1 < span_y
  221. or span_y0 < y0
  222. and span_y > y1
  223. ) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
  224. # 调整公式类型
  225. if span['type'] == ContentType.InterlineEquation:
  226. # 最后一行是行间公式
  227. if j + 1 >= len(text_inline_lines):
  228. span['type'] = ContentType.InlineEquation
  229. span['bbox'][1] = y0
  230. span['bbox'][3] = y1
  231. else:
  232. # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
  233. y0_next, y1_next = text_inline_lines[j + 1][1]
  234. if (
  235. not __is_overlaps_y_exceeds_threshold(
  236. span['bbox'], (0, y0_next, 0, y1_next)
  237. )
  238. and 3 * (y1 - y0) > span_y - span_y0
  239. ):
  240. span['type'] = ContentType.InlineEquation
  241. span['bbox'][1] = y0
  242. span['bbox'][3] = y1
  243. break
  244. elif (
  245. span_y < y0
  246. or span_y0 < y0 < span_y
  247. and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
  248. ):
  249. break
  250. else:
  251. j += 1
  252. return spans
  253. def get_qa_need_list(blocks):
  254. # 创建 images, tables, interline_equations, inline_equations 的副本
  255. images = []
  256. tables = []
  257. interline_equations = []
  258. inline_equations = []
  259. for block in blocks:
  260. for line in block['lines']:
  261. for span in line['spans']:
  262. if span['type'] == ContentType.Image:
  263. images.append(span)
  264. elif span['type'] == ContentType.Table:
  265. tables.append(span)
  266. elif span['type'] == ContentType.InlineEquation:
  267. inline_equations.append(span)
  268. elif span['type'] == ContentType.InterlineEquation:
  269. interline_equations.append(span)
  270. else:
  271. continue
  272. return images, tables, interline_equations, inline_equations
  273. def get_qa_need_list_v2(blocks):
  274. # 创建 images, tables, interline_equations, inline_equations 的副本
  275. images = []
  276. tables = []
  277. interline_equations = []
  278. for block in blocks:
  279. if block['type'] == BlockType.Image:
  280. images.append(block)
  281. elif block['type'] == BlockType.Table:
  282. tables.append(block)
  283. elif block['type'] == BlockType.InterlineEquation:
  284. interline_equations.append(block)
  285. return images, tables, interline_equations