ocr_dict_merge.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
  2. _is_in_or_part_overlap_with_area_ratio,
  3. calculate_overlap_area_in_bbox1_area_ratio)
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import BlockType, ContentType
  6. # 将每一个line中的span从左到右排序
  7. def line_sort_spans_by_left_to_right(lines):
  8. line_objects = []
  9. for line in lines:
  10. # 按照x0坐标排序
  11. line.sort(key=lambda span: span['bbox'][0])
  12. line_bbox = [
  13. min(span['bbox'][0] for span in line), # x0
  14. min(span['bbox'][1] for span in line), # y0
  15. max(span['bbox'][2] for span in line), # x1
  16. max(span['bbox'][3] for span in line), # y1
  17. ]
  18. line_objects.append({
  19. 'bbox': line_bbox,
  20. 'spans': line,
  21. })
  22. return line_objects
  23. def merge_spans_to_line(spans):
  24. if len(spans) == 0:
  25. return []
  26. else:
  27. # 按照y0坐标排序
  28. spans.sort(key=lambda span: span['bbox'][1])
  29. lines = []
  30. current_line = [spans[0]]
  31. for span in spans[1:]:
  32. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  33. # image和table类型,同上
  34. if span['type'] in [
  35. ContentType.InterlineEquation, ContentType.Image,
  36. ContentType.Table
  37. ] or any(s['type'] in [
  38. ContentType.InterlineEquation, ContentType.Image,
  39. ContentType.Table
  40. ] for s in current_line):
  41. # 则开始新行
  42. lines.append(current_line)
  43. current_line = [span]
  44. continue
  45. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  46. if __is_overlaps_y_exceeds_threshold(span['bbox'],
  47. current_line[-1]['bbox']):
  48. current_line.append(span)
  49. else:
  50. # 否则,开始新行
  51. lines.append(current_line)
  52. current_line = [span]
  53. # 添加最后一行
  54. if current_line:
  55. lines.append(current_line)
  56. return lines
  57. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  58. lines = []
  59. new_spans = []
  60. dropped_spans = []
  61. for item in layout_bboxes:
  62. layout_bbox = item['layout_bbox']
  63. # 遍历spans,将每个span放入对应的layout中
  64. layout_sapns = []
  65. for span in spans:
  66. if calculate_overlap_area_in_bbox1_area_ratio(
  67. span['bbox'], layout_bbox) > 0.6:
  68. layout_sapns.append(span)
  69. # 如果layout_sapns不为空,则放入new_spans中
  70. if len(layout_sapns) > 0:
  71. new_spans.append(layout_sapns)
  72. # 从spans删除已经放入layout_sapns中的span
  73. for layout_sapn in layout_sapns:
  74. spans.remove(layout_sapn)
  75. if len(new_spans) > 0:
  76. for layout_sapns in new_spans:
  77. layout_lines = merge_spans_to_line(layout_sapns)
  78. lines.extend(layout_lines)
  79. # 对line中的span进行排序
  80. lines = line_sort_spans_by_left_to_right(lines)
  81. for span in spans:
  82. span['tag'] = DropTag.NOT_IN_LAYOUT
  83. dropped_spans.append(span)
  84. return lines, dropped_spans
  85. def merge_lines_to_block(lines):
  86. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  87. blocks = []
  88. for line in lines:
  89. blocks.append({
  90. 'bbox': line['bbox'],
  91. 'lines': [line],
  92. })
  93. return blocks
  94. def sort_blocks_by_layout(all_bboxes, layout_bboxes):
  95. new_blocks = []
  96. sort_blocks = []
  97. for item in layout_bboxes:
  98. layout_bbox = item['layout_bbox']
  99. # 遍历blocks,将每个blocks放入对应的layout中
  100. layout_blocks = []
  101. for block in all_bboxes:
  102. # 如果是footnote则跳过
  103. if block[7] == BlockType.Footnote:
  104. continue
  105. block_bbox = block[:4]
  106. if calculate_overlap_area_in_bbox1_area_ratio(
  107. block_bbox, layout_bbox) > 0.8:
  108. layout_blocks.append(block)
  109. # 如果layout_blocks不为空,则放入new_blocks中
  110. if len(layout_blocks) > 0:
  111. new_blocks.append(layout_blocks)
  112. # 从all_bboxes删除已经放入layout_blocks中的block
  113. for layout_block in layout_blocks:
  114. all_bboxes.remove(layout_block)
  115. # 如果new_blocks不为空,则对new_blocks中每个block进行排序
  116. if len(new_blocks) > 0:
  117. for bboxes_in_layout_block in new_blocks:
  118. bboxes_in_layout_block.sort(
  119. key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
  120. sort_blocks.extend(bboxes_in_layout_block)
  121. # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
  122. return sort_blocks
  123. def fill_spans_in_blocks(blocks, spans, radio):
  124. """将allspans中的span按位置关系,放入blocks中."""
  125. block_with_spans = []
  126. for block in blocks:
  127. block_type = block[7]
  128. block_bbox = block[0:4]
  129. block_dict = {
  130. 'type': block_type,
  131. 'bbox': block_bbox,
  132. }
  133. block_spans = []
  134. for span in spans:
  135. span_bbox = span['bbox']
  136. if calculate_overlap_area_in_bbox1_area_ratio(
  137. span_bbox, block_bbox) > radio:
  138. block_spans.append(span)
  139. '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
  140. # displayed_list = []
  141. # text_inline_lines = []
  142. # modify_y_axis(block_spans, displayed_list, text_inline_lines)
  143. '''模型识别错误的行间公式, type类型转换成行内公式'''
  144. # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
  145. '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
  146. # block_spans = remove_overlap_between_bbox_for_span(block_spans)
  147. block_dict['spans'] = block_spans
  148. block_with_spans.append(block_dict)
  149. # 从spans删除已经放入block_spans中的span
  150. if len(block_spans) > 0:
  151. for span in block_spans:
  152. spans.remove(span)
  153. return block_with_spans, spans
  154. def fix_block_spans(block_with_spans, img_blocks, table_blocks):
  155. """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
  156. 需要将caption和footnote的text_span放入相应img_block和table_block内的
  157. caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
  158. fix_blocks = []
  159. for block in block_with_spans:
  160. block_type = block['type']
  161. if block_type == BlockType.Image:
  162. block = fix_image_block(block, img_blocks)
  163. elif block_type == BlockType.Table:
  164. block = fix_table_block(block, table_blocks)
  165. elif block_type in [BlockType.Text, BlockType.Title]:
  166. block = fix_text_block(block)
  167. elif block_type == BlockType.InterlineEquation:
  168. block = fix_interline_block(block)
  169. else:
  170. continue
  171. fix_blocks.append(block)
  172. return fix_blocks
  173. def fix_discarded_block(discarded_block_with_spans):
  174. fix_discarded_blocks = []
  175. for block in discarded_block_with_spans:
  176. block = fix_text_block(block)
  177. fix_discarded_blocks.append(block)
  178. return fix_discarded_blocks
  179. def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
  180. block_spans = []
  181. # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
  182. for span in spans:
  183. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
  184. block_bbox) > 0.6:
  185. block_spans.append(span)
  186. block_lines = merge_spans_to_line(block_spans)
  187. # 对line中的span进行排序
  188. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  189. block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
  190. return block, block_spans
  191. def make_body_block(span: dict, block_bbox: list, block_type: str):
  192. # 创建body_block
  193. body_line = {
  194. 'bbox': block_bbox,
  195. 'spans': [span],
  196. }
  197. body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
  198. return body_block
  199. def fix_image_block(block, img_blocks):
  200. block['blocks'] = []
  201. # 遍历img_blocks,找到与当前block匹配的img_block
  202. for img_block in img_blocks:
  203. if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
  204. img_block['bbox'], 0.95):
  205. # 创建img_body_block
  206. for span in block['spans']:
  207. if span['type'] == ContentType.Image and img_block[
  208. 'img_body_bbox'] == span['bbox']:
  209. # 创建img_body_block
  210. img_body_block = make_body_block(
  211. span, img_block['img_body_bbox'], BlockType.ImageBody)
  212. block['blocks'].append(img_body_block)
  213. # 从spans中移除img_body_block中已经放入的span
  214. block['spans'].remove(span)
  215. break
  216. # 根据list长度,判断img_block中是否有img_caption
  217. if img_block['img_caption_bbox'] is not None:
  218. img_caption_block, img_caption_spans = merge_spans_to_block(
  219. block['spans'], img_block['img_caption_bbox'],
  220. BlockType.ImageCaption)
  221. block['blocks'].append(img_caption_block)
  222. if img_block['img_footnote_bbox'] is not None:
  223. img_footnote_block, img_footnote_spans = merge_spans_to_block(
  224. block['spans'], img_block['img_footnote_bbox'],
  225. BlockType.ImageFootnote)
  226. block['blocks'].append(img_footnote_block)
  227. break
  228. del block['spans']
  229. return block
  230. def fix_table_block(block, table_blocks):
  231. block['blocks'] = []
  232. # 遍历table_blocks,找到与当前block匹配的table_block
  233. for table_block in table_blocks:
  234. if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
  235. table_block['bbox'], 0.95):
  236. # 创建table_body_block
  237. for span in block['spans']:
  238. if span['type'] == ContentType.Table and table_block[
  239. 'table_body_bbox'] == span['bbox']:
  240. # 创建table_body_block
  241. table_body_block = make_body_block(
  242. span, table_block['table_body_bbox'],
  243. BlockType.TableBody)
  244. block['blocks'].append(table_body_block)
  245. # 从spans中移除img_body_block中已经放入的span
  246. block['spans'].remove(span)
  247. break
  248. # 根据list长度,判断table_block中是否有caption
  249. if table_block['table_caption_bbox'] is not None:
  250. table_caption_block, table_caption_spans = merge_spans_to_block(
  251. block['spans'], table_block['table_caption_bbox'],
  252. BlockType.TableCaption)
  253. block['blocks'].append(table_caption_block)
  254. # 如果table_caption_block_spans不为空
  255. if len(table_caption_spans) > 0:
  256. # 一些span已经放入了caption_block中,需要从block['spans']中删除
  257. for span in table_caption_spans:
  258. block['spans'].remove(span)
  259. # 根据list长度,判断table_block中是否有table_note
  260. if table_block['table_footnote_bbox'] is not None:
  261. table_footnote_block, table_footnote_spans = merge_spans_to_block(
  262. block['spans'], table_block['table_footnote_bbox'],
  263. BlockType.TableFootnote)
  264. block['blocks'].append(table_footnote_block)
  265. break
  266. del block['spans']
  267. return block
  268. def fix_text_block(block):
  269. # 文本block中的公式span都应该转换成行内type
  270. for span in block['spans']:
  271. if span['type'] == ContentType.InterlineEquation:
  272. span['type'] = ContentType.InlineEquation
  273. block_lines = merge_spans_to_line(block['spans'])
  274. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  275. block['lines'] = sort_block_lines
  276. del block['spans']
  277. return block
  278. def fix_interline_block(block):
  279. block_lines = merge_spans_to_line(block['spans'])
  280. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  281. block['lines'] = sort_block_lines
  282. del block['spans']
  283. return block