ocr_dict_merge.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. from magic_pdf.config.drop_tag import DropTag
  2. from magic_pdf.config.ocr_content_type import BlockType, ContentType
  3. from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
  4. _is_in_or_part_overlap_with_area_ratio,
  5. calculate_overlap_area_in_bbox1_area_ratio)
  6. # 将每一个line中的span从左到右排序
  7. def line_sort_spans_by_left_to_right(lines):
  8. line_objects = []
  9. for line in lines:
  10. # 按照x0坐标排序
  11. line.sort(key=lambda span: span['bbox'][0])
  12. line_bbox = [
  13. min(span['bbox'][0] for span in line), # x0
  14. min(span['bbox'][1] for span in line), # y0
  15. max(span['bbox'][2] for span in line), # x1
  16. max(span['bbox'][3] for span in line), # y1
  17. ]
  18. line_objects.append({
  19. 'bbox': line_bbox,
  20. 'spans': line,
  21. })
  22. return line_objects
  23. def merge_spans_to_line(spans):
  24. if len(spans) == 0:
  25. return []
  26. else:
  27. # 按照y0坐标排序
  28. spans.sort(key=lambda span: span['bbox'][1])
  29. lines = []
  30. current_line = [spans[0]]
  31. for span in spans[1:]:
  32. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  33. # image和table类型,同上
  34. if span['type'] in [
  35. ContentType.InterlineEquation, ContentType.Image,
  36. ContentType.Table
  37. ] or any(s['type'] in [
  38. ContentType.InterlineEquation, ContentType.Image,
  39. ContentType.Table
  40. ] for s in current_line):
  41. # 则开始新行
  42. lines.append(current_line)
  43. current_line = [span]
  44. continue
  45. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  46. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.5):
  47. current_line.append(span)
  48. else:
  49. # 否则,开始新行
  50. lines.append(current_line)
  51. current_line = [span]
  52. # 添加最后一行
  53. if current_line:
  54. lines.append(current_line)
  55. return lines
  56. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  57. lines = []
  58. new_spans = []
  59. dropped_spans = []
  60. for item in layout_bboxes:
  61. layout_bbox = item['layout_bbox']
  62. # 遍历spans,将每个span放入对应的layout中
  63. layout_sapns = []
  64. for span in spans:
  65. if calculate_overlap_area_in_bbox1_area_ratio(
  66. span['bbox'], layout_bbox) > 0.6:
  67. layout_sapns.append(span)
  68. # 如果layout_sapns不为空,则放入new_spans中
  69. if len(layout_sapns) > 0:
  70. new_spans.append(layout_sapns)
  71. # 从spans删除已经放入layout_sapns中的span
  72. for layout_sapn in layout_sapns:
  73. spans.remove(layout_sapn)
  74. if len(new_spans) > 0:
  75. for layout_sapns in new_spans:
  76. layout_lines = merge_spans_to_line(layout_sapns)
  77. lines.extend(layout_lines)
  78. # 对line中的span进行排序
  79. lines = line_sort_spans_by_left_to_right(lines)
  80. for span in spans:
  81. span['tag'] = DropTag.NOT_IN_LAYOUT
  82. dropped_spans.append(span)
  83. return lines, dropped_spans
  84. def merge_lines_to_block(lines):
  85. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  86. blocks = []
  87. for line in lines:
  88. blocks.append({
  89. 'bbox': line['bbox'],
  90. 'lines': [line],
  91. })
  92. return blocks
  93. def sort_blocks_by_layout(all_bboxes, layout_bboxes):
  94. new_blocks = []
  95. sort_blocks = []
  96. for item in layout_bboxes:
  97. layout_bbox = item['layout_bbox']
  98. # 遍历blocks,将每个blocks放入对应的layout中
  99. layout_blocks = []
  100. for block in all_bboxes:
  101. # 如果是footnote则跳过
  102. if block[7] == BlockType.Footnote:
  103. continue
  104. block_bbox = block[:4]
  105. if calculate_overlap_area_in_bbox1_area_ratio(
  106. block_bbox, layout_bbox) > 0.8:
  107. layout_blocks.append(block)
  108. # 如果layout_blocks不为空,则放入new_blocks中
  109. if len(layout_blocks) > 0:
  110. new_blocks.append(layout_blocks)
  111. # 从all_bboxes删除已经放入layout_blocks中的block
  112. for layout_block in layout_blocks:
  113. all_bboxes.remove(layout_block)
  114. # 如果new_blocks不为空,则对new_blocks中每个block进行排序
  115. if len(new_blocks) > 0:
  116. for bboxes_in_layout_block in new_blocks:
  117. bboxes_in_layout_block.sort(
  118. key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
  119. sort_blocks.extend(bboxes_in_layout_block)
  120. # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
  121. return sort_blocks
  122. def fill_spans_in_blocks(blocks, spans, radio):
  123. """将allspans中的span按位置关系,放入blocks中."""
  124. block_with_spans = []
  125. for block in blocks:
  126. block_type = block[7]
  127. block_bbox = block[0:4]
  128. block_dict = {
  129. 'type': block_type,
  130. 'bbox': block_bbox,
  131. }
  132. if block_type in [
  133. BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
  134. BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
  135. ]:
  136. block_dict['group_id'] = block[-1]
  137. block_spans = []
  138. for span in spans:
  139. span_bbox = span['bbox']
  140. if calculate_overlap_area_in_bbox1_area_ratio(
  141. span_bbox, block_bbox) > radio:
  142. block_spans.append(span)
  143. '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
  144. # displayed_list = []
  145. # text_inline_lines = []
  146. # modify_y_axis(block_spans, displayed_list, text_inline_lines)
  147. '''模型识别错误的行间公式, type类型转换成行内公式'''
  148. # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
  149. '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
  150. # block_spans = remove_overlap_between_bbox_for_span(block_spans)
  151. block_dict['spans'] = block_spans
  152. block_with_spans.append(block_dict)
  153. # 从spans删除已经放入block_spans中的span
  154. if len(block_spans) > 0:
  155. for span in block_spans:
  156. spans.remove(span)
  157. return block_with_spans, spans
  158. def fix_block_spans(block_with_spans, img_blocks, table_blocks):
  159. """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
  160. 需要将caption和footnote的text_span放入相应img_block和table_block内的
  161. caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
  162. fix_blocks = []
  163. for block in block_with_spans:
  164. block_type = block['type']
  165. if block_type == BlockType.Image:
  166. block = fix_image_block(block, img_blocks)
  167. elif block_type == BlockType.Table:
  168. block = fix_table_block(block, table_blocks)
  169. elif block_type in [BlockType.Text, BlockType.Title]:
  170. block = fix_text_block(block)
  171. elif block_type == BlockType.InterlineEquation:
  172. block = fix_interline_block(block)
  173. else:
  174. continue
  175. fix_blocks.append(block)
  176. return fix_blocks
  177. def fix_block_spans_v2(block_with_spans):
  178. """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
  179. 需要将caption和footnote的text_span放入相应img_block和table_block内的
  180. caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
  181. fix_blocks = []
  182. for block in block_with_spans:
  183. block_type = block['type']
  184. if block_type in [BlockType.Text, BlockType.Title,
  185. BlockType.ImageCaption, BlockType.ImageFootnote,
  186. BlockType.TableCaption, BlockType.TableFootnote
  187. ]:
  188. block = fix_text_block(block)
  189. elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
  190. block = fix_interline_block(block)
  191. else:
  192. continue
  193. fix_blocks.append(block)
  194. return fix_blocks
  195. def fix_discarded_block(discarded_block_with_spans):
  196. fix_discarded_blocks = []
  197. for block in discarded_block_with_spans:
  198. block = fix_text_block(block)
  199. fix_discarded_blocks.append(block)
  200. return fix_discarded_blocks
  201. def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
  202. block_spans = []
  203. # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
  204. for span in spans:
  205. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
  206. block_bbox) > 0.6:
  207. block_spans.append(span)
  208. block_lines = merge_spans_to_line(block_spans)
  209. # 对line中的span进行排序
  210. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  211. block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
  212. return block, block_spans
  213. def make_body_block(span: dict, block_bbox: list, block_type: str):
  214. # 创建body_block
  215. body_line = {
  216. 'bbox': block_bbox,
  217. 'spans': [span],
  218. }
  219. body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
  220. return body_block
  221. def fix_image_block(block, img_blocks):
  222. block['blocks'] = []
  223. # 遍历img_blocks,找到与当前block匹配的img_block
  224. for img_block in img_blocks:
  225. if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
  226. img_block['bbox'], 0.95):
  227. # 创建img_body_block
  228. for span in block['spans']:
  229. if span['type'] == ContentType.Image and img_block[
  230. 'img_body_bbox'] == span['bbox']:
  231. # 创建img_body_block
  232. img_body_block = make_body_block(
  233. span, img_block['img_body_bbox'], BlockType.ImageBody)
  234. block['blocks'].append(img_body_block)
  235. # 从spans中移除img_body_block中已经放入的span
  236. block['spans'].remove(span)
  237. break
  238. # 根据list长度,判断img_block中是否有img_caption
  239. if img_block['img_caption_bbox'] is not None:
  240. img_caption_block, img_caption_spans = merge_spans_to_block(
  241. block['spans'], img_block['img_caption_bbox'],
  242. BlockType.ImageCaption)
  243. block['blocks'].append(img_caption_block)
  244. if img_block['img_footnote_bbox'] is not None:
  245. img_footnote_block, img_footnote_spans = merge_spans_to_block(
  246. block['spans'], img_block['img_footnote_bbox'],
  247. BlockType.ImageFootnote)
  248. block['blocks'].append(img_footnote_block)
  249. break
  250. del block['spans']
  251. return block
  252. def fix_table_block(block, table_blocks):
  253. block['blocks'] = []
  254. # 遍历table_blocks,找到与当前block匹配的table_block
  255. for table_block in table_blocks:
  256. if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
  257. table_block['bbox'], 0.95):
  258. # 创建table_body_block
  259. for span in block['spans']:
  260. if span['type'] == ContentType.Table and table_block[
  261. 'table_body_bbox'] == span['bbox']:
  262. # 创建table_body_block
  263. table_body_block = make_body_block(
  264. span, table_block['table_body_bbox'],
  265. BlockType.TableBody)
  266. block['blocks'].append(table_body_block)
  267. # 从spans中移除img_body_block中已经放入的span
  268. block['spans'].remove(span)
  269. break
  270. # 根据list长度,判断table_block中是否有caption
  271. if table_block['table_caption_bbox'] is not None:
  272. table_caption_block, table_caption_spans = merge_spans_to_block(
  273. block['spans'], table_block['table_caption_bbox'],
  274. BlockType.TableCaption)
  275. block['blocks'].append(table_caption_block)
  276. # 如果table_caption_block_spans不为空
  277. if len(table_caption_spans) > 0:
  278. # 一些span已经放入了caption_block中,需要从block['spans']中删除
  279. for span in table_caption_spans:
  280. block['spans'].remove(span)
  281. # 根据list长度,判断table_block中是否有table_note
  282. if table_block['table_footnote_bbox'] is not None:
  283. table_footnote_block, table_footnote_spans = merge_spans_to_block(
  284. block['spans'], table_block['table_footnote_bbox'],
  285. BlockType.TableFootnote)
  286. block['blocks'].append(table_footnote_block)
  287. break
  288. del block['spans']
  289. return block
  290. def fix_text_block(block):
  291. # 文本block中的公式span都应该转换成行内type
  292. for span in block['spans']:
  293. if span['type'] == ContentType.InterlineEquation:
  294. span['type'] = ContentType.InlineEquation
  295. block_lines = merge_spans_to_line(block['spans'])
  296. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  297. block['lines'] = sort_block_lines
  298. del block['spans']
  299. return block
  300. def fix_interline_block(block):
  301. block_lines = merge_spans_to_line(block['spans'])
  302. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  303. block['lines'] = sort_block_lines
  304. del block['spans']
  305. return block