ocr_dict_merge.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. from loguru import logger
  2. from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
  3. calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
  4. from magic_pdf.libs.drop_tag import DropTag
  5. from magic_pdf.libs.ocr_content_type import ContentType, BlockType
  6. from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
  7. from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
  8. # 将每一个line中的span从左到右排序
  9. def line_sort_spans_by_left_to_right(lines):
  10. line_objects = []
  11. for line in lines:
  12. # 按照x0坐标排序
  13. line.sort(key=lambda span: span['bbox'][0])
  14. line_bbox = [
  15. min(span['bbox'][0] for span in line), # x0
  16. min(span['bbox'][1] for span in line), # y0
  17. max(span['bbox'][2] for span in line), # x1
  18. max(span['bbox'][3] for span in line), # y1
  19. ]
  20. line_objects.append({
  21. "bbox": line_bbox,
  22. "spans": line,
  23. })
  24. return line_objects
  25. def merge_spans_to_line(spans):
  26. if len(spans) == 0:
  27. return []
  28. else:
  29. # 按照y0坐标排序
  30. spans.sort(key=lambda span: span['bbox'][1])
  31. lines = []
  32. current_line = [spans[0]]
  33. for span in spans[1:]:
  34. # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
  35. # image和table类型,同上
  36. if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
  37. s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
  38. current_line):
  39. # 则开始新行
  40. lines.append(current_line)
  41. current_line = [span]
  42. continue
  43. # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
  44. if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
  45. current_line.append(span)
  46. else:
  47. # 否则,开始新行
  48. lines.append(current_line)
  49. current_line = [span]
  50. # 添加最后一行
  51. if current_line:
  52. lines.append(current_line)
  53. return lines
  54. def merge_spans_to_line_by_layout(spans, layout_bboxes):
  55. lines = []
  56. new_spans = []
  57. dropped_spans = []
  58. for item in layout_bboxes:
  59. layout_bbox = item['layout_bbox']
  60. # 遍历spans,将每个span放入对应的layout中
  61. layout_sapns = []
  62. for span in spans:
  63. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
  64. layout_sapns.append(span)
  65. # 如果layout_sapns不为空,则放入new_spans中
  66. if len(layout_sapns) > 0:
  67. new_spans.append(layout_sapns)
  68. # 从spans删除已经放入layout_sapns中的span
  69. for layout_sapn in layout_sapns:
  70. spans.remove(layout_sapn)
  71. if len(new_spans) > 0:
  72. for layout_sapns in new_spans:
  73. layout_lines = merge_spans_to_line(layout_sapns)
  74. lines.extend(layout_lines)
  75. # 对line中的span进行排序
  76. lines = line_sort_spans_by_left_to_right(lines)
  77. for span in spans:
  78. span['tag'] = DropTag.NOT_IN_LAYOUT
  79. dropped_spans.append(span)
  80. return lines, dropped_spans
  81. def merge_lines_to_block(lines):
  82. # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
  83. blocks = []
  84. for line in lines:
  85. blocks.append(
  86. {
  87. "bbox": line["bbox"],
  88. "lines": [line],
  89. }
  90. )
  91. return blocks
  92. def sort_blocks_by_layout(all_bboxes, layout_bboxes):
  93. new_blocks = []
  94. sort_blocks = []
  95. for item in layout_bboxes:
  96. layout_bbox = item['layout_bbox']
  97. # 遍历blocks,将每个blocks放入对应的layout中
  98. layout_blocks = []
  99. for block in all_bboxes:
  100. # 如果是footnote则跳过
  101. if block[7] == BlockType.Footnote:
  102. continue
  103. block_bbox = block[:4]
  104. if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
  105. layout_blocks.append(block)
  106. # 如果layout_blocks不为空,则放入new_blocks中
  107. if len(layout_blocks) > 0:
  108. new_blocks.append(layout_blocks)
  109. # 从all_bboxes删除已经放入layout_blocks中的block
  110. for layout_block in layout_blocks:
  111. all_bboxes.remove(layout_block)
  112. # 如果new_blocks不为空,则对new_blocks中每个block进行排序
  113. if len(new_blocks) > 0:
  114. for bboxes_in_layout_block in new_blocks:
  115. bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
  116. sort_blocks.extend(bboxes_in_layout_block)
  117. # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
  118. return sort_blocks
  119. def fill_spans_in_blocks(blocks, spans, radio):
  120. '''
  121. 将allspans中的span按位置关系,放入blocks中
  122. '''
  123. block_with_spans = []
  124. for block in blocks:
  125. block_type = block[7]
  126. block_bbox = block[0:4]
  127. block_dict = {
  128. 'type': block_type,
  129. 'bbox': block_bbox,
  130. }
  131. block_spans = []
  132. for span in spans:
  133. span_bbox = span['bbox']
  134. if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
  135. block_spans.append(span)
  136. '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
  137. # displayed_list = []
  138. # text_inline_lines = []
  139. # modify_y_axis(block_spans, displayed_list, text_inline_lines)
  140. '''模型识别错误的行间公式, type类型转换成行内公式'''
  141. # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
  142. '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
  143. # block_spans = remove_overlap_between_bbox_for_span(block_spans)
  144. block_dict['spans'] = block_spans
  145. block_with_spans.append(block_dict)
  146. # 从spans删除已经放入block_spans中的span
  147. if len(block_spans) > 0:
  148. for span in block_spans:
  149. spans.remove(span)
  150. return block_with_spans, spans
  151. def fix_block_spans(block_with_spans, img_blocks, table_blocks):
  152. '''
  153. 1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
  154. 需要将caption和footnote的text_span放入相应img_block和table_block内的
  155. caption_block和footnote_block中
  156. 2、同时需要删除block中的spans字段
  157. '''
  158. fix_blocks = []
  159. for block in block_with_spans:
  160. block_type = block['type']
  161. if block_type == BlockType.Image:
  162. block = fix_image_block(block, img_blocks)
  163. elif block_type == BlockType.Table:
  164. block = fix_table_block(block, table_blocks)
  165. elif block_type in [BlockType.Text, BlockType.Title]:
  166. block = fix_text_block(block)
  167. elif block_type == BlockType.InterlineEquation:
  168. block = fix_interline_block(block)
  169. else:
  170. continue
  171. fix_blocks.append(block)
  172. return fix_blocks
  173. def fix_discarded_block(discarded_block_with_spans):
  174. fix_discarded_blocks = []
  175. for block in discarded_block_with_spans:
  176. block = fix_text_block(block)
  177. fix_discarded_blocks.append(block)
  178. return fix_discarded_blocks
  179. def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
  180. block_spans = []
  181. # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
  182. for span in spans:
  183. if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
  184. block_spans.append(span)
  185. block_lines = merge_spans_to_line(block_spans)
  186. # 对line中的span进行排序
  187. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  188. block = {
  189. 'bbox': block_bbox,
  190. 'type': block_type,
  191. 'lines': sort_block_lines
  192. }
  193. return block, block_spans
  194. def make_body_block(span: dict, block_bbox: list, block_type: str):
  195. # 创建body_block
  196. body_line = {
  197. 'bbox': block_bbox,
  198. 'spans': [span],
  199. }
  200. body_block = {
  201. 'bbox': block_bbox,
  202. 'type': block_type,
  203. 'lines': [body_line]
  204. }
  205. return body_block
  206. def fix_image_block(block, img_blocks):
  207. block['blocks'] = []
  208. # 遍历img_blocks,找到与当前block匹配的img_block
  209. for img_block in img_blocks:
  210. if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
  211. # 创建img_body_block
  212. for span in block['spans']:
  213. if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
  214. # 创建img_body_block
  215. img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
  216. block['blocks'].append(img_body_block)
  217. # 从spans中移除img_body_block中已经放入的span
  218. block['spans'].remove(span)
  219. break
  220. # 根据list长度,判断img_block中是否有img_caption
  221. if img_block['img_caption_bbox'] is not None:
  222. img_caption_block, img_caption_spans = merge_spans_to_block(
  223. block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
  224. )
  225. block['blocks'].append(img_caption_block)
  226. break
  227. del block['spans']
  228. return block
  229. def fix_table_block(block, table_blocks):
  230. block['blocks'] = []
  231. # 遍历table_blocks,找到与当前block匹配的table_block
  232. for table_block in table_blocks:
  233. if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
  234. # 创建table_body_block
  235. for span in block['spans']:
  236. if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
  237. # 创建table_body_block
  238. table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
  239. block['blocks'].append(table_body_block)
  240. # 从spans中移除img_body_block中已经放入的span
  241. block['spans'].remove(span)
  242. break
  243. # 根据list长度,判断table_block中是否有caption
  244. if table_block['table_caption_bbox'] is not None:
  245. table_caption_block, table_caption_spans = merge_spans_to_block(
  246. block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
  247. )
  248. block['blocks'].append(table_caption_block)
  249. # 如果table_caption_block_spans不为空
  250. if len(table_caption_spans) > 0:
  251. # 一些span已经放入了caption_block中,需要从block['spans']中删除
  252. for span in table_caption_spans:
  253. block['spans'].remove(span)
  254. # 根据list长度,判断table_block中是否有table_note
  255. if table_block['table_footnote_bbox'] is not None:
  256. table_footnote_block, table_footnote_spans = merge_spans_to_block(
  257. block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
  258. )
  259. block['blocks'].append(table_footnote_block)
  260. break
  261. del block['spans']
  262. return block
  263. def fix_text_block(block):
  264. # 文本block中的公式span都应该转换成行内type
  265. for span in block['spans']:
  266. if span['type'] == ContentType.InterlineEquation:
  267. span['type'] = ContentType.InlineEquation
  268. block_lines = merge_spans_to_line(block['spans'])
  269. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  270. block['lines'] = sort_block_lines
  271. del block['spans']
  272. return block
  273. def fix_interline_block(block):
  274. block_lines = merge_spans_to_line(block['spans'])
  275. sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
  276. block['lines'] = sort_block_lines
  277. del block['spans']
  278. return block