remove_footnote.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
  2. import collections # 统计库
  3. def is_below(bbox1, bbox2):
  4. # 如果block1的上边y坐标大于block2的下边y坐标,那么block1在block2下面
  5. return bbox1[1] > bbox2[3]
  6. def merge_bboxes(bboxes):
  7. # 找出所有blocks的最小x0,最大y1,最大x1,最小y0,这就是合并后的bbox
  8. x0 = min(bbox[0] for bbox in bboxes)
  9. y0 = min(bbox[1] for bbox in bboxes)
  10. x1 = max(bbox[2] for bbox in bboxes)
  11. y1 = max(bbox[3] for bbox in bboxes)
  12. return [x0, y0, x1, y1]
  13. def merge_footnote_blocks(page_info, main_text_font):
  14. page_info['merged_bboxes'] = []
  15. for layout in page_info['layout_bboxes']:
  16. # 找出layout中的所有footnote blocks和preproc_blocks
  17. footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])]
  18. # 如果没有footnote_blocks,就跳过这个layout
  19. if not footnote_bboxes:
  20. continue
  21. preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])]
  22. # preproc_bboxes = [block['bbox'] for block in preproc_blocks]
  23. font_names = collections.Counter()
  24. if len(preproc_blocks) > 0:
  25. # 存储每一行的文本块大小的列表
  26. line_sizes = []
  27. # 存储每个文本块的平均行大小
  28. block_sizes = []
  29. for block in preproc_blocks:
  30. block_line_sizes = []
  31. block_fonts = collections.Counter()
  32. for line in block['lines']:
  33. # 提取每个span的size属性,并计算行大小
  34. span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
  35. if span_sizes:
  36. line_size = sum(span_sizes) / len(span_sizes)
  37. line_sizes.append(line_size)
  38. block_line_sizes.append(line_size)
  39. span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
  40. 'font' in span and len(span['text']) > 0]
  41. if span_font:
  42. # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
  43. # font_names.append(font_name for font_name in span_font)
  44. # block_fonts.append(font_name for font_name in span_font)
  45. for font, count in span_font:
  46. # font_names.extend([font] * count)
  47. # block_fonts.extend([font] * count)
  48. font_names[font] += count
  49. block_fonts[font] += count
  50. if block_line_sizes:
  51. # 计算文本块的平均行大小
  52. block_size = sum(block_line_sizes) / len(block_line_sizes)
  53. block_font = block_fonts.most_common(1)[0][0]
  54. block_sizes.append((block, block_size, block_font))
  55. # 计算main_text_size
  56. # main_text_font = font_names.most_common(1)[0][0]
  57. main_text_size = collections.Counter(line_sizes).most_common(1)[0][0]
  58. else:
  59. continue
  60. need_merge_bboxes = []
  61. # 任何一个下面有正文block的footnote bbox都是假footnote
  62. for footnote_bbox in footnote_bboxes:
  63. # 检测footnote下面是否有正文block(正文block需满足,block平均size大于等于main_text_size,且block行数大于等于5)
  64. main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if
  65. is_below(block['bbox'], footnote_bbox) and
  66. sum([size >= main_text_size,
  67. len(block['lines']) >= 5,
  68. block_font == main_text_font]) >= 2]
  69. # 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
  70. if len(main_text_bboxes_below) > 0:
  71. continue
  72. else:
  73. # 否则,说明footnote下面没有正文block,这个footnote成立,添加到待merge的footnote_bboxes中
  74. need_merge_bboxes.append(footnote_bbox)
  75. if len(need_merge_bboxes) == 0:
  76. continue
  77. # 找出最靠上的footnote block
  78. top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1])
  79. # 找出所有在top_footnote_block下面的preproc_blocks,并确保这些preproc_blocks的平均行大小小于main_text_size
  80. bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)]
  81. # # 找出所有在top_footnote_block下面的preproc_blocks
  82. # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)]
  83. # 合并top_footnote_block和blocks_below
  84. merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below)
  85. # 添加到新的footnote_bboxes_tmp中
  86. page_info['merged_bboxes'].append(merged_bbox)
  87. return page_info
  88. def remove_footnote_blocks(page_info):
  89. if page_info.get('merged_bboxes'):
  90. # 从文字中去掉footnote
  91. remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes'])
  92. # 从图片中去掉footnote
  93. image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes'])
  94. # 更新page_info
  95. page_info['preproc_blocks'] = remain_text_blocks
  96. page_info['images'] = image_blocks
  97. page_info['droped_text_block'].extend(removed_footnote_text_blocks)
  98. page_info['droped_image_block'].extend(removed_footnote_imgs_blocks)
  99. # 删除footnote_bboxes_tmp和merged_bboxes
  100. del page_info['merged_bboxes']
  101. del page_info['footnote_bboxes_tmp']
  102. return page_info
  103. def remove_footnote_text(raw_text_block, footnote_bboxes):
  104. """
  105. :param raw_text_block: str类型,是当前页的文本内容
  106. :param footnoteBboxes: list类型,是当前页的脚注bbox
  107. """
  108. footnote_text_blocks = []
  109. for block in raw_text_block:
  110. text_bbox = block['bbox']
  111. # TODO 更严谨点在line级别做
  112. if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
  113. # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
  114. block['tag'] = 'footnote'
  115. footnote_text_blocks.append(block)
  116. # raw_text_block.remove(block)
  117. # 移除,不能再内部移除,否则会出错
  118. for block in footnote_text_blocks:
  119. raw_text_block.remove(block)
  120. return raw_text_block, footnote_text_blocks
  121. def remove_footnote_image(image_blocks, footnote_bboxes):
  122. """
  123. :param image_bboxes: list类型,是当前页的图片bbox(结构体)
  124. :param footnoteBboxes: list类型,是当前页的脚注bbox
  125. """
  126. footnote_imgs_blocks = []
  127. for image_block in image_blocks:
  128. if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
  129. footnote_imgs_blocks.append(image_block)
  130. for footnote_imgs_block in footnote_imgs_blocks:
  131. image_blocks.remove(footnote_imgs_block)
  132. return image_blocks, footnote_imgs_blocks