remove_footer_header.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import re
  2. from magic_pdf.libs.boxbase import _is_in_or_part_overlap
  3. from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
  4. def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
  5. page_no_bboxs, page_w, page_h):
  6. """
  7. 删除页眉页脚,页码
  8. 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
  9. """
  10. header = []
  11. footer = []
  12. if len(header) == 0:
  13. model_header = header_bboxs
  14. if model_header:
  15. x0 = min([x for x, _, _, _ in model_header])
  16. y0 = min([y for _, y, _, _ in model_header])
  17. x1 = max([x1 for _, _, x1, _ in model_header])
  18. y1 = max([y1 for _, _, _, y1 in model_header])
  19. header = [x0, y0, x1, y1]
  20. if len(footer) == 0:
  21. model_footer = footer_bboxs
  22. if model_footer:
  23. x0 = min([x for x, _, _, _ in model_footer])
  24. y0 = min([y for _, y, _, _ in model_footer])
  25. x1 = max([x1 for _, _, x1, _ in model_footer])
  26. y1 = max([y1 for _, _, _, y1 in model_footer])
  27. footer = [x0, y0, x1, y1]
  28. header_y0 = 0 if len(header) == 0 else header[3]
  29. footer_y0 = page_h if len(footer) == 0 else footer[1]
  30. if page_no_bboxs:
  31. top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
  32. btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
  33. top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
  34. btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
  35. header_y0 = max(header_y0, top_max_y0)
  36. footer_y0 = min(footer_y0, btn_min_y1)
  37. content_boundry = [0, header_y0, page_w, footer_y0]
  38. header = [0, 0, page_w, header_y0]
  39. footer = [0, footer_y0, page_w, page_h]
  40. """以上计算出来了页眉页脚的边界,下面开始进行删除"""
  41. text_block_to_remove = []
  42. # 首先检查每个textblock
  43. for blk in text_raw_blocks:
  44. if len(blk['lines']) > 0:
  45. for line in blk['lines']:
  46. line_del = []
  47. for span in line['spans']:
  48. span_del = []
  49. if span['bbox'][3] < header_y0:
  50. span_del.append(span)
  51. elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
  52. span_del.append(span)
  53. for span in span_del:
  54. line['spans'].remove(span)
  55. if not line['spans']:
  56. line_del.append(line)
  57. for line in line_del:
  58. blk['lines'].remove(line)
  59. else:
  60. # if not blk['lines']:
  61. blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
  62. text_block_to_remove.append(blk)
  63. """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
  64. page_no_block_2_remove = []
  65. if page_no_bboxs:
  66. for pagenobox in page_no_bboxs:
  67. for block in text_raw_blocks:
  68. if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
  69. for line in block['lines']:
  70. for span in line['spans']:
  71. if _is_in_or_part_overlap(pagenobox, span['bbox']):
  72. # span['text'] = ''
  73. span['tag'] = PAGE_NO
  74. # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
  75. if len(line['spans']) == 1 and len(block['lines']) == 1:
  76. page_no_block_2_remove.append(block)
  77. else:
  78. # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
  79. if len(text_raw_blocks) > 0:
  80. text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
  81. last_block = text_raw_blocks[0]
  82. if len(last_block['lines']) == 1:
  83. last_line = last_block['lines'][0]
  84. if len(last_line['spans']) == 1:
  85. last_span = last_line['spans'][0]
  86. if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
  87. last_span[
  88. 'text']):
  89. last_span['tag'] = PAGE_NO
  90. page_no_block_2_remove.append(last_block)
  91. for b in page_no_block_2_remove:
  92. text_block_to_remove.append(b)
  93. for blk in text_block_to_remove:
  94. if blk in text_raw_blocks:
  95. text_raw_blocks.remove(blk)
  96. text_block_remain = text_raw_blocks
  97. image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
  98. image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
  99. table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
  100. table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
  101. return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove