remove_footer_header.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import re
  2. from magic_pdf.libs.boxbase import _is_in_or_part_overlap
  3. from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
  4. """
  5. copy from pre_proc/remove_footer_header.py
  6. """
  7. def remove_headder_footer_one_page(
  8. text_raw_blocks,
  9. image_bboxes,
  10. table_bboxes,
  11. header_bboxs,
  12. footer_bboxs,
  13. page_no_bboxs,
  14. page_w,
  15. page_h,
  16. ):
  17. """
  18. 删除页眉页脚,页码
  19. 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
  20. """
  21. if 1:
  22. return image_bboxes, table_bboxes, text_raw_blocks, [], [], []
  23. header = []
  24. footer = []
  25. if len(header) == 0:
  26. model_header = header_bboxs
  27. if model_header:
  28. x0 = min([x for x, _, _, _ in model_header])
  29. y0 = min([y for _, y, _, _ in model_header])
  30. x1 = max([x1 for _, _, x1, _ in model_header])
  31. y1 = max([y1 for _, _, _, y1 in model_header])
  32. header = [x0, y0, x1, y1]
  33. if len(footer) == 0:
  34. model_footer = footer_bboxs
  35. if model_footer:
  36. x0 = min([x for x, _, _, _ in model_footer])
  37. y0 = min([y for _, y, _, _ in model_footer])
  38. x1 = max([x1 for _, _, x1, _ in model_footer])
  39. y1 = max([y1 for _, _, _, y1 in model_footer])
  40. footer = [x0, y0, x1, y1]
  41. header_y0 = 0 if len(header) == 0 else header[3]
  42. footer_y0 = page_h if len(footer) == 0 else footer[1]
  43. if page_no_bboxs:
  44. top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
  45. btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
  46. top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
  47. btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
  48. header_y0 = max(header_y0, top_max_y0)
  49. footer_y0 = min(footer_y0, btn_min_y1)
  50. content_boundry = [0, header_y0, page_w, footer_y0]
  51. header = [0, 0, page_w, header_y0]
  52. footer = [0, footer_y0, page_w, page_h]
  53. """以上计算出来了页眉页脚的边界,下面开始进行删除"""
  54. text_block_to_remove = []
  55. # 首先检查每个textblock
  56. for blk in text_raw_blocks:
  57. if len(blk["lines"]) > 0:
  58. for line in blk["lines"]:
  59. line_del = []
  60. for span in line["spans"]:
  61. span_del = []
  62. if span["bbox"][3] < header_y0:
  63. span_del.append(span)
  64. elif _is_in_or_part_overlap(
  65. span["bbox"], header
  66. ) or _is_in_or_part_overlap(span["bbox"], footer):
  67. span_del.append(span)
  68. for span in span_del:
  69. line["spans"].remove(span)
  70. if not line["spans"]:
  71. line_del.append(line)
  72. for line in line_del:
  73. blk["lines"].remove(line)
  74. else:
  75. # if not blk['lines']:
  76. blk["tag"] = CONTENT_IN_FOOT_OR_HEADER
  77. text_block_to_remove.append(blk)
  78. """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
  79. page_no_block_2_remove = []
  80. if page_no_bboxs:
  81. for pagenobox in page_no_bboxs:
  82. for block in text_raw_blocks:
  83. if _is_in_or_part_overlap(
  84. pagenobox, block["bbox"]
  85. ): # 在span级别删除页码
  86. for line in block["lines"]:
  87. for span in line["spans"]:
  88. if _is_in_or_part_overlap(pagenobox, span["bbox"]):
  89. # span['text'] = ''
  90. span["tag"] = PAGE_NO
  91. # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
  92. if len(line["spans"]) == 1 and len(block["lines"]) == 1:
  93. page_no_block_2_remove.append(block)
  94. else:
  95. # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
  96. if len(text_raw_blocks) > 0:
  97. text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True)
  98. last_block = text_raw_blocks[0]
  99. if len(last_block["lines"]) == 1:
  100. last_line = last_block["lines"][0]
  101. if len(last_line["spans"]) == 1:
  102. last_span = last_line["spans"][0]
  103. if (
  104. last_span["text"].strip()
  105. and not re.search("[a-zA-Z]", last_span["text"])
  106. and re.search("[0-9]", last_span["text"])
  107. ):
  108. last_span["tag"] = PAGE_NO
  109. page_no_block_2_remove.append(last_block)
  110. for b in page_no_block_2_remove:
  111. text_block_to_remove.append(b)
  112. for blk in text_block_to_remove:
  113. if blk in text_raw_blocks:
  114. text_raw_blocks.remove(blk)
  115. text_block_remain = text_raw_blocks
  116. image_bbox_to_remove = [
  117. bbox
  118. for bbox in image_bboxes
  119. if not _is_in_or_part_overlap(bbox, content_boundry)
  120. ]
  121. image_bbox_remain = [
  122. bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
  123. ]
  124. table_bbox_to_remove = [
  125. bbox
  126. for bbox in table_bboxes
  127. if not _is_in_or_part_overlap(bbox, content_boundry)
  128. ]
  129. table_bbox_remain = [
  130. bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
  131. ]
  132. # 1, 2, 3
  133. return (
  134. image_bbox_remain,
  135. table_bbox_remain,
  136. text_block_remain,
  137. text_block_to_remove,
  138. image_bbox_to_remove,
  139. table_bbox_to_remove,
  140. )