detect_footer_header_by_statistics.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. from collections import defaultdict
  2. from loguru import logger
  3. from libs.boxbase import _is_in, calculate_iou
  4. def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
  5. return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
  6. def is_single_line_block(block):
  7. # Determine based on the width and height of the block
  8. block_width = block["X1"] - block["X0"]
  9. block_height = block["bbox"][3] - block["bbox"][1]
  10. # If the height of the block is close to the average character height and the width is large, it is considered a single line
  11. return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
  12. def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
  13. """
  14. This function gets the most common bboxes from the bboxes
  15. Parameters
  16. ----------
  17. bboxes : list
  18. bboxes
  19. page_height : float
  20. height of the page
  21. position : str, optional
  22. "top" or "bottom", by default "top"
  23. threshold : float, optional
  24. threshold, by default 0.25
  25. num_bboxes : int, optional
  26. number of bboxes to return, by default 3
  27. min_frequency : int, optional
  28. minimum frequency of the bbox, by default 2
  29. Returns
  30. -------
  31. common_bboxes : list
  32. common bboxes
  33. """
  34. # Filter bbox by position
  35. if position == "top":
  36. filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
  37. else:
  38. filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
  39. # Find the most common bbox
  40. bbox_count = defaultdict(int)
  41. for bbox in filtered_bboxes:
  42. bbox_count[tuple(bbox)] += 1
  43. # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
  44. common_bboxes = [
  45. bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
  46. ][:num_bboxes]
  47. return common_bboxes
  48. def detect_footer_header2(result_dict, similarity_threshold=0.5):
  49. """
  50. This function detects the header and footer of the document.
  51. Parameters
  52. ----------
  53. result_dict : dict
  54. result dictionary
  55. Returns
  56. -------
  57. result_dict : dict
  58. result dictionary
  59. """
  60. # Traverse all blocks in the document
  61. single_line_blocks = 0
  62. total_blocks = 0
  63. single_line_blocks = 0
  64. for page_id, blocks in result_dict.items():
  65. if page_id.startswith("page_"):
  66. for block_key, block in blocks.items():
  67. if block_key.startswith("block_"):
  68. total_blocks += 1
  69. if is_single_line_block(block):
  70. single_line_blocks += 1
  71. # If there are no blocks, skip the header and footer detection
  72. if total_blocks == 0:
  73. print("No blocks found. Skipping header/footer detection.")
  74. return result_dict
  75. # If most of the blocks are single-line, skip the header and footer detection
  76. if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
  77. # print("Skipping header/footer detection for text-dense document.")
  78. return result_dict
  79. # Collect the bounding boxes of all blocks
  80. all_bboxes = []
  81. all_texts = []
  82. for page_id, blocks in result_dict.items():
  83. if page_id.startswith("page_"):
  84. for block_key, block in blocks.items():
  85. if block_key.startswith("block_"):
  86. all_bboxes.append(block["bbox"])
  87. # Get the height of the page
  88. page_height = max(bbox[3] for bbox in all_bboxes)
  89. # Get the most common bbox lists for headers and footers
  90. common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
  91. common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
  92. # Detect and mark headers and footers
  93. for page_id, blocks in result_dict.items():
  94. if page_id.startswith("page_"):
  95. for block_key, block in blocks.items():
  96. if block_key.startswith("block_"):
  97. bbox = block["bbox"]
  98. text = block["text"]
  99. is_header = compare_bbox_with_list(bbox, common_header_bboxes)
  100. is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
  101. block["is_header"] = int(is_header)
  102. block["is_footer"] = int(is_footer)
  103. return result_dict
  104. def __get_page_size(page_sizes:list):
  105. """
  106. 页面大小可能不一样
  107. """
  108. w = sum([w for w,h in page_sizes])/len(page_sizes)
  109. h = sum([h for w,h in page_sizes])/len(page_sizes)
  110. return w, h
  111. def __calculate_iou(bbox1, bbox2):
  112. iou = calculate_iou(bbox1, bbox2)
  113. return iou
  114. def __is_same_pos(box1, box2, iou_threshold):
  115. iou = __calculate_iou(box1, box2)
  116. return iou >= iou_threshold
  117. def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
  118. """
  119. common bbox必须大于page_cnt的1/3
  120. """
  121. min_occurance_cnt = max(3, page_cnt//4)
  122. header_det_bbox = []
  123. footer_det_bbox = []
  124. hdr_same_pos_group = []
  125. btn_same_pos_group = []
  126. page_w, page_h = __get_page_size(page_size)
  127. top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
  128. top_bbox = [b for b in bboxes if b[3]<top_y]
  129. bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
  130. # 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
  131. for i in range(0, len(top_bbox)):
  132. hdr_same_pos_group.append([top_bbox[i]])
  133. for j in range(i+1, len(top_bbox)):
  134. if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
  135. #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
  136. hdr_same_pos_group[i].append(top_bbox[j])
  137. for i in range(0, len(bottom_bbox)):
  138. btn_same_pos_group.append([bottom_bbox[i]])
  139. for j in range(i+1, len(bottom_bbox)):
  140. if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
  141. #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
  142. btn_same_pos_group[i].append(bottom_bbox[j])
  143. # 然后看下每一组的bbox,是否符合大于page_cnt一定比例
  144. hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
  145. btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
  146. # 平铺2个list[list]
  147. hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
  148. btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
  149. # 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
  150. hdr_same_pos_group.sort(key=lambda b:b[3])
  151. btn_same_pos_group.sort(key=lambda b:b[1])
  152. hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
  153. btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
  154. header_det_bbox = [0, 0, page_w, hdr_y]
  155. footer_det_bbox = [0, btn_y, page_w, page_h]
  156. # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
  157. return header_det_bbox, footer_det_bbox, page_w, page_h
  158. def drop_footer_header(pdf_info_dict:dict):
  159. """
  160. 启用规则探测,在全局的视角上通过统计的方法。
  161. """
  162. header = []
  163. footer = []
  164. all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
  165. image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
  166. page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
  167. page_cnt = len(pdf_info_dict.keys()) # 一共多少页
  168. header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
  169. """"
  170. 把范围扩展到页面水平的整个方向上
  171. """
  172. if header:
  173. header = [0, 0, page_w, header[3]+1]
  174. if footer:
  175. footer = [0, footer[1]-1, page_w, page_h]
  176. # 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
  177. # 移除text block
  178. for _, page_info in pdf_info_dict.items():
  179. header_text_blk = []
  180. footer_text_blk = []
  181. for blk in page_info['preproc_blocks']:
  182. blk_bbox = blk['bbox']
  183. if header and blk_bbox[3]<=header[3]:
  184. blk['tag'] = "header"
  185. header_text_blk.append(blk)
  186. elif footer and blk_bbox[1]>=footer[1]:
  187. blk['tag'] = "footer"
  188. footer_text_blk.append(blk)
  189. # 放入text_block_droped中
  190. page_info['droped_text_block'].extend(header_text_blk)
  191. page_info['droped_text_block'].extend(footer_text_blk)
  192. for blk in header_text_blk:
  193. page_info['preproc_blocks'].remove(blk)
  194. for blk in footer_text_blk:
  195. page_info['preproc_blocks'].remove(blk)
  196. """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
  197. header_image = []
  198. footer_image = []
  199. for image_info in page_info['images']:
  200. img_bbox = image_info['bbox']
  201. if header and img_bbox[3]<=header[3]:
  202. image_info['tag'] = "header"
  203. header_image.append(image_info)
  204. elif footer and img_bbox[1]>=footer[1]:
  205. image_info['tag'] = "footer"
  206. footer_image.append(image_info)
  207. page_info['droped_image_block'].extend(header_image)
  208. page_info['droped_image_block'].extend(footer_image)
  209. for img in header_image:
  210. page_info['images'].remove(img)
  211. for img in footer_image:
  212. page_info['images'].remove(img)
  213. """接下来吧backup的图片也删除掉"""
  214. header_image = []
  215. footer_image = []
  216. for image_info in page_info['image_backup']:
  217. img_bbox = image_info['bbox']
  218. if header and img_bbox[3]<=header[3]:
  219. image_info['tag'] = "header"
  220. header_image.append(image_info)
  221. elif footer and img_bbox[1]>=footer[1]:
  222. image_info['tag'] = "footer"
  223. footer_image.append(image_info)
  224. page_info['droped_image_block'].extend(header_image)
  225. page_info['droped_image_block'].extend(footer_image)
  226. for img in header_image:
  227. page_info['image_backup'].remove(img)
  228. for img in footer_image:
  229. page_info['image_backup'].remove(img)
  230. return header, footer