remove_rotate_bbox.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import math
  2. from magic_pdf.libs.boxbase import is_vbox_on_side
  3. from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
  4. def detect_non_horizontal_texts(result_dict):
  5. """
  6. This function detects watermarks and vertical margin notes in the document.
  7. Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
  8. If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
  9. If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
  10. Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
  11. If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
  12. If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
  13. Parameters
  14. ----------
  15. result_dict : dict
  16. The result dictionary.
  17. Returns
  18. -------
  19. result_dict : dict
  20. The updated result dictionary.
  21. """
  22. # Dictionary to store information about potential watermarks
  23. potential_watermarks = {}
  24. potential_margin_notes = {}
  25. for page_id, page_content in result_dict.items():
  26. if page_id.startswith("page_"):
  27. for block_id, block_data in page_content.items():
  28. if block_id.startswith("block_"):
  29. if "dir" in block_data:
  30. coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
  31. angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
  32. angle = abs(math.degrees(angle))
  33. if angle > 5 and angle < 85: # Check if direction is watermarks
  34. if coordinates_text in potential_watermarks:
  35. potential_watermarks[coordinates_text] += 1
  36. else:
  37. potential_watermarks[coordinates_text] = 1
  38. if angle > 85 and angle < 105: # Check if direction is vertical
  39. if coordinates_text in potential_margin_notes:
  40. potential_margin_notes[coordinates_text] += 1 # Increment count
  41. else:
  42. potential_margin_notes[coordinates_text] = 1 # Initialize count
  43. # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
  44. watermark_threshold = len(result_dict) // 2
  45. watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
  46. # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
  47. margin_note_threshold = len(result_dict) // 2
  48. margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
  49. # Add watermark information to the result dictionary
  50. for page_id, blocks in result_dict.items():
  51. if page_id.startswith("page_"):
  52. for block_id, block_data in blocks.items():
  53. coordinates_text = (block_data["bbox"], block_data["text"])
  54. if coordinates_text in watermarks:
  55. block_data["is_watermark"] = 1
  56. else:
  57. block_data["is_watermark"] = 0
  58. if coordinates_text in margin_notes:
  59. block_data["is_vertical_margin_note"] = 1
  60. else:
  61. block_data["is_vertical_margin_note"] = 0
  62. return result_dict
  63. """
  64. 1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
  65. 2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
  66. """
  67. import re
  68. def __is_a_word(sentence):
  69. # 如果输入是中文并且长度为1,则返回True
  70. if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
  71. return True
  72. # 判断是否为单个英文单词或字符(包括ASCII标点)
  73. elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
  74. return True
  75. else:
  76. return False
  77. def __get_text_color(num):
  78. """获取字体的颜色RGB值"""
  79. blue = num & 255
  80. green = (num >> 8) & 255
  81. red = (num >> 16) & 255
  82. return red, green, blue
  83. def __is_empty_side_box(text_block):
  84. """
  85. 是否是边缘上的空白没有任何内容的block
  86. """
  87. for line in text_block['lines']:
  88. for span in line['spans']:
  89. font_color = span['color']
  90. r,g,b = __get_text_color(font_color)
  91. if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
  92. return False
  93. return True
  94. def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
  95. """
  96. 返回删除了垂直,水印,旋转的textblock
  97. 删除的内容打上tag返回
  98. """
  99. removed_text_block = []
  100. for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
  101. lines = block['lines']
  102. block_bbox = block['bbox']
  103. if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
  104. continue
  105. if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
  106. is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
  107. if is_box_valign:
  108. block['tag'] = VERTICAL_TEXT
  109. removed_text_block.append(block)
  110. continue
  111. for line in lines:
  112. if line['dir']!=(1,0):
  113. block['tag'] = ROTATE_TEXT
  114. removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
  115. break
  116. for block in removed_text_block:
  117. pymu_text_block.remove(block)
  118. return pymu_text_block, removed_text_block
  119. def get_side_boundry(rotate_bbox, page_width, page_height):
  120. """
  121. 根据rotate_bbox,返回页面的左右正文边界
  122. """
  123. left_x = 0
  124. right_x = page_width
  125. for x in rotate_bbox:
  126. box = x['bbox']
  127. if box[2]<page_width/2:
  128. left_x = max(left_x, box[2])
  129. else:
  130. right_x = min(right_x, box[0])
  131. return left_x+1, right_x-1
  132. def remove_side_blank_block(pymu_text_block, page_width, page_height):
  133. """
  134. 删除页面两侧的空白block
  135. """
  136. removed_text_block = []
  137. for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
  138. block_bbox = block['bbox']
  139. if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
  140. continue
  141. if __is_empty_side_box(block):
  142. block['tag'] = EMPTY_SIDE_BLOCK
  143. removed_text_block.append(block)
  144. continue
  145. for block in removed_text_block:
  146. pymu_text_block.remove(block)
  147. return pymu_text_block, removed_text_block