fix_table.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. import os
  2. import collections # 统计库
  3. import re # 正则
  4. from libs.commons import fitz # pyMuPDF库
  5. import json
  6. import re
  7. from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
  8. ## version 2
  9. def get_merged_line(page):
  10. """
  11. 这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。
  12. :param page :fitz读取的当前页的内容
  13. """
  14. drawings_bbox = []
  15. drawings_line = []
  16. drawings = page.get_drawings() # 提取所有的矢量
  17. for p in drawings:
  18. drawings_bbox.append(p["rect"].irect) # (L, U, R, D)
  19. lines = []
  20. for L, U, R, D in drawings_bbox:
  21. if abs(D - U) <= 3: # 筛出水平的横线
  22. lines.append((L, U, R, D))
  23. U_groups = []
  24. visited = [False for _ in range(len(lines))]
  25. for i, (L1, U1, R1, D1) in enumerate(lines):
  26. if visited[i] == True:
  27. continue
  28. tmp_g = [(L1, U1, R1, D1)]
  29. for j, (L2, U2, R2, D2) in enumerate(lines):
  30. if i == j:
  31. continue
  32. if visited[j] == True:
  33. continue
  34. if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5: # 把高度一致的线放进一个group
  35. tmp_g.append((L2, U2, R2, D2))
  36. visited[j] = True
  37. U_groups.append(tmp_g)
  38. res = []
  39. for group in U_groups:
  40. group.sort(key = lambda LURD: (LURD[0], LURD[2]))
  41. LL, UU, RR, DD = group[0]
  42. for i, (L1, U1, R1, D1) in enumerate(group):
  43. if (L1 - RR) >= 5:
  44. cur_line = (LL, UU, RR, DD)
  45. res.append(cur_line)
  46. LL = L1
  47. else:
  48. RR = max(RR, R1)
  49. cur_line = (LL, UU, RR, DD)
  50. res.append(cur_line)
  51. return res
  52. def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
  53. """
  54. :param page :fitz读取的当前页的内容
  55. :param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D)
  56. :param include_table_title: 是否将表格的标题也圈进来
  57. :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
  58. """
  59. drawings_lines = get_merged_line(page)
  60. fix_table_bboxes = []
  61. for table in table_bboxes:
  62. (L, U, R, D) = table
  63. fix_table_L = []
  64. fix_table_U = []
  65. fix_table_R = []
  66. fix_table_D = []
  67. width = R - L
  68. width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
  69. height = D - U
  70. height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
  71. for line in drawings_lines:
  72. if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
  73. if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内
  74. fix_table_U.append(line[1])
  75. fix_table_L.append(line[0])
  76. fix_table_R.append(line[2])
  77. elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内
  78. fix_table_D.append(line[1])
  79. fix_table_L.append(line[0])
  80. fix_table_R.append(line[2])
  81. if fix_table_U:
  82. U = min(fix_table_U)
  83. if fix_table_D:
  84. D = max(fix_table_D)
  85. if fix_table_L:
  86. L = min(fix_table_L)
  87. if fix_table_R:
  88. R = max(fix_table_R)
  89. if include_table_title: # 需要将表格标题包括
  90. text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] # 所有的text的block
  91. incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
  92. upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来
  93. sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右
  94. for idx in range(scan_line_num):
  95. if idx+1 <= len(sorted_filtered_text_blocks):
  96. line_temp = sorted_filtered_text_blocks[idx]['lines']
  97. if line_temp:
  98. text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
  99. check_en = re.match('Table', text) # 检查是否有Table开头的(英文)
  100. check_ch = re.match('表', text) # 检查是否有Table开头的(中文)
  101. if check_en or check_ch:
  102. if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
  103. U = sorted_filtered_text_blocks[idx]['bbox'][1]
  104. fix_table_bboxes.append([L-2, U-2, R+2, D+2])
  105. return fix_table_bboxes
  106. def __check_table_title_pattern(text):
  107. """
  108. 检查文本段是否是表格的标题
  109. """
  110. patterns = [r'^table\s\d+']
  111. for pattern in patterns:
  112. match = re.match(pattern, text, re.IGNORECASE)
  113. if match:
  114. return True
  115. else:
  116. return False
  117. def fix_table_text_block(pymu_blocks, table_bboxes: list):
  118. """
  119. 调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界
  120. 例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
  121. """
  122. for tb in table_bboxes:
  123. (L, U, R, D) = tb
  124. for block in pymu_blocks:
  125. if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
  126. txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
  127. if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
  128. tb[0] = min(tb[0], block['bbox'][0])
  129. tb[1] = min(tb[1], block['bbox'][1])
  130. tb[2] = max(tb[2], block['bbox'][2])
  131. tb[3] = max(tb[3], block['bbox'][3])
  132. block['_table'] = True # 占位,防止其他table再次占用
  133. """如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠"""
  134. if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
  135. block['bbox'] = list(block['bbox'])
  136. if block['bbox'][3] > U:
  137. block['bbox'][3] = U-1
  138. if block['bbox'][1] < D:
  139. block['bbox'][1] = D+1
  140. return table_bboxes
  141. def __get_table_caption_text(text_block):
  142. txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
  143. line_cnt = len(text_block['lines'])
  144. txt = txt.replace("Ž . ", '')
  145. return txt, line_cnt
  146. def include_table_title(pymu_blocks, table_bboxes: list):
  147. """
  148. 把表格的title也包含进来,扩展到table_bbox上
  149. """
  150. for tb in table_bboxes:
  151. max_find_cnt = 3 # 上上最多找3次
  152. temp_box = tb.copy()
  153. while max_find_cnt>0:
  154. text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
  155. if text_block_top:
  156. txt, line_cnt = __get_table_caption_text(text_block_top)
  157. if len(txt.strip())>0:
  158. if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
  159. max_find_cnt = max_find_cnt -1
  160. temp_box[1] = text_block_top['bbox'][1]
  161. continue
  162. else:
  163. break
  164. else:
  165. temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大
  166. max_find_cnt = max_find_cnt - 1
  167. else:
  168. break
  169. max_find_cnt = 3 # 向下找
  170. temp_box = tb.copy()
  171. while max_find_cnt>0:
  172. text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
  173. if text_block_bottom:
  174. txt, line_cnt = __get_table_caption_text(text_block_bottom)
  175. if len(txt.strip())>0:
  176. if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
  177. max_find_cnt = max_find_cnt - 1
  178. temp_box[3] = text_block_bottom['bbox'][3]
  179. continue
  180. else:
  181. break
  182. else:
  183. temp_box[3] = text_block_bottom['bbox'][3]
  184. max_find_cnt = max_find_cnt - 1
  185. else:
  186. break
  187. if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
  188. btn_text, _ = __get_table_caption_text(text_block_bottom)
  189. top_text, _ = __get_table_caption_text(text_block_top)
  190. if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
  191. # 取距离最近的
  192. btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
  193. top_text_distance = tb[1] - text_block_top['bbox'][3]
  194. text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
  195. tb[0] = min(tb[0], text_block['bbox'][0])
  196. tb[1] = min(tb[1], text_block['bbox'][1])
  197. tb[2] = max(tb[2], text_block['bbox'][2])
  198. tb[3] = max(tb[3], text_block['bbox'][3])
  199. text_block_bottom['_table_caption'] = True
  200. continue
  201. # 如果以上条件都不满足,那么就向下找
  202. text_block = text_block_top
  203. if text_block and text_block.get("_table_caption", False) is False:
  204. first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
  205. if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
  206. tb[0] = min(tb[0], text_block['bbox'][0])
  207. tb[1] = min(tb[1], text_block['bbox'][1])
  208. tb[2] = max(tb[2], text_block['bbox'][2])
  209. tb[3] = max(tb[3], text_block['bbox'][3])
  210. text_block['_table_caption'] = True
  211. continue
  212. text_block = text_block_bottom
  213. if text_block and text_block.get("_table_caption", False) is False:
  214. first_text_line, _ = __get_table_caption_text(text_block)
  215. if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
  216. tb[0] = min(tb[0], text_block['bbox'][0])
  217. tb[1] = min(tb[1], text_block['bbox'][1])
  218. tb[2] = max(tb[2], text_block['bbox'][2])
  219. tb[3] = max(tb[3], text_block['bbox'][3])
  220. text_block['_table_caption'] = True
  221. continue
  222. """向左、向右寻找,暂时只寻找一次"""
  223. left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
  224. if left_text_block and left_text_block.get("_image_caption", False) is False:
  225. first_text_line, _ = __get_table_caption_text(left_text_block)
  226. if __check_table_title_pattern(first_text_line):
  227. tb[0] = min(tb[0], left_text_block['bbox'][0])
  228. tb[1] = min(tb[1], left_text_block['bbox'][1])
  229. tb[2] = max(tb[2], left_text_block['bbox'][2])
  230. tb[3] = max(tb[3], left_text_block['bbox'][3])
  231. left_text_block['_image_caption'] = True
  232. continue
  233. right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
  234. if right_text_block and right_text_block.get("_image_caption", False) is False:
  235. first_text_line, _ = __get_table_caption_text(right_text_block)
  236. if __check_table_title_pattern(first_text_line):
  237. tb[0] = min(tb[0], right_text_block['bbox'][0])
  238. tb[1] = min(tb[1], right_text_block['bbox'][1])
  239. tb[2] = max(tb[2], right_text_block['bbox'][2])
  240. tb[3] = max(tb[3], right_text_block['bbox'][3])
  241. right_text_block['_image_caption'] = True
  242. continue
  243. return table_bboxes