pdf2text_recogFootnoteLine.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. import io
  2. import re
  3. import os
  4. import json
  5. from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
  6. from libs.commons import fitz
  7. from fitz import Point
  8. from pprint import pprint
  9. import pickle
  10. import collections
  11. from typing import List
  12. def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
  13. # 计算两个rect,重叠面积各占2个rect面积的比例
  14. if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
  15. return 0, 0
  16. square_1 = (R1 - L1) * (D1 - U1)
  17. square_2 = (R2 - L2) * (D2 - U2)
  18. if square_1 == 0 or square_2 == 0:
  19. return 0, 0
  20. square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
  21. return square_overlap / square_1, square_overlap / square_2
  22. def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
  23. # 计算两个line,重叠区间各占2个line长度的比例
  24. if max(L1, L2) > min(R1, R2):
  25. return 0, 0
  26. if L1 == R1 or L2 == R2:
  27. return 0, 0
  28. overlap_line = min(R1, R2) - max(L1, L2)
  29. return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
  30. def parse_footnoteLine(page_ID: int, page: fitz.Page, json_from_DocXchain_obj, exclude_bboxes):
  31. """
  32. :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
  33. :param page :fitz读取的当前页的内容
  34. :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
  35. :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
  36. """
  37. DPI = 72 # use this resolution
  38. pix = page.get_pixmap(dpi=DPI)
  39. pageL = 0
  40. pageR = int(pix.w)
  41. pageU = 0
  42. pageD = int(pix.h)
  43. #---------------------- PyMuPDF解析text --------------------#
  44. textSize_freq = collections.defaultdict(float) # text块中,textSize的频率
  45. textBlock_bboxs = []
  46. textLine_bboxs = []
  47. text_blocks = page.get_text(
  48. "dict",
  49. flags=fitz.TEXTFLAGS_TEXT,
  50. #clip=clip,
  51. )["blocks"]
  52. totText_list = []
  53. for i in range(len(text_blocks)):
  54. # print(blocks[i]) #### print
  55. bbox = text_blocks[i]['bbox']
  56. textBlock_bboxs.append(bbox)
  57. # print(bbox)
  58. cur_block_text_list = []
  59. for tt in text_blocks[i]['lines']:
  60. # 当前line
  61. cur_line_text_list = []
  62. cur_line_bbox = None # 当前line,最右侧的section的bbox
  63. for xf in tt['spans']:
  64. L, U, R, D = xf['bbox']
  65. L, R = min(L, R), max(L, R)
  66. U, D = min(U, D), max(U, D)
  67. textLine_bboxs.append((L, U, R, D))
  68. cur_line_text_list.append(xf['text'])
  69. textSize_freq[xf['size']] += len(xf['text'])
  70. cur_lines_text = ' '.join(cur_line_text_list)
  71. cur_block_text_list.append(cur_lines_text)
  72. totText_list.append('\n'.join(cur_block_text_list))
  73. totText = '\n'.join(totText_list)
  74. # print(totText) # 打印Text
  75. textLine_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
  76. textBlock_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
  77. # print('------------ textSize_freq -----------')
  78. max_sizeFreq = 0 # 出现频率最高的textSize
  79. textSize_withMaxFreq = 0
  80. for x, f in textSize_freq.items():
  81. # print(x, f)
  82. if f > max_sizeFreq:
  83. max_sizeFreq = f
  84. textSize_withMaxFreq = x
  85. #**********************************************************#
  86. #------------------ PyMuPDF读取drawings -----------------#
  87. horizon_lines = []
  88. drawings = page.get_cdrawings()
  89. for drawing in drawings:
  90. try:
  91. rect = drawing['rect']
  92. L, U, R, D = rect
  93. # if (L, U, R, D) in exclude_bboxes:
  94. # continue # 如果是Fiugre, Table, Equation。注释掉是因为,可以暂时先不消,先自我对消。最后再判读需不需要排除。
  95. # 如果是水平线
  96. if U <= D and D - U <= 3:
  97. # 如果长度够
  98. if (pageR - pageL) / 15 <= R - L:
  99. if not(80/800 * pageD <= U <= 750/800 * pageD):
  100. continue # 很可能是页眉和页脚的线
  101. horizon_lines.append((L, U, R, D))
  102. # print((L, U, R, D))
  103. except:
  104. pass
  105. horizon_lines.sort(key = lambda LURD: (LURD[1]))
  106. #********************************************************#
  107. #----------------- 两条线可能是在表格中 ------------------#
  108. def has_text_below_line(L: float, U: float, R: float, D: float, inLowerArea: bool) -> bool:
  109. """
  110. 检查线下是否紧挨着text
  111. """
  112. Uu, Du = U - textSize_withMaxFreq, U # 线上的一个矩形
  113. Lu, Ru = L, R
  114. Ud, Dd = U, U + textSize_withMaxFreq # 线下的一个矩形
  115. Ld, Rd = L, R
  116. find = 0 # 在线下的文字。统计面积。
  117. leftTextCnt = 0 # 不在线底下的文字(整体在线左侧的文字),说明不是个脚注线。统计面积。
  118. English_alpha_cnt = 0 # 英文字母个数
  119. nonEnglish_alpha_cnt = 0 # 非英文字母个数
  120. punctuation_mark_cnt = 0 # 常见标点符号个数
  121. digit_cnt = 0 # 数字个数
  122. distance_nearest_up_line = None
  123. distance_nearest_down_line = None
  124. for i in range(len(text_blocks)):
  125. # print(blocks[i]) #### print
  126. bbox = text_blocks[i]['bbox']
  127. L0, U0, R0, D0 = bbox
  128. if 0< (R0 - L0) < pageR / 6 and (D0 - U0) / (R0 - L0) > 10 :
  129. continue # 一个很窄的,竖直的长条。比如,arXiv预印本,左侧的arXiv标志信息。
  130. textBlock_bboxs.append(bbox)
  131. # print(bbox)
  132. cur_block_text_list = []
  133. for tt in text_blocks[i]['lines']:
  134. # 当前line
  135. cur_line_text_list = []
  136. cur_line_bbox = None # 当前line,最右侧的section的bbox
  137. for xf in tt['spans']:
  138. L2, U2, R2, D2 = xf['bbox']
  139. L2, R2 = min(L2, R2), max(L2, R2)
  140. U2, D2 = min(U2, D2), max(U2, D2)
  141. textLine = xf['text']
  142. if L>0 and L2 < L and (L - L2) / L > 0.2:
  143. leftTextCnt += abs(R2 - L2) * abs(D2 - U2)
  144. else:
  145. ## 线下的部分
  146. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(Ud, Dd, U2, D2)
  147. ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(Ld, Rd, L2, R2)
  148. if U < (U2 + D2) / 2 and ratio_1 > 0 and ratio_2 > 0:
  149. if max(ratio_3, ratio_4) > 0.8:
  150. # if 444 <= U1 < 445 and 55 <= L2 < 56:
  151. # print('匹配的框', L2, U2, R2, D2)
  152. # if xf['size'] > 1.2 * textSize_withMaxFreq:
  153. # return False # 可能是个标题。不能这样卡
  154. find += abs(R2 - L2) * abs(D2 - U2)
  155. distance_nearest_down_line = (U2 + D2) / 2 - U
  156. for c in textLine:
  157. if c == ' ':
  158. continue
  159. elif c.isdigit() == True:
  160. digit_cnt += 1
  161. elif c in ',.:!?[]()%,。、!?:【】()《》-':
  162. punctuation_mark_cnt += 1
  163. elif c.isalpha() == True:
  164. English_alpha_cnt += 1
  165. else:
  166. nonEnglish_alpha_cnt += 1
  167. ## 线上的部分
  168. ratio_5, ratio_6 = calculate_overlapRatio_between_line1_and_line2(Uu, Du, U2, D2)
  169. ratio_7, ratio_8 = calculate_overlapRatio_between_line1_and_line2(Lu, Ru, L2, R2)
  170. if (U2 + D2) / 2 < U and ratio_5 > 0 and ratio_6 > 0:
  171. if max(ratio_7, ratio_8) > 0.8:
  172. distance_nearest_up_line = U - (U2 + D2) / 2
  173. # if distance_nearest_up_line < 0:
  174. # print(Lu, Uu, Ru, Du, L2, U2, R2, D2)
  175. # print(distance_nearest_up_line, distance_nearest_down_line)
  176. if distance_nearest_up_line != None and distance_nearest_down_line != None:
  177. if distance_nearest_up_line * 1.5 < distance_nearest_down_line:
  178. return False # 如果,一根线。距离上面的文字line更近。说明是个下划线,而不是footnoteLine
  179. ## 在上面的线条,要考虑左侧的text块儿。在很靠下的线条,就暂时不考虑左侧text块儿了。
  180. if inLowerArea == False:
  181. if leftTextCnt >= 2000/500000 * pageR * pageD:
  182. return False
  183. return find >= 0 and (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) >= 10
  184. ## 最下面区域的线条,判断时。
  185. # print(English_alpha_cnt, nonEnglish_alpha_cnt, digit_cnt)
  186. if (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) == 0:
  187. return False
  188. if (English_alpha_cnt + digit_cnt) / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.5:
  189. if nonEnglish_alpha_cnt / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.4:
  190. return False
  191. else:
  192. return True
  193. return True
  194. visited = [False for _ in range(len(horizon_lines))]
  195. for i, b1 in enumerate(horizon_lines):
  196. for j in range(i + 1, len(horizon_lines)):
  197. L1, U1, R1, D1 = horizon_lines[i]
  198. L2, U2, R2, D2 = horizon_lines[j]
  199. ## 在一条水平线,且挨着
  200. if L1 > L2:
  201. L1, U1, R1, D1, L2, U2, R2, D2 = L2, U2, R2, D2, L1, U1, R1, D1
  202. in_horizontal_line_flag = (max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5) and (L2 - R1 <= pageR/10)
  203. if in_horizontal_line_flag == True:
  204. visited[i] = True
  205. visited[j] = True
  206. ## 在竖直方向上是一致的。(表格,或者有的文章就是喜欢划线)
  207. L1, U1, R1, D1 = horizon_lines[i]
  208. L2, U2, R2, D2 = horizon_lines[j]
  209. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  210. # print(L1, U1, R1, D1, L2, U2, R2, D2, ratio_1, ratio_2)
  211. in_vertical_line_flag = (ratio_1 > 0.9 and ratio_2 > 0.9) or (max(ratio_1, ratio_2) > 0.95)
  212. if in_vertical_line_flag == True:
  213. visited[i] = True
  214. # if (U2 < pageD * 0.8 or (U2 - U1) < pageD * 0.3) and has_text_below_line(L2, U2, R2, D2, False) == False:
  215. # visited[j] = True # 最最底下的线先不要动
  216. else:
  217. if ratio_1 > 0 and (R2 - L2) / (R1 - L1) > 1:
  218. visited[i] = True
  219. # print(horizon_lines)
  220. horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
  221. # print(horizon_lines)
  222. #*****************************************************************#
  223. #------- 靠上的,就不是脚注。用一个THRESHOLD直接卡掉位于上半页的 -------#
  224. visited = [False for _ in range(len(horizon_lines))]
  225. THRESHOLD = (pageD - pageU) * 0.5
  226. for i, (L, U, R, D) in enumerate(horizon_lines):
  227. if U < THRESHOLD:
  228. visited[i] = True
  229. horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
  230. #******************************************************#
  231. #--------------- 此时,还有遮挡的,上面的丢弃 ---------------#
  232. visited = [False for _ in range(len(horizon_lines))]
  233. for i, (L1, U1, R1, D1) in enumerate(horizon_lines):
  234. for j in range(i + 1, len(horizon_lines)):
  235. L2, U2, R2, D2 = horizon_lines[j]
  236. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  237. if (ratio_1 > 0.2 and ratio_2 > 0.2) or max(ratio_1, ratio_2) > 0.7:
  238. visited[i] = True
  239. horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
  240. #********************************************************#
  241. # print(horizon_lines)
  242. ## 检查,线下面有没有紧挨着的text
  243. horizon_lines = [LURD for LURD in horizon_lines if has_text_below_line(*(LURD), True) == True]
  244. # print(horizon_lines)
  245. ## 卡一下长度
  246. # horizon_lines = [LURD for LURD in horizon_lines if (LURD[2] - LURD[0] >= pageR / 10)]
  247. ## 上面最多保留2条
  248. horizon_lines = horizon_lines[max(-2, -len(horizon_lines)) :]
  249. #----------------------------------------------------- 第2段 -----------------------------------------------------------#
  250. #----------------------------------- 最下面的情形,用距离硬卡。还有在右侧的情形就被包含了 -----------------------------------#
  251. #------------------ PyMuPDF读取drawings -----------------#
  252. down_horizon_lines = []
  253. drawings = page.get_cdrawings()
  254. for drawing in drawings:
  255. try:
  256. rect = drawing['rect']
  257. L, U, R, D = rect
  258. # if (L, U, R, D) in exclude_bboxes:
  259. # continue # 如果是Fiugre, Table, Equation。目前是Figure识别的比较好。但是Table和Equation识别的不好
  260. # 如果是水平线
  261. if U <= D and D - U <= 3 and U > pageD * 0.85:
  262. # 如果长度够
  263. if (pageR - pageL) / 15 <= R - L:
  264. down_horizon_lines.append((L, U, R, D))
  265. # print((L, U, R, D))
  266. except:
  267. pass
  268. down_horizon_lines.sort(key = lambda LURD: (LURD[0], LURD[2], LURD[1]))
  269. visited = [False for _ in range(len(down_horizon_lines))]
  270. for i in range(len(down_horizon_lines) - 1):
  271. L1, U1, R1, D1 = down_horizon_lines[i]
  272. L2, U2, R2, D2 = down_horizon_lines[i + 1]
  273. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  274. if ratio_1 <= 0.1 and ratio_2 <= 0.1:
  275. if L2 - R1 <= pageR / 3:
  276. visited[i] = True
  277. visited[i + 1] = True
  278. down_horizon_lines = [down_horizon_lines[i] for i in range(len(down_horizon_lines)) if visited[i] == False]
  279. down_horizon_lines = [LURD for LURD in down_horizon_lines if has_text_below_line(*(LURD), True) == True]
  280. # for LURD in down_horizon_lines:
  281. # print('第2阶段,LURD是: ', LURD)
  282. # print(has_text_below_line(*(LURD), True))
  283. footnoteLines = horizon_lines + down_horizon_lines
  284. footnoteLines = list(set(footnoteLines))
  285. footnoteLines = footnoteLines[max(-2, -len(footnoteLines)) : ]
  286. #-------------------------- 最后再检查一遍。是否在图片、表格、公式中。 ------------------------------#
  287. def line_in_specialBboxes(L: float, U: float, R: float, D: float, specialBboxes) -> bool:
  288. L2, U2, R2, D2 = L, U, R, D # 当前这根线
  289. for L1, U1, R1, D1 in specialBboxes:
  290. if U1 <= U2 <= D2 < D1:
  291. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  292. if ratio_1 > 0 and ratio_2 > 0.6:
  293. return True
  294. # else:
  295. # U1 -= min(textSize_withMaxFreq * 2, 20)
  296. # D1 += min(textSize_withMaxFreq * 2, 20)
  297. # if U1 <= U2 <= D2 < D1:
  298. # ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  299. # if ratio_1 > 0 and ratio_2 > 0.8:
  300. # return True
  301. return False
  302. footnoteLines = [LURD for LURD in footnoteLines if line_in_specialBboxes(*(LURD), exclude_bboxes) == False]
  303. #-------------------------- 检查,线,是否在当前column的左侧,而不是在一段文字的中间 (通过DocXChain识别的column或者徐超老师写的Layout识别)------------------------------#
  304. # #--------- 通过json_from_DocXchain来获取 column ---------#
  305. # column_bbox_from_DocXChain = []
  306. # xf_json = json_from_DocXchain_obj
  307. # width_from_json = xf_json['page_info']['width']
  308. # height_from_json = xf_json['page_info']['height']
  309. # LR_scaleRatio = width_from_json / (pageR - pageL)
  310. # UD_scaleRatio = height_from_json / (pageD - pageU)
  311. # # {0: 'title', # 标题
  312. # # 1: 'figure', # 图片
  313. # # 2: 'plain text', # 文本
  314. # # 3: 'header', # 页眉
  315. # # 4: 'page number', # 页码
  316. # # 5: 'footnote', # 脚注
  317. # # 6: 'footer', # 页脚
  318. # # 7: 'table', # 表格
  319. # # 8: 'table caption', # 表格描述
  320. # # 9: 'figure caption', # 图片描述
  321. # # 10: 'equation', # 公式
  322. # # 11: 'full column', # 单栏
  323. # # 12: 'sub column', # 多栏
  324. # # 13: 'embedding', # 嵌入公式
  325. # # 14: 'isolated'} # 单行公式
  326. # for xf in xf_json['layout_dets']:
  327. # L = xf['poly'][0] / LR_scaleRatio
  328. # U = xf['poly'][1] / UD_scaleRatio
  329. # R = xf['poly'][2] / LR_scaleRatio
  330. # D = xf['poly'][5] / UD_scaleRatio
  331. # # L += pageL # 有的页面,artBox偏移了。不在(0,0)
  332. # # R += pageL
  333. # # U += pageU
  334. # # D += pageU
  335. # L, R = min(L, R), max(L, R)
  336. # U, D = min(U, D), max(U, D)
  337. # if (xf['category_id'] == 11 or xf['category_id'] == 12) and xf['score'] >= 0.3:
  338. # column_bbox_from_DocXChain.append((L, U, R, D))
  339. #---------------手写,检查,线是否是与某个column的左端对齐 ------------------#
  340. def check_isOnTheLeftOfColumn(L: float, U: float, R: float, D: float) -> bool:
  341. LL = L - textSize_withMaxFreq
  342. RR = LL
  343. UU = max(pageD * 0.02, U - 100/800 * pageD)
  344. DD = min(U + 50/800 * pageD, pageD * 0.98)
  345. # print(LL, UU, RR, DD)
  346. cnt = 0
  347. for bbox in textLine_bboxs:
  348. L2, U2, R2, D2 = bbox
  349. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(UU, DD, U2, D2)
  350. ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
  351. if ratio_1 > 0 and ratio_2 > 0:
  352. if max(ratio_3, ratio_4) > 0.8:
  353. if abs(LL - L2) <= 20/700 * pageR:
  354. cnt += 1
  355. # else:
  356. # if (R2 - L2) >= 30/700 * pageR:
  357. # print(LL, UU, RR, DD, L2, U2, R2, D2)
  358. # return False # 不能这样卡。有些注释里面,单独的特殊符号就是一个textLineBbox
  359. # print('cnt: ', cnt)
  360. return cnt >= 4
  361. # def check_isOnTheLeftOfColumn_considerLayout(L0: float, U0: float, R0: float, D0: float) -> bool:
  362. # LL = L0 - textSize_withMaxFreq * 1.5
  363. # RR = LL
  364. # UU = 100/800 * pageD
  365. # DD = 700/800 * pageD
  366. # STEP = textSize_withMaxFreq / 2
  367. # def check_ok(L: float, U: float, R: float, D: float) -> bool:
  368. # for bbox in textBlock_bboxs:
  369. # L2, U2, R2, D2 = bbox
  370. # ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
  371. # if max(ratio_3, ratio_4) > 0.8:
  372. # if (R2 - L2) > 1/4 * pageR and L2 < LL <= RR < R2:
  373. # if abs(LL - L2) < 50/700 * pageR or abs(RR - R2) < 50/700 * pageR:
  374. # continue
  375. # else:
  376. # return False
  377. # return True
  378. # ## 先探上面
  379. # u = UU
  380. # d = U0
  381. # while u + STEP/2 < d:
  382. # mid = (u + d) / 2
  383. # if check_ok(L0, mid, R0, U0) == True:
  384. # d = mid
  385. # else:
  386. # u = mid + STEP
  387. # print(mid)
  388. # dist_up = U0 - u
  389. # print(u)
  390. # ## 再探下面
  391. # u = D0
  392. # d = DD
  393. # while u + STEP/2 < d:
  394. # mid = (u + d) / 2
  395. # if check_ok(L0, mid, R0, D0) == True:
  396. # u = mid
  397. # else:
  398. # d = mid - STEP
  399. # print(u)
  400. # print('^^^^^^^^^^^^^^')
  401. # dist_down = u - D0
  402. # if dist_up + dist_down < textSize_withMaxFreq * 10:
  403. # return False
  404. # return True
  405. footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn(*(LURD)) == True]
  406. # footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn_considerLayout(*(LURD)) == True] # 不具有泛化性。不用了。
  407. #--------------------------------- 通过footnoteLine获取bbox -------------------------------#
  408. def get_footnoteBbox(L: float, U: float, R: float, D: float) -> (float, float, float, float):
  409. """
  410. 检查线下是否紧挨着text
  411. """
  412. L1, U1, R1, D1 = L, U, R, D
  413. raw_bboxes = []
  414. for i in range(len(text_blocks)):
  415. bbox = text_blocks[i]['bbox']
  416. L2, U2, R2, D2 = bbox
  417. if (D2 - U2) / (R2 - L2) > 10 and (R2 - L2) < pageR / 6:
  418. continue # 一个很窄的,竖直的长条。比如,arXiv预印本,左侧的arXiv标志信息。
  419. if U2 < D2 < U1:
  420. continue # 在线上面
  421. under_THRESHOLD = min(D1 + textSize_withMaxFreq * 20, pageD * 0.98)
  422. if U2 < under_THRESHOLD:
  423. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
  424. if max(ratio_1, ratio_2) > 0.8:
  425. raw_bboxes.append((L2, U2, R2, D2))
  426. # print(L1, U1, R1, D1)
  427. # print(raw_bboxes)
  428. if len(raw_bboxes) == 0:
  429. return []
  430. raw_bboxes.sort(key = lambda LURD: (LURD[1], LURD[0]))
  431. raw_bboxes = [LURD for LURD in raw_bboxes if (abs(LURD[0] - L1) < textSize_withMaxFreq * 6 or L1 < LURD[0])] # footnote的bbox,应该都是左端对齐的
  432. if len(raw_bboxes) == 0:
  433. return []
  434. #------------------ full column和sub column混合,肯定也不行 ------------------#
  435. LL, UU, RR, DD = raw_bboxes[0]
  436. for L, U, R, D in raw_bboxes:
  437. LL, UU, RR, DD = min(LL, L), min(UU, U), max(RR, R), max(DD, D)
  438. for L, U, R, D in raw_bboxes:
  439. if (RR - LL) > pageR*0.8 and (R - L) > pageR * 0.15 and (RR - LL) / (R - L) > 2:
  440. return []
  441. if abs(LL - L) > textSize_withMaxFreq * 3:
  442. return []
  443. #-------------------- 太高了的,full column的框。不行 ----------------------#
  444. if UU < 650/800 * pageD and (RR - LL) > 0.5 * pageR:
  445. return []
  446. #-------------- 第一段字数很少。后面的段字数很多,也不行 ----------------#
  447. if len(raw_bboxes) > 1:
  448. bbox_square = []
  449. for L, U, R, D in raw_bboxes:
  450. cur_s = abs(R - L) * abs(D - U)
  451. bbox_square.append(cur_s)
  452. s0 = bbox_square[0]
  453. s1n = sum(bbox_square[1: ]) / len(bbox_square[1: ])
  454. if s1n / s0 > 10 or max(bbox_square) / s0 > 15:
  455. return []
  456. raw_bboxes += [(LL, UU, RR, DD)]
  457. return raw_bboxes
  458. # print(footnoteLines)
  459. footnoteBboxes = []
  460. for L, U, R, D in footnoteLines:
  461. cur = get_footnoteBbox(L, U, R, D)
  462. if len(cur) > 0:
  463. footnoteBboxes.append((L, U, R, D))
  464. footnoteBboxes += cur
  465. footnoteBboxes = list(set(footnoteBboxes))
  466. return footnoteBboxes
  467. def __bbox_in(box1, box2):
  468. """
  469. box1是否在box2中
  470. """
  471. L1, U1, R1, D1 = box1
  472. L2, U2, R2, D2 = box2
  473. if int(L2) <= int(L1) and int(U2) <= int(U1) and int(R1) <= int(R2) and int(D1) <= int(D2):
  474. return True
  475. return False
  476. def remove_footnote_text(raw_text_block, footnote_bboxes):
  477. """
  478. :param raw_text_block: str类型,是当前页的文本内容
  479. :param footnoteBboxes: list类型,是当前页的脚注bbox
  480. """
  481. footnote_text_blocks = []
  482. for block in raw_text_block:
  483. text_bbox = block['bbox']
  484. # TODO 更严谨点在line级别做
  485. if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
  486. #if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
  487. block['tag'] = 'footnote'
  488. footnote_text_blocks.append(block)
  489. #raw_text_block.remove(block)
  490. # 移除,不能再内部移除,否则会出错
  491. for block in footnote_text_blocks:
  492. raw_text_block.remove(block)
  493. return raw_text_block, footnote_text_blocks
  494. def remove_footnote_image(image_blocks, footnote_bboxes):
  495. """
  496. :param image_bboxes: list类型,是当前页的图片bbox(结构体)
  497. :param footnoteBboxes: list类型,是当前页的脚注bbox
  498. """
  499. footnote_imgs_blocks = []
  500. for image_block in image_blocks:
  501. if any([__bbox_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
  502. footnote_imgs_blocks.append(image_block)
  503. for footnote_imgs_block in footnote_imgs_blocks:
  504. image_blocks.remove(footnote_imgs_block)
  505. return image_blocks, footnote_imgs_blocks
  506. def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_w, page_h):
  507. """
  508. 删除页眉页脚,页码
  509. 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
  510. """
  511. header = []
  512. footer = []
  513. if len(header)==0:
  514. model_header = header_bboxs
  515. if model_header:
  516. x0 = min([x for x,_,_,_ in model_header])
  517. y0 = min([y for _,y,_,_ in model_header])
  518. x1 = max([x1 for _,_,x1,_ in model_header])
  519. y1 = max([y1 for _,_,_,y1 in model_header])
  520. header = [x0, y0, x1, y1]
  521. if len(footer)==0:
  522. model_footer = footer_bboxs
  523. if model_footer:
  524. x0 = min([x for x,_,_,_ in model_footer])
  525. y0 = min([y for _,y,_,_ in model_footer])
  526. x1 = max([x1 for _,_,x1,_ in model_footer])
  527. y1 = max([y1 for _,_,_,y1 in model_footer])
  528. footer = [x0, y0, x1, y1]
  529. header_y0 = 0 if len(header) == 0 else header[3]
  530. footer_y0 = page_h if len(footer) == 0 else footer[1]
  531. if page_no_bboxs:
  532. top_part = [b for b in page_no_bboxs if b[3] < page_h/2]
  533. btn_part = [b for b in page_no_bboxs if b[1] > page_h/2]
  534. top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
  535. btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
  536. header_y0 = max(header_y0, top_max_y0)
  537. footer_y0 = min(footer_y0, btn_min_y1)
  538. content_boundry = [0, header_y0, page_w, footer_y0]
  539. header = [0,0, page_w, header_y0]
  540. footer = [0, footer_y0, page_w, page_h]
  541. """以上计算出来了页眉页脚的边界,下面开始进行删除"""
  542. text_block_to_remove = []
  543. # 首先检查每个textblock
  544. for blk in text_raw_blocks:
  545. if len(blk['lines']) > 0:
  546. for line in blk['lines']:
  547. line_del = []
  548. for span in line['spans']:
  549. span_del = []
  550. if span['bbox'][3] < header_y0:
  551. span_del.append(span)
  552. elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
  553. span_del.append(span)
  554. for span in span_del:
  555. line['spans'].remove(span)
  556. if not line['spans']:
  557. line_del.append(line)
  558. for line in line_del:
  559. blk['lines'].remove(line)
  560. else:
  561. # if not blk['lines']:
  562. blk['tag'] = 'in-foot-header-area'
  563. text_block_to_remove.append(blk)
  564. """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
  565. page_no_block_2_remove = []
  566. if page_no_bboxs:
  567. for pagenobox in page_no_bboxs:
  568. for block in text_raw_blocks:
  569. if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
  570. for line in block['lines']:
  571. for span in line['spans']:
  572. if _is_in_or_part_overlap(pagenobox, span['bbox']):
  573. #span['text'] = ''
  574. span['tag'] = "page-no"
  575. # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
  576. if len(line['spans']) == 1 and len(block['lines'])==1:
  577. page_no_block_2_remove.append(block)
  578. else:
  579. # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
  580. if len(text_raw_blocks) > 0:
  581. text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
  582. last_block = text_raw_blocks[0]
  583. if len(last_block['lines']) == 1:
  584. last_line = last_block['lines'][0]
  585. if len(last_line['spans']) == 1:
  586. last_span = last_line['spans'][0]
  587. if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', last_span['text']):
  588. last_span['tag'] = "page-no"
  589. page_no_block_2_remove.append(last_block)
  590. for b in page_no_block_2_remove:
  591. text_block_to_remove.append(b)
  592. for blk in text_block_to_remove:
  593. if blk in text_raw_blocks:
  594. text_raw_blocks.remove(blk)
  595. text_block_remain = text_raw_blocks
  596. image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
  597. image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
  598. table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
  599. table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
  600. return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove