pdf2text_getNumberOfColumn.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. from magic_pdf.libs import fitz
  2. from typing import List
  3. def show_image(item, title=""):
  4. """Display a pixmap.
  5. Just to display Pixmap image of "item" - ignore the man behind the curtain.
  6. Args:
  7. item: any PyMuPDF object having a "get_pixmap" method.
  8. title: a string to be used as image title
  9. Generates an RGB Pixmap from item using a constant DPI and using matplotlib
  10. to show it inline of the notebook.
  11. """
  12. DPI = 150 # use this resolution
  13. import numpy as np
  14. import matplotlib.pyplot as plt
  15. # %matplotlib inline
  16. pix = item.get_pixmap(dpi=DPI)
  17. img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
  18. plt.figure(dpi=DPI) # set the figure's DPI
  19. plt.title(title) # set title of image
  20. _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))
  21. def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
  22. # 计算两个line,重叠line各占2个line长度的比例
  23. if max(L1, L2) > min(R1, R2):
  24. return 0, 0
  25. if L1 == R1 or L2 == R2:
  26. return 0, 0
  27. overlap_line = min(R1, R2) - max(L1, L2)
  28. return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
  29. def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
  30. """
  31. param: page: fitz解析出来的格式
  32. param: columnNumber: Text的列数
  33. param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
  34. return:
  35. """
  36. INF = 10 ** 9
  37. pageL, pageU, pageR, pageD = INF, INF, 0, 0
  38. for L, U, R, D in textBboxs:
  39. assert L <= R and U <= D
  40. pageL = min(pageL, L)
  41. pageR = max(pageR, R)
  42. pageU = min(pageU, U)
  43. pageD = max(pageD, D)
  44. pageWidth = pageR - pageL
  45. pageHeight = pageD - pageU
  46. pageL -= pageWidth / 10 # 10是经验值
  47. pageR += pageWidth / 10
  48. pageU -= pageHeight / 10
  49. pageD += pageHeight / 10
  50. pageWidth = pageR - pageL
  51. pageHeight = pageD - pageU
  52. x_targetAxis = []
  53. x_splitAxis = []
  54. for i in range(0, columnNumber * 2 + 1):
  55. if i & 1:
  56. x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
  57. else:
  58. x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
  59. # # 可视化:分列的外框
  60. # path_bbox = []
  61. # N = len(x_targetAxis)
  62. # for i in range(N):
  63. # L, R = x_splitAxis[i], x_splitAxis[i + 1]
  64. # path_bbox.append((L, pageU, R, pageD))
  65. # shape = page.new_shape()
  66. # # iterate over the bboxes
  67. # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
  68. # for i, rect in enumerate(path_bbox):
  69. # # if i < 20:
  70. # # continue
  71. # shape.draw_rect(rect) # draw a border
  72. # shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
  73. # shape.finish(color=color_map[i%len(color_map)])
  74. # # shape.finish(color=fitz.pdfcolor["blue"])
  75. # shape.commit() # store to the page
  76. # # if i == 3:
  77. # # print(rect)
  78. # # break
  79. # # print(rect)
  80. # show_image(page, f"Table & Header BBoxes")
  81. return x_targetAxis, x_splitAxis
  82. def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
  83. INF = 10 ** 9
  84. # page_artbox = page.artbox
  85. # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]
  86. pageL, pageU, pageR, pageD = INF, INF, 0, 0
  87. for L, U, R, D in textBboxs:
  88. assert L <= R and U <= D
  89. pageL = min(pageL, L)
  90. pageR = max(pageR, R)
  91. pageU = min(pageU, U)
  92. pageD = max(pageD, D)
  93. pageWidth = pageR - pageL
  94. pageHeight = pageD - pageU
  95. pageL -= pageWidth / 10
  96. pageR += pageWidth / 10
  97. pageU -= pageHeight / 10
  98. pageD += pageHeight / 10
  99. pageWidth = pageR - pageL
  100. pageHeight = pageD - pageU
  101. col_N = len(x_targetAxis) # 列数
  102. col_texts_mid = [[] for _ in range(col_N)]
  103. col_texts_LR = [[] for _ in range(col_N)]
  104. oneLocateLoss_mid = 0
  105. oneLocateLoss_LR = 0
  106. oneLocateCnt_mid = 0 # 完美在一列中的个数
  107. oneLocateCnt_LR = 0
  108. oneLocateSquare_mid = 0.0 # 完美在一列的面积
  109. oneLocateSquare_LR = 0.0
  110. multiLocateLoss_mid = 0
  111. multiLocateLoss_LR = 0
  112. multiLocateCnt_mid = 0 # 在多列中的个数
  113. multiLocateCnt_LR = 0
  114. multiLocateSquare_mid = 0.0 # 在多列中的面积
  115. multiLocateSquare_LR = 0.0
  116. allLocateLoss_mid = 0
  117. allLocateLoss_LR = 0
  118. allLocateCnt_mid = 0 # 横跨页面的大框的个数
  119. allLocateCnt_LR = 0
  120. allLocateSquare_mid = 0.0 # 横跨整个页面的个数
  121. allLocateSquare_LR = 0.0
  122. isSimpleCondition = True # 就1个。2种方式,只要有一种情况不规整,就是不规整。
  123. colID_Textcnt_mid = [0 for _ in range(col_N)] # 每一列中有多少个Text块,根据mid判断的
  124. colID_Textcnt_LR = [0 for _ in range(col_N)] # 每一列中有多少个Text块,根据区间边界判断
  125. allLocateBboxs_mid = [] # 跨整页的,bbox
  126. allLocateBboxs_LR = []
  127. non_allLocateBboxs_mid = []
  128. non_allLocateBboxs_LR = [] # 不在单独某一列,但又不是全列
  129. for L, U, R, D in textBboxs:
  130. if D - U < 40: # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
  131. continue
  132. if R - L < 40:
  133. continue
  134. located_cols_mid = []
  135. located_cols_LR = []
  136. for col_ID in range(col_N):
  137. if col_N == 1:
  138. located_cols_mid.append(col_ID)
  139. located_cols_LR.append(col_ID)
  140. else:
  141. if L <= x_targetAxis[col_ID] <= R:
  142. located_cols_mid.append(col_ID)
  143. if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
  144. located_cols_LR.append(col_ID)
  145. if len(located_cols_mid) == col_N:
  146. allLocateBboxs_mid.append((L, U, R, D))
  147. else:
  148. non_allLocateBboxs_mid.append((L, U, R, D))
  149. if len(located_cols_LR) == col_N:
  150. allLocateBboxs_LR.append((L, U, R, D))
  151. else:
  152. non_allLocateBboxs_LR.append((L, U, R, D))
  153. allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
  154. non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
  155. allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
  156. non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
  157. # --------------------判断,是不是有标题类的小块,掺杂在一列的pdf页面里。-------------#
  158. isOneClumn = False
  159. under_cnt = 0
  160. under_square = 0.0
  161. before_cnt = 0
  162. before_square = 0.0
  163. for nL, nU, nR, nD in non_allLocateBboxs_mid:
  164. cnt = 0
  165. for L, U, R, D in allLocateBboxs_mid:
  166. if nD <= U:
  167. cnt += 1
  168. if cnt >= 1:
  169. before_cnt += cnt
  170. before_square += (R - L) * (D - U) * cnt
  171. else:
  172. under_cnt += 1
  173. under_square += (R - L) * (D - U) * cnt
  174. if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
  175. isOneClumn = True
  176. if isOneClumn == True and col_N != 1:
  177. return INF, False
  178. if isOneClumn == True and col_N == 1:
  179. return 0, True
  180. #### 根据边界的统计情况,再判断一次
  181. isOneClumn = False
  182. under_cnt = 0
  183. under_square = 0.0
  184. before_cnt = 0
  185. before_square = 0.0
  186. for nL, nU, nR, nD in non_allLocateBboxs_LR:
  187. cnt = 0
  188. for L, U, R, D in allLocateBboxs_LR:
  189. if nD <= U:
  190. cnt += 1
  191. if cnt >= 1:
  192. before_cnt += cnt
  193. before_square += (R - L) * (D - U) * cnt
  194. else:
  195. under_cnt += 1
  196. under_square += (R - L) * (D - U) * cnt
  197. if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
  198. isOneClumn = True
  199. if isOneClumn == True and col_N != 1:
  200. return INF, False
  201. if isOneClumn == True and col_N == 1:
  202. return 0, True
  203. for L, U, R, D in textBboxs:
  204. assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'
  205. # 简单排除页眉、迷你小块
  206. # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
  207. if (D - U) < 40:
  208. continue
  209. if (R - L) < 40:
  210. continue
  211. mid = (L + R) / 2
  212. located_cols_mid = [] # 在哪一列里,根据中点来判断
  213. located_cols_LR = [] # 在哪一列里,根据边界判断
  214. for col_ID in range(col_N):
  215. if col_N == 1:
  216. located_cols_mid.append(col_ID)
  217. else:
  218. # 根据中点判断
  219. if L <= x_targetAxis[col_ID] <= R:
  220. located_cols_mid.append(col_ID)
  221. # 根据边界判断
  222. if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
  223. located_cols_LR.append(col_ID)
  224. ## 1列的情形
  225. if col_N == 1:
  226. oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
  227. # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
  228. oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
  229. oneLocateCnt_mid += 1
  230. oneLocateSquare_mid += (D - U) * (R - L)
  231. ## 多列的情形
  232. else:
  233. ######## 根据mid判断
  234. if len(located_cols_mid) == 1:
  235. oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
  236. # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
  237. oneLocateCnt_mid += 1
  238. oneLocateSquare_mid += (D - U) * (R - L)
  239. elif 1 <= len(located_cols_mid) < col_N:
  240. ll, rr = located_cols_mid[0], located_cols_mid[-1]
  241. # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
  242. multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
  243. # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
  244. multiLocateCnt_mid += 1
  245. multiLocateSquare_mid += (D - U) * (R - L)
  246. isSimpleCondition = False
  247. else:
  248. allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
  249. allLocateCnt_mid += 1
  250. allLocateSquare_mid += (D - U) * (R - L)
  251. isSimpleCondition = False
  252. ######## 根据区间的边界判断
  253. if len(located_cols_LR) == 1:
  254. oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
  255. # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
  256. oneLocateCnt_LR += 1
  257. oneLocateSquare_LR += (D - U) * (R - L)
  258. elif 1 <= len(located_cols_LR) < col_N:
  259. ll, rr = located_cols_LR[0], located_cols_LR[-1]
  260. # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
  261. multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
  262. # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
  263. multiLocateCnt_LR += 1
  264. multiLocateSquare_LR += (D - U) * (R - L)
  265. isSimpleCondition = False
  266. else:
  267. allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
  268. allLocateCnt_LR += 1
  269. allLocateSquare_LR += (D - U) * (R - L)
  270. isSimpleCondition = False
  271. tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
  272. tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid
  273. # 1列的情形
  274. if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
  275. return 0, True
  276. # 多列的情形
  277. # if col_N >= 2:
  278. # if allLocateCnt >= 1:
  279. # oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
  280. # multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
  281. # else:
  282. # if multiLocateCnt_mid >= 1:
  283. # oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
  284. totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
  285. totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
  286. return totLoss_mid + totLoss_LR, isSimpleCondition
  287. def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
  288. columnNumber_loss = dict()
  289. columnNumber_isSimpleCondition = dict()
  290. #### 枚举列数
  291. for columnNumber in range(1, 5):
  292. # print('---------{}--------'.format(columnNumber))
  293. x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
  294. loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
  295. columnNumber_loss[columnNumber] = loss
  296. columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition
  297. col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
  298. col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))
  299. return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition