| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- from magic_pdf.libs import fitz
- from typing import List
- def show_image(item, title=""):
- """Display a pixmap.
- Just to display Pixmap image of "item" - ignore the man behind the curtain.
- Args:
- item: any PyMuPDF object having a "get_pixmap" method.
- title: a string to be used as image title
- Generates an RGB Pixmap from item using a constant DPI and using matplotlib
- to show it inline of the notebook.
- """
- DPI = 150 # use this resolution
- import numpy as np
- import matplotlib.pyplot as plt
- # %matplotlib inline
- pix = item.get_pixmap(dpi=DPI)
- img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
- plt.figure(dpi=DPI) # set the figure's DPI
- plt.title(title) # set title of image
- _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))
- def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
- # 计算两个line,重叠line各占2个line长度的比例
- if max(L1, L2) > min(R1, R2):
- return 0, 0
- if L1 == R1 or L2 == R2:
- return 0, 0
- overlap_line = min(R1, R2) - max(L1, L2)
- return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
- def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
- """
- param: page: fitz解析出来的格式
- param: columnNumber: Text的列数
- param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
- return:
-
- """
- INF = 10 ** 9
- pageL, pageU, pageR, pageD = INF, INF, 0, 0
- for L, U, R, D in textBboxs:
- assert L <= R and U <= D
- pageL = min(pageL, L)
- pageR = max(pageR, R)
- pageU = min(pageU, U)
- pageD = max(pageD, D)
- pageWidth = pageR - pageL
- pageHeight = pageD - pageU
- pageL -= pageWidth / 10 # 10是经验值
- pageR += pageWidth / 10
- pageU -= pageHeight / 10
- pageD += pageHeight / 10
- pageWidth = pageR - pageL
- pageHeight = pageD - pageU
- x_targetAxis = []
- x_splitAxis = []
- for i in range(0, columnNumber * 2 + 1):
- if i & 1:
- x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
- else:
- x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
- # # 可视化:分列的外框
- # path_bbox = []
- # N = len(x_targetAxis)
- # for i in range(N):
- # L, R = x_splitAxis[i], x_splitAxis[i + 1]
- # path_bbox.append((L, pageU, R, pageD))
- # shape = page.new_shape()
- # # iterate over the bboxes
- # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
- # for i, rect in enumerate(path_bbox):
- # # if i < 20:
- # # continue
- # shape.draw_rect(rect) # draw a border
- # shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
- # shape.finish(color=color_map[i%len(color_map)])
- # # shape.finish(color=fitz.pdfcolor["blue"])
- # shape.commit() # store to the page
- # # if i == 3:
- # # print(rect)
- # # break
- # # print(rect)
- # show_image(page, f"Table & Header BBoxes")
- return x_targetAxis, x_splitAxis
- def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
- INF = 10 ** 9
- # page_artbox = page.artbox
- # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]
- pageL, pageU, pageR, pageD = INF, INF, 0, 0
- for L, U, R, D in textBboxs:
- assert L <= R and U <= D
- pageL = min(pageL, L)
- pageR = max(pageR, R)
- pageU = min(pageU, U)
- pageD = max(pageD, D)
- pageWidth = pageR - pageL
- pageHeight = pageD - pageU
- pageL -= pageWidth / 10
- pageR += pageWidth / 10
- pageU -= pageHeight / 10
- pageD += pageHeight / 10
- pageWidth = pageR - pageL
- pageHeight = pageD - pageU
- col_N = len(x_targetAxis) # 列数
- col_texts_mid = [[] for _ in range(col_N)]
- col_texts_LR = [[] for _ in range(col_N)]
- oneLocateLoss_mid = 0
- oneLocateLoss_LR = 0
- oneLocateCnt_mid = 0 # 完美在一列中的个数
- oneLocateCnt_LR = 0
- oneLocateSquare_mid = 0.0 # 完美在一列的面积
- oneLocateSquare_LR = 0.0
- multiLocateLoss_mid = 0
- multiLocateLoss_LR = 0
- multiLocateCnt_mid = 0 # 在多列中的个数
- multiLocateCnt_LR = 0
- multiLocateSquare_mid = 0.0 # 在多列中的面积
- multiLocateSquare_LR = 0.0
- allLocateLoss_mid = 0
- allLocateLoss_LR = 0
- allLocateCnt_mid = 0 # 横跨页面的大框的个数
- allLocateCnt_LR = 0
- allLocateSquare_mid = 0.0 # 横跨整个页面的个数
- allLocateSquare_LR = 0.0
- isSimpleCondition = True # 就1个。2种方式,只要有一种情况不规整,就是不规整。
- colID_Textcnt_mid = [0 for _ in range(col_N)] # 每一列中有多少个Text块,根据mid判断的
- colID_Textcnt_LR = [0 for _ in range(col_N)] # 每一列中有多少个Text块,根据区间边界判断
- allLocateBboxs_mid = [] # 跨整页的,bbox
- allLocateBboxs_LR = []
- non_allLocateBboxs_mid = []
- non_allLocateBboxs_LR = [] # 不在单独某一列,但又不是全列
- for L, U, R, D in textBboxs:
- if D - U < 40: # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
- continue
- if R - L < 40:
- continue
- located_cols_mid = []
- located_cols_LR = []
- for col_ID in range(col_N):
- if col_N == 1:
- located_cols_mid.append(col_ID)
- located_cols_LR.append(col_ID)
- else:
- if L <= x_targetAxis[col_ID] <= R:
- located_cols_mid.append(col_ID)
- if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
- located_cols_LR.append(col_ID)
- if len(located_cols_mid) == col_N:
- allLocateBboxs_mid.append((L, U, R, D))
- else:
- non_allLocateBboxs_mid.append((L, U, R, D))
- if len(located_cols_LR) == col_N:
- allLocateBboxs_LR.append((L, U, R, D))
- else:
- non_allLocateBboxs_LR.append((L, U, R, D))
- allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
- non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
- allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
- non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
- # --------------------判断,是不是有标题类的小块,掺杂在一列的pdf页面里。-------------#
- isOneClumn = False
- under_cnt = 0
- under_square = 0.0
- before_cnt = 0
- before_square = 0.0
- for nL, nU, nR, nD in non_allLocateBboxs_mid:
- cnt = 0
- for L, U, R, D in allLocateBboxs_mid:
- if nD <= U:
- cnt += 1
- if cnt >= 1:
- before_cnt += cnt
- before_square += (R - L) * (D - U) * cnt
- else:
- under_cnt += 1
- under_square += (R - L) * (D - U) * cnt
- if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
- isOneClumn = True
- if isOneClumn == True and col_N != 1:
- return INF, False
- if isOneClumn == True and col_N == 1:
- return 0, True
- #### 根据边界的统计情况,再判断一次
- isOneClumn = False
- under_cnt = 0
- under_square = 0.0
- before_cnt = 0
- before_square = 0.0
- for nL, nU, nR, nD in non_allLocateBboxs_LR:
- cnt = 0
- for L, U, R, D in allLocateBboxs_LR:
- if nD <= U:
- cnt += 1
- if cnt >= 1:
- before_cnt += cnt
- before_square += (R - L) * (D - U) * cnt
- else:
- under_cnt += 1
- under_square += (R - L) * (D - U) * cnt
- if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
- isOneClumn = True
- if isOneClumn == True and col_N != 1:
- return INF, False
- if isOneClumn == True and col_N == 1:
- return 0, True
- for L, U, R, D in textBboxs:
- assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'
- # 简单排除页眉、迷你小块
- # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
- if (D - U) < 40:
- continue
- if (R - L) < 40:
- continue
- mid = (L + R) / 2
- located_cols_mid = [] # 在哪一列里,根据中点来判断
- located_cols_LR = [] # 在哪一列里,根据边界判断
- for col_ID in range(col_N):
- if col_N == 1:
- located_cols_mid.append(col_ID)
- else:
- # 根据中点判断
- if L <= x_targetAxis[col_ID] <= R:
- located_cols_mid.append(col_ID)
- # 根据边界判断
- if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
- located_cols_LR.append(col_ID)
- ## 1列的情形
- if col_N == 1:
- oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
- # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
- oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
- oneLocateCnt_mid += 1
- oneLocateSquare_mid += (D - U) * (R - L)
- ## 多列的情形
- else:
- ######## 根据mid判断
- if len(located_cols_mid) == 1:
- oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
- # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
- oneLocateCnt_mid += 1
- oneLocateSquare_mid += (D - U) * (R - L)
- elif 1 <= len(located_cols_mid) < col_N:
- ll, rr = located_cols_mid[0], located_cols_mid[-1]
- # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
- multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
- # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
- multiLocateCnt_mid += 1
- multiLocateSquare_mid += (D - U) * (R - L)
- isSimpleCondition = False
- else:
- allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
- allLocateCnt_mid += 1
- allLocateSquare_mid += (D - U) * (R - L)
- isSimpleCondition = False
- ######## 根据区间的边界判断
- if len(located_cols_LR) == 1:
- oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
- # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
- oneLocateCnt_LR += 1
- oneLocateSquare_LR += (D - U) * (R - L)
- elif 1 <= len(located_cols_LR) < col_N:
- ll, rr = located_cols_LR[0], located_cols_LR[-1]
- # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
- multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
- # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
- multiLocateCnt_LR += 1
- multiLocateSquare_LR += (D - U) * (R - L)
- isSimpleCondition = False
- else:
- allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
- allLocateCnt_LR += 1
- allLocateSquare_LR += (D - U) * (R - L)
- isSimpleCondition = False
- tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
- tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid
- # 1列的情形
- if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
- return 0, True
- # 多列的情形
- # if col_N >= 2:
- # if allLocateCnt >= 1:
- # oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
- # multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
- # else:
- # if multiLocateCnt_mid >= 1:
- # oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
- totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
- totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
- return totLoss_mid + totLoss_LR, isSimpleCondition
- def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
- columnNumber_loss = dict()
- columnNumber_isSimpleCondition = dict()
- #### 枚举列数
- for columnNumber in range(1, 5):
- # print('---------{}--------'.format(columnNumber))
- x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
- loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
- columnNumber_loss[columnNumber] = loss
- columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition
- col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
- col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))
- return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition
|