| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647 |
- import collections # 统计库
- import re
- from magic_pdf.libs.commons import fitz # pyMuPDF库
- #--------------------------------------- Tool Functions --------------------------------------#
- # 正则化,输入文本,输出只保留a-z,A-Z,0-9
- def remove_special_chars(s: str) -> str:
- pattern = r"[^a-zA-Z0-9]"
- res = re.sub(pattern, "", s)
- return res
- def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
- # 判断rect1和rect2是否一模一样
- return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
- def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
- # 判断rect1包含了rect2
- return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
- def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
- # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
- return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
- def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
- # 计算两个rect,重叠面积各占2个rect面积的比例
- if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
- return 0, 0
- square_1 = (R1 - L1) * (D1 - U1)
- square_2 = (R2 - L2) * (D2 - U2)
- if square_1 == 0 or square_2 == 0:
- return 0, 0
- square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
- return square_overlap / square_1, square_overlap / square_2
- def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
- # 计算两个line,重叠区间各占2个line长度的比例
- if max(L1, L2) > min(R1, R2):
- return 0, 0
- if L1 == R1 or L2 == R2:
- return 0, 0
- overlap_line = min(R1, R2) - max(L1, L2)
- return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
- # 判断rect其实是一条line
- def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
- width = R - L
- height = D - U
- if width <= 3 or height <= 3:
- return True
- if width / height >= 30 or height / width >= 30:
- return True
- def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
- """
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
- :param page :fitz读取的当前页的内容
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
- """
- #### 通过fitz获取page信息
- ## 超越边界
- DPI = 72 # use this resolution
- pix = page.get_pixmap(dpi=DPI)
- pageL = 0
- pageR = int(pix.w)
- pageU = 0
- pageD = int(pix.h)
-
- #----------------- 保存每一个文本块的LURD ------------------#
- textLine_blocks = []
- blocks = page.get_text(
- "dict",
- flags=fitz.TEXTFLAGS_TEXT,
- #clip=clip,
- )["blocks"]
- for i in range(len(blocks)):
- bbox = blocks[i]['bbox']
- # print(bbox)
- for tt in blocks[i]['lines']:
- # 当前line
- cur_line_bbox = None # 当前line,最右侧的section的bbox
- for xf in tt['spans']:
- L, U, R, D = xf['bbox']
- L, R = min(L, R), max(L, R)
- U, D = min(U, D), max(U, D)
- textLine_blocks.append((L, U, R, D))
- textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
-
- #---------------------------------------------- 保存img --------------------------------------------------#
- raw_imgs = page.get_images() # 获取所有的图片
- imgs = []
- img_names = [] # 保存图片的名字,方便在md中插入引用
- img_bboxs = [] # 保存图片的location信息。
- img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
- img_ID = 0
- ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
- for i in range(len(raw_imgs)):
- # 如果图片在junklist中则跳过
- if raw_imgs[i][0] in junk_img_bojids:
- continue
- else:
- try:
- tt = page.get_image_rects(raw_imgs[i][0], transform = True)
- rec = tt[0][0]
- L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
- L, R = min(L, R), max(L, R)
- U, D = min(U, D), max(U, D)
- if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
- continue
- if pageL == L and R == pageR:
- continue
- if pageU == U and D == pageD:
- continue
- # pix1 = page.get_Pixmap(clip=(L,U,R,D))
- new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
- # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
- img_names.append(new_img_name)
- img_bboxs.append((L, U, R, D))
- img_visited.append(False)
- imgs.append(raw_imgs[i])
- except:
- continue
-
- #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
- imgs_ok = [True for _ in range(len(imgs))]
- for i in range(len(imgs)):
- L1, U1, R1, D1 = img_bboxs[i]
- for j in range(i + 1, len(imgs)):
- L2, U2, R2, D2 = img_bboxs[j]
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- if ratio_1 > 0 and ratio_2 > 0:
- if ratio_1 == 1 and ratio_2 > 0.8:
- imgs_ok[i] = False
- elif ratio_1 > 0.8 and ratio_2 == 1:
- imgs_ok[j] = False
- elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
- imgs_ok[i] = False
- imgs_ok[j] = False
- elif s1 / s2 > 5 and ratio_2 > 0.5:
- imgs_ok[j] = False
- elif s2 / s1 > 5 and ratio_1 > 0.5:
- imgs_ok[i] = False
-
- imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
- img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
- img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
- img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
- #*******************************************************************************#
-
- #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
- #
- svgs = page.get_drawings()
- #------------ preprocess, check一些大框,看是否是合理的 ----------#
- ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
- svg_rect_visited = set()
- available_svgIdx = []
- for i in range(len(svgs)):
- L, U, R, D = svgs[i]['rect'].irect
- L, R = min(L, R), max(L, R)
- U, D = min(U, D), max(U, D)
- tt = (L, U, R, D)
- if tt not in svg_rect_visited:
- svg_rect_visited.add(tt)
- available_svgIdx.append(i)
-
- svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
- svg_childs = [[] for _ in range(len(svgs))]
- svg_parents = [[] for _ in range(len(svgs))]
- svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
- svg_visited = [False for _ in range(len(svgs))]
- svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
-
-
- for i in range(len(svgs)):
- L, U, R, D = svgs[i]['rect'].irect
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
- if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
- if ratio_2 >= 0.7:
- svg_exceedPage[i] += 4
- else:
- if L <= pageL:
- svg_exceedPage[i] += 1
- if pageR <= R:
- svg_exceedPage[i] += 1
- if U <= pageU:
- svg_exceedPage[i] += 1
- if pageD <= D:
- svg_exceedPage[i] += 1
-
- #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
- if len([x for x in svg_exceedPage if x >= 1]) >= 2:
- svgs = []
- svg_childs = []
- svg_parents = []
- svg_overlaps = []
- svg_visited = []
- svg_exceedPage = []
-
- #---------------------------- build graph ----------------------------#
- for i, p in enumerate(svgs):
- L1, U1, R1, D1 = svgs[i]["rect"].irect
- for j in range(len(svgs)):
- if i == j:
- continue
- L2, U2, R2, D2 = svgs[j]["rect"].irect
- ## 包含
- if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- svg_childs[i].append(j)
- svg_parents[j].append(i)
- else:
- ## 交叉
- if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- svg_overlaps[i].append(j)
- #---------------- 确定最终的svg。连通块儿的外围 -------------------#
- eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
- svg_ID = 0
- svg_final_names = []
- svg_final_bboxs = []
- svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
-
- svg_idxs = [i for i in range(len(svgs))]
- svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
-
- for i in svg_idxs:
- if svg_visited[i] == True:
- continue
- svg_visited[i] = True
- L, U, R, D = svgs[i]['rect'].irect
- width = R - L
- height = D - U
- if check_rect_isLine(L, U, R, D) == True:
- svg_visited[i] = False
- continue
- # if i == 4:
- # print(i, L, U, R, D)
- # print(svg_parents[i])
-
- cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
- if len(svg_parents[i]) == 0:
- ## 是个普通框的情形
- cur_block_element_cnt += len(svg_childs[i])
- if svg_exceedPage[i] == 0:
- ## 误差。可能已经包含在某个框里面了
- neglect_flag = False
- for pL, pU, pR, pD in svg_final_bboxs:
- if pL <= L <= R <= pR and pU <= U <= D <= pD:
- neglect_flag = True
- break
- if neglect_flag == True:
- continue
-
- ## 搜索连通域, bfs+记忆化
- q = collections.deque()
- for j in svg_overlaps[i]:
- q.append(j)
- while q:
- j = q.popleft()
- svg_visited[j] = True
- L2, U2, R2, D2 = svgs[j]['rect'].irect
- # width2 = R2 - L2
- # height2 = D2 - U2
- # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
- # continue
- L = min(L, L2)
- R = max(R, R2)
- U = min(U, U2)
- D = max(D, D2)
- cur_block_element_cnt += 1
- cur_block_element_cnt += len(svg_childs[j])
- for k in svg_overlaps[j]:
- if svg_visited[k] == False and svg_exceedPage[k] == 0:
- svg_visited[k] = True
- q.append(k)
- elif svg_exceedPage[i] <= 2:
- ## 误差。可能已经包含在某个svg_final_bbox框里面了
- neglect_flag = False
- for sL, sU, sR, sD in svg_final_bboxs:
- if sL <= L <= R <= sR and sU <= U <= D <= sD:
- neglect_flag = True
- break
- if neglect_flag == True:
- continue
-
- L, U, R, D = pageR, pageD, pageL, pageU
- ## 所有孩子元素的最大边界
- for j in svg_childs[i]:
- if svg_visited[j] == True:
- continue
- if svg_exceedPage[j] >= 1:
- continue
- svg_visited[j] = True #### 这个位置考虑一下
- L2, U2, R2, D2 = svgs[j]['rect'].irect
- L = min(L, L2)
- R = max(R, R2)
- U = min(U, U2)
- D = max(D, D2)
- cur_block_element_cnt += 1
-
- # 如果是条line,就不用保存了
- if check_rect_isLine(L, U, R, D) == True:
- continue
- # 如果当前的svg,连2个elements都没有,就不用保存了
- if cur_block_element_cnt < 3:
- continue
-
- ## 当前svg,框住了多少文本框。如果框多了,可能就是错了
- contain_textLineBlock_cnt = 0
- for L2, U2, R2, D2 in textLine_blocks:
- if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
- contain_textLineBlock_cnt += 1
- if contain_textLineBlock_cnt >= 10:
- continue
-
- # L -= eps_ERROR * 2
- # U -= eps_ERROR
- # R += eps_ERROR * 2
- # D += eps_ERROR
- # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
- # cur_svg = page.get_pixmap(clip=(L,U,R,D))
- new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
- # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
- svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
- svg_final_bboxs.append((L, U, R, D))
- svg_final_visited.append(False)
- svg_ID += 1
-
- ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
- svg_idxs = [i for i in range(len(svg_final_bboxs))]
- svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
- svg_final_names_2 = []
- svg_final_bboxs_2 = []
- svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
- svg_ID_2 = 0
- for i in range(len(svg_final_bboxs)):
- L1, U1, R1, D1 = svg_final_bboxs[i]
- for j in range(i + 1, len(svg_final_bboxs)):
- L2, U2, R2, D2 = svg_final_bboxs[j]
- # 如果 rect1包含了rect2
- if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- svg_final_visited[j] = True
- continue
- # 水平并列
- ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
- if ratio_1 >= 0.7 and ratio_2 >= 0.7:
- if abs(L2 - R1) >= 20:
- continue
- LL = min(L1, L2)
- UU = min(U1, U2)
- RR = max(R1, R2)
- DD = max(D1, D2)
- svg_final_bboxs[i] = (LL, UU, RR, DD)
- svg_final_visited[j] = True
- continue
- # 竖直并列
- ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
- if ratio_1 >= 0.7 and ratio_2 >= 0.7:
- if abs(U2 - D1) >= 20:
- continue
- LL = min(L1, L2)
- UU = min(U1, U2)
- RR = max(R1, R2)
- DD = max(D1, D2)
- svg_final_bboxs[i] = (LL, UU, RR, DD)
- svg_final_visited[j] = True
-
- for i in range(len(svg_final_bboxs)):
- if svg_final_visited[i] == False:
- L, U, R, D = svg_final_bboxs[i]
- svg_final_bboxs_2.append((L, U, R, D))
-
- L -= eps_ERROR * 2
- U -= eps_ERROR
- R += eps_ERROR * 2
- D += eps_ERROR
- # cur_svg = page.get_pixmap(clip=(L,U,R,D))
- new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
- # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
- svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
- svg_final_bboxs_2.append((L, U, R, D))
- svg_final_visited_2.append(False)
- svg_ID_2 += 1
-
- ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
- # 有收尾才comprehensive
- # xxxx
- # xxxx
- # xxxx
- # xxxx
-
-
- #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
- figure_bbox_from_DocXChain = []
-
- figure_from_DocXChain_visited = [] # 记忆化
- figure_bbox_from_DocXChain_overlappedRatio = []
-
- figure_only_from_DocXChain_bboxs = [] # 存储
- figure_only_from_DocXChain_names = []
- figure_only_from_DocXChain_visited = []
- figure_only_ID = 0
-
- xf_json = json_from_DocXchain_obj
- width_from_json = xf_json['page_info']['width']
- height_from_json = xf_json['page_info']['height']
- LR_scaleRatio = width_from_json / (pageR - pageL)
- UD_scaleRatio = height_from_json / (pageD - pageU)
-
- for xf in xf_json['layout_dets']:
- # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
- L = xf['poly'][0] / LR_scaleRatio
- U = xf['poly'][1] / UD_scaleRatio
- R = xf['poly'][2] / LR_scaleRatio
- D = xf['poly'][5] / UD_scaleRatio
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
- # R += pageL
- # U += pageU
- # D += pageU
- L, R = min(L, R), max(L, R)
- U, D = min(U, D), max(U, D)
- # figure
- if xf["category_id"] == 1 and xf['score'] >= 0.3:
- figure_bbox_from_DocXChain.append((L, U, R, D))
- figure_from_DocXChain_visited.append(False)
- figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
- #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
-
- ## 比对imgs
- for i, b1 in enumerate(figure_bbox_from_DocXChain):
- # print('--------- DocXChain的图片', b1)
- L1, U1, R1, D1 = b1
- for b2 in img_bboxs:
- # print('-------- igms得到的图', b2)
- L2, U2, R2, D2 = b2
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- # 相同
- if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- figure_from_DocXChain_visited[i] = True
- # 包含
- elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- if s2 / s1 > 0.8:
- figure_from_DocXChain_visited[i] = True
- elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
- if s1 / s2 > 0.8:
- figure_from_DocXChain_visited[i] = True
- else:
- # 重叠了相当一部分
- # print('进入第3部分')
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
- figure_from_DocXChain_visited[i] = True
- else:
- figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
- # print('图片的重叠率是{}'.format(ratio_1))
- ## 比对svgs
- svg_final_bboxs_2_badIdxs = []
- for i, b1 in enumerate(figure_bbox_from_DocXChain):
- L1, U1, R1, D1 = b1
- for j, b2 in enumerate(svg_final_bboxs_2):
- L2, U2, R2, D2 = b2
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- # 相同
- if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- figure_from_DocXChain_visited[i] = True
- # 包含
- elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
- figure_from_DocXChain_visited[i] = True
- elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
- if s1 / s2 > 0.7:
- figure_from_DocXChain_visited[i] = True
- else:
- svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
- else:
- # 重叠了相当一部分
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
- figure_from_DocXChain_visited[i] = True
- else:
- figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
-
- # 丢掉错误的svg
- svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
-
- for i in range(len(figure_from_DocXChain_visited)):
- if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
- figure_from_DocXChain_visited[i] = True
-
- # DocXChain识别出来的figure,但是没被保存的。
- for i in range(len(figure_from_DocXChain_visited)):
- if figure_from_DocXChain_visited[i] == False:
- figure_from_DocXChain_visited[i] = True
- cur_bbox = figure_bbox_from_DocXChain[i]
- # cur_figure = page.get_pixmap(clip=cur_bbox)
- new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
- # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
- figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
- figure_only_from_DocXChain_bboxs.append(cur_bbox)
- figure_only_from_DocXChain_visited.append(False)
- figure_only_ID += 1
-
- img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
- svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
- figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
- curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
-
- #--------------------------- 最后统一去重 -----------------------------------#
- curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
-
- #### 先考虑包含关系的小块
- final_duplicate = set()
- for i in range(len(curPage_all_fig_bboxs)):
- L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
- for j in range(len(curPage_all_fig_bboxs)):
- if i == j:
- continue
- L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
- final_duplicate.add((L1, U1, R1, D1))
- else:
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- if ratio_1 >= 0.8 and ratio_2 <= 0.6:
- final_duplicate.add((L1, U1, R1, D1))
- curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
-
- #### 再考虑重叠关系的块
- final_duplicate = set()
- final_synthetic_bboxs = []
- for i in range(len(curPage_all_fig_bboxs)):
- L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
- for j in range(len(curPage_all_fig_bboxs)):
- if i == j:
- continue
- L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- union_ok = False
- if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
- union_ok = True
- if (ratio_1 > 0.2 and s2 / s1 > 5):
- union_ok = True
- if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
- union_ok = True
- if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
- union_ok = True
- if union_ok == True:
- final_duplicate.add((L1, U1, R1, D1))
- final_duplicate.add((L2, U2, R2, D2))
- L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
- final_synthetic_bboxs.append((L3, U3, R3, D3))
- # print('---------- curPage_all_fig_bboxs ---------')
- # print(curPage_all_fig_bboxs)
- curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
- final_synthetic_bboxs = list(set(final_synthetic_bboxs))
- ## 再再考虑重叠关系。极端情况下会迭代式地2进1
- new_images = []
- droped_img_idx = []
- image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
- for i in range(0, len(image_bboxes)):
- for j in range(i+1, len(image_bboxes)):
- if j not in droped_img_idx:
- L2, U2, R2, D2 = image_bboxes[j]
- s1 = abs(R1 - L1) * abs(D1 - U1)
- s2 = abs(R2 - L2) * abs(D2 - U2)
- ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- union_ok = False
- if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
- union_ok = True
- if (ratio_1 > 0.2 and s2 / s1 > 5):
- union_ok = True
- if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
- union_ok = True
- if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
- union_ok = True
- if union_ok == True:
- # 合并
- image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
- droped_img_idx.append(j)
-
- for i in range(0, len(image_bboxes)):
- if i not in droped_img_idx:
- new_images.append(image_bboxes[i])
-
-
- # find_union_FLAG = True
- # while find_union_FLAG == True:
- # find_union_FLAG = False
- # final_duplicate = set()
- # tmp = []
- # for i in range(len(final_synthetic_bboxs)):
- # L1, U1, R1, D1 = final_synthetic_bboxs[i]
- # for j in range(len(final_synthetic_bboxs)):
- # if i == j:
- # continue
- # L2, U2, R2, D2 = final_synthetic_bboxs[j]
- # s1 = abs(R1 - L1) * abs(D1 - U1)
- # s2 = abs(R2 - L2) * abs(D2 - U2)
- # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
- # union_ok = False
- # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
- # union_ok = True
- # if (ratio_1 > 0.2 and s2 / s1 > 5):
- # union_ok = True
- # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
- # union_ok = True
- # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
- # union_ok = True
- # if union_ok == True:
- # find_union_FLAG = True
- # final_duplicate.add((L1, U1, R1, D1))
- # final_duplicate.add((L2, U2, R2, D2))
- # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
- # tmp.append((L3, U3, R3, D3))
- # if find_union_FLAG == True:
- # tmp = list(set(tmp))
- # final_synthetic_bboxs = tmp[:]
-
- # curPage_all_fig_bboxs += final_synthetic_bboxs
- # print('--------- final synthetic')
- # print(final_synthetic_bboxs)
- #**************************************************************************#
- images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
- images = images1 + new_images
- return images
|