detect_images.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. import os
  2. import collections # 统计库
  3. import re
  4. from libs.boxbase import _is_in_or_part_overlap # 正则
  5. from libs.commons import fitz # pyMuPDF库
  6. import json # json
  7. #--------------------------------------- Tool Functions --------------------------------------#
  8. # 正则化,输入文本,输出只保留a-z,A-Z,0-9
  9. def remove_special_chars(s: str) -> str:
  10. pattern = r"[^a-zA-Z0-9]"
  11. res = re.sub(pattern, "", s)
  12. return res
  13. def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  14. # 判断rect1和rect2是否一模一样
  15. return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
  16. def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  17. # 判断rect1包含了rect2
  18. return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
  19. def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  20. # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
  21. return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
  22. def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
  23. # 计算两个rect,重叠面积各占2个rect面积的比例
  24. if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
  25. return 0, 0
  26. square_1 = (R1 - L1) * (D1 - U1)
  27. square_2 = (R2 - L2) * (D2 - U2)
  28. if square_1 == 0 or square_2 == 0:
  29. return 0, 0
  30. square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
  31. return square_overlap / square_1, square_overlap / square_2
  32. def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
  33. # 计算两个line,重叠区间各占2个line长度的比例
  34. if max(L1, L2) > min(R1, R2):
  35. return 0, 0
  36. if L1 == R1 or L2 == R2:
  37. return 0, 0
  38. overlap_line = min(R1, R2) - max(L1, L2)
  39. return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
  40. # 判断rect其实是一条line
  41. def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
  42. width = R - L
  43. height = D - U
  44. if width <= 3 or height <= 3:
  45. return True
  46. if width / height >= 30 or height / width >= 30:
  47. return True
  48. def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
  49. """
  50. :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
  51. :param page :fitz读取的当前页的内容
  52. :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
  53. :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
  54. """
  55. #### 通过fitz获取page信息
  56. ## 超越边界
  57. DPI = 72 # use this resolution
  58. pix = page.get_pixmap(dpi=DPI)
  59. pageL = 0
  60. pageR = int(pix.w)
  61. pageU = 0
  62. pageD = int(pix.h)
  63. #----------------- 保存每一个文本块的LURD ------------------#
  64. textLine_blocks = []
  65. blocks = page.get_text(
  66. "dict",
  67. flags=fitz.TEXTFLAGS_TEXT,
  68. #clip=clip,
  69. )["blocks"]
  70. for i in range(len(blocks)):
  71. bbox = blocks[i]['bbox']
  72. # print(bbox)
  73. for tt in blocks[i]['lines']:
  74. # 当前line
  75. cur_line_bbox = None # 当前line,最右侧的section的bbox
  76. for xf in tt['spans']:
  77. L, U, R, D = xf['bbox']
  78. L, R = min(L, R), max(L, R)
  79. U, D = min(U, D), max(U, D)
  80. textLine_blocks.append((L, U, R, D))
  81. textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
  82. #---------------------------------------------- 保存img --------------------------------------------------#
  83. raw_imgs = page.get_images() # 获取所有的图片
  84. imgs = []
  85. img_names = [] # 保存图片的名字,方便在md中插入引用
  86. img_bboxs = [] # 保存图片的location信息。
  87. img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
  88. img_ID = 0
  89. ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
  90. for i in range(len(raw_imgs)):
  91. # 如果图片在junklist中则跳过
  92. if raw_imgs[i][0] in junk_img_bojids:
  93. continue
  94. else:
  95. try:
  96. tt = page.get_image_rects(raw_imgs[i][0], transform = True)
  97. rec = tt[0][0]
  98. L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
  99. L, R = min(L, R), max(L, R)
  100. U, D = min(U, D), max(U, D)
  101. if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
  102. continue
  103. if pageL == L and R == pageR:
  104. continue
  105. if pageU == U and D == pageD:
  106. continue
  107. # pix1 = page.get_Pixmap(clip=(L,U,R,D))
  108. new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
  109. # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
  110. img_names.append(new_img_name)
  111. img_bboxs.append((L, U, R, D))
  112. img_visited.append(False)
  113. imgs.append(raw_imgs[i])
  114. except:
  115. continue
  116. #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
  117. imgs_ok = [True for _ in range(len(imgs))]
  118. for i in range(len(imgs)):
  119. L1, U1, R1, D1 = img_bboxs[i]
  120. for j in range(i + 1, len(imgs)):
  121. L2, U2, R2, D2 = img_bboxs[j]
  122. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  123. s1 = abs(R1 - L1) * abs(D1 - U1)
  124. s2 = abs(R2 - L2) * abs(D2 - U2)
  125. if ratio_1 > 0 and ratio_2 > 0:
  126. if ratio_1 == 1 and ratio_2 > 0.8:
  127. imgs_ok[i] = False
  128. elif ratio_1 > 0.8 and ratio_2 == 1:
  129. imgs_ok[j] = False
  130. elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
  131. imgs_ok[i] = False
  132. imgs_ok[j] = False
  133. elif s1 / s2 > 5 and ratio_2 > 0.5:
  134. imgs_ok[j] = False
  135. elif s2 / s1 > 5 and ratio_1 > 0.5:
  136. imgs_ok[i] = False
  137. imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  138. img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  139. img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  140. img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  141. #*******************************************************************************#
  142. #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
  143. #
  144. svgs = page.get_drawings()
  145. #------------ preprocess, check一些大框,看是否是合理的 ----------#
  146. ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
  147. svg_rect_visited = set()
  148. available_svgIdx = []
  149. for i in range(len(svgs)):
  150. L, U, R, D = svgs[i]['rect'].irect
  151. L, R = min(L, R), max(L, R)
  152. U, D = min(U, D), max(U, D)
  153. tt = (L, U, R, D)
  154. if tt not in svg_rect_visited:
  155. svg_rect_visited.add(tt)
  156. available_svgIdx.append(i)
  157. svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
  158. svg_childs = [[] for _ in range(len(svgs))]
  159. svg_parents = [[] for _ in range(len(svgs))]
  160. svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
  161. svg_visited = [False for _ in range(len(svgs))]
  162. svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
  163. for i in range(len(svgs)):
  164. L, U, R, D = svgs[i]['rect'].irect
  165. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
  166. if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
  167. if ratio_2 >= 0.7:
  168. svg_exceedPage[i] += 4
  169. else:
  170. if L <= pageL:
  171. svg_exceedPage[i] += 1
  172. if pageR <= R:
  173. svg_exceedPage[i] += 1
  174. if U <= pageU:
  175. svg_exceedPage[i] += 1
  176. if pageD <= D:
  177. svg_exceedPage[i] += 1
  178. #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
  179. if len([x for x in svg_exceedPage if x >= 1]) >= 2:
  180. svgs = []
  181. svg_childs = []
  182. svg_parents = []
  183. svg_overlaps = []
  184. svg_visited = []
  185. svg_exceedPage = []
  186. #---------------------------- build graph ----------------------------#
  187. for i, p in enumerate(svgs):
  188. L1, U1, R1, D1 = svgs[i]["rect"].irect
  189. for j in range(len(svgs)):
  190. if i == j:
  191. continue
  192. L2, U2, R2, D2 = svgs[j]["rect"].irect
  193. ## 包含
  194. if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  195. svg_childs[i].append(j)
  196. svg_parents[j].append(i)
  197. else:
  198. ## 交叉
  199. if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  200. svg_overlaps[i].append(j)
  201. #---------------- 确定最终的svg。连通块儿的外围 -------------------#
  202. eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
  203. svg_ID = 0
  204. svg_final_names = []
  205. svg_final_bboxs = []
  206. svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
  207. svg_idxs = [i for i in range(len(svgs))]
  208. svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
  209. for i in svg_idxs:
  210. if svg_visited[i] == True:
  211. continue
  212. svg_visited[i] = True
  213. L, U, R, D = svgs[i]['rect'].irect
  214. width = R - L
  215. height = D - U
  216. if check_rect_isLine(L, U, R, D) == True:
  217. svg_visited[i] = False
  218. continue
  219. # if i == 4:
  220. # print(i, L, U, R, D)
  221. # print(svg_parents[i])
  222. cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
  223. if len(svg_parents[i]) == 0:
  224. ## 是个普通框的情形
  225. cur_block_element_cnt += len(svg_childs[i])
  226. if svg_exceedPage[i] == 0:
  227. ## 误差。可能已经包含在某个框里面了
  228. neglect_flag = False
  229. for pL, pU, pR, pD in svg_final_bboxs:
  230. if pL <= L <= R <= pR and pU <= U <= D <= pD:
  231. neglect_flag = True
  232. break
  233. if neglect_flag == True:
  234. continue
  235. ## 搜索连通域, bfs+记忆化
  236. q = collections.deque()
  237. for j in svg_overlaps[i]:
  238. q.append(j)
  239. while q:
  240. j = q.popleft()
  241. svg_visited[j] = True
  242. L2, U2, R2, D2 = svgs[j]['rect'].irect
  243. # width2 = R2 - L2
  244. # height2 = D2 - U2
  245. # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
  246. # continue
  247. L = min(L, L2)
  248. R = max(R, R2)
  249. U = min(U, U2)
  250. D = max(D, D2)
  251. cur_block_element_cnt += 1
  252. cur_block_element_cnt += len(svg_childs[j])
  253. for k in svg_overlaps[j]:
  254. if svg_visited[k] == False and svg_exceedPage[k] == 0:
  255. svg_visited[k] = True
  256. q.append(k)
  257. elif svg_exceedPage[i] <= 2:
  258. ## 误差。可能已经包含在某个svg_final_bbox框里面了
  259. neglect_flag = False
  260. for sL, sU, sR, sD in svg_final_bboxs:
  261. if sL <= L <= R <= sR and sU <= U <= D <= sD:
  262. neglect_flag = True
  263. break
  264. if neglect_flag == True:
  265. continue
  266. L, U, R, D = pageR, pageD, pageL, pageU
  267. ## 所有孩子元素的最大边界
  268. for j in svg_childs[i]:
  269. if svg_visited[j] == True:
  270. continue
  271. if svg_exceedPage[j] >= 1:
  272. continue
  273. svg_visited[j] = True #### 这个位置考虑一下
  274. L2, U2, R2, D2 = svgs[j]['rect'].irect
  275. L = min(L, L2)
  276. R = max(R, R2)
  277. U = min(U, U2)
  278. D = max(D, D2)
  279. cur_block_element_cnt += 1
  280. # 如果是条line,就不用保存了
  281. if check_rect_isLine(L, U, R, D) == True:
  282. continue
  283. # 如果当前的svg,连2个elements都没有,就不用保存了
  284. if cur_block_element_cnt < 3:
  285. continue
  286. ## 当前svg,框住了多少文本框。如果框多了,可能就是错了
  287. contain_textLineBlock_cnt = 0
  288. for L2, U2, R2, D2 in textLine_blocks:
  289. if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
  290. contain_textLineBlock_cnt += 1
  291. if contain_textLineBlock_cnt >= 10:
  292. continue
  293. # L -= eps_ERROR * 2
  294. # U -= eps_ERROR
  295. # R += eps_ERROR * 2
  296. # D += eps_ERROR
  297. # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
  298. # cur_svg = page.get_pixmap(clip=(L,U,R,D))
  299. new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
  300. # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
  301. svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
  302. svg_final_bboxs.append((L, U, R, D))
  303. svg_final_visited.append(False)
  304. svg_ID += 1
  305. ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
  306. svg_idxs = [i for i in range(len(svg_final_bboxs))]
  307. svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
  308. svg_final_names_2 = []
  309. svg_final_bboxs_2 = []
  310. svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
  311. svg_ID_2 = 0
  312. for i in range(len(svg_final_bboxs)):
  313. L1, U1, R1, D1 = svg_final_bboxs[i]
  314. for j in range(i + 1, len(svg_final_bboxs)):
  315. L2, U2, R2, D2 = svg_final_bboxs[j]
  316. # 如果 rect1包含了rect2
  317. if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  318. svg_final_visited[j] = True
  319. continue
  320. # 水平并列
  321. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
  322. if ratio_1 >= 0.7 and ratio_2 >= 0.7:
  323. if abs(L2 - R1) >= 20:
  324. continue
  325. LL = min(L1, L2)
  326. UU = min(U1, U2)
  327. RR = max(R1, R2)
  328. DD = max(D1, D2)
  329. svg_final_bboxs[i] = (LL, UU, RR, DD)
  330. svg_final_visited[j] = True
  331. continue
  332. # 竖直并列
  333. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
  334. if ratio_1 >= 0.7 and ratio_2 >= 0.7:
  335. if abs(U2 - D1) >= 20:
  336. continue
  337. LL = min(L1, L2)
  338. UU = min(U1, U2)
  339. RR = max(R1, R2)
  340. DD = max(D1, D2)
  341. svg_final_bboxs[i] = (LL, UU, RR, DD)
  342. svg_final_visited[j] = True
  343. for i in range(len(svg_final_bboxs)):
  344. if svg_final_visited[i] == False:
  345. L, U, R, D = svg_final_bboxs[i]
  346. svg_final_bboxs_2.append((L, U, R, D))
  347. L -= eps_ERROR * 2
  348. U -= eps_ERROR
  349. R += eps_ERROR * 2
  350. D += eps_ERROR
  351. # cur_svg = page.get_pixmap(clip=(L,U,R,D))
  352. new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
  353. # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
  354. svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
  355. svg_final_bboxs_2.append((L, U, R, D))
  356. svg_final_visited_2.append(False)
  357. svg_ID_2 += 1
  358. ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
  359. # 有收尾才comprehensive
  360. # xxxx
  361. # xxxx
  362. # xxxx
  363. # xxxx
  364. #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
  365. figure_bbox_from_DocXChain = []
  366. figure_from_DocXChain_visited = [] # 记忆化
  367. figure_bbox_from_DocXChain_overlappedRatio = []
  368. figure_only_from_DocXChain_bboxs = [] # 存储
  369. figure_only_from_DocXChain_names = []
  370. figure_only_from_DocXChain_visited = []
  371. figure_only_ID = 0
  372. xf_json = json_from_DocXchain_obj
  373. width_from_json = xf_json['page_info']['width']
  374. height_from_json = xf_json['page_info']['height']
  375. LR_scaleRatio = width_from_json / (pageR - pageL)
  376. UD_scaleRatio = height_from_json / (pageD - pageU)
  377. for xf in xf_json['layout_dets']:
  378. # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
  379. L = xf['poly'][0] / LR_scaleRatio
  380. U = xf['poly'][1] / UD_scaleRatio
  381. R = xf['poly'][2] / LR_scaleRatio
  382. D = xf['poly'][5] / UD_scaleRatio
  383. # L += pageL # 有的页面,artBox偏移了。不在(0,0)
  384. # R += pageL
  385. # U += pageU
  386. # D += pageU
  387. L, R = min(L, R), max(L, R)
  388. U, D = min(U, D), max(U, D)
  389. # figure
  390. if xf["category_id"] == 1 and xf['score'] >= 0.3:
  391. figure_bbox_from_DocXChain.append((L, U, R, D))
  392. figure_from_DocXChain_visited.append(False)
  393. figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
  394. #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
  395. ## 比对imgs
  396. for i, b1 in enumerate(figure_bbox_from_DocXChain):
  397. # print('--------- DocXChain的图片', b1)
  398. L1, U1, R1, D1 = b1
  399. for b2 in img_bboxs:
  400. # print('-------- igms得到的图', b2)
  401. L2, U2, R2, D2 = b2
  402. s1 = abs(R1 - L1) * abs(D1 - U1)
  403. s2 = abs(R2 - L2) * abs(D2 - U2)
  404. # 相同
  405. if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  406. figure_from_DocXChain_visited[i] = True
  407. # 包含
  408. elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  409. if s2 / s1 > 0.8:
  410. figure_from_DocXChain_visited[i] = True
  411. elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  412. if s1 / s2 > 0.8:
  413. figure_from_DocXChain_visited[i] = True
  414. else:
  415. # 重叠了相当一部分
  416. # print('进入第3部分')
  417. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  418. if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
  419. figure_from_DocXChain_visited[i] = True
  420. else:
  421. figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
  422. # print('图片的重叠率是{}'.format(ratio_1))
  423. ## 比对svgs
  424. svg_final_bboxs_2_badIdxs = []
  425. for i, b1 in enumerate(figure_bbox_from_DocXChain):
  426. L1, U1, R1, D1 = b1
  427. for j, b2 in enumerate(svg_final_bboxs_2):
  428. L2, U2, R2, D2 = b2
  429. s1 = abs(R1 - L1) * abs(D1 - U1)
  430. s2 = abs(R2 - L2) * abs(D2 - U2)
  431. # 相同
  432. if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  433. figure_from_DocXChain_visited[i] = True
  434. # 包含
  435. elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  436. figure_from_DocXChain_visited[i] = True
  437. elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  438. if s1 / s2 > 0.7:
  439. figure_from_DocXChain_visited[i] = True
  440. else:
  441. svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
  442. else:
  443. # 重叠了相当一部分
  444. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  445. if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
  446. figure_from_DocXChain_visited[i] = True
  447. else:
  448. figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
  449. # 丢掉错误的svg
  450. svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
  451. for i in range(len(figure_from_DocXChain_visited)):
  452. if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
  453. figure_from_DocXChain_visited[i] = True
  454. # DocXChain识别出来的figure,但是没被保存的。
  455. for i in range(len(figure_from_DocXChain_visited)):
  456. if figure_from_DocXChain_visited[i] == False:
  457. figure_from_DocXChain_visited[i] = True
  458. cur_bbox = figure_bbox_from_DocXChain[i]
  459. # cur_figure = page.get_pixmap(clip=cur_bbox)
  460. new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
  461. # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
  462. figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
  463. figure_only_from_DocXChain_bboxs.append(cur_bbox)
  464. figure_only_from_DocXChain_visited.append(False)
  465. figure_only_ID += 1
  466. img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
  467. svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
  468. figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
  469. curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
  470. #--------------------------- 最后统一去重 -----------------------------------#
  471. curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
  472. #### 先考虑包含关系的小块
  473. final_duplicate = set()
  474. for i in range(len(curPage_all_fig_bboxs)):
  475. L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
  476. for j in range(len(curPage_all_fig_bboxs)):
  477. if i == j:
  478. continue
  479. L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
  480. s1 = abs(R1 - L1) * abs(D1 - U1)
  481. s2 = abs(R2 - L2) * abs(D2 - U2)
  482. if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  483. final_duplicate.add((L1, U1, R1, D1))
  484. else:
  485. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  486. if ratio_1 >= 0.8 and ratio_2 <= 0.6:
  487. final_duplicate.add((L1, U1, R1, D1))
  488. curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
  489. #### 再考虑重叠关系的块
  490. final_duplicate = set()
  491. final_synthetic_bboxs = []
  492. for i in range(len(curPage_all_fig_bboxs)):
  493. L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
  494. for j in range(len(curPage_all_fig_bboxs)):
  495. if i == j:
  496. continue
  497. L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
  498. s1 = abs(R1 - L1) * abs(D1 - U1)
  499. s2 = abs(R2 - L2) * abs(D2 - U2)
  500. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  501. union_ok = False
  502. if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  503. union_ok = True
  504. if (ratio_1 > 0.2 and s2 / s1 > 5):
  505. union_ok = True
  506. if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  507. union_ok = True
  508. if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  509. union_ok = True
  510. if union_ok == True:
  511. final_duplicate.add((L1, U1, R1, D1))
  512. final_duplicate.add((L2, U2, R2, D2))
  513. L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
  514. final_synthetic_bboxs.append((L3, U3, R3, D3))
  515. # print('---------- curPage_all_fig_bboxs ---------')
  516. # print(curPage_all_fig_bboxs)
  517. curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
  518. final_synthetic_bboxs = list(set(final_synthetic_bboxs))
  519. ## 再再考虑重叠关系。极端情况下会迭代式地2进1
  520. new_images = []
  521. droped_img_idx = []
  522. image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
  523. for i in range(0, len(image_bboxes)):
  524. for j in range(i+1, len(image_bboxes)):
  525. if j not in droped_img_idx:
  526. L2, U2, R2, D2 = image_bboxes[j]
  527. s1 = abs(R1 - L1) * abs(D1 - U1)
  528. s2 = abs(R2 - L2) * abs(D2 - U2)
  529. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  530. union_ok = False
  531. if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  532. union_ok = True
  533. if (ratio_1 > 0.2 and s2 / s1 > 5):
  534. union_ok = True
  535. if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  536. union_ok = True
  537. if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  538. union_ok = True
  539. if union_ok == True:
  540. # 合并
  541. image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
  542. droped_img_idx.append(j)
  543. for i in range(0, len(image_bboxes)):
  544. if i not in droped_img_idx:
  545. new_images.append(image_bboxes[i])
  546. # find_union_FLAG = True
  547. # while find_union_FLAG == True:
  548. # find_union_FLAG = False
  549. # final_duplicate = set()
  550. # tmp = []
  551. # for i in range(len(final_synthetic_bboxs)):
  552. # L1, U1, R1, D1 = final_synthetic_bboxs[i]
  553. # for j in range(len(final_synthetic_bboxs)):
  554. # if i == j:
  555. # continue
  556. # L2, U2, R2, D2 = final_synthetic_bboxs[j]
  557. # s1 = abs(R1 - L1) * abs(D1 - U1)
  558. # s2 = abs(R2 - L2) * abs(D2 - U2)
  559. # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  560. # union_ok = False
  561. # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  562. # union_ok = True
  563. # if (ratio_1 > 0.2 and s2 / s1 > 5):
  564. # union_ok = True
  565. # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  566. # union_ok = True
  567. # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  568. # union_ok = True
  569. # if union_ok == True:
  570. # find_union_FLAG = True
  571. # final_duplicate.add((L1, U1, R1, D1))
  572. # final_duplicate.add((L2, U2, R2, D2))
  573. # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
  574. # tmp.append((L3, U3, R3, D3))
  575. # if find_union_FLAG == True:
  576. # tmp = list(set(tmp))
  577. # final_synthetic_bboxs = tmp[:]
  578. # curPage_all_fig_bboxs += final_synthetic_bboxs
  579. # print('--------- final synthetic')
  580. # print(final_synthetic_bboxs)
  581. #**************************************************************************#
  582. images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
  583. images = images1 + new_images
  584. return images