detect_images.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. import collections # 统计库
  2. import re
  3. from magic_pdf.libs.commons import fitz # pyMuPDF库
  4. #--------------------------------------- Tool Functions --------------------------------------#
  5. # 正则化,输入文本,输出只保留a-z,A-Z,0-9
  6. def remove_special_chars(s: str) -> str:
  7. pattern = r"[^a-zA-Z0-9]"
  8. res = re.sub(pattern, "", s)
  9. return res
  10. def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  11. # 判断rect1和rect2是否一模一样
  12. return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
  13. def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  14. # 判断rect1包含了rect2
  15. return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
  16. def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
  17. # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠)
  18. return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
  19. def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
  20. # 计算两个rect,重叠面积各占2个rect面积的比例
  21. if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
  22. return 0, 0
  23. square_1 = (R1 - L1) * (D1 - U1)
  24. square_2 = (R2 - L2) * (D2 - U2)
  25. if square_1 == 0 or square_2 == 0:
  26. return 0, 0
  27. square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
  28. return square_overlap / square_1, square_overlap / square_2
  29. def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
  30. # 计算两个line,重叠区间各占2个line长度的比例
  31. if max(L1, L2) > min(R1, R2):
  32. return 0, 0
  33. if L1 == R1 or L2 == R2:
  34. return 0, 0
  35. overlap_line = min(R1, R2) - max(L1, L2)
  36. return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
  37. # 判断rect其实是一条line
  38. def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
  39. width = R - L
  40. height = D - U
  41. if width <= 3 or height <= 3:
  42. return True
  43. if width / height >= 30 or height / width >= 30:
  44. return True
  45. def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
  46. """
  47. :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
  48. :param page :fitz读取的当前页的内容
  49. :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
  50. :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
  51. """
  52. #### 通过fitz获取page信息
  53. ## 超越边界
  54. DPI = 72 # use this resolution
  55. pix = page.get_pixmap(dpi=DPI)
  56. pageL = 0
  57. pageR = int(pix.w)
  58. pageU = 0
  59. pageD = int(pix.h)
  60. #----------------- 保存每一个文本块的LURD ------------------#
  61. textLine_blocks = []
  62. blocks = page.get_text(
  63. "dict",
  64. flags=fitz.TEXTFLAGS_TEXT,
  65. #clip=clip,
  66. )["blocks"]
  67. for i in range(len(blocks)):
  68. bbox = blocks[i]['bbox']
  69. # print(bbox)
  70. for tt in blocks[i]['lines']:
  71. # 当前line
  72. cur_line_bbox = None # 当前line,最右侧的section的bbox
  73. for xf in tt['spans']:
  74. L, U, R, D = xf['bbox']
  75. L, R = min(L, R), max(L, R)
  76. U, D = min(U, D), max(U, D)
  77. textLine_blocks.append((L, U, R, D))
  78. textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
  79. #---------------------------------------------- 保存img --------------------------------------------------#
  80. raw_imgs = page.get_images() # 获取所有的图片
  81. imgs = []
  82. img_names = [] # 保存图片的名字,方便在md中插入引用
  83. img_bboxs = [] # 保存图片的location信息。
  84. img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了
  85. img_ID = 0
  86. ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标)
  87. for i in range(len(raw_imgs)):
  88. # 如果图片在junklist中则跳过
  89. if raw_imgs[i][0] in junk_img_bojids:
  90. continue
  91. else:
  92. try:
  93. tt = page.get_image_rects(raw_imgs[i][0], transform = True)
  94. rec = tt[0][0]
  95. L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
  96. L, R = min(L, R), max(L, R)
  97. U, D = min(U, D), max(U, D)
  98. if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
  99. continue
  100. if pageL == L and R == pageR:
  101. continue
  102. if pageU == U and D == pageD:
  103. continue
  104. # pix1 = page.get_Pixmap(clip=(L,U,R,D))
  105. new_img_name = "{}_{}.png".format(page_ID, i) # 图片name
  106. # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名
  107. img_names.append(new_img_name)
  108. img_bboxs.append((L, U, R, D))
  109. img_visited.append(False)
  110. imgs.append(raw_imgs[i])
  111. except:
  112. continue
  113. #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------#
  114. imgs_ok = [True for _ in range(len(imgs))]
  115. for i in range(len(imgs)):
  116. L1, U1, R1, D1 = img_bboxs[i]
  117. for j in range(i + 1, len(imgs)):
  118. L2, U2, R2, D2 = img_bboxs[j]
  119. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  120. s1 = abs(R1 - L1) * abs(D1 - U1)
  121. s2 = abs(R2 - L2) * abs(D2 - U2)
  122. if ratio_1 > 0 and ratio_2 > 0:
  123. if ratio_1 == 1 and ratio_2 > 0.8:
  124. imgs_ok[i] = False
  125. elif ratio_1 > 0.8 and ratio_2 == 1:
  126. imgs_ok[j] = False
  127. elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
  128. imgs_ok[i] = False
  129. imgs_ok[j] = False
  130. elif s1 / s2 > 5 and ratio_2 > 0.5:
  131. imgs_ok[j] = False
  132. elif s2 / s1 > 5 and ratio_1 > 0.5:
  133. imgs_ok[i] = False
  134. imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  135. img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  136. img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  137. img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
  138. #*******************************************************************************#
  139. #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
  140. #
  141. svgs = page.get_drawings()
  142. #------------ preprocess, check一些大框,看是否是合理的 ----------#
  143. ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
  144. svg_rect_visited = set()
  145. available_svgIdx = []
  146. for i in range(len(svgs)):
  147. L, U, R, D = svgs[i]['rect'].irect
  148. L, R = min(L, R), max(L, R)
  149. U, D = min(U, D), max(U, D)
  150. tt = (L, U, R, D)
  151. if tt not in svg_rect_visited:
  152. svg_rect_visited.add(tt)
  153. available_svgIdx.append(i)
  154. svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs
  155. svg_childs = [[] for _ in range(len(svgs))]
  156. svg_parents = [[] for _ in range(len(svgs))]
  157. svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
  158. svg_visited = [False for _ in range(len(svgs))]
  159. svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。
  160. for i in range(len(svgs)):
  161. L, U, R, D = svgs[i]['rect'].irect
  162. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
  163. if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
  164. if ratio_2 >= 0.7:
  165. svg_exceedPage[i] += 4
  166. else:
  167. if L <= pageL:
  168. svg_exceedPage[i] += 1
  169. if pageR <= R:
  170. svg_exceedPage[i] += 1
  171. if U <= pageU:
  172. svg_exceedPage[i] += 1
  173. if pageD <= D:
  174. svg_exceedPage[i] += 1
  175. #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。
  176. if len([x for x in svg_exceedPage if x >= 1]) >= 2:
  177. svgs = []
  178. svg_childs = []
  179. svg_parents = []
  180. svg_overlaps = []
  181. svg_visited = []
  182. svg_exceedPage = []
  183. #---------------------------- build graph ----------------------------#
  184. for i, p in enumerate(svgs):
  185. L1, U1, R1, D1 = svgs[i]["rect"].irect
  186. for j in range(len(svgs)):
  187. if i == j:
  188. continue
  189. L2, U2, R2, D2 = svgs[j]["rect"].irect
  190. ## 包含
  191. if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  192. svg_childs[i].append(j)
  193. svg_parents[j].append(i)
  194. else:
  195. ## 交叉
  196. if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  197. svg_overlaps[i].append(j)
  198. #---------------- 确定最终的svg。连通块儿的外围 -------------------#
  199. eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准)
  200. svg_ID = 0
  201. svg_final_names = []
  202. svg_final_bboxs = []
  203. svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited
  204. svg_idxs = [i for i in range(len(svgs))]
  205. svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序
  206. for i in svg_idxs:
  207. if svg_visited[i] == True:
  208. continue
  209. svg_visited[i] = True
  210. L, U, R, D = svgs[i]['rect'].irect
  211. width = R - L
  212. height = D - U
  213. if check_rect_isLine(L, U, R, D) == True:
  214. svg_visited[i] = False
  215. continue
  216. # if i == 4:
  217. # print(i, L, U, R, D)
  218. # print(svg_parents[i])
  219. cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。
  220. if len(svg_parents[i]) == 0:
  221. ## 是个普通框的情形
  222. cur_block_element_cnt += len(svg_childs[i])
  223. if svg_exceedPage[i] == 0:
  224. ## 误差。可能已经包含在某个框里面了
  225. neglect_flag = False
  226. for pL, pU, pR, pD in svg_final_bboxs:
  227. if pL <= L <= R <= pR and pU <= U <= D <= pD:
  228. neglect_flag = True
  229. break
  230. if neglect_flag == True:
  231. continue
  232. ## 搜索连通域, bfs+记忆化
  233. q = collections.deque()
  234. for j in svg_overlaps[i]:
  235. q.append(j)
  236. while q:
  237. j = q.popleft()
  238. svg_visited[j] = True
  239. L2, U2, R2, D2 = svgs[j]['rect'].irect
  240. # width2 = R2 - L2
  241. # height2 = D2 - U2
  242. # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
  243. # continue
  244. L = min(L, L2)
  245. R = max(R, R2)
  246. U = min(U, U2)
  247. D = max(D, D2)
  248. cur_block_element_cnt += 1
  249. cur_block_element_cnt += len(svg_childs[j])
  250. for k in svg_overlaps[j]:
  251. if svg_visited[k] == False and svg_exceedPage[k] == 0:
  252. svg_visited[k] = True
  253. q.append(k)
  254. elif svg_exceedPage[i] <= 2:
  255. ## 误差。可能已经包含在某个svg_final_bbox框里面了
  256. neglect_flag = False
  257. for sL, sU, sR, sD in svg_final_bboxs:
  258. if sL <= L <= R <= sR and sU <= U <= D <= sD:
  259. neglect_flag = True
  260. break
  261. if neglect_flag == True:
  262. continue
  263. L, U, R, D = pageR, pageD, pageL, pageU
  264. ## 所有孩子元素的最大边界
  265. for j in svg_childs[i]:
  266. if svg_visited[j] == True:
  267. continue
  268. if svg_exceedPage[j] >= 1:
  269. continue
  270. svg_visited[j] = True #### 这个位置考虑一下
  271. L2, U2, R2, D2 = svgs[j]['rect'].irect
  272. L = min(L, L2)
  273. R = max(R, R2)
  274. U = min(U, U2)
  275. D = max(D, D2)
  276. cur_block_element_cnt += 1
  277. # 如果是条line,就不用保存了
  278. if check_rect_isLine(L, U, R, D) == True:
  279. continue
  280. # 如果当前的svg,连2个elements都没有,就不用保存了
  281. if cur_block_element_cnt < 3:
  282. continue
  283. ## 当前svg,框住了多少文本框。如果框多了,可能就是错了
  284. contain_textLineBlock_cnt = 0
  285. for L2, U2, R2, D2 in textLine_blocks:
  286. if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
  287. contain_textLineBlock_cnt += 1
  288. if contain_textLineBlock_cnt >= 10:
  289. continue
  290. # L -= eps_ERROR * 2
  291. # U -= eps_ERROR
  292. # R += eps_ERROR * 2
  293. # D += eps_ERROR
  294. # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
  295. # cur_svg = page.get_pixmap(clip=(L,U,R,D))
  296. new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name
  297. # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
  298. svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
  299. svg_final_bboxs.append((L, U, R, D))
  300. svg_final_visited.append(False)
  301. svg_ID += 1
  302. ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并
  303. svg_idxs = [i for i in range(len(svg_final_bboxs))]
  304. svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L)
  305. svg_final_names_2 = []
  306. svg_final_bboxs_2 = []
  307. svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited
  308. svg_ID_2 = 0
  309. for i in range(len(svg_final_bboxs)):
  310. L1, U1, R1, D1 = svg_final_bboxs[i]
  311. for j in range(i + 1, len(svg_final_bboxs)):
  312. L2, U2, R2, D2 = svg_final_bboxs[j]
  313. # 如果 rect1包含了rect2
  314. if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  315. svg_final_visited[j] = True
  316. continue
  317. # 水平并列
  318. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
  319. if ratio_1 >= 0.7 and ratio_2 >= 0.7:
  320. if abs(L2 - R1) >= 20:
  321. continue
  322. LL = min(L1, L2)
  323. UU = min(U1, U2)
  324. RR = max(R1, R2)
  325. DD = max(D1, D2)
  326. svg_final_bboxs[i] = (LL, UU, RR, DD)
  327. svg_final_visited[j] = True
  328. continue
  329. # 竖直并列
  330. ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
  331. if ratio_1 >= 0.7 and ratio_2 >= 0.7:
  332. if abs(U2 - D1) >= 20:
  333. continue
  334. LL = min(L1, L2)
  335. UU = min(U1, U2)
  336. RR = max(R1, R2)
  337. DD = max(D1, D2)
  338. svg_final_bboxs[i] = (LL, UU, RR, DD)
  339. svg_final_visited[j] = True
  340. for i in range(len(svg_final_bboxs)):
  341. if svg_final_visited[i] == False:
  342. L, U, R, D = svg_final_bboxs[i]
  343. svg_final_bboxs_2.append((L, U, R, D))
  344. L -= eps_ERROR * 2
  345. U -= eps_ERROR
  346. R += eps_ERROR * 2
  347. D += eps_ERROR
  348. # cur_svg = page.get_pixmap(clip=(L,U,R,D))
  349. new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name
  350. # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名
  351. svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用
  352. svg_final_bboxs_2.append((L, U, R, D))
  353. svg_final_visited_2.append(False)
  354. svg_ID_2 += 1
  355. ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。
  356. # 有收尾才comprehensive
  357. # xxxx
  358. # xxxx
  359. # xxxx
  360. # xxxx
  361. #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------#
  362. figure_bbox_from_DocXChain = []
  363. figure_from_DocXChain_visited = [] # 记忆化
  364. figure_bbox_from_DocXChain_overlappedRatio = []
  365. figure_only_from_DocXChain_bboxs = [] # 存储
  366. figure_only_from_DocXChain_names = []
  367. figure_only_from_DocXChain_visited = []
  368. figure_only_ID = 0
  369. xf_json = json_from_DocXchain_obj
  370. width_from_json = xf_json['page_info']['width']
  371. height_from_json = xf_json['page_info']['height']
  372. LR_scaleRatio = width_from_json / (pageR - pageL)
  373. UD_scaleRatio = height_from_json / (pageD - pageU)
  374. for xf in xf_json['layout_dets']:
  375. # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
  376. L = xf['poly'][0] / LR_scaleRatio
  377. U = xf['poly'][1] / UD_scaleRatio
  378. R = xf['poly'][2] / LR_scaleRatio
  379. D = xf['poly'][5] / UD_scaleRatio
  380. # L += pageL # 有的页面,artBox偏移了。不在(0,0)
  381. # R += pageL
  382. # U += pageU
  383. # D += pageU
  384. L, R = min(L, R), max(L, R)
  385. U, D = min(U, D), max(U, D)
  386. # figure
  387. if xf["category_id"] == 1 and xf['score'] >= 0.3:
  388. figure_bbox_from_DocXChain.append((L, U, R, D))
  389. figure_from_DocXChain_visited.append(False)
  390. figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
  391. #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
  392. ## 比对imgs
  393. for i, b1 in enumerate(figure_bbox_from_DocXChain):
  394. # print('--------- DocXChain的图片', b1)
  395. L1, U1, R1, D1 = b1
  396. for b2 in img_bboxs:
  397. # print('-------- igms得到的图', b2)
  398. L2, U2, R2, D2 = b2
  399. s1 = abs(R1 - L1) * abs(D1 - U1)
  400. s2 = abs(R2 - L2) * abs(D2 - U2)
  401. # 相同
  402. if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  403. figure_from_DocXChain_visited[i] = True
  404. # 包含
  405. elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  406. if s2 / s1 > 0.8:
  407. figure_from_DocXChain_visited[i] = True
  408. elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  409. if s1 / s2 > 0.8:
  410. figure_from_DocXChain_visited[i] = True
  411. else:
  412. # 重叠了相当一部分
  413. # print('进入第3部分')
  414. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  415. if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
  416. figure_from_DocXChain_visited[i] = True
  417. else:
  418. figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
  419. # print('图片的重叠率是{}'.format(ratio_1))
  420. ## 比对svgs
  421. svg_final_bboxs_2_badIdxs = []
  422. for i, b1 in enumerate(figure_bbox_from_DocXChain):
  423. L1, U1, R1, D1 = b1
  424. for j, b2 in enumerate(svg_final_bboxs_2):
  425. L2, U2, R2, D2 = b2
  426. s1 = abs(R1 - L1) * abs(D1 - U1)
  427. s2 = abs(R2 - L2) * abs(D2 - U2)
  428. # 相同
  429. if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  430. figure_from_DocXChain_visited[i] = True
  431. # 包含
  432. elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
  433. figure_from_DocXChain_visited[i] = True
  434. elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  435. if s1 / s2 > 0.7:
  436. figure_from_DocXChain_visited[i] = True
  437. else:
  438. svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。
  439. else:
  440. # 重叠了相当一部分
  441. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  442. if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
  443. figure_from_DocXChain_visited[i] = True
  444. else:
  445. figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
  446. # 丢掉错误的svg
  447. svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
  448. for i in range(len(figure_from_DocXChain_visited)):
  449. if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
  450. figure_from_DocXChain_visited[i] = True
  451. # DocXChain识别出来的figure,但是没被保存的。
  452. for i in range(len(figure_from_DocXChain_visited)):
  453. if figure_from_DocXChain_visited[i] == False:
  454. figure_from_DocXChain_visited[i] = True
  455. cur_bbox = figure_bbox_from_DocXChain[i]
  456. # cur_figure = page.get_pixmap(clip=cur_bbox)
  457. new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name
  458. # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名
  459. figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用
  460. figure_only_from_DocXChain_bboxs.append(cur_bbox)
  461. figure_only_from_DocXChain_visited.append(False)
  462. figure_only_ID += 1
  463. img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
  464. svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
  465. figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
  466. curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
  467. #--------------------------- 最后统一去重 -----------------------------------#
  468. curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
  469. #### 先考虑包含关系的小块
  470. final_duplicate = set()
  471. for i in range(len(curPage_all_fig_bboxs)):
  472. L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
  473. for j in range(len(curPage_all_fig_bboxs)):
  474. if i == j:
  475. continue
  476. L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
  477. s1 = abs(R1 - L1) * abs(D1 - U1)
  478. s2 = abs(R2 - L2) * abs(D2 - U2)
  479. if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
  480. final_duplicate.add((L1, U1, R1, D1))
  481. else:
  482. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  483. if ratio_1 >= 0.8 and ratio_2 <= 0.6:
  484. final_duplicate.add((L1, U1, R1, D1))
  485. curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
  486. #### 再考虑重叠关系的块
  487. final_duplicate = set()
  488. final_synthetic_bboxs = []
  489. for i in range(len(curPage_all_fig_bboxs)):
  490. L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
  491. for j in range(len(curPage_all_fig_bboxs)):
  492. if i == j:
  493. continue
  494. L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
  495. s1 = abs(R1 - L1) * abs(D1 - U1)
  496. s2 = abs(R2 - L2) * abs(D2 - U2)
  497. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  498. union_ok = False
  499. if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  500. union_ok = True
  501. if (ratio_1 > 0.2 and s2 / s1 > 5):
  502. union_ok = True
  503. if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  504. union_ok = True
  505. if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  506. union_ok = True
  507. if union_ok == True:
  508. final_duplicate.add((L1, U1, R1, D1))
  509. final_duplicate.add((L2, U2, R2, D2))
  510. L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
  511. final_synthetic_bboxs.append((L3, U3, R3, D3))
  512. # print('---------- curPage_all_fig_bboxs ---------')
  513. # print(curPage_all_fig_bboxs)
  514. curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]
  515. final_synthetic_bboxs = list(set(final_synthetic_bboxs))
  516. ## 再再考虑重叠关系。极端情况下会迭代式地2进1
  517. new_images = []
  518. droped_img_idx = []
  519. image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]
  520. for i in range(0, len(image_bboxes)):
  521. for j in range(i+1, len(image_bboxes)):
  522. if j not in droped_img_idx:
  523. L2, U2, R2, D2 = image_bboxes[j]
  524. s1 = abs(R1 - L1) * abs(D1 - U1)
  525. s2 = abs(R2 - L2) * abs(D2 - U2)
  526. ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  527. union_ok = False
  528. if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  529. union_ok = True
  530. if (ratio_1 > 0.2 and s2 / s1 > 5):
  531. union_ok = True
  532. if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  533. union_ok = True
  534. if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  535. union_ok = True
  536. if union_ok == True:
  537. # 合并
  538. image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
  539. droped_img_idx.append(j)
  540. for i in range(0, len(image_bboxes)):
  541. if i not in droped_img_idx:
  542. new_images.append(image_bboxes[i])
  543. # find_union_FLAG = True
  544. # while find_union_FLAG == True:
  545. # find_union_FLAG = False
  546. # final_duplicate = set()
  547. # tmp = []
  548. # for i in range(len(final_synthetic_bboxs)):
  549. # L1, U1, R1, D1 = final_synthetic_bboxs[i]
  550. # for j in range(len(final_synthetic_bboxs)):
  551. # if i == j:
  552. # continue
  553. # L2, U2, R2, D2 = final_synthetic_bboxs[j]
  554. # s1 = abs(R1 - L1) * abs(D1 - U1)
  555. # s2 = abs(R2 - L2) * abs(D2 - U2)
  556. # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
  557. # union_ok = False
  558. # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6):
  559. # union_ok = True
  560. # if (ratio_1 > 0.2 and s2 / s1 > 5):
  561. # union_ok = True
  562. # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
  563. # union_ok = True
  564. # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
  565. # union_ok = True
  566. # if union_ok == True:
  567. # find_union_FLAG = True
  568. # final_duplicate.add((L1, U1, R1, D1))
  569. # final_duplicate.add((L2, U2, R2, D2))
  570. # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
  571. # tmp.append((L3, U3, R3, D3))
  572. # if find_union_FLAG == True:
  573. # tmp = list(set(tmp))
  574. # final_synthetic_bboxs = tmp[:]
  575. # curPage_all_fig_bboxs += final_synthetic_bboxs
  576. # print('--------- final synthetic')
  577. # print(final_synthetic_bboxs)
  578. #**************************************************************************#
  579. images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
  580. images = images1 + new_images
  581. return images