bbox_sort.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681
  1. # 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
  2. # 其中x0, y0代表左上角坐标,x1, y1代表右下角坐标,坐标原点在左上角。
  3. from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
  4. from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
  5. from magic_pdf.libs.commons import mymax
  6. X0_IDX = 0
  7. Y0_IDX = 1
  8. X1_IDX = 2
  9. Y1_IDX = 3
  10. CONTENT_IDX = 4
  11. IDX_X = 5
  12. IDX_Y = 6
  13. CONTENT_TYPE_IDX = 7
  14. X0_EXT_IDX = 8
  15. Y0_EXT_IDX = 9
  16. X1_EXT_IDX = 10
  17. Y1_EXT_IDX = 11
  18. def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
  19. """
  20. text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
  21. 把bbox重新组装成一个list,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是pymupdf里的block结构
  22. """
  23. all_bboxes = []
  24. for image in image_info:
  25. box = image['bbox']
  26. # 由于没有实现横向的栏切分,因此在这里先过滤掉一些小的图片。这些图片有可能影响layout,造成没有横向栏切分的情况下,layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
  27. # 把长宽都小于50的去掉
  28. if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
  29. continue
  30. all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
  31. for table in table_info:
  32. box = table['bbox']
  33. all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
  34. """由于公式与段落混合,因此公式不再参与layout划分,无需加入all_bboxes"""
  35. # 加入文本block
  36. text_block_temp = []
  37. for block in text_raw_blocks:
  38. bbox = block['bbox']
  39. text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
  40. text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)
  41. text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox,有可能让layout探测陷入无限循环
  42. """找出会影响layout的色块、横向分割线"""
  43. spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
  44. # 还要去掉存在于spilter_bboxes里的text_block
  45. if len(spilter_bboxes) > 0:
  46. text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
  47. for bbox in text_block_new:
  48. all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
  49. for bbox in spilter_bboxes:
  50. all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
  51. return all_bboxes
  52. def resolve_bbox_overlap_for_layout_det(bboxes:list):
  53. """
  54. 1. 去掉bbox互相包含的,去掉被包含的
  55. 2. 上下方向上如果有重叠,就扩大大box范围,直到覆盖小box
  56. """
  57. def _is_in_other_bbox(i:int):
  58. """
  59. 判断i个box是否被其他box有所包含
  60. """
  61. for j in range(0, len(bboxes)):
  62. if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
  63. return True
  64. # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
  65. # return True
  66. return False
  67. # 首先去掉被包含的bbox
  68. new_bbox_1 = []
  69. for i in range(0, len(bboxes)):
  70. if not _is_in_other_bbox(i):
  71. new_bbox_1.append(bboxes[i])
  72. # 其次扩展大的box
  73. new_box = []
  74. new_bbox_2 = []
  75. len_1 = len(new_bbox_2)
  76. while True:
  77. merged_idx = []
  78. for i in range(0, len(new_bbox_1)):
  79. if i in merged_idx:
  80. continue
  81. for j in range(i+1, len(new_bbox_1)):
  82. if j in merged_idx:
  83. continue
  84. bx1 = new_bbox_1[i]
  85. bx2 = new_bbox_1[j]
  86. if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
  87. merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
  88. new_bbox_2.append(merged_box)
  89. merged_idx.append(i)
  90. merged_idx.append(j)
  91. for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
  92. if i not in merged_idx:
  93. new_bbox_2.append(new_bbox_1[i])
  94. if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
  95. break
  96. else:
  97. len_1 = len(new_bbox_2)
  98. new_box = new_bbox_2
  99. new_bbox_1, new_bbox_2 = new_bbox_2, []
  100. return new_box
  101. def filter_lines_bbox(bboxes: list):
  102. """
  103. 过滤掉bbox为空的行
  104. """
  105. new_box = []
  106. for box in bboxes:
  107. x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
  108. if abs(x0-x1)<=1 or abs(y0-y1)<=1:
  109. continue
  110. else:
  111. new_box.append(box)
  112. return new_box
  113. ################################################################################
  114. # 第一种排序算法
  115. # 以下是基于延长线遮挡做的一个算法
  116. #
  117. ################################################################################
  118. def find_all_left_bbox(this_bbox, all_bboxes) -> list:
  119. """
  120. 寻找this_bbox左边的所有bbox
  121. """
  122. left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
  123. return left_boxes
  124. def find_all_top_bbox(this_bbox, all_bboxes) -> list:
  125. """
  126. 寻找this_bbox上面的所有bbox
  127. """
  128. top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
  129. return top_boxes
  130. def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
  131. """
  132. 寻找this_bbox在all_bboxes中的遮挡深度 idx_x
  133. """
  134. if this_bbox[IDX_X] is not None:
  135. return this_bbox[IDX_X]
  136. else:
  137. all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
  138. if len(all_left_bboxes) == 0:
  139. this_bbox[IDX_X] = 0
  140. else:
  141. all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
  142. max_idx_x = mymax(all_left_bboxes_idx)
  143. this_bbox[IDX_X] = max_idx_x + 1
  144. return this_bbox[IDX_X]
  145. def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
  146. """
  147. 寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
  148. """
  149. if this_bbox[IDX_Y] is not None:
  150. return this_bbox[IDX_Y]
  151. else:
  152. all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
  153. if len(all_top_bboxes) == 0:
  154. this_bbox[IDX_Y] = 0
  155. else:
  156. all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
  157. max_idx_y = mymax(all_top_bboxes_idx)
  158. this_bbox[IDX_Y] = max_idx_y + 1
  159. return this_bbox[IDX_Y]
  160. def bbox_sort(all_bboxes: list):
  161. """
  162. 排序
  163. """
  164. all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
  165. all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
  166. all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
  167. all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
  168. all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
  169. all_bboxes_idx.sort(key=lambda x: x[0])
  170. sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
  171. return sorted_bboxes
  172. ################################################################################
  173. # 第二种排序算法
  174. # 下面的算法在计算idx_x和idx_y的时候不考虑延长线,而只考虑实际的长或者宽被遮挡的情况
  175. #
  176. ################################################################################
  177. def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
  178. """
  179. 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
  180. """
  181. left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
  182. box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
  183. this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
  184. box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
  185. # 然后再过滤一下,找到水平上距离this_bbox最近的那个
  186. if len(left_boxes) > 0:
  187. left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
  188. left_boxes = [left_boxes[0]]
  189. else:
  190. left_boxes = []
  191. return left_boxes
  192. def get_and_set_idx_x_2(this_bbox, all_bboxes):
  193. """
  194. 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
  195. 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
  196. """
  197. if this_bbox[IDX_X] is not None:
  198. return this_bbox[IDX_X]
  199. else:
  200. left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
  201. if len(left_nearest_bbox) == 0:
  202. this_bbox[IDX_X] = 0
  203. else:
  204. left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
  205. this_bbox[IDX_X] = left_idx_x + 1
  206. return this_bbox[IDX_X]
  207. def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
  208. """
  209. 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
  210. """
  211. top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
  212. box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
  213. this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
  214. box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
  215. # 然后再过滤一下,找到水平上距离this_bbox最近的那个
  216. if len(top_boxes) > 0:
  217. top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
  218. top_boxes = [top_boxes[0]]
  219. else:
  220. top_boxes = []
  221. return top_boxes
  222. def get_and_set_idx_y_2(this_bbox, all_bboxes):
  223. """
  224. 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
  225. 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
  226. """
  227. if this_bbox[IDX_Y] is not None:
  228. return this_bbox[IDX_Y]
  229. else:
  230. top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
  231. if len(top_nearest_bbox) == 0:
  232. this_bbox[IDX_Y] = 0
  233. else:
  234. top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
  235. this_bbox[IDX_Y] = top_idx_y + 1
  236. return this_bbox[IDX_Y]
  237. def paper_bbox_sort(all_bboxes: list, page_width, page_height):
  238. all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
  239. all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
  240. all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
  241. all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
  242. all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
  243. all_bboxes_idx.sort(key=lambda x: x[0])
  244. sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
  245. return sorted_bboxes
  246. ################################################################################
  247. """
  248. 第三种排序算法, 假设page的最左侧为X0,最右侧为X1,最上侧为Y0,最下侧为Y1
  249. 这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下:
  250. 1. 首先在水平方向上对bbox进行扩展。扩展方法是:
  251. - 对每个bbox,找到其左边最近的bbox(也就是y方向有重叠),然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox,那么就将其左边界扩展到page的最左侧X0。
  252. - 对每个bbox,找到其右边最近的bbox(也就是y方向有重叠),然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox,那么就将其右边界扩展到page的最右侧X1。
  253. - 经过上面2个步骤,bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
  254. 2. 合并所有的连续水平方向的bbox, 合并方法是:
  255. - 对bbox进行y方向排序,然后从上到下遍历所有bbox,如果当前bbox和下一个bbox的x0, x1等于X0, X1,那么就合并这两个bbox。
  256. 3. 然后在垂直方向上对bbox进行扩展。扩展方法是:
  257. - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
  258. 针对每个block
  259. - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有,则x0=X0
  260. - x1: 找到位于右侧x=x1延长线右侧所有的bboxes, 找到最小的x0, 让x1=x0-1。如果没有,则x1=X1
  261. 随后在垂直方向上合并所有的连续的block,方法如下:
  262. - 对block进行x方向排序,然后从左到右遍历所有block,如果当前block和下一个block的x0, x1相等,那么就合并这两个block。
  263. 如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
  264. 如果在某个垂直方向上无法被完全分割到一个block,那么就将这个block打上标签'BAD_LAYOUT'。
  265. 至此完成,一个页面的预处理,天然的block要么属于'GOOD_LAYOUT',要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面,可以先按照自上而下,自左到右进行天然排序,也可以先过滤掉这种书籍。
  266. (完成条件下次加强:进行水平方向切分,把混乱的layout部分尽可能切割出去)
  267. """
  268. ################################################################################
  269. def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
  270. """
  271. 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
  272. 这里使用扩展之后的bbox
  273. """
  274. left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
  275. box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
  276. this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
  277. box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
  278. # 然后再过滤一下,找到水平上距离this_bbox最近的那个
  279. if len(left_boxes) > 0:
  280. left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
  281. left_boxes = left_boxes
  282. else:
  283. left_boxes = []
  284. return left_boxes
  285. def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
  286. """
  287. 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
  288. 这里使用扩展之后的bbox
  289. """
  290. top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
  291. box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
  292. this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
  293. box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
  294. # 然后再过滤一下,找到水平上距离this_bbox最近的那个
  295. if len(top_boxes) > 0:
  296. top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
  297. top_boxes = top_boxes
  298. else:
  299. top_boxes = []
  300. return top_boxes
  301. def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
  302. """
  303. 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
  304. 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
  305. """
  306. if this_bbox[IDX_X] is not None:
  307. return this_bbox[IDX_X]
  308. else:
  309. left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
  310. if len(left_nearest_bbox) == 0:
  311. this_bbox[IDX_X] = 0
  312. else:
  313. left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
  314. this_bbox[IDX_X] = mymax(left_idx_x) + 1
  315. return this_bbox[IDX_X]
  316. def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
  317. """
  318. 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
  319. 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
  320. """
  321. if this_bbox[IDX_Y] is not None:
  322. return this_bbox[IDX_Y]
  323. else:
  324. top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
  325. if len(top_nearest_bbox) == 0:
  326. this_bbox[IDX_Y] = 0
  327. else:
  328. top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
  329. this_bbox[IDX_Y] = mymax(top_idx_y) + 1
  330. return this_bbox[IDX_Y]
  331. def _paper_bbox_sort_ext(all_bboxes: list):
  332. all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
  333. all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
  334. all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
  335. all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
  336. all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
  337. all_bboxes_idx.sort(key=lambda x: x[0])
  338. sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
  339. return sorted_bboxes
  340. # ===============================================================================================
  341. def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
  342. """
  343. 寻找this_bbox左边的所有bbox, 使用延长线
  344. """
  345. left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
  346. if len(left_boxes):
  347. left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
  348. left_boxes = left_boxes[0]
  349. else:
  350. left_boxes = None
  351. return left_boxes
  352. def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
  353. """
  354. 寻找this_bbox右边的所有bbox, 使用延长线
  355. """
  356. right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
  357. if len(right_boxes):
  358. right_boxes.sort(key=lambda x: x[X0_IDX])
  359. right_boxes = right_boxes[0]
  360. else:
  361. right_boxes = None
  362. return right_boxes
  363. # =============================================================================================
  364. def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
  365. """
  366. 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox, 不用延长线并且不能像
  367. """
  368. left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
  369. box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
  370. this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
  371. box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
  372. # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
  373. if len(left_boxes) > 0:
  374. left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
  375. left_boxes = left_boxes[0]
  376. else:
  377. left_boxes = None
  378. return left_boxes
  379. def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
  380. """
  381. 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
  382. """
  383. right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
  384. this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
  385. box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
  386. box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
  387. if len(right_bboxes)>0:
  388. right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
  389. right_bboxes = right_bboxes[0]
  390. else:
  391. right_bboxes = None
  392. return right_bboxes
  393. def reset_idx_x_y(all_boxes:list)->list:
  394. for box in all_boxes:
  395. box[IDX_X] = None
  396. box[IDX_Y] = None
  397. return all_boxes
  398. # ===================================================================================================
  399. def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
  400. """
  401. 找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
  402. """
  403. top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
  404. box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
  405. this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
  406. box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
  407. # 然后再过滤一下,找到上方距离this_bbox最近的那个
  408. if len(top_bboxes) > 0:
  409. top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
  410. top_bboxes = top_bboxes[0]
  411. else:
  412. top_bboxes = None
  413. return top_bboxes
  414. def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
  415. """
  416. 找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
  417. """
  418. bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
  419. box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
  420. this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
  421. box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
  422. # 然后再过滤一下,找到水平上距离this_bbox最近的那个
  423. if len(bottom_bboxes) > 0:
  424. bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
  425. bottom_bboxes = bottom_bboxes[0]
  426. else:
  427. bottom_bboxes = None
  428. return bottom_bboxes
  429. def find_boundry_bboxes(bboxes:list) -> tuple:
  430. """
  431. 找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
  432. """
  433. x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
  434. for box in bboxes:
  435. x0 = min(box[X0_IDX], x0)
  436. y0 = min(box[Y0_IDX], y0)
  437. x1 = max(box[X1_IDX], x1)
  438. y1 = max(box[Y1_IDX], y1)
  439. return x0, y0, x1, y1
  440. def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
  441. """
  442. 在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
  443. """
  444. for box in bboxes:
  445. top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
  446. bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
  447. if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
  448. box[X0_EXT_IDX] = box[X0_IDX]
  449. box[Y0_EXT_IDX] = boundry_y0
  450. box[X1_EXT_IDX] = box[X1_IDX]
  451. box[Y1_EXT_IDX] = boundry_y1
  452. # else:
  453. # if top_nearest_bbox is None:
  454. # box[Y0_EXT_IDX] = boundry_y0
  455. # else:
  456. # box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
  457. # if bottom_nearest_bbox is None:
  458. # box[Y1_EXT_IDX] = boundry_y1
  459. # else:
  460. # box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
  461. # box[X0_EXT_IDX] = box[X0_IDX]
  462. # box[X1_EXT_IDX] = box[X1_IDX]
  463. return bboxes
  464. # ===================================================================================================
  465. def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
  466. """
  467. 增加预处理行为的排序:
  468. return:
  469. [
  470. {
  471. "layout_bbox": [x0, y0, x1, y1],
  472. "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
  473. "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
  474. }
  475. ]
  476. """
  477. sorted_layouts = [] # 最后的返回结果
  478. page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
  479. all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
  480. # 首先在水平方向上扩展独占一行的bbox
  481. for bbox in all_bboxes:
  482. left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
  483. right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
  484. if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
  485. bbox[X0_EXT_IDX] = page_x0
  486. bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
  487. bbox[X1_EXT_IDX] = page_x1
  488. bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
  489. # 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
  490. if len(all_bboxes)==1:
  491. return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
  492. if len(all_bboxes)==0:
  493. return []
  494. """
  495. 然后合并所有连续水平方向的bbox.
  496. """
  497. all_bboxes.sort(key=lambda x: x[Y0_IDX])
  498. h_bboxes = []
  499. h_bbox_group = []
  500. v_boxes = []
  501. for bbox in all_bboxes:
  502. if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
  503. h_bbox_group.append(bbox)
  504. else:
  505. if len(h_bbox_group)>0:
  506. h_bboxes.append(h_bbox_group)
  507. h_bbox_group = []
  508. # 最后一个group
  509. if len(h_bbox_group)>0:
  510. h_bboxes.append(h_bbox_group)
  511. """
  512. 现在h_bboxes里面是所有的group了,每个group都是一个list
  513. 对h_bboxes里的每个group进行计算放回到sorted_layouts里
  514. """
  515. for gp in h_bboxes:
  516. gp.sort(key=lambda x: x[Y0_IDX])
  517. block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
  518. # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
  519. x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
  520. block_info["layout_bbox"] = [x0, y0, x1, y1]
  521. sorted_layouts.append(block_info)
  522. # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
  523. h_split_lines = [page_y0]
  524. for gp in h_bboxes:
  525. layout_bbox = gp['layout_bbox']
  526. y0, y1 = layout_bbox[1], layout_bbox[3]
  527. h_split_lines.append(y0)
  528. h_split_lines.append(y1)
  529. h_split_lines.append(page_y1)
  530. unsplited_bboxes = []
  531. for i in range(0, len(h_split_lines), 2):
  532. start_y0, start_y1 = h_split_lines[i:i+2]
  533. # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
  534. bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
  535. unsplited_bboxes.append(bboxes_in_block)
  536. # ================== 至此,水平方向的 已经切分排序完毕====================================
  537. """
  538. 接下来针对每个非水平的部分切分垂直方向的
  539. 此时,只剩下了无法被完全水平打通的bbox了。对这些box,优先进行垂直扩展,然后进行垂直切分.
  540. 分3步:
  541. 1. 先把能完全垂直打通的隔离出去当做一个layout
  542. 2. 其余的先垂直切分
  543. 3. 垂直切分之后的部分再尝试水平切分
  544. 4. 剩下的不能被切分的各个部分当成一个layout
  545. """
  546. # 对每部分进行垂直切分
  547. for bboxes_in_block in unsplited_bboxes:
  548. # 首先对这个block的bbox进行垂直方向上的扩展
  549. boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block)
  550. # 进行垂直方向上的扩展
  551. extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
  552. # 然后对这个block进行垂直方向上的切分
  553. extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大,代表了从左到右读取
  554. v_boxes_group = []
  555. for bbox in extended_vertical_bboxes:
  556. if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
  557. v_boxes_group.append(bbox)
  558. else:
  559. if len(v_boxes_group)>0:
  560. v_boxes.append(v_boxes_group)
  561. v_boxes_group = []
  562. if len(v_boxes_group)>0:
  563. v_boxes.append(v_boxes_group)
  564. # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了,因为上面已经做了
  565. for gp in v_boxes:
  566. gp.sort(key=lambda x: x[X0_IDX])
  567. block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
  568. # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
  569. x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
  570. block_info["layout_bbox"] = [x0, y0, x1, y1]
  571. sorted_layouts.append(block_info)
  572. # 在垂直方向上,划分子块,也就是用贯通的垂直线进行切分。这些被切分出来的块,极大可能是可被垂直切分的,如果不能完全的垂直切分,那么尝试水平切分。都不能的则当成一个layout
  573. v_split_lines = [boundry_x0]
  574. for gp in v_boxes:
  575. layout_bbox = gp['layout_bbox']
  576. x0, x1 = layout_bbox[0], layout_bbox[2]
  577. v_split_lines.append(x0)
  578. v_split_lines.append(x1)
  579. v_split_lines.append(boundry_x1)
  580. reset_idx_x_y(all_bboxes)
  581. all_boxes = _paper_bbox_sort_ext(all_bboxes)
  582. return all_boxes