layout_sort.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921
  1. """对pdf上的box进行layout识别,并对内部组成的box进行排序."""
  2. from loguru import logger
  3. from magic_pdf.layout.bbox_sort import (CONTENT_IDX, CONTENT_TYPE_IDX,
  4. X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX,
  5. Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX,
  6. paper_bbox_sort)
  7. from magic_pdf.layout.layout_det_utils import (
  8. find_all_bottom_bbox_direct, find_all_left_bbox_direct,
  9. find_all_right_bbox_direct, find_all_top_bbox_direct,
  10. find_bottom_bbox_direct_from_left_edge,
  11. find_bottom_bbox_direct_from_right_edge,
  12. find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge,
  13. get_left_edge_bboxes, get_right_edge_bboxes)
  14. from magic_pdf.libs.boxbase import get_bbox_in_boundary
  15. LAYOUT_V = 'V'
  16. LAYOUT_H = 'H'
  17. LAYOUT_UNPROC = 'U'
  18. LAYOUT_BAD = 'B'
  19. def _is_single_line_text(bbox):
  20. """检查bbox里面的文字是否只有一行."""
  21. return True # TODO
  22. box_type = bbox[CONTENT_TYPE_IDX]
  23. if box_type != 'text':
  24. return False
  25. paras = bbox[CONTENT_IDX]['paras']
  26. text_content = ''
  27. for para_id, para in paras.items(): # 拼装内部的段落文本
  28. is_title = para['is_title']
  29. if is_title != 0:
  30. text_content += f"## {para['text']}"
  31. else:
  32. text_content += para['text']
  33. text_content += '\n\n'
  34. return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split('\n\n')) <= 1
  35. def _horizontal_split(bboxes: list, boundary: tuple, avg_font_size=20) -> list:
  36. """
  37. 对bboxes进行水平切割
  38. 方法是:找到左侧和右侧都没有被直接遮挡的box,然后进行扩展,之后进行切割
  39. return:
  40. 返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平,u代表未探测的,v代表垂直布局
  41. """
  42. sorted_layout_blocks = [] # 这是要最终返回的值
  43. bound_x0, bound_y0, bound_x1, bound_y1 = boundary
  44. all_bboxes = get_bbox_in_boundary(bboxes, boundary)
  45. # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
  46. """
  47. 首先在水平方向上扩展独占一行的bbox
  48. """
  49. last_h_split_line_y1 = bound_y0 # 记录下上次的水平分割线
  50. for i, bbox in enumerate(all_bboxes):
  51. left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes) # 非扩展线
  52. right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
  53. if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
  54. """
  55. 然而,如果只是孤立的一行文字,那么就还要满足以下几个条件才可以:
  56. 1. bbox和中心线相交。或者
  57. 2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
  58. 3. TODO 加强条件:这个bbox上方和下方是同一列column,那么就不能算作独占一行
  59. """
  60. # 先检查这个bbox里是否只包含一行文字
  61. # is_single_line = _is_single_line_text(bbox)
  62. """
  63. 这里有个点需要注意,当页面内容不是居中的时候,第一次调用传递的是page的boundary,这个时候mid_x就不是中心线了.
  64. 所以这里计算出最紧致的boundary,然后再计算mid_x
  65. """
  66. boundary_real_x0, boundary_real_x1 = min(
  67. [bbox[X0_IDX] for bbox in all_bboxes]
  68. ), max([bbox[X1_IDX] for bbox in all_bboxes])
  69. mid_x = (boundary_real_x0 + boundary_real_x1) / 2
  70. # 检查这个box是否内容在中心线有交
  71. # 必须跨过去2个字符的宽度
  72. is_cross_boundary_mid_line = (
  73. min(mid_x - bbox[X0_IDX], bbox[X1_IDX] - mid_x) > avg_font_size * 2
  74. )
  75. """
  76. 检查条件2
  77. """
  78. is_belong_to_col = False
  79. """
  80. 检查是否能被上方col吸收,方法是:
  81. 1. 上方非空且不是独占一行的,并且
  82. 2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
  83. """
  84. """
  85. 以迭代的方式向上找,查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
  86. """
  87. # 先确定上方的y0, y0
  88. b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
  89. # 然后从box开始逐个向上找到所有与box在x上有交集的box
  90. box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
  91. bbox_in_bound_check = get_bbox_in_boundary(all_bboxes, box_to_check)
  92. bboxes_on_top = []
  93. virtual_box = bbox
  94. while True:
  95. b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
  96. if b_on_top is not None:
  97. bboxes_on_top.append(b_on_top)
  98. virtual_box = [
  99. min([virtual_box[X0_IDX], b_on_top[X0_IDX]]),
  100. min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]),
  101. max([virtual_box[X1_IDX], b_on_top[X1_IDX]]),
  102. b_y1,
  103. ]
  104. else:
  105. break
  106. # 随后确定这些box的最小x0, 最大x1
  107. if len(bboxes_on_top) > 0 and len(bboxes_on_top) != len(
  108. bbox_in_bound_check
  109. ): # virtual_box可能会膨胀到占满整个区域,这实际上就不能属于一个col了。
  110. min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
  111. # 然后采用一种比较粗糙的方法,看min_x0,max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
  112. if not any(
  113. [
  114. b[X0_IDX] <= min_x0 - 1 <= b[X1_IDX]
  115. or b[X0_IDX] <= max_x1 + 1 <= b[X1_IDX]
  116. for b in bbox_in_bound_check
  117. ]
  118. ):
  119. # 其上,下都不能被扩展成行,暂时只检查一下上方 TODO
  120. top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
  121. bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
  122. if not any(
  123. [
  124. top_nearest_bbox is not None
  125. and (
  126. find_all_left_bbox_direct(top_nearest_bbox, bboxes)
  127. is None
  128. and find_all_right_bbox_direct(top_nearest_bbox, bboxes)
  129. is None
  130. ),
  131. bottom_nearest_bbox is not None
  132. and (
  133. find_all_left_bbox_direct(bottom_nearest_bbox, bboxes)
  134. is None
  135. and find_all_right_bbox_direct(
  136. bottom_nearest_bbox, bboxes
  137. )
  138. is None
  139. ),
  140. top_nearest_bbox is None or bottom_nearest_bbox is None,
  141. ]
  142. ):
  143. is_belong_to_col = True
  144. # 检查是否能被下方col吸收 TODO
  145. """
  146. 这里为什么没有is_cross_boundary_mid_line的条件呢?
  147. 确实有些杂志左右两栏宽度不是对称的。
  148. """
  149. if not is_belong_to_col or is_cross_boundary_mid_line:
  150. bbox[X0_EXT_IDX] = bound_x0
  151. bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
  152. bbox[X1_EXT_IDX] = bound_x1
  153. bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
  154. last_h_split_line_y1 = bbox[Y1_IDX] # 更新这条线
  155. else:
  156. continue
  157. """
  158. 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
  159. 然后合并所有连续水平方向的bbox.
  160. """
  161. all_bboxes.sort(key=lambda x: x[Y0_IDX])
  162. h_bboxes = []
  163. h_bbox_group = []
  164. for bbox in all_bboxes:
  165. if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
  166. h_bbox_group.append(bbox)
  167. else:
  168. if len(h_bbox_group) > 0:
  169. h_bboxes.append(h_bbox_group)
  170. h_bbox_group = []
  171. # 最后一个group
  172. if len(h_bbox_group) > 0:
  173. h_bboxes.append(h_bbox_group)
  174. """
  175. 现在h_bboxes里面是所有的group了,每个group都是一个list
  176. 对h_bboxes里的每个group进行计算放回到sorted_layouts里
  177. """
  178. h_layouts = []
  179. for gp in h_bboxes:
  180. gp.sort(key=lambda x: x[Y0_IDX])
  181. # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
  182. x0, y0, x1, y1 = (
  183. gp[0][X0_EXT_IDX],
  184. gp[0][Y0_EXT_IDX],
  185. gp[-1][X1_EXT_IDX],
  186. gp[-1][Y1_EXT_IDX],
  187. )
  188. h_layouts.append([x0, y0, x1, y1, LAYOUT_H]) # 水平的布局
  189. """
  190. 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
  191. """
  192. h_split_lines = [bound_y0]
  193. for gp in h_bboxes: # gp是一个list[bbox_list]
  194. y0, y1 = gp[0][1], gp[-1][3]
  195. h_split_lines.append(y0)
  196. h_split_lines.append(y1)
  197. h_split_lines.append(bound_y1)
  198. unsplited_bboxes = []
  199. for i in range(0, len(h_split_lines), 2):
  200. start_y0, start_y1 = h_split_lines[i : i + 2]
  201. # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
  202. bboxes_in_block = [
  203. bbox
  204. for bbox in all_bboxes
  205. if bbox[Y0_IDX] >= start_y0 and bbox[Y1_IDX] <= start_y1
  206. ]
  207. unsplited_bboxes.append(bboxes_in_block)
  208. # 接着把未处理的加入到h_layouts里
  209. for bboxes_in_block in unsplited_bboxes:
  210. if len(bboxes_in_block) == 0:
  211. continue
  212. x0, y0, x1, y1 = (
  213. bound_x0,
  214. min([bbox[Y0_IDX] for bbox in bboxes_in_block]),
  215. bound_x1,
  216. max([bbox[Y1_IDX] for bbox in bboxes_in_block]),
  217. )
  218. h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
  219. h_layouts.sort(key=lambda x: x[1]) # 按照y0排序, 也就是从上到下的顺序
  220. """
  221. 转换成如下格式返回
  222. """
  223. for layout in h_layouts:
  224. sorted_layout_blocks.append(
  225. {
  226. 'layout_bbox': layout[:4],
  227. 'layout_label': layout[4],
  228. 'sub_layout': [],
  229. }
  230. )
  231. return sorted_layout_blocks
  232. ###############################################################################################
  233. #
  234. # 垂直方向的处理
  235. #
  236. #
  237. ###############################################################################################
  238. def _vertical_align_split_v1(bboxes: list, boundary: tuple) -> list:
  239. """
  240. 计算垂直方向上的对齐, 并分割bboxes成layout。负责对一列多行的进行列维度分割。
  241. 如果不能完全分割,剩余部分作为layout_lable为u的layout返回
  242. -----------------------
  243. | | |
  244. | | |
  245. | | |
  246. | | |
  247. -------------------------
  248. 此函数会将:以上布局将会切分出来2列
  249. """
  250. sorted_layout_blocks = [] # 这是要最终返回的值
  251. new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
  252. v_blocks = []
  253. """
  254. 先从左到右切分
  255. """
  256. while True:
  257. all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
  258. left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
  259. if len(left_edge_bboxes) == 0:
  260. break
  261. right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes]) + 1
  262. # 然后检查这条线能不与其他bbox的左边界相交或者重合
  263. if any(
  264. [bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]
  265. ):
  266. # 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
  267. break
  268. else: # 说明成功分割出一列
  269. # 找到左侧边界最靠左的bbox作为layout的x0
  270. layout_x0 = min(
  271. [bbox[X0_IDX] for bbox in left_edge_bboxes]
  272. ) # 这里主要是为了画出来有一定间距
  273. v_blocks.append(
  274. [
  275. layout_x0,
  276. new_boundary[1],
  277. right_split_line_x1,
  278. new_boundary[3],
  279. LAYOUT_V,
  280. ]
  281. )
  282. new_boundary[0] = right_split_line_x1 # 更新边界
  283. """
  284. 再从右到左切, 此时如果还是无法完全切分,那么剩余部分作为layout_lable为u的layout返回
  285. """
  286. unsplited_block = []
  287. while True:
  288. all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
  289. right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
  290. if len(right_edge_bboxes) == 0:
  291. break
  292. left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes]) - 1
  293. # 然后检查这条线能不与其他bbox的左边界相交或者重合
  294. if any(
  295. [bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]
  296. ):
  297. # 这里是余下的
  298. unsplited_block.append(
  299. [
  300. new_boundary[0],
  301. new_boundary[1],
  302. new_boundary[2],
  303. new_boundary[3],
  304. LAYOUT_UNPROC,
  305. ]
  306. )
  307. break
  308. else:
  309. # 找到右侧边界最靠右的bbox作为layout的x1
  310. layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
  311. v_blocks.append(
  312. [
  313. left_split_line_x0,
  314. new_boundary[1],
  315. layout_x1,
  316. new_boundary[3],
  317. LAYOUT_V,
  318. ]
  319. )
  320. new_boundary[2] = left_split_line_x0 # 更新右边界
  321. """
  322. 最后拼装成layout格式返回
  323. """
  324. for block in v_blocks:
  325. sorted_layout_blocks.append(
  326. {
  327. 'layout_bbox': block[:4],
  328. 'layout_label': block[4],
  329. 'sub_layout': [],
  330. }
  331. )
  332. for block in unsplited_block:
  333. sorted_layout_blocks.append(
  334. {
  335. 'layout_bbox': block[:4],
  336. 'layout_label': block[4],
  337. 'sub_layout': [],
  338. }
  339. )
  340. # 按照x0排序
  341. sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
  342. return sorted_layout_blocks
  343. def _vertical_align_split_v2(bboxes: list, boundary: tuple) -> list:
  344. """改进的
  345. _vertical_align_split算法,原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分,导致整个layout多列被识别为一列。
  346. 利用从左上角的box开始向下看的方法,不断扩展w_x0, w_x1,直到不能继续向下扩展,或者到达边界下边界。"""
  347. sorted_layout_blocks = [] # 这是要最终返回的值
  348. new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
  349. bad_boxes = [] # 被割中的box
  350. v_blocks = []
  351. while True:
  352. all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
  353. if len(all_bboxes) == 0:
  354. break
  355. left_top_box = min(
  356. all_bboxes, key=lambda x: (x[X0_IDX], x[Y0_IDX])
  357. ) # 这里应该加强,检查一下必须是在第一列的 TODO
  358. start_box = [
  359. left_top_box[X0_IDX],
  360. left_top_box[Y0_IDX],
  361. left_top_box[X1_IDX],
  362. left_top_box[Y1_IDX],
  363. ]
  364. w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
  365. """
  366. 然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
  367. 扩展之后,宽度会增加,随后用x=w_x1来检测在边界内是否有box与相交,如果相交,那么就说明不能再扩展了。
  368. 当不能扩展的时候就要看是否到达下边界:
  369. 1. 达到,那么更新左边界继续分下一个列
  370. 2. 没有达到,那么此时开始从右侧切分进入下面的循环里
  371. """
  372. while left_top_box is not None: # 向下去找
  373. virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
  374. left_top_box = find_bottom_bbox_direct_from_left_edge(
  375. virtual_box, all_bboxes
  376. )
  377. if left_top_box:
  378. w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
  379. [virtual_box[X1_IDX], left_top_box[X1_IDX]]
  380. )
  381. # 万一这个初始的box在column中间,那么还要向上看
  382. start_box = [
  383. w_x0,
  384. start_box[Y0_IDX],
  385. w_x1,
  386. start_box[Y1_IDX],
  387. ] # 扩展一下宽度更鲁棒
  388. left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
  389. while left_top_box is not None: # 向上去找
  390. virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
  391. left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
  392. if left_top_box:
  393. w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
  394. [virtual_box[X1_IDX], left_top_box[X1_IDX]]
  395. )
  396. # 检查相交
  397. if any([bbox[X0_IDX] <= w_x1 + 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
  398. for b in all_bboxes:
  399. if b[X0_IDX] <= w_x1 + 1 <= b[X1_IDX]:
  400. bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
  401. break
  402. else: # 说明成功分割出一列
  403. v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
  404. new_boundary[0] = w_x1 # 更新边界
  405. """
  406. 接着开始从右上角的box扫描
  407. """
  408. w_x0, w_x1 = 0, 0
  409. unsplited_block = []
  410. while True:
  411. all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
  412. if len(all_bboxes) == 0:
  413. break
  414. # 先找到X1最大的
  415. bbox_list_sorted = sorted(
  416. all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True
  417. )
  418. # Then, find the boxes with the smallest Y0 value
  419. bigest_x1 = bbox_list_sorted[0][X1_IDX]
  420. boxes_with_bigest_x1 = [
  421. bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1
  422. ] # 也就是最靠右的那些
  423. right_top_box = min(
  424. boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]
  425. ) # y0最小的那个
  426. start_box = [
  427. right_top_box[X0_IDX],
  428. right_top_box[Y0_IDX],
  429. right_top_box[X1_IDX],
  430. right_top_box[Y1_IDX],
  431. ]
  432. w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
  433. while right_top_box is not None:
  434. virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
  435. right_top_box = find_bottom_bbox_direct_from_right_edge(
  436. virtual_box, all_bboxes
  437. )
  438. if right_top_box:
  439. w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
  440. [w_x1, right_top_box[X1_IDX]]
  441. )
  442. # 在向上扫描
  443. start_box = [
  444. w_x0,
  445. start_box[Y0_IDX],
  446. w_x1,
  447. start_box[Y1_IDX],
  448. ] # 扩展一下宽度更鲁棒
  449. right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
  450. while right_top_box is not None:
  451. virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
  452. right_top_box = find_top_bbox_direct_from_right_edge(
  453. virtual_box, all_bboxes
  454. )
  455. if right_top_box:
  456. w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
  457. [w_x1, right_top_box[X1_IDX]]
  458. )
  459. # 检查是否与其他box相交, 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
  460. if any([bbox[X0_IDX] <= w_x0 - 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
  461. unsplited_block.append(
  462. [
  463. new_boundary[0],
  464. new_boundary[1],
  465. new_boundary[2],
  466. new_boundary[3],
  467. LAYOUT_UNPROC,
  468. ]
  469. )
  470. for b in all_bboxes:
  471. if b[X0_IDX] <= w_x0 - 1 <= b[X1_IDX]:
  472. bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
  473. break
  474. else: # 说明成功分割出一列
  475. v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
  476. new_boundary[2] = w_x0
  477. """转换数据结构"""
  478. for block in v_blocks:
  479. sorted_layout_blocks.append(
  480. {
  481. 'layout_bbox': block[:4],
  482. 'layout_label': block[4],
  483. 'sub_layout': [],
  484. }
  485. )
  486. for block in unsplited_block:
  487. sorted_layout_blocks.append(
  488. {
  489. 'layout_bbox': block[:4],
  490. 'layout_label': block[4],
  491. 'sub_layout': [],
  492. 'bad_boxes': bad_boxes, # 记录下来,这个box是被割中的
  493. }
  494. )
  495. # 按照x0排序
  496. sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
  497. return sorted_layout_blocks
  498. def _try_horizontal_mult_column_split(bboxes: list, boundary: tuple) -> list:
  499. """
  500. 尝试水平切分,如果切分不动,那就当一个BAD_LAYOUT返回
  501. ------------------
  502. | | |
  503. ------------------
  504. | | | | <- 这里是此函数要切分的场景
  505. ------------------
  506. | | |
  507. | | |
  508. """
  509. pass
  510. def _vertical_split(bboxes: list, boundary: tuple) -> list:
  511. """
  512. 从垂直方向进行切割,分block
  513. 这个版本里,如果垂直切分不动,那就当一个BAD_LAYOUT返回
  514. --------------------------
  515. | | |
  516. | | |
  517. | |
  518. 这种列是此函数要切分的 -> | |
  519. | |
  520. | | |
  521. | | |
  522. -------------------------
  523. """
  524. sorted_layout_blocks = [] # 这是要最终返回的值
  525. bound_x0, bound_y0, bound_x1, bound_y1 = boundary
  526. all_bboxes = get_bbox_in_boundary(bboxes, boundary)
  527. """
  528. all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
  529. all_bboxes = fix_hor_bbox_pos(all_bboxes) # 水平解覆盖
  530. 这两行代码目前先不执行,因为公式检测,表格检测还不是很成熟,导致非常多的textblock参与了运算,时间消耗太大。
  531. 这两行代码的作用是:
  532. 如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩,从而避免重叠。对布局切分来说带来正反馈。
  533. """
  534. # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
  535. """
  536. 首先在垂直方向上扩展独占一行的bbox
  537. """
  538. for bbox in all_bboxes:
  539. top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes) # 非扩展线
  540. bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
  541. if (
  542. top_nearest_bbox is None
  543. and bottom_nearest_bbox is None
  544. and not any(
  545. [
  546. b[X0_IDX] < bbox[X1_IDX] < b[X1_IDX]
  547. or b[X0_IDX] < bbox[X0_IDX] < b[X1_IDX]
  548. for b in all_bboxes
  549. ]
  550. )
  551. ): # 独占一列, 且不和其他重叠
  552. bbox[X0_EXT_IDX] = bbox[X0_IDX]
  553. bbox[Y0_EXT_IDX] = bound_y0
  554. bbox[X1_EXT_IDX] = bbox[X1_IDX]
  555. bbox[Y1_EXT_IDX] = bound_y1
  556. """
  557. 此时独占一列的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
  558. 然后合并所有连续垂直方向的bbox.
  559. """
  560. all_bboxes.sort(key=lambda x: x[X0_IDX])
  561. # fix: 这里水平方向的列不要合并成一个行,因为需要保证返回给下游的最小block,总是可以无脑从上到下阅读文字。
  562. v_bboxes = []
  563. for box in all_bboxes:
  564. if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
  565. v_bboxes.append(box)
  566. """
  567. 现在v_bboxes里面是所有的group了,每个group都是一个list
  568. 对v_bboxes里的每个group进行计算放回到sorted_layouts里
  569. """
  570. v_layouts = []
  571. for vbox in v_bboxes:
  572. # gp.sort(key=lambda x: x[X0_IDX])
  573. # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
  574. x0, y0, x1, y1 = (
  575. vbox[X0_EXT_IDX],
  576. vbox[Y0_EXT_IDX],
  577. vbox[X1_EXT_IDX],
  578. vbox[Y1_EXT_IDX],
  579. )
  580. v_layouts.append([x0, y0, x1, y1, LAYOUT_V]) # 垂直的布局
  581. """
  582. 接下来利用这些连续的垂直bbox的layout_bbox的x0, x1,从垂直上切分开其余的为几个部分
  583. """
  584. v_split_lines = [bound_x0]
  585. for gp in v_bboxes:
  586. x0, x1 = gp[X0_IDX], gp[X1_IDX]
  587. v_split_lines.append(x0)
  588. v_split_lines.append(x1)
  589. v_split_lines.append(bound_x1)
  590. unsplited_bboxes = []
  591. for i in range(0, len(v_split_lines), 2):
  592. start_x0, start_x1 = v_split_lines[i : i + 2]
  593. # 然后找出[start_x0, start_x1]之间的其他bbox,这些组成一个未分割板块
  594. bboxes_in_block = [
  595. bbox
  596. for bbox in all_bboxes
  597. if bbox[X0_IDX] >= start_x0 and bbox[X1_IDX] <= start_x1
  598. ]
  599. unsplited_bboxes.append(bboxes_in_block)
  600. # 接着把未处理的加入到v_layouts里
  601. for bboxes_in_block in unsplited_bboxes:
  602. if len(bboxes_in_block) == 0:
  603. continue
  604. x0, y0, x1, y1 = (
  605. min([bbox[X0_IDX] for bbox in bboxes_in_block]),
  606. bound_y0,
  607. max([bbox[X1_IDX] for bbox in bboxes_in_block]),
  608. bound_y1,
  609. )
  610. v_layouts.append(
  611. [x0, y0, x1, y1, LAYOUT_UNPROC]
  612. ) # 说明这篇区域未能够分析出可靠的版面
  613. v_layouts.sort(key=lambda x: x[0]) # 按照x0排序, 也就是从左到右的顺序
  614. for layout in v_layouts:
  615. sorted_layout_blocks.append(
  616. {
  617. 'layout_bbox': layout[:4],
  618. 'layout_label': layout[4],
  619. 'sub_layout': [],
  620. }
  621. )
  622. """
  623. 至此,垂直方向切成了2种类型,其一是独占一列的,其二是未处理的。
  624. 下面对这些未处理的进行垂直方向切分,这个切分要切出来类似“吕”这种类型的垂直方向的布局
  625. """
  626. for i, layout in enumerate(sorted_layout_blocks):
  627. if layout['layout_label'] == LAYOUT_UNPROC:
  628. x0, y0, x1, y1 = layout['layout_bbox']
  629. v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
  630. sorted_layout_blocks[i] = {
  631. 'layout_bbox': [x0, y0, x1, y1],
  632. 'layout_label': LAYOUT_H,
  633. 'sub_layout': v_split_layouts,
  634. }
  635. layout['layout_label'] = LAYOUT_H # 被垂线切分成了水平布局
  636. return sorted_layout_blocks
  637. def split_layout(bboxes: list, boundary: tuple, page_num: int) -> list:
  638. """
  639. 把bboxes切割成layout
  640. return:
  641. [
  642. {
  643. "layout_bbox": [x0,y0,x1,y1],
  644. "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
  645. "sub_layout":[] #每个元素都是[
  646. x0,y0,
  647. x1,y1,
  648. block_content,
  649. idx_x,idx_y,
  650. content_type,
  651. ext_x0,ext_y0,
  652. ext_x1,ext_y1
  653. ], 并且顺序就是阅读顺序
  654. }
  655. ]
  656. example:
  657. [
  658. {
  659. "layout_bbox": [0, 0, 100, 100],
  660. "layout_label":"u|v|h|b",
  661. "sub_layout":[
  662. ]
  663. },
  664. {
  665. "layout_bbox": [0, 0, 100, 100],
  666. "layout_label":"u|v|h|b",
  667. "sub_layout":[
  668. {
  669. "layout_bbox": [0, 0, 100, 100],
  670. "layout_label":"u|v|h|b",
  671. "content_bboxes":[
  672. [],
  673. [],
  674. []
  675. ]
  676. },
  677. {
  678. "layout_bbox": [0, 0, 100, 100],
  679. "layout_label":"u|v|h|b",
  680. "sub_layout":[
  681. ]
  682. }
  683. }
  684. ]
  685. """
  686. sorted_layouts = [] # 最终返回的结果
  687. boundary_x0, boundary_y0, boundary_x1, boundary_y1 = boundary
  688. if len(bboxes) <= 1:
  689. return [
  690. {
  691. 'layout_bbox': [boundary_x0, boundary_y0, boundary_x1, boundary_y1],
  692. 'layout_label': LAYOUT_V,
  693. 'sub_layout': [],
  694. }
  695. ]
  696. """
  697. 接下来按照先水平后垂直的顺序进行切分
  698. """
  699. bboxes = paper_bbox_sort(
  700. bboxes, boundary_x1 - boundary_x0, boundary_y1 - boundary_y0
  701. )
  702. sorted_layouts = _horizontal_split(bboxes, boundary) # 通过水平分割出来的layout
  703. for i, layout in enumerate(sorted_layouts):
  704. x0, y0, x1, y1 = layout['layout_bbox']
  705. layout_type = layout['layout_label']
  706. if layout_type == LAYOUT_UNPROC: # 说明是非独占单行的,这些需要垂直切分
  707. v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
  708. """
  709. 最后这里有个逻辑问题:如果这个函数只分离出来了一个column layout,那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
  710. box已经把行全部剥离了,所以这里必须十多个列才可以。如果只剥离出来一个layout,并且是多个box,那么就说明这个layout是无法分割的,标记为LAYOUT_UNPROC
  711. """
  712. layout_label = LAYOUT_V
  713. if len(v_split_layouts) == 1:
  714. if len(v_split_layouts[0]['sub_layout']) == 0:
  715. layout_label = LAYOUT_UNPROC
  716. # logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
  717. """
  718. 组合起来最终的layout
  719. """
  720. sorted_layouts[i] = {
  721. 'layout_bbox': [x0, y0, x1, y1],
  722. 'layout_label': layout_label,
  723. 'sub_layout': v_split_layouts,
  724. }
  725. layout['layout_label'] = LAYOUT_H
  726. """
  727. 水平和垂直方向都切分完毕了。此时还有一些未处理的,这些未处理的可能是因为水平和垂直方向都无法切分。
  728. 这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分,如果也不能切分最终就当做BAD_LAYOUT返回
  729. """
  730. # TODO
  731. return sorted_layouts
  732. def get_bboxes_layout(all_boxes: list, boundary: tuple, page_id: int):
  733. """
  734. 对利用layout排序之后的box,进行排序
  735. return:
  736. [
  737. {
  738. "layout_bbox": [x0, y0, x1, y1],
  739. "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
  740. },
  741. ]
  742. """
  743. def _preorder_traversal(layout):
  744. """对sorted_layouts的叶子节点,也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序,也就是从上到
  745. 下,从左到右的顺序."""
  746. sorted_layout_blocks = []
  747. for layout in layout:
  748. sub_layout = layout['sub_layout']
  749. if len(sub_layout) == 0:
  750. sorted_layout_blocks.append(layout)
  751. else:
  752. s = _preorder_traversal(sub_layout)
  753. sorted_layout_blocks.extend(s)
  754. return sorted_layout_blocks
  755. # -------------------------------------------------------------------------------------------------------------------------
  756. sorted_layouts = split_layout(
  757. all_boxes, boundary, page_id
  758. ) # 先切分成layout,得到一个Tree
  759. total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
  760. return total_sorted_layout_blocks, sorted_layouts
  761. def get_columns_cnt_of_layout(layout_tree):
  762. """获取一个layout的宽度."""
  763. max_width_list = [0] # 初始化一个元素,防止max,min函数报错
  764. for items in layout_tree: # 针对每一层(横切)计算列数,横着的算一列
  765. layout_type = items['layout_label']
  766. sub_layouts = items['sub_layout']
  767. if len(sub_layouts) == 0:
  768. max_width_list.append(1)
  769. else:
  770. if layout_type == LAYOUT_H:
  771. max_width_list.append(1)
  772. else:
  773. width = 0
  774. for sub_layout in sub_layouts:
  775. if len(sub_layout['sub_layout']) == 0:
  776. width += 1
  777. else:
  778. for lay in sub_layout['sub_layout']:
  779. width += get_columns_cnt_of_layout([lay])
  780. max_width_list.append(width)
  781. return max(max_width_list)
  782. def sort_with_layout(bboxes: list, page_width, page_height) -> (list, list):
  783. """输入是一个bbox的list.
  784. 获取到输入之后,先进行layout切分,然后对这些bbox进行排序。返回排序后的bboxes
  785. """
  786. new_bboxes = []
  787. for box in bboxes:
  788. # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
  789. new_bboxes.append(
  790. [
  791. box[0],
  792. box[1],
  793. box[2],
  794. box[3],
  795. None,
  796. None,
  797. None,
  798. 'text',
  799. None,
  800. None,
  801. None,
  802. None,
  803. box[4],
  804. ]
  805. )
  806. layout_bboxes, _ = get_bboxes_layout(
  807. new_bboxes, tuple([0, 0, page_width, page_height]), 0
  808. )
  809. if any([lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]):
  810. logger.warning('drop this pdf, reason: 复杂版面')
  811. return None, None
  812. sorted_bboxes = []
  813. # 利用layout bbox每次框定一些box,然后排序
  814. for layout in layout_bboxes:
  815. lbox = layout['layout_bbox']
  816. bbox_in_layout = get_bbox_in_boundary(new_bboxes, lbox)
  817. sorted_bbox = paper_bbox_sort(
  818. bbox_in_layout, lbox[2] - lbox[0], lbox[3] - lbox[1]
  819. )
  820. sorted_bboxes.extend(sorted_bbox)
  821. return sorted_bboxes, layout_bboxes
  822. def sort_text_block(text_block, layout_bboxes):
  823. """对一页的text_block进行排序."""
  824. sorted_text_bbox = []
  825. all_text_bbox = []
  826. # 做一个box=>text的映射
  827. box_to_text = {}
  828. for blk in text_block:
  829. box = blk['bbox']
  830. box_to_text[(box[0], box[1], box[2], box[3])] = blk
  831. all_text_bbox.append(box)
  832. # text_blocks_to_sort = []
  833. # for box in box_to_text.keys():
  834. # text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
  835. # 按照layout_bboxes的顺序,对text_block进行排序
  836. for layout in layout_bboxes:
  837. layout_box = layout['layout_bbox']
  838. text_bbox_in_layout = get_bbox_in_boundary(
  839. all_text_bbox,
  840. [
  841. layout_box[0] - 1,
  842. layout_box[1] - 1,
  843. layout_box[2] + 1,
  844. layout_box[3] + 1,
  845. ],
  846. )
  847. # sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
  848. text_bbox_in_layout.sort(
  849. key=lambda x: x[1]
  850. ) # 一个layout内部的box,按照y0自上而下排序
  851. # sorted_bbox = [[b] for b in text_blocks_to_sort]
  852. for sb in text_bbox_in_layout:
  853. sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
  854. return sorted_text_bbox