xycuts.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from copy import deepcopy
  15. from typing import Dict, List, Tuple
  16. import numpy as np
  17. from ..layout_objects import LayoutBlock, LayoutRegion
  18. from ..setting import BLOCK_LABEL_MAP, XYCUT_SETTINGS
  19. from ..utils import calculate_overlap_ratio, calculate_projection_overlap_ratio
  20. from .utils import (
  21. calculate_discontinuous_projection,
  22. euclidean_insert,
  23. find_local_minima_flat_regions,
  24. get_blocks_by_direction_interval,
  25. get_cut_blocks,
  26. insert_child_blocks,
  27. manhattan_insert,
  28. projection_by_bboxes,
  29. recursive_xy_cut,
  30. recursive_yx_cut,
  31. reference_insert,
  32. shrink_overlapping_boxes,
  33. sort_normal_blocks,
  34. update_doc_title_child_blocks,
  35. update_paragraph_title_child_blocks,
  36. update_region_child_blocks,
  37. update_vision_child_blocks,
  38. weighted_distance_insert,
  39. )
  40. def pre_process(
  41. region: LayoutRegion,
  42. ) -> List:
  43. """
  44. Preprocess the layout for sorting purposes.
  45. This function performs two main tasks:
  46. 1. Pre-cuts the layout to ensure the document is correctly partitioned and sorted.
  47. 2. Match the blocks with their children.
  48. Args:
  49. region: LayoutParsingRegion, the layout region to be pre-processed.
  50. Returns:
  51. List: A list of pre-cutted layout blocks list.
  52. """
  53. mask_labels = [
  54. "header",
  55. "unordered",
  56. "footer",
  57. "vision_footnote",
  58. "sub_paragraph_title",
  59. "doc_title_text",
  60. "vision_title",
  61. "sub_region",
  62. ]
  63. pre_cut_block_idxes = []
  64. block_map = region.block_map
  65. blocks: List[LayoutBlock] = list(block_map.values())
  66. for block in blocks:
  67. if block.order_label not in mask_labels:
  68. update_region_label(block, region)
  69. block_direction = block.direction
  70. if block_direction == "horizontal":
  71. tolerance_len = block.long_side_length // 5
  72. else:
  73. tolerance_len = block.short_side_length // 10
  74. block_center = (
  75. block.bbox[region.direction_start_index]
  76. + block.bbox[region.direction_end_index]
  77. ) / 2
  78. center_offset = abs(block_center - region.direction_center_coordinate)
  79. is_centered = center_offset <= tolerance_len
  80. if is_centered:
  81. pre_cut_block_idxes.append(block.index)
  82. pre_cut_list = []
  83. cut_direction = region.secondary_direction
  84. cut_coordinates = []
  85. discontinuous = []
  86. all_boxes = np.array(
  87. [block.bbox for block in blocks if block.order_label not in mask_labels]
  88. )
  89. if len(all_boxes) == 0:
  90. return pre_cut_list
  91. if pre_cut_block_idxes:
  92. discontinuous, num_list = calculate_discontinuous_projection(
  93. all_boxes, direction=cut_direction, return_num=True
  94. )
  95. for idx in pre_cut_block_idxes:
  96. block = block_map[idx]
  97. if (
  98. block.order_label not in mask_labels
  99. and block.secondary_direction == cut_direction
  100. ):
  101. if (
  102. block.secondary_direction_start_coordinate,
  103. block.secondary_direction_end_coordinate,
  104. ) in discontinuous:
  105. idx = discontinuous.index(
  106. (
  107. block.secondary_direction_start_coordinate,
  108. block.secondary_direction_end_coordinate,
  109. )
  110. )
  111. if num_list[idx] == 1:
  112. cut_coordinates.append(
  113. block.secondary_direction_start_coordinate
  114. )
  115. cut_coordinates.append(block.secondary_direction_end_coordinate)
  116. secondary_check_bboxes = np.array(
  117. [
  118. block.bbox
  119. for block in blocks
  120. if block.order_label not in mask_labels + ["vision"]
  121. ]
  122. )
  123. if len(secondary_check_bboxes) > 0 or blocks[0].label == "region":
  124. secondary_discontinuous = calculate_discontinuous_projection(
  125. secondary_check_bboxes, direction=region.direction
  126. )
  127. if len(secondary_discontinuous) == 1 or blocks[0].label == "region":
  128. if not discontinuous:
  129. discontinuous = calculate_discontinuous_projection(
  130. all_boxes, direction=cut_direction
  131. )
  132. current_interval = discontinuous[0]
  133. pre_cut_coordinates = [
  134. cood for cood in cut_coordinates if cood < current_interval[1]
  135. ]
  136. if not pre_cut_coordinates:
  137. pre_cut_coordinate = 0
  138. else:
  139. pre_cut_coordinate = max(pre_cut_coordinates)
  140. pre_cut_coordinate = max(current_interval[0], pre_cut_coordinate)
  141. for interval in discontinuous[1:]:
  142. gap_len = interval[0] - current_interval[1]
  143. if (
  144. gap_len >= region.text_line_height * 3
  145. or blocks[0].label == "region"
  146. ):
  147. cut_coordinates.append(current_interval[1])
  148. elif gap_len > region.text_line_height * 1.2:
  149. pre_blocks = get_blocks_by_direction_interval(
  150. list(block_map.values()),
  151. pre_cut_coordinate,
  152. current_interval[1],
  153. cut_direction,
  154. )
  155. post_blocks = get_blocks_by_direction_interval(
  156. list(block_map.values()),
  157. current_interval[1],
  158. interval[1],
  159. cut_direction,
  160. )
  161. pre_bboxes = np.array([block.bbox for block in pre_blocks])
  162. post_bboxes = np.array([block.bbox for block in post_blocks])
  163. projection_index = 1 if cut_direction == "horizontal" else 0
  164. pre_projection = projection_by_bboxes(pre_bboxes, projection_index)
  165. post_projection = projection_by_bboxes(
  166. post_bboxes, projection_index
  167. )
  168. pre_intervals = find_local_minima_flat_regions(pre_projection)
  169. post_intervals = find_local_minima_flat_regions(post_projection)
  170. pre_gap_boxes = []
  171. if pre_intervals is not None:
  172. for start, end in pre_intervals:
  173. bbox = [0] * 4
  174. bbox[projection_index] = start
  175. bbox[projection_index + 2] = end
  176. pre_gap_boxes.append(bbox)
  177. post_gap_boxes = []
  178. if post_intervals is not None:
  179. for start, end in post_intervals:
  180. bbox = [0] * 4
  181. bbox[projection_index] = start
  182. bbox[projection_index + 2] = end
  183. post_gap_boxes.append(bbox)
  184. max_gap_boxes_num = max(len(pre_gap_boxes), len(post_gap_boxes))
  185. if max_gap_boxes_num > 0:
  186. discontinuous_intervals = calculate_discontinuous_projection(
  187. pre_gap_boxes + post_gap_boxes, direction=region.direction
  188. )
  189. if len(discontinuous_intervals) != max_gap_boxes_num:
  190. pre_cut_coordinate = current_interval[1]
  191. cut_coordinates.append(current_interval[1])
  192. current_interval = interval
  193. cut_list = get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels)
  194. pre_cut_list.extend(cut_list)
  195. if region.direction == "vertical":
  196. pre_cut_list = pre_cut_list[::-1]
  197. return pre_cut_list
  198. def update_region_label(
  199. block: LayoutBlock,
  200. region: LayoutRegion,
  201. ) -> None:
  202. """
  203. Update the region label of a block based on its label and match the block with its children.
  204. Args:
  205. blocks (List[LayoutBlock]): The list of blocks to process.
  206. config (Dict[str, Any]): The configuration dictionary containing the necessary information.
  207. block_idx (int): The index of the current block being processed.
  208. Returns:
  209. None
  210. """
  211. if block.label in BLOCK_LABEL_MAP["header_labels"]:
  212. block.order_label = "header"
  213. elif block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
  214. block.order_label = "doc_title"
  215. elif (
  216. block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]
  217. and block.order_label is None
  218. ):
  219. block.order_label = "paragraph_title"
  220. elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
  221. block.order_label = "vision"
  222. block.num_of_lines = 1
  223. block.update_direction(region.direction)
  224. elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
  225. block.order_label = "footer"
  226. elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
  227. block.order_label = "unordered"
  228. elif block.label == "region":
  229. block.order_label = "region"
  230. else:
  231. block.order_label = "normal_text"
  232. # only vision and doc title block can have child block
  233. if block.order_label not in ["vision", "doc_title", "paragraph_title", "region"]:
  234. return
  235. # match doc title text block
  236. if block.order_label == "doc_title":
  237. update_doc_title_child_blocks(block, region)
  238. # match sub title block
  239. elif block.order_label == "paragraph_title":
  240. update_paragraph_title_child_blocks(block, region)
  241. # match vision title block and vision footnote block
  242. elif block.order_label == "vision":
  243. update_vision_child_blocks(block, region)
  244. elif block.order_label == "region":
  245. update_region_child_blocks(block, region)
  246. def get_layout_structure(
  247. blocks: List[LayoutBlock],
  248. region_direction: str,
  249. region_secondary_direction: str,
  250. ) -> Tuple[List[Dict[str, any]], bool]:
  251. """
  252. Determine the layout cross column of blocks.
  253. Args:
  254. blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
  255. Returns:
  256. Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
  257. indicating if the cross layout area is greater than the single layout area.
  258. """
  259. blocks.sort(
  260. key=lambda x: (x.bbox[0], x.width),
  261. )
  262. mask_labels = ["doc_title", "cross_layout", "cross_reference"]
  263. for block_idx, block in enumerate(blocks):
  264. if block.order_label in mask_labels:
  265. continue
  266. for ref_idx, ref_block in enumerate(blocks):
  267. if block_idx == ref_idx or ref_block.order_label in mask_labels:
  268. continue
  269. bbox_iou = calculate_overlap_ratio(block.bbox, ref_block.bbox)
  270. if bbox_iou:
  271. if ref_block.order_label == "vision":
  272. ref_block.order_label = "cross_layout"
  273. break
  274. if bbox_iou > 0.1 and block.area < ref_block.area:
  275. block.order_label = "cross_layout"
  276. break
  277. match_projection_iou = calculate_projection_overlap_ratio(
  278. block.bbox,
  279. ref_block.bbox,
  280. region_direction,
  281. )
  282. if match_projection_iou > 0:
  283. for second_ref_idx, second_ref_block in enumerate(blocks):
  284. if (
  285. second_ref_idx in [block_idx, ref_idx]
  286. or second_ref_block.order_label in mask_labels
  287. ):
  288. continue
  289. bbox_iou = calculate_overlap_ratio(
  290. block.bbox, second_ref_block.bbox
  291. )
  292. if bbox_iou > 0.1:
  293. if second_ref_block.order_label == "vision":
  294. second_ref_block.order_label = "cross_layout"
  295. break
  296. if (
  297. block.order_label == "vision"
  298. or block.area < second_ref_block.area
  299. ):
  300. block.order_label = "cross_layout"
  301. break
  302. second_match_projection_iou = calculate_projection_overlap_ratio(
  303. block.bbox,
  304. second_ref_block.bbox,
  305. region_direction,
  306. )
  307. ref_match_projection_iou = calculate_projection_overlap_ratio(
  308. ref_block.bbox,
  309. second_ref_block.bbox,
  310. region_direction,
  311. )
  312. secondary_direction_ref_match_projection_overlap_ratio = (
  313. calculate_projection_overlap_ratio(
  314. ref_block.bbox,
  315. second_ref_block.bbox,
  316. region_secondary_direction,
  317. )
  318. )
  319. if (
  320. second_match_projection_iou > 0
  321. and ref_match_projection_iou == 0
  322. and secondary_direction_ref_match_projection_overlap_ratio > 0
  323. ):
  324. if block.order_label in ["vision", "region"] or (
  325. ref_block.order_label == "normal_text"
  326. and second_ref_block.order_label == "normal_text"
  327. and ref_block.long_side_length
  328. > ref_block.text_line_height
  329. * XYCUT_SETTINGS.get(
  330. "cross_layout_ref_text_block_words_num_threshold", 8
  331. )
  332. and second_ref_block.long_side_length
  333. > second_ref_block.text_line_height
  334. * XYCUT_SETTINGS.get(
  335. "cross_layout_ref_text_block_words_num_threshold", 8
  336. )
  337. ):
  338. block.order_label = (
  339. "cross_reference"
  340. if block.label == "reference"
  341. else "cross_layout"
  342. )
  343. def sort_by_xycut(
  344. block_bboxes: List,
  345. direction: str = "vertical",
  346. min_gap: int = 1,
  347. ) -> List[int]:
  348. """
  349. Sort bounding boxes using recursive XY cut method based on the specified direction.
  350. Args:
  351. block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
  352. where each box is represented as
  353. [x_min, y_min, x_max, y_max].
  354. direction (int): direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
  355. Defaults to 0.
  356. min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
  357. Returns:
  358. List[int]: A list of indices representing the order of sorted bounding boxes.
  359. """
  360. block_bboxes = np.asarray(block_bboxes).astype(int)
  361. res = []
  362. if direction == "vertical":
  363. recursive_yx_cut(
  364. block_bboxes,
  365. np.arange(len(block_bboxes)).tolist(),
  366. res,
  367. min_gap,
  368. )
  369. else:
  370. recursive_xy_cut(
  371. block_bboxes,
  372. np.arange(len(block_bboxes)).tolist(),
  373. res,
  374. min_gap,
  375. )
  376. return res
  377. def match_unsorted_blocks(
  378. sorted_blocks: List[LayoutBlock],
  379. unsorted_blocks: List[LayoutBlock],
  380. region: LayoutRegion,
  381. ) -> List[LayoutBlock]:
  382. """
  383. Match special blocks with the sorted blocks based on their region labels.
  384. Args:
  385. sorted_blocks (List[LayoutBlock]): Sorted blocks to be matched.
  386. unsorted_blocks (List[LayoutBlock]): Unsorted blocks to be matched.
  387. config (Dict): Configuration dictionary containing various parameters.
  388. median_width (int): Median width value used for calculations.
  389. Returns:
  390. List[LayoutBlock]: The updated sorted blocks after matching special blocks.
  391. """
  392. distance_type_map = {
  393. "cross_layout": weighted_distance_insert,
  394. "paragraph_title": weighted_distance_insert,
  395. "doc_title": weighted_distance_insert,
  396. "vision_title": weighted_distance_insert,
  397. "vision": weighted_distance_insert,
  398. "cross_reference": reference_insert,
  399. "unordered": manhattan_insert,
  400. "other": manhattan_insert,
  401. "region": euclidean_insert,
  402. }
  403. unsorted_blocks = sort_normal_blocks(
  404. unsorted_blocks,
  405. region.text_line_height,
  406. region.text_line_width,
  407. region.direction,
  408. )
  409. for idx, block in enumerate(unsorted_blocks):
  410. order_label = block.order_label if block.label != "region" else "region"
  411. if idx == 0 and order_label == "doc_title":
  412. sorted_blocks.insert(0, block)
  413. continue
  414. sorted_blocks = distance_type_map[order_label](
  415. block=block, sorted_blocks=sorted_blocks, region=region
  416. )
  417. return sorted_blocks
  418. def xycut_enhanced(
  419. region: LayoutRegion,
  420. ) -> LayoutRegion:
  421. """
  422. xycut_enhance function performs the following steps:
  423. 1. Preprocess the input blocks by extracting headers, footers, and pre-cut blocks.
  424. 2. Mask blocks that are crossing different blocks.
  425. 3. Perform xycut_enhanced algorithm on the remaining blocks.
  426. 4. Match unsorted blocks with the sorted blocks based on their order labels.
  427. 5. Update child blocks of the sorted blocks based on their parent blocks.
  428. 6. Return the ordered result list.
  429. Args:
  430. blocks (List[LayoutBlock]): Input blocks to be processed.
  431. Returns:
  432. List[LayoutBlock]: Ordered result list after processing.
  433. """
  434. if len(region.block_map) == 0:
  435. return []
  436. pre_cut_list: List[List[LayoutBlock]] = pre_process(region)
  437. final_order_res_list: List[LayoutBlock] = []
  438. header_blocks: List[LayoutBlock] = [
  439. region.block_map[idx] for idx in region.header_block_idxes
  440. ]
  441. unordered_blocks: List[LayoutBlock] = [
  442. region.block_map[idx] for idx in region.unordered_block_idxes
  443. ]
  444. footer_blocks: List[LayoutBlock] = [
  445. region.block_map[idx] for idx in region.footer_block_idxes
  446. ]
  447. header_blocks: List[LayoutBlock] = sort_normal_blocks(
  448. header_blocks, region.text_line_height, region.text_line_width, region.direction
  449. )
  450. footer_blocks: List[LayoutBlock] = sort_normal_blocks(
  451. footer_blocks, region.text_line_height, region.text_line_width, region.direction
  452. )
  453. unordered_blocks: List[LayoutBlock] = sort_normal_blocks(
  454. unordered_blocks,
  455. region.text_line_height,
  456. region.text_line_width,
  457. region.direction,
  458. )
  459. final_order_res_list.extend(header_blocks)
  460. unsorted_blocks: List[LayoutBlock] = []
  461. sorted_blocks_by_pre_cuts: List[LayoutBlock] = []
  462. for pre_cut_blocks in pre_cut_list:
  463. sorted_blocks: List[LayoutBlock] = []
  464. doc_title_blocks: List[LayoutBlock] = []
  465. xy_cut_blocks: List[LayoutBlock] = []
  466. if pre_cut_blocks and pre_cut_blocks[0].label == "region":
  467. block_bboxes = np.array([block.bbox for block in pre_cut_blocks])
  468. discontinuous = calculate_discontinuous_projection(
  469. block_bboxes, direction=region.direction
  470. )
  471. if len(discontinuous) == 1:
  472. get_layout_structure(
  473. pre_cut_blocks, region.direction, region.secondary_direction
  474. )
  475. else:
  476. get_layout_structure(
  477. pre_cut_blocks, region.direction, region.secondary_direction
  478. )
  479. # Get xy cut blocks and add other blocks in special_block_map
  480. for block in pre_cut_blocks:
  481. if block.order_label not in [
  482. "cross_layout",
  483. "cross_reference",
  484. "doc_title",
  485. "unordered",
  486. ]:
  487. xy_cut_blocks.append(block)
  488. elif block.label == "doc_title":
  489. doc_title_blocks.append(block)
  490. else:
  491. unsorted_blocks.append(block)
  492. if len(xy_cut_blocks) > 0:
  493. block_bboxes = np.array([block.bbox for block in xy_cut_blocks])
  494. block_text_lines = [block.num_of_lines for block in xy_cut_blocks]
  495. discontinuous = calculate_discontinuous_projection(
  496. block_bboxes, direction=region.direction
  497. )
  498. blocks_to_sort = deepcopy(xy_cut_blocks)
  499. if region.direction == "vertical":
  500. for block in blocks_to_sort:
  501. block.bbox = np.array(
  502. [-block.bbox[0], block.bbox[1], -block.bbox[2], block.bbox[3]]
  503. )
  504. if len(discontinuous) == 1 or max(block_text_lines) == 1:
  505. blocks_to_sort.sort(
  506. key=lambda x: (
  507. x.bbox[region.secondary_direction_start_index]
  508. // (region.text_line_height // 2),
  509. x.bbox[region.direction_start_index],
  510. )
  511. )
  512. blocks_to_sort = shrink_overlapping_boxes(
  513. blocks_to_sort, region.secondary_direction
  514. )
  515. block_bboxes = np.array([block.bbox for block in blocks_to_sort])
  516. sorted_indexes = sort_by_xycut(
  517. block_bboxes, direction=region.secondary_direction, min_gap=1
  518. )
  519. else:
  520. blocks_to_sort.sort(
  521. key=lambda x: (
  522. x.bbox[region.direction_start_index]
  523. // (region.text_line_width // 2),
  524. x.bbox[region.secondary_direction_start_index],
  525. )
  526. )
  527. blocks_to_sort = shrink_overlapping_boxes(
  528. blocks_to_sort, region.secondary_direction
  529. )
  530. block_bboxes = np.array([block.bbox for block in blocks_to_sort])
  531. sorted_indexes = sort_by_xycut(
  532. block_bboxes, direction=region.direction, min_gap=1
  533. )
  534. sorted_blocks = [
  535. region.block_map[blocks_to_sort[i].index] for i in sorted_indexes
  536. ]
  537. sorted_blocks = match_unsorted_blocks(
  538. sorted_blocks,
  539. doc_title_blocks,
  540. region=region,
  541. )
  542. if unsorted_blocks and unsorted_blocks[0].label == "region":
  543. sorted_blocks = match_unsorted_blocks(
  544. sorted_blocks,
  545. unsorted_blocks,
  546. region=region,
  547. )
  548. unsorted_blocks = []
  549. sorted_blocks_by_pre_cuts.extend(sorted_blocks)
  550. final_sorted_blocks = match_unsorted_blocks(
  551. sorted_blocks_by_pre_cuts,
  552. unsorted_blocks,
  553. region=region,
  554. )
  555. final_order_res_list.extend(final_sorted_blocks)
  556. final_order_res_list.extend(footer_blocks)
  557. final_order_res_list.extend(unordered_blocks)
  558. for block_idx, block in enumerate(final_order_res_list):
  559. final_order_res_list = insert_child_blocks(
  560. block, block_idx, final_order_res_list
  561. )
  562. block = final_order_res_list[block_idx]
  563. return final_order_res_list