utils.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import Dict, List, Tuple
  15. import numpy as np
  16. from ..result_v2 import LayoutParsingBlock
  17. from ..utils import calculate_projection_overlap_ratio
  18. def get_nearest_edge_distance(
  19. bbox1: List[int],
  20. bbox2: List[int],
  21. weight: List[float] = [1.0, 1.0, 1.0, 1.0],
  22. ) -> Tuple[float]:
  23. """
  24. Calculate the nearest edge distance between two bounding boxes, considering orientational weights.
  25. Args:
  26. bbox1 (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
  27. bbox2 (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
  28. weight (list, optional): orientational weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
  29. Returns:
  30. float: The calculated minimum edge distance between the bounding boxes.
  31. """
  32. x1, y1, x2, y2 = bbox1
  33. x1_prime, y1_prime, x2_prime, y2_prime = bbox2
  34. min_x_distance, min_y_distance = 0, 0
  35. horizontal_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "horizontal")
  36. vertical_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "vertical")
  37. if horizontal_iou > 0 and vertical_iou > 0:
  38. return 0.0
  39. if horizontal_iou == 0:
  40. min_x_distance = min(abs(x1 - x2_prime), abs(x2 - x1_prime)) * (
  41. weight[0] if x2 < x1_prime else weight[1]
  42. )
  43. if vertical_iou == 0:
  44. min_y_distance = min(abs(y1 - y2_prime), abs(y2 - y1_prime)) * (
  45. weight[2] if y2 < y1_prime else weight[3]
  46. )
  47. return min_x_distance + min_y_distance
  48. def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
  49. """
  50. Generate a 1D projection histogram from bounding boxes along a specified axis.
  51. Args:
  52. boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
  53. axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
  54. Returns:
  55. A 1D numpy array representing the projection histogram based on bounding box intervals.
  56. """
  57. assert axis in [0, 1]
  58. max_length = np.max(boxes[:, axis::2])
  59. projection = np.zeros(max_length, dtype=int)
  60. # Increment projection histogram over the interval defined by each bounding box
  61. for start, end in boxes[:, axis::2]:
  62. projection[start:end] += 1
  63. return projection
  64. def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
  65. """
  66. Split the projection profile into segments based on specified thresholds.
  67. Args:
  68. arr_values: 1D array representing the projection profile.
  69. min_value: Minimum value threshold to consider a profile segment significant.
  70. min_gap: Minimum gap width to consider a separation between segments.
  71. Returns:
  72. A tuple of start and end indices for each segment that meets the criteria.
  73. """
  74. # Identify indices where the projection exceeds the minimum value
  75. significant_indices = np.where(arr_values > min_value)[0]
  76. if not len(significant_indices):
  77. return
  78. # Calculate gaps between significant indices
  79. index_diffs = significant_indices[1:] - significant_indices[:-1]
  80. gap_indices = np.where(index_diffs > min_gap)[0]
  81. # Determine start and end indices of segments
  82. segment_starts = np.insert(
  83. significant_indices[gap_indices + 1],
  84. 0,
  85. significant_indices[0],
  86. )
  87. segment_ends = np.append(
  88. significant_indices[gap_indices],
  89. significant_indices[-1] + 1,
  90. )
  91. return segment_starts, segment_ends
  92. def recursive_yx_cut(
  93. boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
  94. ):
  95. """
  96. Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
  97. Args:
  98. boxes: A (N, 4) array representing bounding boxes.
  99. indices: List of indices indicating the original position of boxes.
  100. res: List to store indices of the final segmented bounding boxes.
  101. min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
  102. Returns:
  103. None: This function modifies the `res` list in place.
  104. """
  105. assert len(boxes) == len(
  106. indices
  107. ), "The length of boxes and indices must be the same."
  108. # Sort by y_min for Y-axis projection
  109. y_sorted_indices = boxes[:, 1].argsort()
  110. y_sorted_boxes = boxes[y_sorted_indices]
  111. y_sorted_indices = np.array(indices)[y_sorted_indices]
  112. # Perform Y-axis projection
  113. y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
  114. y_intervals = _split_projection_profile(y_projection, 0, 1)
  115. if not y_intervals:
  116. return
  117. # Process each segment defined by Y-axis projection
  118. for y_start, y_end in zip(*y_intervals):
  119. # Select boxes within the current y interval
  120. y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
  121. y_sorted_boxes[:, 1] < y_end
  122. )
  123. y_boxes_chunk = y_sorted_boxes[y_interval_indices]
  124. y_indices_chunk = y_sorted_indices[y_interval_indices]
  125. # Sort by x_min for X-axis projection
  126. x_sorted_indices = y_boxes_chunk[:, 0].argsort()
  127. x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
  128. x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
  129. # Perform X-axis projection
  130. x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
  131. x_intervals = _split_projection_profile(x_projection, 0, min_gap)
  132. if not x_intervals:
  133. continue
  134. # If X-axis cannot be further segmented, add current indices to results
  135. if len(x_intervals[0]) == 1:
  136. res.extend(x_sorted_indices_chunk)
  137. continue
  138. # Recursively process each segment defined by X-axis projection
  139. for x_start, x_end in zip(*x_intervals):
  140. x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
  141. x_sorted_boxes_chunk[:, 0] < x_end
  142. )
  143. recursive_yx_cut(
  144. x_sorted_boxes_chunk[x_interval_indices],
  145. x_sorted_indices_chunk[x_interval_indices],
  146. res,
  147. )
  148. def recursive_xy_cut(
  149. boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
  150. ):
  151. """
  152. Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
  153. Args:
  154. boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
  155. indices: A list of indices representing the position of boxes in the original data.
  156. res: A list to store indices of bounding boxes that meet the criteria.
  157. min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
  158. Returns:
  159. None: This function modifies the `res` list in place.
  160. """
  161. # Ensure boxes and indices have the same length
  162. assert len(boxes) == len(
  163. indices
  164. ), "The length of boxes and indices must be the same."
  165. # Sort by x_min to prepare for X-axis projection
  166. x_sorted_indices = boxes[:, 0].argsort()
  167. x_sorted_boxes = boxes[x_sorted_indices]
  168. x_sorted_indices = np.array(indices)[x_sorted_indices]
  169. # Perform X-axis projection
  170. x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
  171. x_intervals = _split_projection_profile(x_projection, 0, 1)
  172. if not x_intervals:
  173. return
  174. # Process each segment defined by X-axis projection
  175. for x_start, x_end in zip(*x_intervals):
  176. # Select boxes within the current x interval
  177. x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
  178. x_sorted_boxes[:, 0] < x_end
  179. )
  180. x_boxes_chunk = x_sorted_boxes[x_interval_indices]
  181. x_indices_chunk = x_sorted_indices[x_interval_indices]
  182. # Sort selected boxes by y_min to prepare for Y-axis projection
  183. y_sorted_indices = x_boxes_chunk[:, 1].argsort()
  184. y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
  185. y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
  186. # Perform Y-axis projection
  187. y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
  188. y_intervals = _split_projection_profile(y_projection, 0, min_gap)
  189. if not y_intervals:
  190. continue
  191. # If Y-axis cannot be further segmented, add current indices to results
  192. if len(y_intervals[0]) == 1:
  193. res.extend(y_sorted_indices_chunk)
  194. continue
  195. # Recursively process each segment defined by Y-axis projection
  196. for y_start, y_end in zip(*y_intervals):
  197. y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
  198. y_sorted_boxes_chunk[:, 1] < y_end
  199. )
  200. recursive_xy_cut(
  201. y_sorted_boxes_chunk[y_interval_indices],
  202. y_sorted_indices_chunk[y_interval_indices],
  203. res,
  204. )
  205. def reference_insert(
  206. block: LayoutParsingBlock,
  207. sorted_blocks: List[LayoutParsingBlock],
  208. config: Dict,
  209. median_width: float = 0.0,
  210. ):
  211. """
  212. Insert reference block into sorted blocks based on the distance between the block and the nearest sorted block.
  213. Args:
  214. block: The block to insert into the sorted blocks.
  215. sorted_blocks: The sorted blocks where the new block will be inserted.
  216. config: Configuration dictionary containing parameters related to the layout parsing.
  217. median_width: Median width of the document. Defaults to 0.0.
  218. Returns:
  219. sorted_blocks: The updated sorted blocks after insertion.
  220. """
  221. min_distance = float("inf")
  222. nearest_sorted_block_index = 0
  223. for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
  224. if sorted_block.bbox[3] <= block.bbox[1]:
  225. distance = -(sorted_block.bbox[2] * 10 + sorted_block.bbox[3])
  226. if distance < min_distance:
  227. min_distance = distance
  228. nearest_sorted_block_index = sorted_block_idx
  229. sorted_blocks.insert(nearest_sorted_block_index + 1, block)
  230. return sorted_blocks
  231. def manhattan_insert(
  232. block: LayoutParsingBlock,
  233. sorted_blocks: List[LayoutParsingBlock],
  234. config: Dict,
  235. median_width: float = 0.0,
  236. ):
  237. """
  238. Insert a block into a sorted list of blocks based on the Manhattan distance between the block and the nearest sorted block.
  239. Args:
  240. block: The block to insert into the sorted blocks.
  241. sorted_blocks: The sorted blocks where the new block will be inserted.
  242. config: Configuration dictionary containing parameters related to the layout parsing.
  243. median_width: Median width of the document. Defaults to 0.0.
  244. Returns:
  245. sorted_blocks: The updated sorted blocks after insertion.
  246. """
  247. min_distance = float("inf")
  248. nearest_sorted_block_index = 0
  249. for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
  250. distance = _manhattan_distance(block.bbox, sorted_block.bbox)
  251. if distance < min_distance:
  252. min_distance = distance
  253. nearest_sorted_block_index = sorted_block_idx
  254. sorted_blocks.insert(nearest_sorted_block_index + 1, block)
  255. return sorted_blocks
  256. def weighted_distance_insert(
  257. block: LayoutParsingBlock,
  258. sorted_blocks: List[LayoutParsingBlock],
  259. config: Dict,
  260. median_width: float = 0.0,
  261. ):
  262. """
  263. Insert a block into a sorted list of blocks based on the weighted distance between the block and the nearest sorted block.
  264. Args:
  265. block: The block to insert into the sorted blocks.
  266. sorted_blocks: The sorted blocks where the new block will be inserted.
  267. config: Configuration dictionary containing parameters related to the layout parsing.
  268. median_width: Median width of the document. Defaults to 0.0.
  269. Returns:
  270. sorted_blocks: The updated sorted blocks after insertion.
  271. """
  272. doc_title_labels = config.get("doc_title_labels", [])
  273. paragraph_title_labels = config.get("paragraph_title_labels", [])
  274. vision_labels = config.get("vision_labels", [])
  275. xy_cut_block_labels = config.get("xy_cut_block_labels", [])
  276. tolerance_len = config.get("tolerance_len", 2)
  277. x1, y1, x2, y2 = block.bbox
  278. min_weighted_distance, min_edge_distance, min_up_edge_distance = (
  279. float("inf"),
  280. float("inf"),
  281. float("inf"),
  282. )
  283. nearest_sorted_block_index = 0
  284. for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
  285. x1_prime, y1_prime, x2_prime, y2_prime = sorted_block.bbox
  286. # Calculate edge distance
  287. weight = _get_weights(block.order_label, block.orientation)
  288. edge_distance = get_nearest_edge_distance(block.bbox, sorted_block.bbox, weight)
  289. if block.label in doc_title_labels:
  290. disperse = max(1, median_width)
  291. tolerance_len = max(tolerance_len, disperse)
  292. if block.label == "abstract":
  293. tolerance_len *= 2
  294. edge_distance = max(0.1, edge_distance) * 10
  295. # Calculate up edge distances
  296. up_edge_distance = y1_prime
  297. left_edge_distance = x1_prime
  298. if (
  299. block.label in xy_cut_block_labels
  300. or block.label in doc_title_labels
  301. or block.label in paragraph_title_labels
  302. or block.label in vision_labels
  303. ) and y1 > y2_prime:
  304. up_edge_distance = -y2_prime
  305. left_edge_distance = -x2_prime
  306. if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
  307. up_edge_distance = min_up_edge_distance
  308. # Calculate weighted distance
  309. weighted_distance = (
  310. +edge_distance * config.get("edge_weight", 10**4)
  311. + up_edge_distance * config.get("up_edge_weight", 1)
  312. + left_edge_distance * config.get("left_edge_weight", 0.0001)
  313. )
  314. min_edge_distance = min(edge_distance, min_edge_distance)
  315. min_up_edge_distance = min(up_edge_distance, min_up_edge_distance)
  316. if weighted_distance < min_weighted_distance:
  317. nearest_sorted_block_index = sorted_block_idx
  318. min_weighted_distance = weighted_distance
  319. if y1 > y1_prime or (y1 == y1_prime and x1 > x1_prime):
  320. nearest_sorted_block_index = sorted_block_idx + 1
  321. sorted_blocks.insert(nearest_sorted_block_index, block)
  322. return sorted_blocks
  323. def insert_child_blocks(
  324. block: LayoutParsingBlock,
  325. block_idx: int,
  326. sorted_blocks: List[LayoutParsingBlock],
  327. ) -> List[LayoutParsingBlock]:
  328. """
  329. Insert child blocks of a block into the sorted blocks list.
  330. Args:
  331. block: The parent block whose child blocks need to be inserted.
  332. block_idx: Index at which the parent block exists in the sorted blocks list.
  333. sorted_blocks: Sorted blocks list where the child blocks are to be inserted.
  334. Returns:
  335. sorted_blocks: Updated sorted blocks list after inserting child blocks.
  336. """
  337. if block.child_blocks:
  338. sub_blocks = block.get_child_blocks()
  339. sub_blocks.append(block)
  340. sub_blocks = sort_child_blocks(sub_blocks, block.orientation)
  341. sorted_blocks[block_idx] = sub_blocks[0]
  342. for block in sub_blocks[1:]:
  343. block_idx += 1
  344. sorted_blocks.insert(block_idx, block)
  345. return sorted_blocks
  346. def sort_child_blocks(blocks, orientation="horizontal") -> List[LayoutParsingBlock]:
  347. """
  348. Sort child blocks based on their bounding box coordinates.
  349. Args:
  350. blocks: A list of LayoutParsingBlock objects representing the child blocks.
  351. orientation: Orientation of the blocks ('horizontal' or 'vertical'). Default is 'horizontal'.
  352. Returns:
  353. sorted_blocks: A sorted list of LayoutParsingBlock objects.
  354. """
  355. if orientation == "horizontal":
  356. # from top to bottom
  357. blocks.sort(
  358. key=lambda x: (
  359. x.bbox[1], # y_min
  360. x.bbox[0], # x_min
  361. x.bbox[1] ** 2 + x.bbox[0] ** 2, # distance with (0,0)
  362. ),
  363. reverse=False,
  364. )
  365. else:
  366. # from right to left
  367. blocks.sort(
  368. key=lambda x: (
  369. x.bbox[0], # x_min
  370. x.bbox[1], # y_min
  371. x.bbox[1] ** 2 + x.bbox[0] ** 2, # distance with (0,0)
  372. ),
  373. reverse=True,
  374. )
  375. return blocks
  376. def _get_weights(label, dircetion="horizontal"):
  377. """Define weights based on the label and orientation."""
  378. if label == "doc_title":
  379. return (
  380. [1, 0.1, 0.1, 1] if dircetion == "horizontal" else [0.2, 0.1, 1, 1]
  381. ) # left-down , right-left
  382. elif label in [
  383. "paragraph_title",
  384. "table_title",
  385. "abstract",
  386. "image",
  387. "seal",
  388. "chart",
  389. "figure",
  390. ]:
  391. return [1, 1, 0.1, 1] # down
  392. else:
  393. return [1, 1, 1, 0.1] # up
  394. def _manhattan_distance(
  395. point1: Tuple[float, float],
  396. point2: Tuple[float, float],
  397. weight_x: float = 1.0,
  398. weight_y: float = 1.0,
  399. ) -> float:
  400. """
  401. Calculate the weighted Manhattan distance between two points.
  402. Args:
  403. point1 (Tuple[float, float]): The first point as (x, y).
  404. point2 (Tuple[float, float]): The second point as (x, y).
  405. weight_x (float): The weight for the x-axis distance. Default is 1.0.
  406. weight_y (float): The weight for the y-axis distance. Default is 1.0.
  407. Returns:
  408. float: The weighted Manhattan distance between the two points.
  409. """
  410. return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
  411. def sort_blocks(blocks, median_width=None, reverse=False):
  412. """
  413. Sort blocks based on their y_min, x_min and distance with (0,0).
  414. Args:
  415. blocks (list): list of blocks to be sorted.
  416. median_width (int): the median width of the text blocks.
  417. reverse (bool, optional): whether to sort in descending order. Default is False.
  418. Returns:
  419. list: a list of sorted blocks.
  420. """
  421. if median_width is None:
  422. median_width = 1
  423. blocks.sort(
  424. key=lambda x: (
  425. x.bbox[1] // 10, # y_min
  426. x.bbox[0] // median_width, # x_min
  427. x.bbox[1] ** 2 + x.bbox[0] ** 2, # distance with (0,0)
  428. ),
  429. reverse=reverse,
  430. )
  431. return blocks
  432. def get_cut_blocks(
  433. blocks, cut_orientation, cut_coordinates, overall_region_box, mask_labels=[]
  434. ):
  435. """
  436. Cut blocks based on the given cut orientation and coordinates.
  437. Args:
  438. blocks (list): list of blocks to be cut.
  439. cut_orientation (str): cut orientation, either "horizontal" or "vertical".
  440. cut_coordinates (list): list of cut coordinates.
  441. overall_region_box (list): the overall region box that contains all blocks.
  442. Returns:
  443. list: a list of tuples containing the cutted blocks and their corresponding mean width。
  444. """
  445. cuted_list = []
  446. # filter out mask blocks,including header, footer, unordered and child_blocks
  447. # 0: horizontal, 1: vertical
  448. cut_aixis = 0 if cut_orientation == "horizontal" else 1
  449. blocks.sort(key=lambda x: x.bbox[cut_aixis + 2])
  450. cut_coordinates.append(float("inf"))
  451. cut_coordinates = list(set(cut_coordinates))
  452. cut_coordinates.sort()
  453. cut_idx = 0
  454. for cut_coordinate in cut_coordinates:
  455. group_blocks = []
  456. block_idx = cut_idx
  457. while block_idx < len(blocks):
  458. block = blocks[block_idx]
  459. if block.bbox[cut_aixis + 2] > cut_coordinate:
  460. break
  461. elif block.order_label not in mask_labels:
  462. group_blocks.append(block)
  463. block_idx += 1
  464. cut_idx = block_idx
  465. if group_blocks:
  466. cuted_list.append(group_blocks)
  467. return cuted_list
  468. def add_split_block(
  469. blocks: List[LayoutParsingBlock], region_bbox: List[int]
  470. ) -> List[LayoutParsingBlock]:
  471. block_bboxes = np.array([block.bbox for block in blocks])
  472. discontinuous = calculate_discontinuous_projection(
  473. block_bboxes, orientation="vertical"
  474. )
  475. current_interval = discontinuous[0]
  476. for interval in discontinuous[1:]:
  477. gap_len = interval[0] - current_interval[1]
  478. if gap_len > 40:
  479. x1, _, x2, __ = region_bbox
  480. y1 = current_interval[1] + 5
  481. y2 = interval[0] - 5
  482. bbox = [x1, y1, x2, y2]
  483. split_block = LayoutParsingBlock(label="split", bbox=bbox)
  484. blocks.append(split_block)
  485. current_interval = interval
  486. def get_adjacent_blocks_by_orientation(
  487. blocks: List[LayoutParsingBlock],
  488. block_idx: int,
  489. ref_block_idxes: List[int],
  490. iou_threshold,
  491. ) -> List:
  492. """
  493. Get the adjacent blocks with the same orientation as the current block.
  494. Args:
  495. block (LayoutParsingBlock): The current block.
  496. blocks (List[LayoutParsingBlock]): A list of all blocks.
  497. ref_block_idxes (List[int]): A list of indices of reference blocks.
  498. iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
  499. Returns:
  500. Int: The index of the previous block with same orientation.
  501. Int: The index of the following block with same orientation.
  502. """
  503. min_prev_block_distance = float("inf")
  504. prev_block_index = None
  505. min_post_block_distance = float("inf")
  506. post_block_index = None
  507. block = blocks[block_idx]
  508. child_labels = [
  509. "vision_footnote",
  510. "sub_paragraph_title",
  511. "doc_title_text",
  512. "vision_title",
  513. ]
  514. # find the nearest text block with same orientation to the current block
  515. for ref_block_idx in ref_block_idxes:
  516. ref_block = blocks[ref_block_idx]
  517. ref_block_orientation = ref_block.orientation
  518. if ref_block.order_label in child_labels:
  519. continue
  520. match_block_iou = calculate_projection_overlap_ratio(
  521. block.bbox,
  522. ref_block.bbox,
  523. ref_block_orientation,
  524. )
  525. child_match_distance_tolerance_len = block.short_side_length / 10
  526. if block.order_label == "vision":
  527. if ref_block.num_of_lines == 1:
  528. gap_tolerance_len = ref_block.short_side_length * 2
  529. else:
  530. gap_tolerance_len = block.short_side_length / 10
  531. else:
  532. gap_tolerance_len = block.short_side_length * 2
  533. if match_block_iou >= iou_threshold:
  534. prev_distance = (
  535. block.secondary_orientation_start_coordinate
  536. - ref_block.secondary_orientation_end_coordinate
  537. + child_match_distance_tolerance_len
  538. ) // 5 + ref_block.start_coordinate / 5000
  539. next_distance = (
  540. ref_block.secondary_orientation_start_coordinate
  541. - block.secondary_orientation_end_coordinate
  542. + child_match_distance_tolerance_len
  543. ) // 5 + ref_block.start_coordinate / 5000
  544. if (
  545. ref_block.secondary_orientation_end_coordinate
  546. <= block.secondary_orientation_start_coordinate
  547. + child_match_distance_tolerance_len
  548. and prev_distance < min_prev_block_distance
  549. ):
  550. min_prev_block_distance = prev_distance
  551. if (
  552. block.secondary_orientation_start_coordinate
  553. - ref_block.secondary_orientation_end_coordinate
  554. < gap_tolerance_len
  555. ):
  556. prev_block_index = ref_block_idx
  557. elif (
  558. ref_block.secondary_orientation_start_coordinate
  559. > block.secondary_orientation_end_coordinate
  560. - child_match_distance_tolerance_len
  561. and next_distance < min_post_block_distance
  562. ):
  563. min_post_block_distance = next_distance
  564. if (
  565. ref_block.secondary_orientation_start_coordinate
  566. - block.secondary_orientation_end_coordinate
  567. < gap_tolerance_len
  568. ):
  569. post_block_index = ref_block_idx
  570. diff_dist = abs(min_prev_block_distance - min_post_block_distance)
  571. # if the difference in distance is too large, only consider the nearest one
  572. if diff_dist * 5 > block.short_side_length:
  573. if min_prev_block_distance < min_post_block_distance:
  574. post_block_index = None
  575. else:
  576. prev_block_index = None
  577. return prev_block_index, post_block_index
  578. def update_doc_title_child_blocks(
  579. blocks: List[LayoutParsingBlock],
  580. block: LayoutParsingBlock,
  581. prev_idx: int,
  582. post_idx: int,
  583. config: dict,
  584. ) -> None:
  585. """
  586. Update the child blocks of a document title block.
  587. The child blocks need to meet the following conditions:
  588. 1. They must be adjacent
  589. 2. They must have the same orientation as the parent block.
  590. 3. Their short side length should be less than 80% of the parent's short side length.
  591. 4. Their long side length should be less than 150% of the parent's long side length.
  592. 5. The child block must be text block.
  593. Args:
  594. blocks (List[LayoutParsingBlock]): overall blocks.
  595. block (LayoutParsingBlock): document title block.
  596. prev_idx (int): previous block index, None if not exist.
  597. post_idx (int): post block index, None if not exist.
  598. config (dict): configurations.
  599. Returns:
  600. None
  601. """
  602. for idx in [prev_idx, post_idx]:
  603. if idx is None:
  604. continue
  605. ref_block = blocks[idx]
  606. with_seem_orientation = ref_block.orientation == block.orientation
  607. short_side_length_condition = (
  608. ref_block.short_side_length < block.short_side_length * 0.8
  609. )
  610. long_side_length_condition = (
  611. ref_block.long_side_length < block.long_side_length
  612. or ref_block.long_side_length > 1.5 * block.long_side_length
  613. )
  614. if (
  615. with_seem_orientation
  616. and short_side_length_condition
  617. and long_side_length_condition
  618. and ref_block.num_of_lines < 3
  619. ):
  620. ref_block.order_label = "doc_title_text"
  621. block.append_child_block(ref_block)
  622. config["text_block_idxes"].remove(idx)
  623. def update_paragraph_title_child_blocks(
  624. blocks: List[LayoutParsingBlock],
  625. block: LayoutParsingBlock,
  626. prev_idx: int,
  627. post_idx: int,
  628. config: dict,
  629. ) -> None:
  630. """
  631. Update the child blocks of a paragraph title block.
  632. The child blocks need to meet the following conditions:
  633. 1. They must be adjacent
  634. 2. They must have the same orientation as the parent block.
  635. 3. The child block must be paragraph title block.
  636. Args:
  637. blocks (List[LayoutParsingBlock]): overall blocks.
  638. block (LayoutParsingBlock): document title block.
  639. prev_idx (int): previous block index, None if not exist.
  640. post_idx (int): post block index, None if not exist.
  641. config (dict): configurations.
  642. Returns:
  643. None
  644. """
  645. paragraph_title_labels = config.get("paragraph_title_labels", [])
  646. for idx in [prev_idx, post_idx]:
  647. if idx is None:
  648. continue
  649. ref_block = blocks[idx]
  650. min_height = min(block.height, ref_block.height)
  651. nearest_edge_distance = get_nearest_edge_distance(block.bbox, ref_block.bbox)
  652. with_seem_orientation = ref_block.orientation == block.orientation
  653. if (
  654. with_seem_orientation
  655. and ref_block.label in paragraph_title_labels
  656. and nearest_edge_distance <= min_height * 2
  657. ):
  658. ref_block.order_label = "sub_paragraph_title"
  659. block.append_child_block(ref_block)
  660. config["paragraph_title_block_idxes"].remove(idx)
  661. def update_vision_child_blocks(
  662. blocks: List[LayoutParsingBlock],
  663. block: LayoutParsingBlock,
  664. ref_block_idxes: List[int],
  665. prev_idx: int,
  666. post_idx: int,
  667. config: dict,
  668. ) -> None:
  669. """
  670. Update the child blocks of a paragraph title block.
  671. The child blocks need to meet the following conditions:
  672. - For Both:
  673. 1. They must be adjacent
  674. 2. The child block must be vision_title or text block.
  675. - For vision_title:
  676. 1. The distance between the child block and the parent block should be less than 1/2 of the parent's height.
  677. - For text block:
  678. 1. The distance between the child block and the parent block should be less than 15.
  679. 2. The child short_side_length should be less than the parent's short side length.
  680. 3. The child long_side_length should be less than 50% of the parent's long side length.
  681. 4. The difference between their centers is very small.
  682. Args:
  683. blocks (List[LayoutParsingBlock]): overall blocks.
  684. block (LayoutParsingBlock): document title block.
  685. ref_block_idxes (List[int]): A list of indices of reference blocks.
  686. prev_idx (int): previous block index, None if not exist.
  687. post_idx (int): post block index, None if not exist.
  688. config (dict): configurations.
  689. Returns:
  690. None
  691. """
  692. vision_title_labels = config.get("vision_title_labels", [])
  693. text_labels = config.get("text_labels", [])
  694. for idx in [prev_idx, post_idx]:
  695. if idx is None:
  696. continue
  697. ref_block = blocks[idx]
  698. nearest_edge_distance = get_nearest_edge_distance(block.bbox, ref_block.bbox)
  699. block_center = block.get_centroid()
  700. ref_block_center = ref_block.get_centroid()
  701. if ref_block.label in vision_title_labels and nearest_edge_distance <= min(
  702. block.height * 0.5, ref_block.height * 2
  703. ):
  704. ref_block.order_label = "vision_title"
  705. block.append_child_block(ref_block)
  706. config["vision_title_block_idxes"].remove(idx)
  707. elif (
  708. nearest_edge_distance <= 15
  709. and ref_block.short_side_length < block.short_side_length
  710. and ref_block.long_side_length < 0.5 * block.long_side_length
  711. and ref_block.orientation == block.orientation
  712. and (
  713. abs(block_center[0] - ref_block_center[0]) < 10
  714. or (
  715. block.bbox[0] - ref_block.bbox[0] < 10
  716. and ref_block.num_of_lines == 1
  717. )
  718. or (
  719. block.bbox[2] - ref_block.bbox[2] < 10
  720. and ref_block.num_of_lines == 1
  721. )
  722. )
  723. ):
  724. has_vision_footnote = False
  725. if len(block.child_blocks) > 0:
  726. for child_block in block.child_blocks:
  727. if child_block.label in text_labels:
  728. has_vision_footnote = True
  729. if not has_vision_footnote:
  730. ref_block.order_label = "vision_footnote"
  731. block.append_child_block(ref_block)
  732. config["text_block_idxes"].remove(idx)
  733. def calculate_discontinuous_projection(
  734. boxes, orientation="horizontal", return_num=False
  735. ) -> List:
  736. """
  737. Calculate the discontinuous projection of boxes along the specified orientation.
  738. Args:
  739. boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
  740. orientation (str): orientation along which to perform the projection ('horizontal' or 'vertical').
  741. Returns:
  742. list: List of tuples representing the merged intervals.
  743. """
  744. boxes = np.array(boxes)
  745. if orientation == "horizontal":
  746. intervals = boxes[:, [0, 2]]
  747. elif orientation == "vertical":
  748. intervals = boxes[:, [1, 3]]
  749. else:
  750. raise ValueError("orientation must be 'horizontal' or 'vertical'")
  751. intervals = intervals[np.argsort(intervals[:, 0])]
  752. merged_intervals = []
  753. num = 1
  754. current_start, current_end = intervals[0]
  755. num_list = []
  756. for start, end in intervals[1:]:
  757. if start <= current_end:
  758. num += 1
  759. current_end = max(current_end, end)
  760. else:
  761. num_list.append(num)
  762. merged_intervals.append((current_start, current_end))
  763. num = 1
  764. current_start, current_end = start, end
  765. num_list.append(num)
  766. merged_intervals.append((current_start, current_end))
  767. if return_num:
  768. return merged_intervals, num_list
  769. return merged_intervals
  770. def shrink_overlapping_boxes(
  771. boxes, orientation="horizontal", min_threshold=0, max_threshold=0.1
  772. ) -> List:
  773. """
  774. Shrink overlapping boxes along the specified orientation.
  775. Args:
  776. boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
  777. orientation (str): orientation along which to perform the shrinking ('horizontal' or 'vertical').
  778. min_threshold (float): Minimum threshold for shrinking. Default is 0.
  779. max_threshold (float): Maximum threshold for shrinking. Default is 0.2.
  780. Returns:
  781. list: List of tuples representing the merged intervals.
  782. """
  783. current_block = boxes[0]
  784. for block in boxes[1:]:
  785. x1, y1, x2, y2 = current_block.bbox
  786. x1_prime, y1_prime, x2_prime, y2_prime = block.bbox
  787. cut_iou = calculate_projection_overlap_ratio(
  788. current_block.bbox, block.bbox, orientation=orientation
  789. )
  790. match_iou = calculate_projection_overlap_ratio(
  791. current_block.bbox,
  792. block.bbox,
  793. orientation="horizontal" if orientation == "vertical" else "vertical",
  794. )
  795. if orientation == "vertical":
  796. if (
  797. (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
  798. or y2 == y1_prime
  799. or abs(y2 - y1_prime) <= 3
  800. ):
  801. overlap_y_min = max(y1, y1_prime)
  802. overlap_y_max = min(y2, y2_prime)
  803. split_y = int((overlap_y_min + overlap_y_max) / 2)
  804. overlap_y_min = split_y - 1
  805. overlap_y_max = split_y + 1
  806. current_block.bbox = [x1, y1, x2, overlap_y_min]
  807. block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
  808. else:
  809. if (
  810. (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
  811. or x2 == x1_prime
  812. or abs(x2 - x1_prime) <= 3
  813. ):
  814. overlap_x_min = max(x1, x1_prime)
  815. overlap_x_max = min(x2, x2_prime)
  816. split_x = int((overlap_x_min + overlap_x_max) / 2)
  817. overlap_x_min = split_x - 1
  818. overlap_x_max = split_x + 1
  819. current_block.bbox = [x1, y1, overlap_x_min, y2]
  820. block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
  821. current_block = block
  822. return boxes