table_recognition_post_processing.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import Any, Dict, Optional
  15. import numpy as np
  16. from ..layout_parsing.utils import get_sub_regions_ocr_res
  17. from ..components import convert_points_to_boxes
  18. from .result import SingleTableRecognitionResult
  19. from ..ocr.result import OCRResult
  20. def get_ori_image_coordinate(x: int, y: int, box_list: list) -> list:
  21. """
  22. get the original coordinate from Cropped image to Original image.
  23. Args:
  24. x (int): x coordinate of cropped image
  25. y (int): y coordinate of cropped image
  26. box_list (list): list of table bounding boxes, eg. [[x1, y1, x2, y2, x3, y3, x4, y4]]
  27. Returns:
  28. list: list of original coordinates, eg. [[x1, y1, x2, y2, x3, y3, x4, y4]]
  29. """
  30. if not box_list:
  31. return box_list
  32. offset = np.array([x, y] * 4)
  33. box_list = np.array(box_list)
  34. if box_list.shape[-1] == 2:
  35. offset = offset.reshape(4, 2)
  36. ori_box_list = offset + box_list
  37. return ori_box_list
  38. def convert_table_structure_pred_bbox(
  39. table_structure_pred: Dict, crop_start_point: list, img_shape: tuple
  40. ) -> None:
  41. """
  42. Convert the predicted table structure bounding boxes to the original image coordinate system.
  43. Args:
  44. table_structure_pred (Dict): A dictionary containing the predicted table structure, including bounding boxes ('bbox').
  45. crop_start_point (list): A list of two integers representing the starting point (x, y) of the cropped image region.
  46. img_shape (tuple): A tuple of two integers representing the shape (height, width) of the original image.
  47. Returns:
  48. None: The function modifies the 'table_structure_pred' dictionary in place by adding the 'cell_box_list' key.
  49. """
  50. cell_points_list = table_structure_pred["bbox"]
  51. ori_cell_points_list = get_ori_image_coordinate(
  52. crop_start_point[0], crop_start_point[1], cell_points_list
  53. )
  54. ori_cell_points_list = np.reshape(ori_cell_points_list, (-1, 4, 2))
  55. cell_box_list = convert_points_to_boxes(ori_cell_points_list)
  56. img_height, img_width = img_shape
  57. cell_box_list = np.clip(
  58. cell_box_list, 0, [img_width, img_height, img_width, img_height]
  59. )
  60. table_structure_pred["cell_box_list"] = cell_box_list
  61. return
  62. def distance(box_1: list, box_2: list) -> float:
  63. """
  64. compute the distance between two boxes
  65. Args:
  66. box_1 (list): first rectangle box,eg.(x1, y1, x2, y2)
  67. box_2 (list): second rectangle box,eg.(x1, y1, x2, y2)
  68. Returns:
  69. float: the distance between two boxes
  70. """
  71. x1, y1, x2, y2 = box_1
  72. x3, y3, x4, y4 = box_2
  73. dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2)
  74. dis_2 = abs(x3 - x1) + abs(y3 - y1)
  75. dis_3 = abs(x4 - x2) + abs(y4 - y2)
  76. return dis + min(dis_2, dis_3)
  77. def compute_iou(rec1: list, rec2: list) -> float:
  78. """
  79. computing IoU
  80. Args:
  81. rec1 (list): (x1, y1, x2, y2)
  82. rec2 (list): (x1, y1, x2, y2)
  83. Returns:
  84. float: Intersection over Union
  85. """
  86. # computing area of each rectangles
  87. S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1])
  88. S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1])
  89. # computing the sum_area
  90. sum_area = S_rec1 + S_rec2
  91. # find the each edge of intersect rectangle
  92. left_line = max(rec1[0], rec2[0])
  93. right_line = min(rec1[2], rec2[2])
  94. top_line = max(rec1[1], rec2[1])
  95. bottom_line = min(rec1[3], rec2[3])
  96. # judge if there is an intersect
  97. if left_line >= right_line or top_line >= bottom_line:
  98. return 0.0
  99. else:
  100. intersect = (right_line - left_line) * (bottom_line - top_line)
  101. return (intersect / (sum_area - intersect)) * 1.0
  102. def _whether_y_overlap_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.6):
  103. """
  104. Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
  105. Args:
  106. bbox1 (tuple): The first bounding box defined as (left, top, right, bottom).
  107. bbox2 (tuple): The second bounding box defined as (left, top, right, bottom).
  108. overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
  109. Defaults to 0.6.
  110. Returns:
  111. bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
  112. exceeds the overlap_ratio_threshold, otherwise False.
  113. """
  114. _, y1_0, _, y1_1 = bbox1
  115. _, y2_0, _, y2_1 = bbox2
  116. overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
  117. min_height = min(y1_1 - y1_0, y2_1 - y2_0)
  118. return (overlap / min_height) > overlap_ratio_threshold
  119. def _sort_box_by_y_projection(boxes, line_height_iou_threshold=0.6):
  120. """
  121. Sorts a list of bounding boxes based on their spatial arrangement.
  122. The function first sorts the boxes by their top y-coordinate to group them into lines.
  123. Within each line, the boxes are then sorted by their x-coordinate.
  124. Args:
  125. boxes (list): A list of bounding boxes, where each box is defined as [left, top, right, bottom].
  126. line_height_iou_threshold (float): The Intersection over Union (IoU) threshold for grouping boxes into the same line.
  127. Returns:
  128. list: A list of indices representing the order of the boxes after sorting by their spatial arrangement.
  129. """
  130. if not boxes:
  131. return []
  132. indexed_boxes = list(enumerate(boxes))
  133. indexed_boxes.sort(key=lambda item: item[1][1])
  134. lines = []
  135. first_index, first_box = indexed_boxes[0]
  136. current_line = [(first_index, first_box)]
  137. current_y0, current_y1 = first_box[1], first_box[3]
  138. for index, box in indexed_boxes[1:]:
  139. y0, y1 = box[1], box[3]
  140. if _whether_y_overlap_exceeds_threshold(
  141. (0, current_y0, 0, current_y1),
  142. (0, y0, 0, y1),
  143. line_height_iou_threshold,
  144. ):
  145. current_line.append((index, box))
  146. current_y0 = min(current_y0, y0)
  147. current_y1 = max(current_y1, y1)
  148. else:
  149. lines.append(current_line)
  150. current_line = [(index, box)]
  151. current_y0, current_y1 = y0, y1
  152. if current_line:
  153. lines.append(current_line)
  154. for line in lines:
  155. line.sort(key=lambda item: item[1][0])
  156. sorted_indices = [index for line in lines for index, _ in line]
  157. return sorted_indices
  158. def match_table_and_ocr(
  159. cell_box_list: list, ocr_dt_boxes: list, cell_sort_by_y_projection: bool = False
  160. ) -> dict:
  161. """
  162. match table and ocr
  163. Args:
  164. cell_box_list (list): bbox for table cell, 2 points, [left, top, right, bottom]
  165. ocr_dt_boxes (list): bbox for ocr, 2 points, [left, top, right, bottom]
  166. cell_sort_by_y_projection (bool): Whether to sort the matched OCR boxes by y-projection.
  167. Returns:
  168. dict: matched dict, key is table index, value is ocr index
  169. """
  170. matched = {}
  171. for i, ocr_box in enumerate(np.array(ocr_dt_boxes)):
  172. ocr_box = ocr_box.astype(np.float32)
  173. distances = []
  174. for j, table_box in enumerate(cell_box_list):
  175. distances.append(
  176. (distance(table_box, ocr_box), 1.0 - compute_iou(table_box, ocr_box))
  177. ) # compute iou and l1 distance
  178. sorted_distances = distances.copy()
  179. # select det box by iou and l1 distance
  180. sorted_distances = sorted(sorted_distances, key=lambda item: (item[1], item[0]))
  181. if distances.index(sorted_distances[0]) not in matched.keys():
  182. matched[distances.index(sorted_distances[0])] = [i]
  183. else:
  184. matched[distances.index(sorted_distances[0])].append(i)
  185. if cell_sort_by_y_projection:
  186. for cell_index in matched:
  187. input_boxes = [ocr_dt_boxes[i] for i in matched[cell_index]]
  188. sorted_indices = _sort_box_by_y_projection(input_boxes, 0.7)
  189. sorted_indices = [matched[cell_index][i] for i in sorted_indices]
  190. matched[cell_index] = sorted_indices
  191. return matched
  192. def get_html_result(
  193. matched_index: dict, ocr_contents: dict, pred_structures: list
  194. ) -> str:
  195. """
  196. Generates HTML content based on the matched index, OCR contents, and predicted structures.
  197. Args:
  198. matched_index (dict): A dictionary containing matched indices.
  199. ocr_contents (dict): A dictionary of OCR contents.
  200. pred_structures (list): A list of predicted HTML structures.
  201. Returns:
  202. str: Generated HTML content as a string.
  203. """
  204. pred_html = []
  205. td_index = 0
  206. head_structure = pred_structures[0:3]
  207. html = "".join(head_structure)
  208. table_structure = pred_structures[3:-3]
  209. for tag in table_structure:
  210. if "</td>" in tag:
  211. if "<td></td>" == tag:
  212. pred_html.extend("<td>")
  213. if td_index in matched_index.keys():
  214. b_with = False
  215. if (
  216. "<b>" in ocr_contents[matched_index[td_index][0]]
  217. and len(matched_index[td_index]) > 1
  218. ):
  219. b_with = True
  220. pred_html.extend("<b>")
  221. for i, td_index_index in enumerate(matched_index[td_index]):
  222. content = ocr_contents[td_index_index]
  223. if len(matched_index[td_index]) > 1:
  224. if len(content) == 0:
  225. continue
  226. if content[0] == " ":
  227. content = content[1:]
  228. if "<b>" in content:
  229. content = content[3:]
  230. if "</b>" in content:
  231. content = content[:-4]
  232. if len(content) == 0:
  233. continue
  234. if i != len(matched_index[td_index]) - 1 and " " != content[-1]:
  235. content += " "
  236. pred_html.extend(content)
  237. if b_with:
  238. pred_html.extend("</b>")
  239. if "<td></td>" == tag:
  240. pred_html.append("</td>")
  241. else:
  242. pred_html.append(tag)
  243. td_index += 1
  244. else:
  245. pred_html.append(tag)
  246. html += "".join(pred_html)
  247. end_structure = pred_structures[-3:]
  248. html += "".join(end_structure)
  249. return html
  250. def get_table_recognition_res(
  251. table_box: list,
  252. table_structure_pred: dict,
  253. overall_ocr_res: OCRResult,
  254. cells_texts_list: list,
  255. use_table_cells_ocr_results: bool,
  256. cell_sort_by_y_projection: bool = False,
  257. ) -> SingleTableRecognitionResult:
  258. """
  259. Retrieve table recognition result from cropped image info, table structure prediction, and overall OCR result.
  260. Args:
  261. table_box (list): Information about the location of cropped image, including the bounding box.
  262. table_structure_pred (dict): Predicted table structure.
  263. overall_ocr_res (OCRResult): Overall OCR result from the input image.
  264. cells_texts_list (list): OCR results with cells.
  265. use_table_cells_ocr_results (bool): whether to use OCR results with cells.
  266. cell_sort_by_y_projection (bool): Whether to sort the matched OCR boxes by y-projection.
  267. Returns:
  268. SingleTableRecognitionResult: An object containing the single table recognition result.
  269. """
  270. table_box = np.array([table_box])
  271. table_ocr_pred = get_sub_regions_ocr_res(overall_ocr_res, table_box)
  272. crop_start_point = [table_box[0][0], table_box[0][1]]
  273. img_shape = overall_ocr_res["doc_preprocessor_res"]["output_img"].shape[0:2]
  274. if len(table_structure_pred['bbox']) == 0 or len(table_ocr_pred["rec_boxes"]) == 0:
  275. pred_html = ' '.join(list(table_structure_pred["structure"]))
  276. if len(table_structure_pred['bbox']) != 0:
  277. convert_table_structure_pred_bbox(table_structure_pred, crop_start_point, img_shape)
  278. table_cells_result = table_structure_pred["cell_box_list"]
  279. else:
  280. table_cells_result = []
  281. single_img_res = {
  282. "cell_box_list": table_cells_result,
  283. "table_ocr_pred": table_ocr_pred,
  284. "pred_html": pred_html,
  285. }
  286. return SingleTableRecognitionResult(single_img_res)
  287. convert_table_structure_pred_bbox(table_structure_pred, crop_start_point, img_shape)
  288. structures = table_structure_pred["structure"]
  289. cell_box_list = table_structure_pred["cell_box_list"]
  290. if use_table_cells_ocr_results == True:
  291. ocr_dt_boxes = cell_box_list
  292. ocr_texts_res = cells_texts_list
  293. else:
  294. ocr_dt_boxes = table_ocr_pred["rec_boxes"]
  295. ocr_texts_res = table_ocr_pred["rec_texts"]
  296. matched_index = match_table_and_ocr(
  297. cell_box_list, ocr_dt_boxes, cell_sort_by_y_projection=cell_sort_by_y_projection
  298. )
  299. pred_html = get_html_result(matched_index, ocr_texts_res, structures)
  300. single_img_res = {
  301. "cell_box_list": cell_box_list,
  302. "table_ocr_pred": table_ocr_pred,
  303. "pred_html": pred_html,
  304. }
  305. return SingleTableRecognitionResult(single_img_res)