layout_objects.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from typing import Any, List, Union
  15. import numpy as np
  16. from .setting import BLOCK_LABEL_MAP, LINE_SETTINGS
  17. from .utils import (
  18. caculate_euclidean_dist,
  19. calculate_projection_overlap_ratio,
  20. is_english_letter,
  21. is_non_breaking_punctuation,
  22. is_numeric,
  23. )
  24. __all__ = [
  25. "TextSpan",
  26. "TextLine",
  27. "LayoutBlock",
  28. "LayoutRegion",
  29. ]
  30. class TextSpan(object):
  31. """Text span class"""
  32. def __init__(self, box, text, label):
  33. """
  34. Initialize a TextSpan object.
  35. Args:
  36. box (list): The bounding box of the text span.
  37. text (str): The text content of the text span.
  38. label (int): The label of the text span.
  39. """
  40. self.box = box
  41. self.text = text
  42. self.label = label
  43. def __str__(self) -> str:
  44. return f"{self.text}"
  45. def __repr__(self) -> str:
  46. return f"{self.text}"
  47. class TextLine(object):
  48. """Text line class"""
  49. def __init__(self, spans: List[TextSpan] = [], direction="horizontal"):
  50. """
  51. Initialize a TextLine object.
  52. Args:
  53. spans (List[TextSpan]): A list of TextSpan objects. Defaults to [].
  54. direction (str): The direction of the text line. Defaults to "horizontal".
  55. """
  56. self.spans = spans
  57. self.direction = direction
  58. self.region_box = self.get_region_box()
  59. self.need_new_line = False
  60. @property
  61. def labels(self):
  62. return [span.label for span in self.spans]
  63. @property
  64. def boxes(self):
  65. return [span.box for span in self.spans]
  66. @property
  67. def height(self):
  68. start_idx = 1 if self.direction == "horizontal" else 0
  69. end_idx = 3 if self.direction == "horizontal" else 2
  70. return abs(self.region_box[end_idx] - self.region_box[start_idx])
  71. @property
  72. def width(self):
  73. start_idx = 0 if self.direction == "horizontal" else 1
  74. end_idx = 2 if self.direction == "horizontal" else 3
  75. return abs(self.region_box[end_idx] - self.region_box[start_idx])
  76. def __str__(self) -> str:
  77. return f"{' '.join([str(span.text) for span in self.spans])}\n"
  78. def __repr__(self) -> str:
  79. return f"{' '.join([str(span.text) for span in self.spans])}\n"
  80. def add_span(self, span: Union[TextSpan, List[TextSpan]]):
  81. """
  82. Add a span to the text line.
  83. Args:
  84. span (Union[TextSpan, List[TextSpan]]): A single TextSpan object or a list of TextSpan objects.
  85. """
  86. if isinstance(span, list):
  87. self.spans.extend(span)
  88. else:
  89. self.spans.append(span)
  90. self.region_box = self.get_region_box()
  91. def get_region_box(self):
  92. """
  93. Get the region box of the text line.
  94. Returns:
  95. list: The region box of the text line.
  96. """
  97. if not self.spans:
  98. return None # or an empty list, or however you want to handle no spans
  99. # Initialize min and max values with the first span's box
  100. x_min, y_min, x_max, y_max = self.spans[0].box
  101. for span in self.spans:
  102. x_min = min(x_min, span.box[0])
  103. y_min = min(y_min, span.box[1])
  104. x_max = max(x_max, span.box[2])
  105. y_max = max(y_max, span.box[3])
  106. return [x_min, y_min, x_max, y_max]
  107. def get_texts(
  108. self,
  109. block_label: str,
  110. block_text_width: int,
  111. block_start_coordinate: int,
  112. block_stop_coordinate: int,
  113. ori_image,
  114. text_rec_model=None,
  115. text_rec_score_thresh=None,
  116. ):
  117. """
  118. Get the text of the text line.
  119. Args:
  120. block_label (str): The label of the block.
  121. block_text_width (int): The width of the block.
  122. block_start_coordinate (int): The starting coordinate of the block.
  123. block_stop_coordinate (int): The stopping coordinate of the block.
  124. ori_image (np.ndarray): The original image.
  125. text_rec_model (Any): The text recognition model.
  126. text_rec_score_thresh (float): The text recognition score threshold.
  127. Returns:
  128. str: The text of the text line.
  129. """
  130. span_box_start_index = 0 if self.direction == "horizontal" else 1
  131. lines_start_index = 1 if self.direction == "horizontal" else 3
  132. self.spans.sort(
  133. key=lambda span: (
  134. span.box[span_box_start_index] // 2,
  135. (
  136. span.box[lines_start_index]
  137. if self.direction == "horizontal"
  138. else -span.box[lines_start_index]
  139. ),
  140. )
  141. )
  142. if "formula" in self.labels:
  143. sort_index = 0 if self.direction == "horizontal" else 1
  144. splited_spans = self.split_boxes_by_projection()
  145. if len(self.spans) != len(splited_spans):
  146. splited_spans.sort(key=lambda span: span.box[sort_index])
  147. new_spans = []
  148. for span in splited_spans:
  149. bbox = span.box
  150. if span.label == "text":
  151. crop_img = ori_image[
  152. int(bbox[1]) : int(bbox[3]),
  153. int(bbox[0]) : int(bbox[2]),
  154. ]
  155. crop_img_rec_res = list(text_rec_model([crop_img]))[0]
  156. crop_img_rec_score = crop_img_rec_res["rec_score"]
  157. crop_img_rec_text = crop_img_rec_res["rec_text"]
  158. span.text = crop_img_rec_text
  159. if crop_img_rec_score < text_rec_score_thresh:
  160. continue
  161. new_spans.append(span)
  162. self.spans = new_spans
  163. line_text = self.format_line(
  164. block_text_width,
  165. block_start_coordinate,
  166. block_stop_coordinate,
  167. line_gap_limit=self.height * 1.5,
  168. block_label=block_label,
  169. )
  170. return line_text
  171. def is_projection_contained(self, box_a, box_b, start_idx, end_idx):
  172. """Check if box_a completely contains box_b in the x-direction."""
  173. return box_a[start_idx] <= box_b[start_idx] and box_a[end_idx] >= box_b[end_idx]
  174. def split_boxes_by_projection(self, offset=1e-5):
  175. """
  176. Check if there is any complete containment in the x-direction
  177. between the bounding boxes and split the containing box accordingly.
  178. Args:
  179. offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
  180. Returns:
  181. A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
  182. """
  183. new_spans = []
  184. if self.direction == "horizontal":
  185. projection_start_index, projection_end_index = 0, 2
  186. else:
  187. projection_start_index, projection_end_index = 1, 3
  188. for i in range(len(self.spans)):
  189. span = self.spans[i]
  190. is_split = False
  191. for j in range(i, len(self.spans)):
  192. box_b = self.spans[j].box
  193. box_a, text, label = span.box, span.text, span.label
  194. if self.is_projection_contained(
  195. box_a, box_b, projection_start_index, projection_end_index
  196. ):
  197. is_split = True
  198. # Split box_a based on the x-coordinates of box_b
  199. if box_a[projection_start_index] < box_b[projection_start_index]:
  200. w = (
  201. box_b[projection_start_index]
  202. - offset
  203. - box_a[projection_start_index]
  204. )
  205. if w > 1:
  206. new_bbox = box_a.copy()
  207. new_bbox[projection_end_index] = (
  208. box_b[projection_start_index] - offset
  209. )
  210. new_spans.append(
  211. TextSpan(
  212. box=np.array(new_bbox),
  213. text=text,
  214. label=label,
  215. )
  216. )
  217. if box_a[projection_end_index] > box_b[projection_end_index]:
  218. w = (
  219. box_a[projection_end_index]
  220. - box_b[projection_end_index]
  221. + offset
  222. )
  223. if w > 1:
  224. box_a[projection_start_index] = (
  225. box_b[projection_end_index] + offset
  226. )
  227. span = TextSpan(
  228. box=np.array(box_a),
  229. text=text,
  230. label=label,
  231. )
  232. if j == len(self.spans) - 1 and is_split:
  233. new_spans.append(span)
  234. if not is_split:
  235. new_spans.append(span)
  236. return new_spans
  237. def format_line(
  238. self,
  239. block_text_width: int,
  240. block_start_coordinate: int,
  241. block_stop_coordinate: int,
  242. line_gap_limit: int = 10,
  243. block_label: str = "text",
  244. ) -> str:
  245. """
  246. Format a line of text spans based on layout constraints.
  247. Args:
  248. block_text_width (int): The width of the block.
  249. block_start_coordinate (int): The starting coordinate of the block.
  250. block_stop_coordinate (int): The stopping coordinate of the block.
  251. line_gap_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
  252. block_label (str): The label associated with the entire block. Default is 'text'.
  253. Returns:
  254. str: Formatted line of text.
  255. """
  256. first_span_box = self.spans[0].box
  257. last_span_box = self.spans[-1].box
  258. line_text = ""
  259. for span in self.spans:
  260. if span.label == "formula" and block_label != "formula":
  261. formula_rec = span.text
  262. if not formula_rec.startswith("$") and not formula_rec.endswith("$"):
  263. if len(self.spans) > 1:
  264. span.text = f"${span.text}$"
  265. else:
  266. span.text = f"\n${span.text}$"
  267. line_text += span.text
  268. if (
  269. len(span.text) > 0
  270. and is_english_letter(line_text[-1])
  271. or span.label == "formula"
  272. ):
  273. line_text += " "
  274. if self.direction == "horizontal":
  275. text_stop_index = 2
  276. else:
  277. text_stop_index = 3
  278. if line_text.endswith(" "):
  279. line_text = line_text[:-1]
  280. if len(line_text) == 0:
  281. return ""
  282. last_char = line_text[-1]
  283. if (
  284. not is_english_letter(last_char)
  285. and not is_non_breaking_punctuation(last_char)
  286. and not is_numeric(last_char)
  287. ) or (
  288. block_stop_coordinate - last_span_box[text_stop_index]
  289. > block_text_width * 0.3
  290. ):
  291. if (
  292. self.direction == "horizontal"
  293. and block_stop_coordinate - last_span_box[text_stop_index]
  294. > line_gap_limit
  295. ) or (
  296. self.direction == "vertical"
  297. and (
  298. block_stop_coordinate - last_span_box[text_stop_index]
  299. > line_gap_limit
  300. or first_span_box[1] - block_start_coordinate > line_gap_limit
  301. )
  302. ):
  303. self.need_new_line = True
  304. if line_text.endswith("-"):
  305. line_text = line_text[:-1]
  306. return line_text
  307. if (len(line_text) > 0 and is_english_letter(last_char)) or line_text.endswith(
  308. "$"
  309. ):
  310. line_text += " "
  311. if (
  312. len(line_text) > 0
  313. and not is_english_letter(last_char)
  314. and not is_numeric(last_char)
  315. ) or self.direction == "vertical":
  316. if (
  317. block_stop_coordinate - last_span_box[text_stop_index]
  318. > block_text_width * 0.3
  319. and len(line_text) > 0
  320. and not is_non_breaking_punctuation(last_char)
  321. ):
  322. line_text += "\n"
  323. self.need_new_line = True
  324. elif (
  325. block_stop_coordinate - last_span_box[text_stop_index]
  326. > (block_stop_coordinate - block_start_coordinate) * 0.5
  327. ):
  328. line_text += "\n"
  329. self.need_new_line = True
  330. return line_text
  331. class LayoutBlock(object):
  332. """Layout Block Class"""
  333. def __init__(self, label, bbox, content="") -> None:
  334. """
  335. Initialize a LayoutBlock object.
  336. Args:
  337. label (str): Label assigned to the block.
  338. bbox (list): Bounding box coordinates of the block.
  339. content (str, optional): Content of the block. Defaults to an empty string.
  340. """
  341. self.label = label
  342. self.order_label = None
  343. self.bbox = list(map(int, bbox))
  344. self.content = content
  345. self.seg_start_coordinate = float("inf")
  346. self.seg_end_coordinate = float("-inf")
  347. self.width = bbox[2] - bbox[0]
  348. self.height = bbox[3] - bbox[1]
  349. self.area = float(self.width) * float(self.height)
  350. self.num_of_lines = 1
  351. self.image = None
  352. self.index = None
  353. self.order_index = None
  354. self.text_line_width = 1
  355. self.text_line_height = 1
  356. self.child_blocks = []
  357. self.update_direction()
  358. def __str__(self) -> str:
  359. _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
  360. return _str
  361. def __repr__(self) -> str:
  362. _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
  363. return _str
  364. def to_dict(self) -> dict:
  365. return self.__dict__
  366. def update_direction(self, direction=None) -> None:
  367. """
  368. Update the direction of the block based on its bounding box.
  369. Args:
  370. direction (str, optional): Direction of the block. If not provided, it will be determined automatically using the bounding box. Defaults to None.
  371. """
  372. if not direction:
  373. direction = self.get_bbox_direction()
  374. self.direction = direction
  375. self.update_direction_info()
  376. def update_direction_info(self) -> None:
  377. """Update the direction information of the block based on its direction."""
  378. if self.direction == "horizontal":
  379. self.secondary_direction = "vertical"
  380. self.short_side_length = self.height
  381. self.long_side_length = self.width
  382. self.start_coordinate = self.bbox[0]
  383. self.end_coordinate = self.bbox[2]
  384. self.secondary_direction_start_coordinate = self.bbox[1]
  385. self.secondary_direction_end_coordinate = self.bbox[3]
  386. else:
  387. self.secondary_direction = "horizontal"
  388. self.short_side_length = self.width
  389. self.long_side_length = self.height
  390. self.start_coordinate = self.bbox[1]
  391. self.end_coordinate = self.bbox[3]
  392. self.secondary_direction_start_coordinate = self.bbox[0]
  393. self.secondary_direction_end_coordinate = self.bbox[2]
  394. def append_child_block(self, child_block) -> None:
  395. """
  396. Append a child block to the current block.
  397. Args:
  398. child_block (LayoutBlock): Child block to be added.
  399. Returns:
  400. None
  401. """
  402. if not self.child_blocks:
  403. self.ori_bbox = self.bbox.copy()
  404. x1, y1, x2, y2 = self.bbox
  405. x1_child, y1_child, x2_child, y2_child = child_block.bbox
  406. union_bbox = (
  407. min(x1, x1_child),
  408. min(y1, y1_child),
  409. max(x2, x2_child),
  410. max(y2, y2_child),
  411. )
  412. self.bbox = union_bbox
  413. self.update_direction_info()
  414. child_blocks = [child_block]
  415. if child_block.child_blocks:
  416. child_blocks.extend(child_block.get_child_blocks())
  417. self.child_blocks.extend(child_blocks)
  418. def get_child_blocks(self) -> list:
  419. """Get all child blocks of the current block."""
  420. self.bbox = self.ori_bbox
  421. child_blocks = self.child_blocks.copy()
  422. self.child_blocks = []
  423. return child_blocks
  424. def get_centroid(self) -> tuple:
  425. """Get the centroid of the bounding box of the block."""
  426. x1, y1, x2, y2 = self.bbox
  427. centroid = ((x1 + x2) / 2, (y1 + y2) / 2)
  428. return centroid
  429. def get_bbox_direction(self, direction_ratio: float = 1.0) -> str:
  430. """
  431. Determine if a bounding box is horizontal or vertical.
  432. Args:
  433. direction_ratio (float): Ratio for determining direction. Default is 1.0.
  434. Returns:
  435. str: "horizontal" or "vertical".
  436. """
  437. return (
  438. "horizontal" if self.width * direction_ratio >= self.height else "vertical"
  439. )
  440. def calculate_text_line_direction(
  441. self, bboxes: List[List[int]], direction_ratio: float = 1.5
  442. ) -> bool:
  443. """
  444. Calculate the direction of the text based on the bounding boxes.
  445. Args:
  446. bboxes (list): A list of bounding boxes.
  447. direction_ratio (float): Ratio for determining direction. Default is 1.5.
  448. Returns:
  449. str: "horizontal" or "vertical".
  450. """
  451. horizontal_box_num = 0
  452. for bbox in bboxes:
  453. if len(bbox) != 4:
  454. raise ValueError(
  455. "Invalid bounding box format. Expected a list of length 4."
  456. )
  457. x1, y1, x2, y2 = bbox
  458. width = x2 - x1
  459. height = y2 - y1
  460. horizontal_box_num += 1 if width * direction_ratio >= height else 0
  461. return "horizontal" if horizontal_box_num >= len(bboxes) * 0.5 else "vertical"
  462. def group_boxes_into_lines(
  463. self, ocr_rec_res, line_height_iou_threshold
  464. ) -> List[TextLine]:
  465. """
  466. Group the bounding boxes into lines based on their direction.
  467. Args:
  468. ocr_rec_res (dict): The result of OCR recognition.
  469. line_height_iou_threshold (float): The minimum IOU value required for two spans to belong to the same line.
  470. Returns:
  471. list: A list of TextLines.
  472. """
  473. rec_boxes = ocr_rec_res["boxes"]
  474. rec_texts = ocr_rec_res["rec_texts"]
  475. rec_labels = ocr_rec_res["rec_labels"]
  476. text_boxes = [
  477. rec_boxes[i] for i in range(len(rec_boxes)) if rec_labels[i] == "text"
  478. ]
  479. direction = self.calculate_text_line_direction(text_boxes)
  480. self.update_direction(direction)
  481. spans = [TextSpan(*span) for span in zip(rec_boxes, rec_texts, rec_labels)]
  482. if not spans:
  483. return []
  484. # sort spans by direction
  485. if self.direction == "vertical":
  486. spans.sort(
  487. key=lambda span: span.box[0], reverse=True
  488. ) # sort by x coordinate
  489. match_direction = "horizontal"
  490. else:
  491. spans.sort(
  492. key=lambda span: span.box[1], reverse=False
  493. ) # sort by y coordinate
  494. match_direction = "vertical"
  495. lines = []
  496. current_line = TextLine([spans[0]], direction=self.direction)
  497. for span in spans[1:]:
  498. overlap_ratio = calculate_projection_overlap_ratio(
  499. current_line.region_box, span.box, match_direction, mode="small"
  500. )
  501. if overlap_ratio >= line_height_iou_threshold:
  502. current_line.add_span(span)
  503. else:
  504. lines.append(current_line)
  505. current_line = TextLine([span], direction=self.direction)
  506. lines.append(current_line)
  507. if lines and self.direction == "vertical":
  508. line_heights = np.array([line.height for line in lines])
  509. min_height = np.min(line_heights)
  510. max_height = np.max(line_heights)
  511. # if height is too large, filter out the line
  512. if max_height > min_height * 2:
  513. normal_height_threshold = min_height * 1.1
  514. normal_height_count = np.sum(line_heights < normal_height_threshold)
  515. # if the number of lines with height less than the threshold is less than 40%, then filter out the line
  516. if normal_height_count < len(lines) * 0.4:
  517. keep_condition = line_heights <= normal_height_threshold
  518. lines = [line for line, keep in zip(lines, keep_condition) if keep]
  519. # calculate the average height of the text line
  520. if lines:
  521. line_heights = [line.height for line in lines]
  522. line_widths = [line.width for line in lines]
  523. self.text_line_height = np.mean(line_heights)
  524. self.text_line_width = np.mean(line_widths)
  525. else:
  526. self.text_line_height = 0
  527. self.text_line_width = 0
  528. return lines
  529. def update_text_content(
  530. self,
  531. image: list,
  532. ocr_rec_res: dict,
  533. text_rec_model: Any,
  534. text_rec_score_thresh: Union[float, None] = None,
  535. ) -> None:
  536. """
  537. Update the text content of the block based on the OCR result.
  538. Args:
  539. image (list): The input image.
  540. ocr_rec_res (dict): The result of OCR recognition.
  541. text_rec_model (Any): The model used for text recognition.
  542. text_rec_score_thresh (Union[float, None]): The score threshold for text recognition. If None, use the default setting.
  543. Returns:
  544. None
  545. """
  546. if len(ocr_rec_res["rec_texts"]) == 0:
  547. self.content = ""
  548. return
  549. lines = self.group_boxes_into_lines(
  550. ocr_rec_res,
  551. LINE_SETTINGS.get("line_height_iou_threshold", 0.8),
  552. )
  553. # words start coordinate and stop coordinate in the line
  554. coord_start_idx = 0 if self.direction == "horizontal" else 1
  555. coord_end_idx = coord_start_idx + 2
  556. if self.label == "reference":
  557. rec_boxes = ocr_rec_res["boxes"]
  558. block_start = min([box[coord_start_idx] for box in rec_boxes])
  559. block_stop = max([box[coord_end_idx] for box in rec_boxes])
  560. else:
  561. block_start = self.bbox[coord_start_idx]
  562. block_stop = self.bbox[coord_end_idx]
  563. text_lines = []
  564. text_width_list = []
  565. need_new_line_num = 0
  566. for line_idx, line in enumerate(lines):
  567. line: TextLine = line
  568. text_width_list.append(line.width)
  569. # get text from line
  570. line_text = line.get_texts(
  571. block_label=self.label,
  572. block_text_width=max(text_width_list),
  573. block_start_coordinate=block_start,
  574. block_stop_coordinate=block_stop,
  575. ori_image=image,
  576. text_rec_model=text_rec_model,
  577. text_rec_score_thresh=text_rec_score_thresh,
  578. )
  579. if line.need_new_line:
  580. need_new_line_num += 1
  581. # set segment start and end coordinate
  582. if line_idx == 0:
  583. self.seg_start_coordinate = line.spans[0].box[0]
  584. elif line_idx == len(lines) - 1:
  585. self.seg_end_coordinate = line.spans[-1].box[2]
  586. text_lines.append(line_text)
  587. delim = LINE_SETTINGS["delimiter_map"].get(self.label, "")
  588. if delim == "":
  589. content = ""
  590. pre_line_end = False
  591. last_char = ""
  592. for idx, line_text in enumerate(text_lines):
  593. if len(line_text) == 0:
  594. continue
  595. line: TextLine = lines[idx]
  596. if pre_line_end:
  597. start_gep_len = line.region_box[coord_start_idx] - block_start
  598. if (
  599. (
  600. start_gep_len > line.height * 1.5
  601. and not is_english_letter(last_char)
  602. and not is_numeric(last_char)
  603. )
  604. or start_gep_len > (block_stop - block_start) * 0.4
  605. ) and not content.endswith("\n"):
  606. line_text = "\n" + line_text
  607. content += f"{line_text}"
  608. if len(line_text) > 2 and line_text.endswith(" "):
  609. last_char = line_text[-2]
  610. else:
  611. last_char = line_text[-1]
  612. if (
  613. len(line_text) > 0
  614. and not line_text.endswith("\n")
  615. and not is_english_letter(last_char)
  616. and not is_non_breaking_punctuation(last_char)
  617. and not is_numeric(last_char)
  618. and need_new_line_num > len(text_lines) * 0.5
  619. ) or need_new_line_num > len(text_lines) * 0.6:
  620. content += f"\n"
  621. if (
  622. block_stop - line.region_box[coord_end_idx]
  623. > (block_stop - block_start) * 0.3
  624. ):
  625. pre_line_end = True
  626. else:
  627. content = delim.join(text_lines)
  628. self.content = content
  629. self.num_of_lines = len(text_lines)
  630. class LayoutRegion(LayoutBlock):
  631. """LayoutRegion class"""
  632. def __init__(
  633. self,
  634. bbox,
  635. blocks: List[LayoutBlock] = [],
  636. ) -> None:
  637. """
  638. Initialize a LayoutRegion object.
  639. Args:
  640. bbox (List[int]): The bounding box of the region.
  641. blocks (List[LayoutBlock]): A list of blocks that belong to this region.
  642. """
  643. super().__init__("region", bbox, content="")
  644. self.bbox = bbox
  645. self.block_map = {}
  646. self.direction = "horizontal"
  647. self.doc_title_block_idxes = []
  648. self.paragraph_title_block_idxes = []
  649. self.vision_block_idxes = []
  650. self.unordered_block_idxes = []
  651. self.vision_title_block_idxes = []
  652. self.normal_text_block_idxes = []
  653. self.euclidean_distance = float(np.inf)
  654. self.header_block_idxes = []
  655. self.footer_block_idxes = []
  656. self.text_line_width = 20
  657. self.text_line_height = 10
  658. self.num_of_lines = 10
  659. self.init_region_info_from_layout(blocks)
  660. self.update_euclidean_distance()
  661. def init_region_info_from_layout(self, blocks: List[LayoutBlock]) -> None:
  662. """Initialize the information about the layout region from the given blocks.
  663. Args:
  664. blocks (List[LayoutBlock]): A list of blocks that belong to this region.
  665. Returns:
  666. None
  667. """
  668. horizontal_normal_text_block_num = 0
  669. text_line_height_list = []
  670. text_line_width_list = []
  671. for idx, block in enumerate(blocks):
  672. self.block_map[idx] = block
  673. block.index = idx
  674. if block.label in BLOCK_LABEL_MAP["header_labels"]:
  675. self.header_block_idxes.append(idx)
  676. elif block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
  677. self.doc_title_block_idxes.append(idx)
  678. elif block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]:
  679. self.paragraph_title_block_idxes.append(idx)
  680. elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
  681. self.vision_block_idxes.append(idx)
  682. elif block.label in BLOCK_LABEL_MAP["vision_title_labels"]:
  683. self.vision_title_block_idxes.append(idx)
  684. elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
  685. self.footer_block_idxes.append(idx)
  686. elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
  687. self.unordered_block_idxes.append(idx)
  688. else:
  689. self.normal_text_block_idxes.append(idx)
  690. text_line_height_list.append(block.text_line_height)
  691. text_line_width_list.append(block.text_line_width)
  692. if block.direction == "horizontal":
  693. horizontal_normal_text_block_num += 1
  694. direction = (
  695. "horizontal"
  696. if horizontal_normal_text_block_num
  697. >= len(self.normal_text_block_idxes) * 0.5
  698. else "vertical"
  699. )
  700. self.update_direction(direction)
  701. self.text_line_width = (
  702. np.mean(text_line_width_list) if text_line_width_list else 20
  703. )
  704. self.text_line_height = (
  705. np.mean(text_line_height_list) if text_line_height_list else 10
  706. )
  707. def update_euclidean_distance(self):
  708. """Update euclidean distance between each block and the reference point"""
  709. blocks: List[LayoutBlock] = list(self.block_map.values())
  710. if self.direction == "horizontal":
  711. ref_point = (0, 0)
  712. block_distance = [
  713. caculate_euclidean_dist((block.bbox[0], block.bbox[1]), ref_point)
  714. for block in blocks
  715. ]
  716. else:
  717. ref_point = (self.bbox[2], 0)
  718. block_distance = [
  719. caculate_euclidean_dist((block.bbox[2], block.bbox[1]), ref_point)
  720. for block in blocks
  721. ]
  722. self.euclidean_distance = min(block_distance) if len(block_distance) > 0 else 0
  723. def update_direction(self, direction=None):
  724. """
  725. Update the direction of the layout region.
  726. Args:
  727. direction (str): The new direction of the layout region.
  728. """
  729. super().update_direction(direction=direction)
  730. if self.direction == "horizontal":
  731. self.direction_start_index = 0
  732. self.direction_end_index = 2
  733. self.secondary_direction_start_index = 1
  734. self.secondary_direction_end_index = 3
  735. self.secondary_direction = "vertical"
  736. else:
  737. self.direction_start_index = 1
  738. self.direction_end_index = 3
  739. self.secondary_direction_start_index = 0
  740. self.secondary_direction_end_index = 2
  741. self.secondary_direction = "horizontal"
  742. self.direction_center_coordinate = (
  743. self.bbox[self.direction_start_index] + self.bbox[self.direction_end_index]
  744. ) / 2
  745. self.secondary_direction_center_coordinate = (
  746. self.bbox[self.secondary_direction_start_index]
  747. + self.bbox[self.secondary_direction_end_index]
  748. ) / 2