uilts.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import html
  15. import itertools
  16. import math
  17. import re
  18. from collections import Counter
  19. from copy import deepcopy
  20. from typing import Any, Dict, List, Tuple, Union
  21. import numpy as np
  22. from PIL import Image
  23. from pydantic import BaseModel, computed_field, model_validator
  24. from ..layout_parsing.utils import (
  25. calculate_bbox_area,
  26. calculate_overlap_ratio,
  27. calculate_projection_overlap_ratio,
  28. )
  29. def filter_overlap_boxes(
  30. layout_det_res: Dict[str, List[Dict]]
  31. ) -> Dict[str, List[Dict]]:
  32. """
  33. Remove overlapping boxes from layout detection results based on a given overlap ratio.
  34. Args:
  35. layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.
  36. Returns:
  37. Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
  38. """
  39. layout_det_res_filtered = deepcopy(layout_det_res)
  40. boxes = [
  41. box for box in layout_det_res_filtered["boxes"] if box["label"] != "reference"
  42. ]
  43. dropped_indexes = set()
  44. for i in range(len(boxes)):
  45. for j in range(i + 1, len(boxes)):
  46. if i in dropped_indexes or j in dropped_indexes:
  47. continue
  48. overlap_ratio = calculate_overlap_ratio(
  49. boxes[i]["coordinate"], boxes[j]["coordinate"], "small"
  50. )
  51. if overlap_ratio > 0.7:
  52. box_area_i = calculate_bbox_area(boxes[i]["coordinate"])
  53. box_area_j = calculate_bbox_area(boxes[j]["coordinate"])
  54. if (
  55. boxes[i]["label"] == "image" or boxes[j]["label"] == "image"
  56. ) and boxes[i]["label"] != boxes[j]["label"]:
  57. continue
  58. if box_area_i >= box_area_j:
  59. dropped_indexes.add(j)
  60. else:
  61. dropped_indexes.add(i)
  62. layout_det_res_filtered["boxes"] = [
  63. box for idx, box in enumerate(boxes) if idx not in dropped_indexes
  64. ]
  65. return layout_det_res_filtered
  66. def to_pil_image(img):
  67. """
  68. Convert the input to a PIL Image.
  69. Args:
  70. img (PIL.Image or numpy.ndarray): Input image.
  71. Returns:
  72. PIL.Image: PIL Image object.
  73. """
  74. if isinstance(img, Image.Image):
  75. return img
  76. return Image.fromarray(img)
  77. def to_np_array(img):
  78. """
  79. Convert the input to a numpy array.
  80. Args:
  81. img (PIL.Image or numpy.ndarray): Input image.
  82. Returns:
  83. numpy.ndarray: Numpy array image.
  84. """
  85. if isinstance(img, Image.Image):
  86. return np.array(img)
  87. return img
  88. def calc_merged_wh(images):
  89. """
  90. Calculate width (max of all) and height (sum) for a vertical merge of images.
  91. Args:
  92. images (List[PIL.Image or np.ndarray]): List of images.
  93. Returns:
  94. Tuple[int, int]: (width, height) of merged image.
  95. """
  96. widths = [to_pil_image(img).width for img in images]
  97. heights = [to_pil_image(img).height for img in images]
  98. w = max(widths)
  99. h = sum(heights)
  100. return w, h
  101. def merge_images(images, aligns="center"):
  102. """
  103. Merge images vertically with given alignment.
  104. Args:
  105. images (List[PIL.Image or np.ndarray]): List of images to merge.
  106. aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').
  107. Returns:
  108. np.ndarray: Merged image as numpy array.
  109. """
  110. if not images:
  111. return None
  112. if len(images) == 1:
  113. return to_np_array(images[0])
  114. if isinstance(aligns, str):
  115. aligns = [aligns] * (len(images) - 1)
  116. if len(aligns) != len(images) - 1:
  117. raise ValueError("The length of aligns must be len(images) - 1")
  118. merged = to_pil_image(images[0])
  119. for i in range(1, len(images)):
  120. img2 = to_pil_image(images[i])
  121. align = aligns[i - 1]
  122. w = max(merged.width, img2.width)
  123. h = merged.height + img2.height
  124. new_img = Image.new("RGB", (w, h), (255, 255, 255))
  125. if align == "center":
  126. x1 = (w - merged.width) // 2
  127. x2 = (w - img2.width) // 2
  128. elif align == "right":
  129. x1 = w - merged.width
  130. x2 = w - img2.width
  131. else: # left
  132. x1 = x2 = 0
  133. new_img.paste(merged, (x1, 0))
  134. new_img.paste(img2, (x2, merged.height))
  135. merged = new_img
  136. return to_np_array(merged)
  137. def merge_blocks(blocks, non_merge_labels):
  138. """
  139. Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.
  140. Args:
  141. blocks (List[Dict]): List of block dicts.
  142. non_merge_labels (List[str]): Block labels that should not be merged.
  143. Returns:
  144. List[Dict]: List of processed (and possibly merged) blocks.
  145. """
  146. blocks_to_merge = []
  147. non_merge_blocks = {}
  148. for idx, block in enumerate(blocks):
  149. if block["label"] in non_merge_labels:
  150. non_merge_blocks[idx] = block
  151. else:
  152. blocks_to_merge.append((idx, block))
  153. merged_groups = []
  154. current_group = []
  155. current_indices = []
  156. current_aligns = []
  157. def is_aligned(a1, a2):
  158. return abs(a1 - a2) <= 5
  159. def get_alignment(block_bbox, prev_bbox):
  160. if is_aligned(block_bbox[0], prev_bbox[0]):
  161. return "left"
  162. elif is_aligned(block_bbox[2], prev_bbox[2]):
  163. return "right"
  164. else:
  165. return "center"
  166. def overlapwith_other_box(block_idx, prev_idx, blocks):
  167. prev_bbox = blocks[prev_idx]["box"]
  168. block_bbox = blocks[block_idx]["box"]
  169. x1 = min(prev_bbox[0], block_bbox[0])
  170. y1 = min(prev_bbox[1], block_bbox[1])
  171. x2 = max(prev_bbox[2], block_bbox[2])
  172. y2 = max(prev_bbox[3], block_bbox[3])
  173. min_box = [x1, y1, x2, y2]
  174. for idx, other_block in enumerate(blocks):
  175. if idx in [block_idx, prev_idx]:
  176. continue
  177. other_bbox = other_block["box"]
  178. if calculate_overlap_ratio(min_box, other_bbox) > 0:
  179. return True
  180. return False
  181. for i, (idx, block) in enumerate(blocks_to_merge):
  182. if not current_group:
  183. current_group = [block]
  184. current_indices = [idx]
  185. current_aligns = []
  186. continue
  187. prev_idx, prev_block = blocks_to_merge[i - 1]
  188. prev_bbox = prev_block["box"]
  189. prev_label = prev_block["label"]
  190. block_bbox = block["box"]
  191. block_label = block["label"]
  192. iou_h = calculate_projection_overlap_ratio(block_bbox, prev_bbox, "horizontal")
  193. is_cross = (
  194. iou_h == 0
  195. and block_label == "text"
  196. and block_label == prev_label
  197. and block_bbox[0] > prev_bbox[2]
  198. and block_bbox[1] < prev_bbox[3]
  199. and block_bbox[0] - prev_bbox[2]
  200. < max(prev_bbox[2] - prev_bbox[0], block_bbox[2] - block_bbox[0]) * 0.3
  201. )
  202. is_updown_align = (
  203. iou_h > 0
  204. and block_label in ["text"]
  205. and block_label == prev_label
  206. and block_bbox[3] >= prev_bbox[1]
  207. and abs(block_bbox[1] - prev_bbox[3])
  208. < max(prev_bbox[3] - prev_bbox[1], block_bbox[3] - block_bbox[1]) * 0.5
  209. and (
  210. is_aligned(block_bbox[0], prev_bbox[0])
  211. ^ is_aligned(block_bbox[2], prev_bbox[2])
  212. )
  213. and overlapwith_other_box(idx, prev_idx, blocks)
  214. )
  215. if is_cross:
  216. align_mode = "center"
  217. elif is_updown_align:
  218. align_mode = get_alignment(block_bbox, prev_bbox)
  219. else:
  220. align_mode = None
  221. if is_cross or is_updown_align:
  222. current_group.append(block)
  223. current_indices.append(idx)
  224. current_aligns.append(align_mode)
  225. else:
  226. merged_groups.append((current_indices, current_group, current_aligns))
  227. current_group = [block]
  228. current_indices = [idx]
  229. current_aligns = []
  230. if current_group:
  231. merged_groups.append((current_indices, current_group, current_aligns))
  232. group_ranges = []
  233. for group_indices, group, aligns in merged_groups:
  234. start, end = min(group_indices), max(group_indices)
  235. group_ranges.append((start, end, group_indices, aligns))
  236. result_blocks = []
  237. used_indices = set()
  238. idx = 0
  239. while idx < len(blocks):
  240. group_found = False
  241. for (start, end, group_indices, aligns), (g_indices, g_blocks, g_aligns) in zip(
  242. group_ranges, merged_groups
  243. ):
  244. if idx == start and all(i not in used_indices for i in group_indices):
  245. group_found = True
  246. imgs = [blocks[i]["img"] for i in group_indices]
  247. merge_aligns = aligns if aligns else []
  248. w, h = calc_merged_wh(imgs)
  249. aspect_ratio = h / w if w != 0 else float("inf")
  250. if aspect_ratio >= 3:
  251. for j, block_idx in enumerate(group_indices):
  252. block = blocks[block_idx].copy()
  253. block["img"] = blocks[block_idx]["img"]
  254. block["merge_aligns"] = None
  255. result_blocks.append(block)
  256. used_indices.add(block_idx)
  257. else:
  258. merged_img = merge_images(imgs, merge_aligns)
  259. for j, block_idx in enumerate(group_indices):
  260. block = blocks[block_idx].copy()
  261. block["img"] = merged_img if j == 0 else None
  262. block["merge_aligns"] = merge_aligns if j == 0 else None
  263. result_blocks.append(block)
  264. used_indices.add(block_idx)
  265. insert_list = []
  266. for n_idx in range(start + 1, end):
  267. if n_idx in non_merge_blocks:
  268. insert_list.append(n_idx)
  269. for n_idx in insert_list:
  270. result_blocks.append(non_merge_blocks[n_idx])
  271. used_indices.add(n_idx)
  272. idx = end + 1
  273. break
  274. if group_found:
  275. continue
  276. if idx in non_merge_blocks and idx not in used_indices:
  277. result_blocks.append(non_merge_blocks[idx])
  278. used_indices.add(idx)
  279. idx += 1
  280. return result_blocks
  281. def paint_token(image, box, token_str):
  282. """
  283. Fill a rectangular area in the image with a white background and write the given token string.
  284. Args:
  285. image (np.ndarray): Image to paint on.
  286. box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
  287. token_str (str): Token string to write.
  288. Returns:
  289. np.ndarray: Modified image.
  290. """
  291. import cv2
  292. def get_optimal_font_scale(text, fontFace, square_size, fill_ratio=0.9):
  293. # the scale is greater than 0.2 and less than 10,
  294. # suitable for square_size is greater than 30 and less than 1000
  295. left, right = 0.2, 10
  296. optimal_scale = left
  297. # search the optimal font scale
  298. while right - left > 1e-2:
  299. mid = (left + right) / 2
  300. (w, h), _ = cv2.getTextSize(text, fontFace, mid, thickness=1)
  301. if w < square_size * fill_ratio and h < square_size * fill_ratio:
  302. optimal_scale = mid
  303. left = mid
  304. else:
  305. right = mid
  306. return optimal_scale, w, h
  307. x1, y1, x2, y2 = [int(v) for v in box]
  308. box_w = x2 - x1
  309. box_h = y2 - y1
  310. img = image.copy()
  311. cv2.rectangle(img, (x1, y1), (x2, y2), color=(255, 255, 255), thickness=-1)
  312. # automatically set scale and thickness according to length of the shortest side
  313. font = cv2.FONT_HERSHEY_SIMPLEX
  314. thickness_scale_ratio = 4
  315. font_scale, text_w, text_h = get_optimal_font_scale(
  316. token_str, font, min(box_w, box_h), fill_ratio=0.9
  317. )
  318. font_thickness = max(1, math.floor(font_scale * thickness_scale_ratio))
  319. # calculate center coordinates of the patinting text
  320. text_x = x1 + (box_w - text_w) // 2
  321. text_y = y1 + (box_h + text_h) // 2
  322. cv2.putText(
  323. img,
  324. token_str,
  325. (text_x, text_y),
  326. font,
  327. font_scale,
  328. (0, 0, 0),
  329. font_thickness,
  330. lineType=cv2.LINE_AA,
  331. )
  332. return img
  333. def tokenize_figure_of_table(table_block_img, table_box, figures):
  334. """
  335. Replace figures in a table area with tokens, return new image and token map.
  336. Args:
  337. table_block_img (np.ndarray): Table image.
  338. table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
  339. figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').
  340. Returns:
  341. Tuple[np.ndarray, Dict[str, str], List[str]]:
  342. - New table image,
  343. - Token-to-img HTML map,
  344. - List of figure paths dropped.
  345. """
  346. def gen_random_map(num):
  347. exclude_digits = {"0", "1", "9"}
  348. seq = []
  349. i = 0
  350. while len(seq) < num:
  351. if not (set(str(i)) & exclude_digits):
  352. seq.append(i)
  353. i += 1
  354. return seq
  355. import random
  356. random.seed(1024)
  357. token_map = {}
  358. table_x_min, table_y_min, table_x_max, table_y_max = table_box
  359. drop_idxes = []
  360. random_map = gen_random_map(len(figures))
  361. random.shuffle(random_map)
  362. for figure_id, figure in enumerate(figures):
  363. figure_x_min, figure_y_min, figure_x_max, figure_y_max = figure["coordinate"]
  364. if (
  365. figure_x_min >= table_x_min
  366. and figure_y_min >= table_y_min
  367. and figure_x_max <= table_x_max
  368. and figure_y_max <= table_y_max
  369. ):
  370. drop_idxes.append(figure_id)
  371. # the figure is too small to can't be tokenized and recognized when shortest length is less than 25
  372. if min(figure_x_max - figure_x_min, figure_y_max - figure_y_min) < 25:
  373. continue
  374. draw_box = [
  375. figure_x_min - table_x_min,
  376. figure_y_min - table_y_min,
  377. figure_x_max - table_x_min,
  378. figure_y_max - table_y_min,
  379. ]
  380. token_str = "[F" + str(random_map[figure_id]) + "]"
  381. table_block_img = paint_token(table_block_img, draw_box, token_str)
  382. token_map[token_str] = f'<img src="{figure["path"]}" >'
  383. drop_figures = [f["path"] for i, f in enumerate(figures) if i in drop_idxes]
  384. return table_block_img, token_map, drop_figures
  385. def untokenize_figure_of_table(table_res_str, figure_token_map):
  386. """
  387. Replace tokens in a string with their HTML image equivalents.
  388. Args:
  389. table_res_str (str): Table string with tokens.
  390. figure_token_map (dict): Mapping from tokens to HTML img tags.
  391. Returns:
  392. str: Untokenized string.
  393. """
  394. def repl(match):
  395. token_id = match.group(1)
  396. token = f"[F{token_id}]"
  397. return figure_token_map.get(token, match.group(0))
  398. pattern = r"\[F(\d+)\]"
  399. return re.sub(pattern, repl, table_res_str)
  400. class TableCell(BaseModel):
  401. """
  402. TableCell represents a single cell in a table.
  403. Attributes:
  404. row_span (int): Number of rows spanned.
  405. col_span (int): Number of columns spanned.
  406. start_row_offset_idx (int): Start row index.
  407. end_row_offset_idx (int): End row index (exclusive).
  408. start_col_offset_idx (int): Start column index.
  409. end_col_offset_idx (int): End column index (exclusive).
  410. text (str): Cell text content.
  411. column_header (bool): Whether this cell is a column header.
  412. row_header (bool): Whether this cell is a row header.
  413. row_section (bool): Whether this cell is a row section.
  414. """
  415. row_span: int = 1
  416. col_span: int = 1
  417. start_row_offset_idx: int
  418. end_row_offset_idx: int
  419. start_col_offset_idx: int
  420. end_col_offset_idx: int
  421. text: str
  422. column_header: bool = False
  423. row_header: bool = False
  424. row_section: bool = False
  425. @model_validator(mode="before")
  426. @classmethod
  427. def from_dict_format(cls, data: Any) -> Any:
  428. """
  429. Create TableCell from dict, extracting 'text' property correctly.
  430. Args:
  431. data (Any): Input data.
  432. Returns:
  433. Any: TableCell-compatible dict.
  434. """
  435. if isinstance(data, Dict):
  436. if "text" in data:
  437. return data
  438. text = data["bbox"].get("token", "")
  439. if not len(text):
  440. text_cells = data.pop("text_cell_bboxes", None)
  441. if text_cells:
  442. for el in text_cells:
  443. text += el["token"] + " "
  444. text = text.strip()
  445. data["text"] = text
  446. return data
  447. class TableData(BaseModel):
  448. """
  449. TableData holds a table's cells, row and column counts, and provides a grid property.
  450. Attributes:
  451. table_cells (List[TableCell]): List of table cells.
  452. num_rows (int): Number of rows.
  453. num_cols (int): Number of columns.
  454. """
  455. table_cells: List[TableCell] = []
  456. num_rows: int = 0
  457. num_cols: int = 0
  458. @computed_field
  459. @property
  460. def grid(self) -> List[List[TableCell]]:
  461. """
  462. Returns a 2D grid of TableCell objects for the table.
  463. Returns:
  464. List[List[TableCell]]: Table as 2D grid.
  465. """
  466. table_data = [
  467. [
  468. TableCell(
  469. text="",
  470. start_row_offset_idx=i,
  471. end_row_offset_idx=i + 1,
  472. start_col_offset_idx=j,
  473. end_col_offset_idx=j + 1,
  474. )
  475. for j in range(self.num_cols)
  476. ]
  477. for i in range(self.num_rows)
  478. ]
  479. for cell in self.table_cells:
  480. for i in range(
  481. min(cell.start_row_offset_idx, self.num_rows),
  482. min(cell.end_row_offset_idx, self.num_rows),
  483. ):
  484. for j in range(
  485. min(cell.start_col_offset_idx, self.num_cols),
  486. min(cell.end_col_offset_idx, self.num_cols),
  487. ):
  488. table_data[i][j] = cell
  489. return table_data
  490. # OTSL tag constants
  491. OTSL_NL = "<nl>"
  492. OTSL_FCEL = "<fcel>"
  493. OTSL_ECEL = "<ecel>"
  494. OTSL_LCEL = "<lcel>"
  495. OTSL_UCEL = "<ucel>"
  496. OTSL_XCEL = "<xcel>"
  497. NON_CAPTURING_TAG_GROUP = "(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)"
  498. OTSL_FIND_PATTERN = re.compile(
  499. f"{NON_CAPTURING_TAG_GROUP}.*?(?={NON_CAPTURING_TAG_GROUP}|$)", flags=re.DOTALL
  500. )
  501. def otsl_extract_tokens_and_text(s: str):
  502. """
  503. Extract OTSL tags and text parts from the input string.
  504. Args:
  505. s (str): OTSL string.
  506. Returns:
  507. Tuple[List[str], List[str]]: (tokens, text_parts)
  508. """
  509. pattern = (
  510. r"("
  511. + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL])
  512. + r")"
  513. )
  514. tokens = re.findall(pattern, s)
  515. text_parts = re.split(pattern, s)
  516. text_parts = [token for token in text_parts if token.strip()]
  517. return tokens, text_parts
  518. def otsl_parse_texts(texts, tokens):
  519. """
  520. Parse OTSL text and tags into TableCell objects and tag structure.
  521. Args:
  522. texts (List[str]): List of tokens and text.
  523. tokens (List[str]): List of OTSL tags.
  524. Returns:
  525. Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
  526. """
  527. split_word = OTSL_NL
  528. split_row_tokens = [
  529. list(y)
  530. for x, y in itertools.groupby(tokens, lambda z: z == split_word)
  531. if not x
  532. ]
  533. table_cells = []
  534. r_idx = 0
  535. c_idx = 0
  536. # Ensure matrix completeness
  537. if split_row_tokens:
  538. max_cols = max(len(row) for row in split_row_tokens)
  539. for row in split_row_tokens:
  540. while len(row) < max_cols:
  541. row.append(OTSL_ECEL)
  542. new_texts = []
  543. text_idx = 0
  544. for row in split_row_tokens:
  545. for token in row:
  546. new_texts.append(token)
  547. if text_idx < len(texts) and texts[text_idx] == token:
  548. text_idx += 1
  549. if text_idx < len(texts) and texts[text_idx] not in [
  550. OTSL_NL,
  551. OTSL_FCEL,
  552. OTSL_ECEL,
  553. OTSL_LCEL,
  554. OTSL_UCEL,
  555. OTSL_XCEL,
  556. ]:
  557. new_texts.append(texts[text_idx])
  558. text_idx += 1
  559. new_texts.append(OTSL_NL)
  560. if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
  561. text_idx += 1
  562. texts = new_texts
  563. def count_right(tokens, c_idx, r_idx, which_tokens):
  564. span = 0
  565. c_idx_iter = c_idx
  566. while tokens[r_idx][c_idx_iter] in which_tokens:
  567. c_idx_iter += 1
  568. span += 1
  569. if c_idx_iter >= len(tokens[r_idx]):
  570. return span
  571. return span
  572. def count_down(tokens, c_idx, r_idx, which_tokens):
  573. span = 0
  574. r_idx_iter = r_idx
  575. while tokens[r_idx_iter][c_idx] in which_tokens:
  576. r_idx_iter += 1
  577. span += 1
  578. if r_idx_iter >= len(tokens):
  579. return span
  580. return span
  581. for i, text in enumerate(texts):
  582. cell_text = ""
  583. if text in [OTSL_FCEL, OTSL_ECEL]:
  584. row_span = 1
  585. col_span = 1
  586. right_offset = 1
  587. if text != OTSL_ECEL:
  588. cell_text = texts[i + 1]
  589. right_offset = 2
  590. next_right_cell = (
  591. texts[i + right_offset] if i + right_offset < len(texts) else ""
  592. )
  593. next_bottom_cell = ""
  594. if r_idx + 1 < len(split_row_tokens):
  595. if c_idx < len(split_row_tokens[r_idx + 1]):
  596. next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
  597. if next_right_cell in [OTSL_LCEL, OTSL_XCEL]:
  598. col_span += count_right(
  599. split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL]
  600. )
  601. if next_bottom_cell in [OTSL_UCEL, OTSL_XCEL]:
  602. row_span += count_down(
  603. split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL]
  604. )
  605. table_cells.append(
  606. TableCell(
  607. text=cell_text.strip(),
  608. row_span=row_span,
  609. col_span=col_span,
  610. start_row_offset_idx=r_idx,
  611. end_row_offset_idx=r_idx + row_span,
  612. start_col_offset_idx=c_idx,
  613. end_col_offset_idx=c_idx + col_span,
  614. )
  615. )
  616. if text in [OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]:
  617. c_idx += 1
  618. if text == OTSL_NL:
  619. r_idx += 1
  620. c_idx = 0
  621. return table_cells, split_row_tokens
  622. def export_to_html(table_data: TableData):
  623. """
  624. Export TableData to HTML table.
  625. Args:
  626. table_data (TableData): TableData object.
  627. Returns:
  628. str: HTML string.
  629. """
  630. nrows = table_data.num_rows
  631. ncols = table_data.num_cols
  632. if len(table_data.table_cells) == 0:
  633. return ""
  634. body = ""
  635. grid = table_data.grid
  636. for i in range(nrows):
  637. body += "<tr>"
  638. for j in range(ncols):
  639. cell: TableCell = grid[i][j]
  640. rowspan, rowstart = (cell.row_span, cell.start_row_offset_idx)
  641. colspan, colstart = (cell.col_span, cell.start_col_offset_idx)
  642. if rowstart != i or colstart != j:
  643. continue
  644. content = html.escape(cell.text.strip())
  645. celltag = "th" if cell.column_header else "td"
  646. opening_tag = f"{celltag}"
  647. if rowspan > 1:
  648. opening_tag += f' rowspan="{rowspan}"'
  649. if colspan > 1:
  650. opening_tag += f' colspan="{colspan}"'
  651. body += f"<{opening_tag}>{content}</{celltag}>"
  652. body += "</tr>"
  653. body = f"<table>{body}</table>"
  654. return body
  655. def otsl_pad_to_sqr_v2(otsl_str: str) -> str:
  656. """
  657. Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.
  658. Args:
  659. otsl_str (str): OTSL string.
  660. Returns:
  661. str: Padded OTSL string.
  662. """
  663. assert isinstance(otsl_str, str)
  664. otsl_str = otsl_str.strip()
  665. if OTSL_NL not in otsl_str:
  666. return otsl_str + OTSL_NL
  667. lines = otsl_str.split(OTSL_NL)
  668. row_data = []
  669. for line in lines:
  670. if not line:
  671. continue
  672. raw_cells = OTSL_FIND_PATTERN.findall(line)
  673. if not raw_cells:
  674. continue
  675. total_len = len(raw_cells)
  676. min_len = 0
  677. for i, cell_str in enumerate(raw_cells):
  678. if cell_str.startswith(OTSL_FCEL):
  679. min_len = i + 1
  680. row_data.append(
  681. {"raw_cells": raw_cells, "total_len": total_len, "min_len": min_len}
  682. )
  683. if not row_data:
  684. return OTSL_NL
  685. global_min_width = max(row["min_len"] for row in row_data) if row_data else 0
  686. max_total_len = max(row["total_len"] for row in row_data) if row_data else 0
  687. search_start = global_min_width
  688. search_end = max(global_min_width, max_total_len)
  689. min_total_cost = float("inf")
  690. optimal_width = search_end
  691. for width in range(search_start, search_end + 1):
  692. current_total_cost = sum(abs(row["total_len"] - width) for row in row_data)
  693. if current_total_cost < min_total_cost:
  694. min_total_cost = current_total_cost
  695. optimal_width = width
  696. repaired_lines = []
  697. for row in row_data:
  698. cells = row["raw_cells"]
  699. current_len = len(cells)
  700. if current_len > optimal_width:
  701. new_cells = cells[:optimal_width]
  702. else:
  703. padding = [OTSL_ECEL] * (optimal_width - current_len)
  704. new_cells = cells + padding
  705. repaired_lines.append("".join(new_cells))
  706. return OTSL_NL.join(repaired_lines) + OTSL_NL
  707. def convert_otsl_to_html(otsl_content: str):
  708. """
  709. Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.
  710. Args:
  711. otsl_content (str): OTSL string.
  712. Returns:
  713. str: HTML table.
  714. """
  715. otsl_content = otsl_pad_to_sqr_v2(otsl_content)
  716. tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
  717. table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
  718. table_data = TableData(
  719. num_rows=len(split_row_tokens),
  720. num_cols=(max(len(row) for row in split_row_tokens) if split_row_tokens else 0),
  721. table_cells=table_cells,
  722. )
  723. return export_to_html(table_data)
  724. def find_shortest_repeating_substring(s: str) -> Union[str, None]:
  725. """
  726. Find the shortest substring that repeats to form the entire string.
  727. Args:
  728. s (str): Input string.
  729. Returns:
  730. str or None: Shortest repeating substring, or None if not found.
  731. """
  732. n = len(s)
  733. for i in range(1, n // 2 + 1):
  734. if n % i == 0:
  735. substring = s[:i]
  736. if substring * (n // i) == s:
  737. return substring
  738. return None
  739. def find_repeating_suffix(
  740. s: str, min_len: int = 8, min_repeats: int = 5
  741. ) -> Union[Tuple[str, str, int], None]:
  742. """
  743. Detect if string ends with a repeating phrase.
  744. Args:
  745. s (str): Input string.
  746. min_len (int): Minimum length of unit.
  747. min_repeats (int): Minimum repeat count.
  748. Returns:
  749. Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
  750. """
  751. for i in range(len(s) // (min_repeats), min_len - 1, -1):
  752. unit = s[-i:]
  753. if s.endswith(unit * min_repeats):
  754. count = 0
  755. temp_s = s
  756. while temp_s.endswith(unit):
  757. temp_s = temp_s[:-i]
  758. count += 1
  759. start_index = len(s) - (count * i)
  760. return s[:start_index], unit, count
  761. return None
  762. def truncate_repetitive_content(
  763. content: str, line_threshold: int = 10, char_threshold: int = 10, min_len: int = 10
  764. ) -> str:
  765. """
  766. Detect and truncate character-level, phrase-level, or line-level repetition in content.
  767. Args:
  768. content (str): Input text.
  769. line_threshold (int): Min lines for line-level truncation.
  770. char_threshold (int): Min repeats for char-level truncation.
  771. min_len (int): Min length for char-level check.
  772. Returns:
  773. Union[str, str]: (truncated_content, info_string)
  774. """
  775. stripped_content = content.strip()
  776. if not stripped_content:
  777. return content
  778. # Priority 1: Phrase-level suffix repetition in long single lines.
  779. if "\n" not in stripped_content and len(stripped_content) > 100:
  780. suffix_match = find_repeating_suffix(stripped_content, min_len=8, min_repeats=5)
  781. if suffix_match:
  782. prefix, repeating_unit, count = suffix_match
  783. if len(repeating_unit) * count > len(stripped_content) * 0.5:
  784. return prefix
  785. # Priority 2: Full-string character-level repetition (e.g., 'ababab')
  786. if "\n" not in stripped_content and len(stripped_content) > min_len:
  787. repeating_unit = find_shortest_repeating_substring(stripped_content)
  788. if repeating_unit:
  789. count = len(stripped_content) // len(repeating_unit)
  790. if count >= char_threshold:
  791. return repeating_unit
  792. # Priority 3: Line-level repetition (e.g., same line repeated many times)
  793. lines = [line.strip() for line in content.split("\n") if line.strip()]
  794. if not lines:
  795. return content
  796. total_lines = len(lines)
  797. if total_lines < line_threshold:
  798. return content
  799. line_counts = Counter(lines)
  800. most_common_line, count = line_counts.most_common(1)[0]
  801. if count >= line_threshold and (count / total_lines) >= 0.8:
  802. return most_common_line
  803. return content
  804. def crop_margin(img):
  805. import cv2
  806. if len(img.shape) == 3:
  807. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  808. else:
  809. gray = img.copy()
  810. if gray.dtype != np.uint8:
  811. gray = gray.astype(np.uint8)
  812. max_val = gray.max()
  813. min_val = gray.min()
  814. if max_val == min_val:
  815. return img
  816. data = (gray - min_val) / (max_val - min_val) * 255
  817. data = data.astype(np.uint8)
  818. _, binary = cv2.threshold(data, 200, 255, cv2.THRESH_BINARY_INV)
  819. coords = cv2.findNonZero(binary)
  820. if coords is None:
  821. return img
  822. x, y, w, h = cv2.boundingRect(coords)
  823. cropped = img[y : y + h, x : x + w]
  824. return cropped