format_utils.py 10.0 KB


  1. import re
  2. import itertools
  3. import html
  4. from typing import Any, Dict, List
  5. from pydantic import (
  6. BaseModel,
  7. computed_field,
  8. model_validator,
  9. )
  10. class TableCell(BaseModel):
  11. """TableCell."""
  12. row_span: int = 1
  13. col_span: int = 1
  14. start_row_offset_idx: int
  15. end_row_offset_idx: int
  16. start_col_offset_idx: int
  17. end_col_offset_idx: int
  18. text: str
  19. column_header: bool = False
  20. row_header: bool = False
  21. row_section: bool = False
  22. @model_validator(mode="before")
  23. @classmethod
  24. def from_dict_format(cls, data: Any) -> Any:
  25. """from_dict_format."""
  26. if isinstance(data, Dict):
  27. # Check if this is a native BoundingBox or a bbox from docling-ibm-models
  28. if (
  29. # "bbox" not in data
  30. # or data["bbox"] is None
  31. # or isinstance(data["bbox"], BoundingBox)
  32. "text"
  33. in data
  34. ):
  35. return data
  36. text = data["bbox"].get("token", "")
  37. if not len(text):
  38. text_cells = data.pop("text_cell_bboxes", None)
  39. if text_cells:
  40. for el in text_cells:
  41. text += el["token"] + " "
  42. text = text.strip()
  43. data["text"] = text
  44. return data
  45. class TableData(BaseModel): # TBD
  46. """BaseTableData."""
  47. table_cells: List[TableCell] = []
  48. num_rows: int = 0
  49. num_cols: int = 0
  50. @computed_field # type: ignore
  51. @property
  52. def grid(
  53. self,
  54. ) -> List[List[TableCell]]:
  55. """grid."""
  56. # Initialise empty table data grid (only empty cells)
  57. table_data = [
  58. [
  59. TableCell(
  60. text="",
  61. start_row_offset_idx=i,
  62. end_row_offset_idx=i + 1,
  63. start_col_offset_idx=j,
  64. end_col_offset_idx=j + 1,
  65. )
  66. for j in range(self.num_cols)
  67. ]
  68. for i in range(self.num_rows)
  69. ]
  70. # Overwrite cells in table data for which there is actual cell content.
  71. for cell in self.table_cells:
  72. for i in range(
  73. min(cell.start_row_offset_idx, self.num_rows),
  74. min(cell.end_row_offset_idx, self.num_rows),
  75. ):
  76. for j in range(
  77. min(cell.start_col_offset_idx, self.num_cols),
  78. min(cell.end_col_offset_idx, self.num_cols),
  79. ):
  80. table_data[i][j] = cell
  81. return table_data
  82. """
  83. OTSL
  84. """
  85. OTSL_NL = "<nl>"
  86. OTSL_FCEL = "<fcel>"
  87. OTSL_ECEL = "<ecel>"
  88. OTSL_LCEL = "<lcel>"
  89. OTSL_UCEL = "<ucel>"
  90. OTSL_XCEL = "<xcel>"
  91. def otsl_extract_tokens_and_text(s: str):
  92. # Pattern to match anything enclosed by < >
  93. # (including the angle brackets themselves)
  94. # pattern = r"(<[^>]+>)"
  95. pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
  96. # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
  97. tokens = re.findall(pattern, s)
  98. # Remove any tokens that start with "<loc_"
  99. tokens = [token for token in tokens]
  100. # Split the string by those tokens to get the in-between text
  101. text_parts = re.split(pattern, s)
  102. text_parts = [token for token in text_parts]
  103. # Remove any empty or purely whitespace strings from text_parts
  104. text_parts = [part for part in text_parts if part.strip()]
  105. return tokens, text_parts
  106. def otsl_parse_texts(texts, tokens):
  107. split_word = OTSL_NL
  108. split_row_tokens = [
  109. list(y)
  110. for x, y in itertools.groupby(tokens, lambda z: z == split_word)
  111. if not x
  112. ]
  113. table_cells = []
  114. r_idx = 0
  115. c_idx = 0
  116. # Check and complete the matrix
  117. if split_row_tokens:
  118. max_cols = max(len(row) for row in split_row_tokens)
  119. # Insert additional <ecel> to tags
  120. for row_idx, row in enumerate(split_row_tokens):
  121. while len(row) < max_cols:
  122. row.append(OTSL_ECEL)
  123. # Insert additional <ecel> to texts
  124. new_texts = []
  125. text_idx = 0
  126. for row_idx, row in enumerate(split_row_tokens):
  127. for col_idx, token in enumerate(row):
  128. new_texts.append(token)
  129. if text_idx < len(texts) and texts[text_idx] == token:
  130. text_idx += 1
  131. if (text_idx < len(texts) and
  132. texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]):
  133. new_texts.append(texts[text_idx])
  134. text_idx += 1
  135. new_texts.append(OTSL_NL)
  136. if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
  137. text_idx += 1
  138. texts = new_texts
  139. def count_right(tokens, c_idx, r_idx, which_tokens):
  140. span = 0
  141. c_idx_iter = c_idx
  142. while tokens[r_idx][c_idx_iter] in which_tokens:
  143. c_idx_iter += 1
  144. span += 1
  145. if c_idx_iter >= len(tokens[r_idx]):
  146. return span
  147. return span
  148. def count_down(tokens, c_idx, r_idx, which_tokens):
  149. span = 0
  150. r_idx_iter = r_idx
  151. while tokens[r_idx_iter][c_idx] in which_tokens:
  152. r_idx_iter += 1
  153. span += 1
  154. if r_idx_iter >= len(tokens):
  155. return span
  156. return span
  157. for i, text in enumerate(texts):
  158. cell_text = ""
  159. if text in [
  160. OTSL_FCEL,
  161. OTSL_ECEL,
  162. ]:
  163. row_span = 1
  164. col_span = 1
  165. right_offset = 1
  166. if text != OTSL_ECEL:
  167. cell_text = texts[i + 1]
  168. right_offset = 2
  169. # Check next element(s) for lcel / ucel / xcel,
  170. # set properly row_span, col_span
  171. next_right_cell = ""
  172. if i + right_offset < len(texts):
  173. next_right_cell = texts[i + right_offset]
  174. next_bottom_cell = ""
  175. if r_idx + 1 < len(split_row_tokens):
  176. if c_idx < len(split_row_tokens[r_idx + 1]):
  177. next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
  178. if next_right_cell in [
  179. OTSL_LCEL,
  180. OTSL_XCEL,
  181. ]:
  182. # we have horisontal spanning cell or 2d spanning cell
  183. col_span += count_right(
  184. split_row_tokens,
  185. c_idx + 1,
  186. r_idx,
  187. [OTSL_LCEL, OTSL_XCEL],
  188. )
  189. if next_bottom_cell in [
  190. OTSL_UCEL,
  191. OTSL_XCEL,
  192. ]:
  193. # we have a vertical spanning cell or 2d spanning cell
  194. row_span += count_down(
  195. split_row_tokens,
  196. c_idx,
  197. r_idx + 1,
  198. [OTSL_UCEL, OTSL_XCEL],
  199. )
  200. table_cells.append(
  201. TableCell(
  202. text=cell_text.strip(),
  203. row_span=row_span,
  204. col_span=col_span,
  205. start_row_offset_idx=r_idx,
  206. end_row_offset_idx=r_idx + row_span,
  207. start_col_offset_idx=c_idx,
  208. end_col_offset_idx=c_idx + col_span,
  209. )
  210. )
  211. if text in [
  212. OTSL_FCEL,
  213. OTSL_ECEL,
  214. OTSL_LCEL,
  215. OTSL_UCEL,
  216. OTSL_XCEL,
  217. ]:
  218. c_idx += 1
  219. if text == OTSL_NL:
  220. r_idx += 1
  221. c_idx = 0
  222. return table_cells, split_row_tokens
  223. def export_to_html(table_data: TableData):
  224. nrows = table_data.num_rows
  225. ncols = table_data.num_cols
  226. text = ""
  227. if len(table_data.table_cells) == 0:
  228. return ""
  229. body = ""
  230. grid = table_data.grid
  231. for i in range(nrows):
  232. body += "<tr>"
  233. for j in range(ncols):
  234. cell: TableCell = grid[i][j]
  235. rowspan, rowstart = (
  236. cell.row_span,
  237. cell.start_row_offset_idx,
  238. )
  239. colspan, colstart = (
  240. cell.col_span,
  241. cell.start_col_offset_idx,
  242. )
  243. if rowstart != i:
  244. continue
  245. if colstart != j:
  246. continue
  247. content = html.escape(cell.text.strip())
  248. celltag = "td"
  249. if cell.column_header:
  250. celltag = "th"
  251. opening_tag = f"{celltag}"
  252. if rowspan > 1:
  253. opening_tag += f' rowspan="{rowspan}"'
  254. if colspan > 1:
  255. opening_tag += f' colspan="{colspan}"'
  256. body += f"<{opening_tag}>{content}</{celltag}>"
  257. body += "</tr>"
  258. # dir = get_text_direction(text)
  259. body = f"<table>{body}</table>"
  260. return body
  261. def convert_otsl_to_html(otsl_content: str):
  262. tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
  263. table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
  264. table_data = TableData(
  265. num_rows=len(split_row_tokens),
  266. num_cols=(
  267. max(len(row) for row in split_row_tokens) if split_row_tokens else 0
  268. ),
  269. table_cells=table_cells,
  270. )
  271. return export_to_html(table_data)
  272. def block_content_to_html(block_content: str) -> str:
  273. """
  274. Converts block content containing OTSL (Open Table Structure Language) tags into HTML.
  275. This function processes a block of text, splitting it into lines and converting any lines
  276. containing OTSL table tags (e.g., <fcel>, <ecel>) into HTML tables. Lines without these
  277. tags are left unchanged.
  278. Parameters:
  279. block_content (str): A string containing block content with potential OTSL tags.
  280. Returns:
  281. str: The processed block content with OTSL tags converted to HTML tables.
  282. """
  283. lines = block_content.split("\n\n")
  284. new_lines = []
  285. for line in lines:
  286. if "<fcel>" in line or "<ecel>" in line:
  287. line = convert_otsl_to_html(line)
  288. new_lines.append(line)
  289. return "\n\n".join(new_lines)