import re import itertools import html from typing import Any, Dict, List from pydantic import ( BaseModel, computed_field, model_validator, ) class TableCell(BaseModel): """TableCell.""" row_span: int = 1 col_span: int = 1 start_row_offset_idx: int end_row_offset_idx: int start_col_offset_idx: int end_col_offset_idx: int text: str column_header: bool = False row_header: bool = False row_section: bool = False @model_validator(mode="before") @classmethod def from_dict_format(cls, data: Any) -> Any: """from_dict_format.""" if isinstance(data, Dict): # Check if this is a native BoundingBox or a bbox from docling-ibm-models if ( # "bbox" not in data # or data["bbox"] is None # or isinstance(data["bbox"], BoundingBox) "text" in data ): return data text = data["bbox"].get("token", "") if not len(text): text_cells = data.pop("text_cell_bboxes", None) if text_cells: for el in text_cells: text += el["token"] + " " text = text.strip() data["text"] = text return data class TableData(BaseModel): # TBD """BaseTableData.""" table_cells: List[TableCell] = [] num_rows: int = 0 num_cols: int = 0 @computed_field # type: ignore @property def grid( self, ) -> List[List[TableCell]]: """grid.""" # Initialise empty table data grid (only empty cells) table_data = [ [ TableCell( text="", start_row_offset_idx=i, end_row_offset_idx=i + 1, start_col_offset_idx=j, end_col_offset_idx=j + 1, ) for j in range(self.num_cols) ] for i in range(self.num_rows) ] # Overwrite cells in table data for which there is actual cell content. for cell in self.table_cells: for i in range( min(cell.start_row_offset_idx, self.num_rows), min(cell.end_row_offset_idx, self.num_rows), ): for j in range( min(cell.start_col_offset_idx, self.num_cols), min(cell.end_col_offset_idx, self.num_cols), ): table_data[i][j] = cell return table_data """ OTSL """ OTSL_NL = "" OTSL_FCEL = "" OTSL_ECEL = "" OTSL_LCEL = "" OTSL_UCEL = "" OTSL_XCEL = "" def otsl_extract_tokens_and_text(s: str): # Pattern to match anything enclosed by < > # (including the angle brackets themselves) # pattern = r"(<[^>]+>)" pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")" # Find all tokens (e.g. "", "", etc.) tokens = re.findall(pattern, s) # Remove any tokens that start with " to tags for row_idx, row in enumerate(split_row_tokens): while len(row) < max_cols: row.append(OTSL_ECEL) # Insert additional to texts new_texts = [] text_idx = 0 for row_idx, row in enumerate(split_row_tokens): for col_idx, token in enumerate(row): new_texts.append(token) if text_idx < len(texts) and texts[text_idx] == token: text_idx += 1 if (text_idx < len(texts) and texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]): new_texts.append(texts[text_idx]) text_idx += 1 new_texts.append(OTSL_NL) if text_idx < len(texts) and texts[text_idx] == OTSL_NL: text_idx += 1 texts = new_texts def count_right(tokens, c_idx, r_idx, which_tokens): span = 0 c_idx_iter = c_idx while tokens[r_idx][c_idx_iter] in which_tokens: c_idx_iter += 1 span += 1 if c_idx_iter >= len(tokens[r_idx]): return span return span def count_down(tokens, c_idx, r_idx, which_tokens): span = 0 r_idx_iter = r_idx while tokens[r_idx_iter][c_idx] in which_tokens: r_idx_iter += 1 span += 1 if r_idx_iter >= len(tokens): return span return span for i, text in enumerate(texts): cell_text = "" if text in [ OTSL_FCEL, OTSL_ECEL, ]: row_span = 1 col_span = 1 right_offset = 1 if text != OTSL_ECEL: cell_text = texts[i + 1] right_offset = 2 # Check next element(s) for lcel / ucel / xcel, # set properly row_span, col_span next_right_cell = "" if i + right_offset < len(texts): next_right_cell = texts[i + right_offset] next_bottom_cell = "" if r_idx + 1 < len(split_row_tokens): if c_idx < len(split_row_tokens[r_idx + 1]): next_bottom_cell = split_row_tokens[r_idx + 1][c_idx] if next_right_cell in [ OTSL_LCEL, OTSL_XCEL, ]: # we have horisontal spanning cell or 2d spanning cell col_span += count_right( split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL], ) if next_bottom_cell in [ OTSL_UCEL, OTSL_XCEL, ]: # we have a vertical spanning cell or 2d spanning cell row_span += count_down( split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL], ) table_cells.append( TableCell( text=cell_text.strip(), row_span=row_span, col_span=col_span, start_row_offset_idx=r_idx, end_row_offset_idx=r_idx + row_span, start_col_offset_idx=c_idx, end_col_offset_idx=c_idx + col_span, ) ) if text in [ OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL, ]: c_idx += 1 if text == OTSL_NL: r_idx += 1 c_idx = 0 return table_cells, split_row_tokens def export_to_html(table_data: TableData): nrows = table_data.num_rows ncols = table_data.num_cols text = "" if len(table_data.table_cells) == 0: return "" body = "" grid = table_data.grid for i in range(nrows): body += "" for j in range(ncols): cell: TableCell = grid[i][j] rowspan, rowstart = ( cell.row_span, cell.start_row_offset_idx, ) colspan, colstart = ( cell.col_span, cell.start_col_offset_idx, ) if rowstart != i: continue if colstart != j: continue content = html.escape(cell.text.strip()) celltag = "td" if cell.column_header: celltag = "th" opening_tag = f"{celltag}" if rowspan > 1: opening_tag += f' rowspan="{rowspan}"' if colspan > 1: opening_tag += f' colspan="{colspan}"' body += f"<{opening_tag}>{content}" body += "" # dir = get_text_direction(text) body = f"{body}
" return body def convert_otsl_to_html(otsl_content: str): tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content) table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens) table_data = TableData( num_rows=len(split_row_tokens), num_cols=( max(len(row) for row in split_row_tokens) if split_row_tokens else 0 ), table_cells=table_cells, ) return export_to_html(table_data)