tablepyxl.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from ....utils.deps import function_requires_deps, is_dep_available
  16. from .style import Table
  17. if is_dep_available("lxml"):
  18. from lxml import html
  19. if is_dep_available("openpyxl"):
  20. from openpyxl import Workbook
  21. from openpyxl.utils import get_column_letter
  22. if is_dep_available("premailer"):
  23. from premailer import Premailer
  24. def string_to_int(s):
  25. """
  26. Convert a string to an integer
  27. """
  28. if s.isdigit():
  29. return int(s)
  30. return 0
  31. @function_requires_deps("lxml")
  32. def get_Tables(doc):
  33. """
  34. Find all the tables in the doc
  35. """
  36. tree = html.fromstring(doc)
  37. comments = tree.xpath("//comment()")
  38. for comment in comments:
  39. comment.drop_tag()
  40. return [Table(table) for table in tree.xpath("//table")]
  41. @function_requires_deps("openpyxl")
  42. def write_rows(worksheet, elem, row, column=1):
  43. """
  44. Writes every tr child element of elem to a row in the worksheet
  45. returns the next row after all rows are written
  46. """
  47. from openpyxl.cell.cell import MergedCell
  48. initial_column = column
  49. for table_row in elem.rows:
  50. for table_cell in table_row.cells:
  51. cell = worksheet.cell(row=row, column=column)
  52. while isinstance(cell, MergedCell):
  53. column += 1
  54. cell = worksheet.cell(row=row, column=column)
  55. colspan = string_to_int(table_cell.element.get("colspan", "1"))
  56. rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
  57. if rowspan > 1 or colspan > 1:
  58. worksheet.merge_cells(
  59. start_row=row,
  60. start_column=column,
  61. end_row=row + rowspan - 1,
  62. end_column=column + colspan - 1,
  63. )
  64. cell.value = table_cell.value
  65. table_cell.format(cell)
  66. min_width = table_cell.get_dimension("min-width")
  67. max_width = table_cell.get_dimension("max-width")
  68. if colspan == 1:
  69. # Initially, when iterating for the first time through the loop, the width of all the cells is None.
  70. # As we start filling in contents, the initial width of the cell (which can be retrieved by:
  71. # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
  72. # cell in the same column (i.e. width of A2 = width of A1)
  73. width = max(
  74. worksheet.column_dimensions[get_column_letter(column)].width or 0,
  75. len(table_cell.value) + 2,
  76. )
  77. if max_width and width > max_width:
  78. width = max_width
  79. elif min_width and width < min_width:
  80. width = min_width
  81. worksheet.column_dimensions[get_column_letter(column)].width = width
  82. column += colspan
  83. row += 1
  84. column = initial_column
  85. return row
  86. def table_to_sheet(table, wb):
  87. """
  88. Takes a table and workbook and writes the table to a new sheet.
  89. The sheet title will be the same as the table attribute name.
  90. """
  91. ws = wb.create_sheet(title=table.element.get("name"))
  92. insert_table(table, ws, 1, 1)
  93. @function_requires_deps("openpyxl", "premailer")
  94. def document_to_workbook(doc, wb=None, base_url=None):
  95. """
  96. Takes a string representation of an html document and writes one sheet for
  97. every table in the document.
  98. The workbook is returned
  99. """
  100. if not wb:
  101. wb = Workbook()
  102. wb.remove(wb.active)
  103. inline_styles_doc = Premailer(
  104. doc, base_url=base_url, remove_classes=False
  105. ).transform()
  106. tables = get_Tables(inline_styles_doc)
  107. for table in tables:
  108. table_to_sheet(table, wb)
  109. return wb
  110. def document_to_xl(doc, filename, base_url=None):
  111. """
  112. Takes a string representation of an html document and writes one sheet for
  113. every table in the document. The workbook is written out to a file called filename
  114. """
  115. wb = document_to_workbook(doc, base_url=base_url)
  116. wb.save(filename)
  117. def insert_table(table, worksheet, column, row):
  118. """
  119. Inserts a table into the worksheet at the specified column and row
  120. """
  121. if table.head:
  122. row = write_rows(worksheet, table.head, row, column)
  123. if table.body:
  124. row = write_rows(worksheet, table.body, row, column)
  125. def insert_table_at_cell(table, cell):
  126. """
  127. Inserts a table at the location of an openpyxl Cell object.
  128. """
  129. ws = cell.parent
  130. column, row = cell.column, cell.row
  131. insert_table(table, ws, column, row)