tablepyxl.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from lxml import html
  16. from openpyxl import Workbook
  17. from openpyxl.utils import get_column_letter
  18. from premailer import Premailer
  19. from .style import Table
  20. def string_to_int(s):
  21. """
  22. Convert a string to an integer
  23. """
  24. if s.isdigit():
  25. return int(s)
  26. return 0
  27. def get_Tables(doc):
  28. """
  29. Find all the tables in the doc
  30. """
  31. tree = html.fromstring(doc)
  32. comments = tree.xpath("//comment()")
  33. for comment in comments:
  34. comment.drop_tag()
  35. return [Table(table) for table in tree.xpath("//table")]
  36. def write_rows(worksheet, elem, row, column=1):
  37. """
  38. Writes every tr child element of elem to a row in the worksheet
  39. returns the next row after all rows are written
  40. """
  41. from openpyxl.cell.cell import MergedCell
  42. initial_column = column
  43. for table_row in elem.rows:
  44. for table_cell in table_row.cells:
  45. cell = worksheet.cell(row=row, column=column)
  46. while isinstance(cell, MergedCell):
  47. column += 1
  48. cell = worksheet.cell(row=row, column=column)
  49. colspan = string_to_int(table_cell.element.get("colspan", "1"))
  50. rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
  51. if rowspan > 1 or colspan > 1:
  52. worksheet.merge_cells(
  53. start_row=row,
  54. start_column=column,
  55. end_row=row + rowspan - 1,
  56. end_column=column + colspan - 1,
  57. )
  58. cell.value = table_cell.value
  59. table_cell.format(cell)
  60. min_width = table_cell.get_dimension("min-width")
  61. max_width = table_cell.get_dimension("max-width")
  62. if colspan == 1:
  63. # Initially, when iterating for the first time through the loop, the width of all the cells is None.
  64. # As we start filling in contents, the initial width of the cell (which can be retrieved by:
  65. # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
  66. # cell in the same column (i.e. width of A2 = width of A1)
  67. width = max(
  68. worksheet.column_dimensions[get_column_letter(column)].width or 0,
  69. len(table_cell.value) + 2,
  70. )
  71. if max_width and width > max_width:
  72. width = max_width
  73. elif min_width and width < min_width:
  74. width = min_width
  75. worksheet.column_dimensions[get_column_letter(column)].width = width
  76. column += colspan
  77. row += 1
  78. column = initial_column
  79. return row
  80. def table_to_sheet(table, wb):
  81. """
  82. Takes a table and workbook and writes the table to a new sheet.
  83. The sheet title will be the same as the table attribute name.
  84. """
  85. ws = wb.create_sheet(title=table.element.get("name"))
  86. insert_table(table, ws, 1, 1)
  87. def document_to_workbook(doc, wb=None, base_url=None):
  88. """
  89. Takes a string representation of an html document and writes one sheet for
  90. every table in the document.
  91. The workbook is returned
  92. """
  93. if not wb:
  94. wb = Workbook()
  95. wb.remove(wb.active)
  96. inline_styles_doc = Premailer(
  97. doc, base_url=base_url, remove_classes=False
  98. ).transform()
  99. tables = get_Tables(inline_styles_doc)
  100. for table in tables:
  101. table_to_sheet(table, wb)
  102. return wb
  103. def document_to_xl(doc, filename, base_url=None):
  104. """
  105. Takes a string representation of an html document and writes one sheet for
  106. every table in the document. The workbook is written out to a file called filename
  107. """
  108. wb = document_to_workbook(doc, base_url=base_url)
  109. wb.save(filename)
  110. def insert_table(table, worksheet, column, row):
  111. """
  112. Inserts a table into the worksheet at the specified column and row
  113. """
  114. if table.head:
  115. row = write_rows(worksheet, table.head, row, column)
  116. if table.body:
  117. row = write_rows(worksheet, table.body, row, column)
  118. def insert_table_at_cell(table, cell):
  119. """
  120. Inserts a table at the location of an openpyxl Cell object.
  121. """
  122. ws = cell.parent
  123. column, row = cell.column, cell.row
  124. insert_table(table, ws, column, row)