style.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from ....utils.deps import class_requires_deps, function_requires_deps, is_dep_available
  15. if is_dep_available("openpyxl"):
  16. from openpyxl.cell import cell
  17. from openpyxl.styles import (
  18. Alignment,
  19. Border,
  20. Color,
  21. Font,
  22. NamedStyle,
  23. PatternFill,
  24. Side,
  25. )
  26. from openpyxl.styles.colors import BLACK
  27. from openpyxl.styles.fills import FILL_SOLID
  28. from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
  29. FORMAT_DATE_MMDDYYYY = "mm/dd/yyyy"
  30. @function_requires_deps("openpyxl")
  31. def colormap(color):
  32. """
  33. Convenience for looking up known colors
  34. """
  35. cmap = {"black": BLACK}
  36. return cmap.get(color, color)
  37. def style_string_to_dict(style):
  38. """
  39. Convert css style string to a python dictionary
  40. """
  41. def clean_split(string, delim):
  42. """
  43. Clean up a string by removing all spaces and splitting on delim.
  44. """
  45. return (s.strip() for s in string.split(delim))
  46. styles = [clean_split(s, ":") for s in style.split(";") if ":" in s]
  47. return dict(styles)
  48. def get_side(style, name):
  49. """
  50. get side
  51. """
  52. return {
  53. "border_style": style.get("border-{}-style".format(name)),
  54. "color": colormap(style.get("border-{}-color".format(name))),
  55. }
  56. known_styles = {}
  57. @function_requires_deps("openpyxl")
  58. def style_dict_to_named_style(style_dict, number_format=None):
  59. """
  60. Change css style (stored in a python dictionary) to openpyxl NamedStyle
  61. """
  62. style_and_format_string = str(
  63. {
  64. "style_dict": style_dict,
  65. "parent": style_dict.parent,
  66. "number_format": number_format,
  67. }
  68. )
  69. if style_and_format_string not in known_styles:
  70. # Font
  71. font = Font(
  72. bold=style_dict.get("font-weight") == "bold",
  73. color=style_dict.get_color("color", None),
  74. size=style_dict.get("font-size"),
  75. )
  76. # Alignment
  77. alignment = Alignment(
  78. horizontal=style_dict.get("text-align", "general"),
  79. vertical=style_dict.get("vertical-align"),
  80. wrap_text=style_dict.get("white-space", "nowrap") == "normal",
  81. )
  82. # Fill
  83. bg_color = style_dict.get_color("background-color")
  84. fg_color = style_dict.get_color("foreground-color", Color())
  85. fill_type = style_dict.get("fill-type")
  86. if bg_color and bg_color != "transparent":
  87. fill = PatternFill(
  88. fill_type=fill_type or FILL_SOLID,
  89. start_color=bg_color,
  90. end_color=fg_color,
  91. )
  92. else:
  93. fill = PatternFill()
  94. # Border
  95. border = Border(
  96. left=Side(**get_side(style_dict, "left")),
  97. right=Side(**get_side(style_dict, "right")),
  98. top=Side(**get_side(style_dict, "top")),
  99. bottom=Side(**get_side(style_dict, "bottom")),
  100. diagonal=Side(**get_side(style_dict, "diagonal")),
  101. diagonal_direction=None,
  102. outline=Side(**get_side(style_dict, "outline")),
  103. vertical=None,
  104. horizontal=None,
  105. )
  106. name = "Style {}".format(len(known_styles) + 1)
  107. pyxl_style = NamedStyle(
  108. name=name,
  109. font=font,
  110. fill=fill,
  111. alignment=alignment,
  112. border=border,
  113. number_format=number_format,
  114. )
  115. known_styles[style_and_format_string] = pyxl_style
  116. return known_styles[style_and_format_string]
  117. class StyleDict(dict):
  118. """
  119. It's like a dictionary, but it looks for items in the parent dictionary
  120. """
  121. def __init__(self, *args, **kwargs):
  122. self.parent = kwargs.pop("parent", None)
  123. super(StyleDict, self).__init__(*args, **kwargs)
  124. def __getitem__(self, item):
  125. if item in self:
  126. return super(StyleDict, self).__getitem__(item)
  127. elif self.parent:
  128. return self.parent[item]
  129. else:
  130. raise KeyError("{} not found".format(item))
  131. def __hash__(self):
  132. return hash(tuple([(k, self.get(k)) for k in self._keys()]))
  133. # Yielding the keys avoids creating unnecessary data structures
  134. # and happily works with both python2 and python3 where the
  135. # .keys() method is a dictionary_view in python3 and a list in python2.
  136. def _keys(self):
  137. yielded = set()
  138. for k in self.keys():
  139. yielded.add(k)
  140. yield k
  141. if self.parent:
  142. for k in self.parent._keys():
  143. if k not in yielded:
  144. yielded.add(k)
  145. yield k
  146. def get(self, k, d=None):
  147. try:
  148. return self[k]
  149. except KeyError:
  150. return d
  151. def get_color(self, k, d=None):
  152. """
  153. Strip leading # off colors if necessary
  154. """
  155. color = self.get(k, d)
  156. if hasattr(color, "startswith") and color.startswith("#"):
  157. color = color[1:]
  158. if (
  159. len(color) == 3
  160. ): # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that
  161. color = "".join(2 * c for c in color)
  162. return color
  163. class Element(object):
  164. """
  165. Our base class for representing an html element along with a cascading style.
  166. The element is created along with a parent so that the StyleDict that we store
  167. can point to the parent's StyleDict.
  168. """
  169. def __init__(self, element, parent=None):
  170. self.element = element
  171. self.number_format = None
  172. parent_style = parent.style_dict if parent else None
  173. self.style_dict = StyleDict(
  174. style_string_to_dict(element.get("style", "")), parent=parent_style
  175. )
  176. self._style_cache = None
  177. def style(self):
  178. """
  179. Turn the css styles for this element into an openpyxl NamedStyle.
  180. """
  181. if not self._style_cache:
  182. self._style_cache = style_dict_to_named_style(
  183. self.style_dict, number_format=self.number_format
  184. )
  185. return self._style_cache
  186. def get_dimension(self, dimension_key):
  187. """
  188. Extracts the dimension from the style dict of the Element and returns it as a float.
  189. """
  190. dimension = self.style_dict.get(dimension_key)
  191. if dimension:
  192. if dimension[-2:] in ["px", "em", "pt", "in", "cm"]:
  193. dimension = dimension[:-2]
  194. dimension = float(dimension)
  195. return dimension
  196. class Table(Element):
  197. """
  198. The concrete implementations of Elements are semantically named for the types of elements we are interested in.
  199. This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to
  200. allowing Element to have an arbitrary number of children and dealing with an abstract element tree.
  201. """
  202. def __init__(self, table):
  203. """
  204. takes an html table object (from lxml)
  205. """
  206. super(Table, self).__init__(table)
  207. table_head = table.find("thead")
  208. self.head = (
  209. TableHead(table_head, parent=self) if table_head is not None else None
  210. )
  211. table_body = table.find("tbody")
  212. self.body = TableBody(
  213. table_body if table_body is not None else table, parent=self
  214. )
  215. class TableHead(Element):
  216. """
  217. This class maps to the `<th>` element of the html table.
  218. """
  219. def __init__(self, head, parent=None):
  220. super(TableHead, self).__init__(head, parent=parent)
  221. self.rows = [TableRow(tr, parent=self) for tr in head.findall("tr")]
  222. class TableBody(Element):
  223. """
  224. This class maps to the `<tbody>` element of the html table.
  225. """
  226. def __init__(self, body, parent=None):
  227. super(TableBody, self).__init__(body, parent=parent)
  228. self.rows = [TableRow(tr, parent=self) for tr in body.findall("tr")]
  229. class TableRow(Element):
  230. """
  231. This class maps to the `<tr>` element of the html table.
  232. """
  233. def __init__(self, tr, parent=None):
  234. super(TableRow, self).__init__(tr, parent=parent)
  235. self.cells = [
  236. TableCell(cell, parent=self) for cell in tr.findall("th") + tr.findall("td")
  237. ]
  238. def element_to_string(el):
  239. """
  240. element to string
  241. """
  242. return _element_to_string(el).strip()
  243. def _element_to_string(el):
  244. """
  245. element to string
  246. """
  247. string = ""
  248. for x in el.iterchildren():
  249. string += "\n" + _element_to_string(x)
  250. text = el.text.strip() if el.text else ""
  251. tail = el.tail.strip() if el.tail else ""
  252. return text + string + "\n" + tail
  253. @class_requires_deps("openpyxl")
  254. class TableCell(Element):
  255. """
  256. This class maps to the `<td>` element of the html table.
  257. """
  258. CELL_TYPES = {
  259. "TYPE_STRING",
  260. "TYPE_FORMULA",
  261. "TYPE_NUMERIC",
  262. "TYPE_BOOL",
  263. "TYPE_CURRENCY",
  264. "TYPE_PERCENTAGE",
  265. "TYPE_NULL",
  266. "TYPE_INLINE",
  267. "TYPE_ERROR",
  268. "TYPE_FORMULA_CACHE_STRING",
  269. "TYPE_INTEGER",
  270. }
  271. def __init__(self, cell, parent=None):
  272. super(TableCell, self).__init__(cell, parent=parent)
  273. self.value = element_to_string(cell)
  274. self.number_format = self.get_number_format()
  275. def data_type(self):
  276. """
  277. get data type
  278. """
  279. cell_types = self.CELL_TYPES & set(self.element.get("class", "").split())
  280. if cell_types:
  281. if "TYPE_FORMULA" in cell_types:
  282. # Make sure TYPE_FORMULA takes precedence over the other classes in the set.
  283. cell_type = "TYPE_FORMULA"
  284. elif cell_types & {"TYPE_CURRENCY", "TYPE_INTEGER", "TYPE_PERCENTAGE"}:
  285. cell_type = "TYPE_NUMERIC"
  286. else:
  287. cell_type = cell_types.pop()
  288. else:
  289. cell_type = "TYPE_STRING"
  290. return getattr(cell, cell_type)
  291. def get_number_format(self):
  292. """
  293. get number format
  294. """
  295. if "TYPE_CURRENCY" in self.element.get("class", "").split():
  296. return FORMAT_CURRENCY_USD_SIMPLE
  297. if "TYPE_INTEGER" in self.element.get("class", "").split():
  298. return "#,##0"
  299. if "TYPE_PERCENTAGE" in self.element.get("class", "").split():
  300. return FORMAT_PERCENTAGE
  301. if "TYPE_DATE" in self.element.get("class", "").split():
  302. return FORMAT_DATE_MMDDYYYY
  303. if self.data_type() == cell.TYPE_NUMERIC:
  304. try:
  305. int(self.value)
  306. except ValueError:
  307. return "#,##0.##"
  308. else:
  309. return "#,##0"
  310. def format(self, cell):
  311. """
  312. format
  313. """
  314. cell.style = self.style()
  315. data_type = self.data_type()
  316. if data_type:
  317. cell.data_type = data_type