optimizer.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. from paddle import optimizer as optim
  18. from paddlex.ppcls.utils import logger
  19. class Momentum(object):
  20. """
  21. Simple Momentum optimizer with velocity state.
  22. Args:
  23. learning_rate (float|Variable) - The learning rate used to update parameters.
  24. Can be a float value or a Variable with one float value as data element.
  25. momentum (float) - Momentum factor.
  26. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  27. """
  28. def __init__(self,
  29. learning_rate,
  30. momentum,
  31. weight_decay=None,
  32. grad_clip=None,
  33. multi_precision=False):
  34. super().__init__()
  35. self.learning_rate = learning_rate
  36. self.momentum = momentum
  37. self.weight_decay = weight_decay
  38. self.grad_clip = grad_clip
  39. self.multi_precision = multi_precision
  40. def __call__(self, model_list):
  41. # model_list is None in static graph
  42. parameters = sum([m.parameters() for m in model_list],
  43. []) if model_list else None
  44. opt = optim.Momentum(
  45. learning_rate=self.learning_rate,
  46. momentum=self.momentum,
  47. weight_decay=self.weight_decay,
  48. grad_clip=self.grad_clip,
  49. multi_precision=self.multi_precision,
  50. parameters=parameters)
  51. return opt
  52. class Adam(object):
  53. def __init__(self,
  54. learning_rate=0.001,
  55. beta1=0.9,
  56. beta2=0.999,
  57. epsilon=1e-08,
  58. parameter_list=None,
  59. weight_decay=None,
  60. grad_clip=None,
  61. name=None,
  62. lazy_mode=False,
  63. multi_precision=False):
  64. self.learning_rate = learning_rate
  65. self.beta1 = beta1
  66. self.beta2 = beta2
  67. self.epsilon = epsilon
  68. self.parameter_list = parameter_list
  69. self.learning_rate = learning_rate
  70. self.weight_decay = weight_decay
  71. self.grad_clip = grad_clip
  72. self.name = name
  73. self.lazy_mode = lazy_mode
  74. self.multi_precision = multi_precision
  75. def __call__(self, model_list):
  76. # model_list is None in static graph
  77. parameters = sum([m.parameters() for m in model_list],
  78. []) if model_list else None
  79. opt = optim.Adam(
  80. learning_rate=self.learning_rate,
  81. beta1=self.beta1,
  82. beta2=self.beta2,
  83. epsilon=self.epsilon,
  84. weight_decay=self.weight_decay,
  85. grad_clip=self.grad_clip,
  86. name=self.name,
  87. lazy_mode=self.lazy_mode,
  88. multi_precision=self.multi_precision,
  89. parameters=parameters)
  90. return opt
  91. class RMSProp(object):
  92. """
  93. Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
  94. Args:
  95. learning_rate (float|Variable) - The learning rate used to update parameters.
  96. Can be a float value or a Variable with one float value as data element.
  97. momentum (float) - Momentum factor.
  98. rho (float) - rho value in equation.
  99. epsilon (float) - avoid division by zero, default is 1e-6.
  100. regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
  101. """
  102. def __init__(self,
  103. learning_rate,
  104. momentum=0.0,
  105. rho=0.95,
  106. epsilon=1e-6,
  107. weight_decay=None,
  108. grad_clip=None,
  109. multi_precision=False):
  110. super().__init__()
  111. self.learning_rate = learning_rate
  112. self.momentum = momentum
  113. self.rho = rho
  114. self.epsilon = epsilon
  115. self.weight_decay = weight_decay
  116. self.grad_clip = grad_clip
  117. def __call__(self, model_list):
  118. # model_list is None in static graph
  119. parameters = sum([m.parameters() for m in model_list],
  120. []) if model_list else None
  121. opt = optim.RMSProp(
  122. learning_rate=self.learning_rate,
  123. momentum=self.momentum,
  124. rho=self.rho,
  125. epsilon=self.epsilon,
  126. weight_decay=self.weight_decay,
  127. grad_clip=self.grad_clip,
  128. parameters=parameters)
  129. return opt
  130. class AdamW(object):
  131. def __init__(self,
  132. learning_rate=0.001,
  133. beta1=0.9,
  134. beta2=0.999,
  135. epsilon=1e-8,
  136. weight_decay=None,
  137. multi_precision=False,
  138. grad_clip=None,
  139. no_weight_decay_name=None,
  140. one_dim_param_no_weight_decay=False,
  141. **args):
  142. super().__init__()
  143. self.learning_rate = learning_rate
  144. self.beta1 = beta1
  145. self.beta2 = beta2
  146. self.epsilon = epsilon
  147. self.grad_clip = grad_clip
  148. self.weight_decay = weight_decay
  149. self.multi_precision = multi_precision
  150. self.no_weight_decay_name_list = no_weight_decay_name.split(
  151. ) if no_weight_decay_name else []
  152. self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
  153. def __call__(self, model_list):
  154. # model_list is None in static graph
  155. parameters = sum([m.parameters() for m in model_list],
  156. []) if model_list else None
  157. # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
  158. if model_list is None:
  159. if self.one_dim_param_no_weight_decay or len(
  160. self.no_weight_decay_name_list) != 0:
  161. msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
  162. logger.error(Exception(msg))
  163. raise Exception(msg)
  164. self.no_weight_decay_param_name_list = [
  165. p.name for model in model_list for n, p in model.named_parameters()
  166. if any(nd in n for nd in self.no_weight_decay_name_list)
  167. ] if model_list else []
  168. if self.one_dim_param_no_weight_decay:
  169. self.no_weight_decay_param_name_list += [
  170. p.name
  171. for model in model_list for n, p in model.named_parameters()
  172. if len(p.shape) == 1
  173. ] if model_list else []
  174. opt = optim.AdamW(
  175. learning_rate=self.learning_rate,
  176. beta1=self.beta1,
  177. beta2=self.beta2,
  178. epsilon=self.epsilon,
  179. parameters=parameters,
  180. weight_decay=self.weight_decay,
  181. multi_precision=self.multi_precision,
  182. grad_clip=self.grad_clip,
  183. apply_decay_param_fun=self._apply_decay_param_fun)
  184. return opt
  185. def _apply_decay_param_fun(self, name):
  186. return name not in self.no_weight_decay_param_name_list