utils.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import sys
  15. import time
  16. import os
  17. import os.path as osp
  18. import numpy as np
  19. import six
  20. import yaml
  21. import math
  22. from . import logging
  23. def seconds_to_hms(seconds):
  24. h = math.floor(seconds / 3600)
  25. m = math.floor((seconds - h * 3600) / 60)
  26. s = int(seconds - h * 3600 - m * 60)
  27. hms_str = "{}:{}:{}".format(h, m, s)
  28. return hms_str
  29. def get_environ_info():
  30. import paddle.fluid as fluid
  31. info = dict()
  32. info['place'] = 'cpu'
  33. info['num'] = int(os.environ.get('CPU_NUM', 1))
  34. if os.environ.get('CUDA_VISIBLE_DEVICES', None) != "":
  35. if hasattr(fluid.core, 'get_cuda_device_count'):
  36. gpu_num = 0
  37. try:
  38. gpu_num = fluid.core.get_cuda_device_count()
  39. except:
  40. os.environ['CUDA_VISIBLE_DEVICES'] = ''
  41. pass
  42. if gpu_num > 0:
  43. info['place'] = 'cuda'
  44. info['num'] = fluid.core.get_cuda_device_count()
  45. return info
  46. def parse_param_file(param_file, return_shape=True):
  47. from paddle.fluid.proto.framework_pb2 import VarType
  48. f = open(param_file, 'rb')
  49. version = np.fromstring(f.read(4), dtype='int32')
  50. lod_level = np.fromstring(f.read(8), dtype='int64')
  51. for i in range(int(lod_level)):
  52. _size = np.fromstring(f.read(8), dtype='int64')
  53. _ = f.read(_size)
  54. version = np.fromstring(f.read(4), dtype='int32')
  55. tensor_desc = VarType.TensorDesc()
  56. tensor_desc_size = np.fromstring(f.read(4), dtype='int32')
  57. tensor_desc.ParseFromString(f.read(int(tensor_desc_size)))
  58. tensor_shape = tuple(tensor_desc.dims)
  59. if return_shape:
  60. f.close()
  61. return tuple(tensor_desc.dims)
  62. if tensor_desc.data_type != 5:
  63. raise Exception(
  64. "Unexpected data type while parse {}".format(param_file))
  65. data_size = 4
  66. for i in range(len(tensor_shape)):
  67. data_size *= tensor_shape[i]
  68. weight = np.fromstring(f.read(data_size), dtype='float32')
  69. f.close()
  70. return np.reshape(weight, tensor_shape)
  71. def fuse_bn_weights(exe, main_prog, weights_dir):
  72. import paddle.fluid as fluid
  73. logging.info("Try to fuse weights of batch_norm...")
  74. bn_vars = list()
  75. for block in main_prog.blocks:
  76. ops = list(block.ops)
  77. for op in ops:
  78. if op.type == 'affine_channel':
  79. scale_name = op.input('Scale')[0]
  80. bias_name = op.input('Bias')[0]
  81. prefix = scale_name[:-5]
  82. mean_name = prefix + 'mean'
  83. variance_name = prefix + 'variance'
  84. if not osp.exists(osp.join(
  85. weights_dir, mean_name)) or not osp.exists(
  86. osp.join(weights_dir, variance_name)):
  87. logging.info(
  88. "There's no batch_norm weight found to fuse, skip fuse_bn."
  89. )
  90. return
  91. bias = block.var(bias_name)
  92. pretrained_shape = parse_param_file(
  93. osp.join(weights_dir, bias_name))
  94. actual_shape = tuple(bias.shape)
  95. if pretrained_shape != actual_shape:
  96. continue
  97. bn_vars.append(
  98. [scale_name, bias_name, mean_name, variance_name])
  99. eps = 1e-5
  100. for names in bn_vars:
  101. scale_name, bias_name, mean_name, variance_name = names
  102. scale = parse_param_file(
  103. osp.join(weights_dir, scale_name), return_shape=False)
  104. bias = parse_param_file(
  105. osp.join(weights_dir, bias_name), return_shape=False)
  106. mean = parse_param_file(
  107. osp.join(weights_dir, mean_name), return_shape=False)
  108. variance = parse_param_file(
  109. osp.join(weights_dir, variance_name), return_shape=False)
  110. bn_std = np.sqrt(np.add(variance, eps))
  111. new_scale = np.float32(np.divide(scale, bn_std))
  112. new_bias = bias - mean * new_scale
  113. scale_tensor = fluid.global_scope().find_var(scale_name).get_tensor()
  114. bias_tensor = fluid.global_scope().find_var(bias_name).get_tensor()
  115. scale_tensor.set(new_scale, exe.place)
  116. bias_tensor.set(new_bias, exe.place)
  117. if len(bn_vars) == 0:
  118. logging.info(
  119. "There's no batch_norm weight found to fuse, skip fuse_bn.")
  120. else:
  121. logging.info("There's {} batch_norm ops been fused.".format(
  122. len(bn_vars)))
  123. def load_pdparams(exe, main_prog, model_dir):
  124. import paddle.fluid as fluid
  125. from paddle.fluid.proto.framework_pb2 import VarType
  126. from paddle.fluid.framework import Program
  127. vars_to_load = list()
  128. import pickle
  129. with open(osp.join(model_dir, 'model.pdparams'), 'rb') as f:
  130. params_dict = pickle.load(f) if six.PY2 else pickle.load(
  131. f, encoding='latin1')
  132. unused_vars = list()
  133. for var in main_prog.list_vars():
  134. if not isinstance(var, fluid.framework.Parameter):
  135. continue
  136. if var.name not in params_dict:
  137. raise Exception("{} is not in saved paddlex model".format(
  138. var.name))
  139. if var.shape != params_dict[var.name].shape:
  140. unused_vars.append(var.name)
  141. logging.warning(
  142. "[SKIP] Shape of pretrained weight {} doesn't match.(Pretrained: {}, Actual: {})"
  143. .format(var.name, params_dict[var.name].shape, var.shape))
  144. continue
  145. vars_to_load.append(var)
  146. logging.debug("Weight {} will be load".format(var.name))
  147. for var_name in unused_vars:
  148. del params_dict[var_name]
  149. fluid.io.set_program_state(main_prog, params_dict)
  150. if len(vars_to_load) == 0:
  151. logging.warning(
  152. "There is no pretrain weights loaded, maybe you should check you pretrain model!"
  153. )
  154. else:
  155. logging.info("There are {} varaibles in {} are loaded.".format(
  156. len(vars_to_load), model_dir))
  157. def is_persistable(var):
  158. import paddle.fluid as fluid
  159. from paddle.fluid.proto.framework_pb2 import VarType
  160. if var.desc.type() == fluid.core.VarDesc.VarType.FEED_MINIBATCH or \
  161. var.desc.type() == fluid.core.VarDesc.VarType.FETCH_LIST or \
  162. var.desc.type() == fluid.core.VarDesc.VarType.READER:
  163. return False
  164. return var.persistable
  165. def is_belong_to_optimizer(var):
  166. import paddle.fluid as fluid
  167. from paddle.fluid.proto.framework_pb2 import VarType
  168. if not (isinstance(var, fluid.framework.Parameter)
  169. or var.desc.need_check_feed()):
  170. return is_persistable(var)
  171. return False
  172. def load_pdopt(exe, main_prog, model_dir):
  173. import paddle.fluid as fluid
  174. optimizer_var_list = list()
  175. vars_to_load = list()
  176. import pickle
  177. with open(osp.join(model_dir, 'model.pdopt'), 'rb') as f:
  178. opt_dict = pickle.load(f) if six.PY2 else pickle.load(
  179. f, encoding='latin1')
  180. optimizer_var_list = list(
  181. filter(is_belong_to_optimizer, main_prog.list_vars()))
  182. exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  183. if len(optimizer_var_list) > 0:
  184. for var in optimizer_var_list:
  185. if var.name not in opt_dict:
  186. raise Exception(
  187. "{} is not in saved paddlex optimizer, {}".format(
  188. var.name, exception_message))
  189. if var.shape != opt_dict[var.name].shape:
  190. raise Exception(
  191. "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
  192. .format(var.name, opt_dict[var.name].shape,
  193. var.shape), exception_message)
  194. optimizer_varname_list = [var.name for var in optimizer_var_list]
  195. for k, v in opt_dict.items():
  196. if k not in optimizer_varname_list:
  197. raise Exception(
  198. "{} in saved paddlex optimizer is not in the model, {}".
  199. format(k, exception_message))
  200. fluid.io.set_program_state(main_prog, opt_dict)
  201. if len(optimizer_var_list) == 0:
  202. raise Exception(
  203. "There is no optimizer parameters in the model, please set the optimizer!"
  204. )
  205. else:
  206. logging.info(
  207. "There are {} optimizer parameters in {} are loaded.".format(
  208. len(optimizer_var_list), model_dir))
  209. def load_pretrain_weights(exe,
  210. main_prog,
  211. weights_dir,
  212. fuse_bn=False,
  213. resume=False):
  214. if not osp.exists(weights_dir):
  215. raise Exception("Path {} not exists.".format(weights_dir))
  216. if osp.exists(osp.join(weights_dir, "model.pdparams")):
  217. load_pdparams(exe, main_prog, weights_dir)
  218. if resume:
  219. if osp.exists(osp.join(weights_dir, "model.pdopt")):
  220. load_pdopt(exe, main_prog, weights_dir)
  221. else:
  222. raise Exception(
  223. "Optimizer file {} does not exist. Stop resumming training. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  224. .format(osp.join(weights_dir, "model.pdopt")))
  225. return
  226. import paddle.fluid as fluid
  227. vars_to_load = list()
  228. for var in main_prog.list_vars():
  229. if not isinstance(var, fluid.framework.Parameter):
  230. continue
  231. if not osp.exists(osp.join(weights_dir, var.name)):
  232. logging.debug(
  233. "[SKIP] Pretrained weight {}/{} doesn't exist".format(
  234. weights_dir, var.name))
  235. continue
  236. pretrained_shape = parse_param_file(osp.join(weights_dir, var.name))
  237. actual_shape = tuple(var.shape)
  238. if pretrained_shape != actual_shape:
  239. logging.warning(
  240. "[SKIP] Shape of pretrained weight {}/{} doesn't match.(Pretrained: {}, Actual: {})"
  241. .format(weights_dir, var.name, pretrained_shape, actual_shape))
  242. continue
  243. vars_to_load.append(var)
  244. logging.debug("Weight {} will be load".format(var.name))
  245. params_dict = fluid.io.load_program_state(
  246. weights_dir, var_list=vars_to_load)
  247. fluid.io.set_program_state(main_prog, params_dict)
  248. if len(vars_to_load) == 0:
  249. logging.warning(
  250. "There is no pretrain weights loaded, maybe you should check you pretrain model!"
  251. )
  252. else:
  253. logging.info("There are {} varaibles in {} are loaded.".format(
  254. len(vars_to_load), weights_dir))
  255. if fuse_bn:
  256. fuse_bn_weights(exe, main_prog, weights_dir)
  257. if resume:
  258. exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  259. optimizer_var_list = list(
  260. filter(is_belong_to_optimizer, main_prog.list_vars()))
  261. if len(optimizer_var_list) > 0:
  262. for var in optimizer_var_list:
  263. if not osp.exists(osp.join(weights_dir, var.name)):
  264. raise Exception(
  265. "Optimizer parameter {} doesn't exist, {}".format(
  266. osp.join(weights_dir, var.name),
  267. exception_message))
  268. pretrained_shape = parse_param_file(
  269. osp.join(weights_dir, var.name))
  270. actual_shape = tuple(var.shape)
  271. if pretrained_shape != actual_shape:
  272. raise Exception(
  273. "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
  274. .format(var.name, pretrained_shape,
  275. actual_shape), exception_message)
  276. optimizer_varname_list = [var.name for var in optimizer_var_list]
  277. if os.exists(osp.join(weights_dir, 'learning_rate')
  278. ) and 'learning_rate' not in optimizer_varname_list:
  279. raise Exception(
  280. "Optimizer parameter {}/learning_rate is not in the model, {}"
  281. .format(weights_dir, exception_message))
  282. fluid.io.load_vars(
  283. executor=exe,
  284. dirname=weights_dir,
  285. main_program=main_prog,
  286. vars=optimizer_var_list)
  287. if len(optimizer_var_list) == 0:
  288. raise Exception(
  289. "There is no optimizer parameters in the model, please set the optimizer!"
  290. )
  291. else:
  292. logging.info(
  293. "There are {} optimizer parameters in {} are loaded.".format(
  294. len(optimizer_var_list), weights_dir))
  295. class EarlyStop:
  296. def __init__(self, patience, thresh):
  297. self.patience = patience
  298. self.counter = 0
  299. self.score = None
  300. self.max = 0
  301. self.thresh = thresh
  302. if patience < 1:
  303. raise Exception("Argument patience should be a positive integer.")
  304. def __call__(self, current_score):
  305. if self.score is None:
  306. self.score = current_score
  307. return False
  308. elif current_score > self.max:
  309. self.counter = 0
  310. self.score = current_score
  311. self.max = current_score
  312. return False
  313. else:
  314. if (abs(self.score - current_score) < self.thresh
  315. or current_score < self.score):
  316. self.counter += 1
  317. self.score = current_score
  318. logging.debug(
  319. "EarlyStopping: %i / %i" % (self.counter, self.patience))
  320. if self.counter >= self.patience:
  321. logging.info("EarlyStopping: Stop training")
  322. return True
  323. return False
  324. else:
  325. self.counter = 0
  326. self.score = current_score
  327. return False