utils.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import sys
  15. import time
  16. import os
  17. import os.path as osp
  18. import numpy as np
  19. import six
  20. import yaml
  21. import math
  22. import platform
  23. from . import logging
  24. def seconds_to_hms(seconds):
  25. h = math.floor(seconds / 3600)
  26. m = math.floor((seconds - h * 3600) / 60)
  27. s = int(seconds - h * 3600 - m * 60)
  28. hms_str = "{}:{}:{}".format(h, m, s)
  29. return hms_str
  30. def get_environ_info():
  31. import paddle.fluid as fluid
  32. info = dict()
  33. info['place'] = 'cpu'
  34. info['num'] = int(os.environ.get('CPU_NUM', 1))
  35. if os.environ.get('CUDA_VISIBLE_DEVICES', None) != "":
  36. if hasattr(fluid.core, 'get_cuda_device_count'):
  37. gpu_num = 0
  38. try:
  39. gpu_num = fluid.core.get_cuda_device_count()
  40. except:
  41. os.environ['CUDA_VISIBLE_DEVICES'] = ''
  42. pass
  43. if gpu_num > 0:
  44. info['place'] = 'cuda'
  45. info['num'] = fluid.core.get_cuda_device_count()
  46. return info
  47. def path_normalization(path):
  48. win_sep = "\\"
  49. other_sep = "/"
  50. if platform.system() == "Windows":
  51. path = win_sep.join(path.split(other_sep))
  52. else:
  53. path = other_sep.join(path.split(win_sep))
  54. return path
  55. def parse_param_file(param_file, return_shape=True):
  56. from paddle.fluid.proto.framework_pb2 import VarType
  57. f = open(param_file, 'rb')
  58. version = np.frombuffer(f.read(4), dtype='int32')
  59. lod_level = np.frombuffer(f.read(8), dtype='int64')
  60. for i in range(int(lod_level)):
  61. _size = np.frombuffer(f.read(8), dtype='int64')
  62. _ = f.read(_size)
  63. version = np.frombuffer(f.read(4), dtype='int32')
  64. tensor_desc = VarType.TensorDesc()
  65. tensor_desc_size = np.frombuffer(f.read(4), dtype='int32')
  66. tensor_desc.ParseFromString(f.read(int(tensor_desc_size)))
  67. tensor_shape = tuple(tensor_desc.dims)
  68. if return_shape:
  69. f.close()
  70. return tuple(tensor_desc.dims)
  71. if tensor_desc.data_type != 5:
  72. raise Exception("Unexpected data type while parse {}".format(
  73. param_file))
  74. data_size = 4
  75. for i in range(len(tensor_shape)):
  76. data_size *= tensor_shape[i]
  77. weight = np.fromstring(f.read(data_size), dtype='float32')
  78. f.close()
  79. return np.reshape(weight, tensor_shape)
  80. def fuse_bn_weights(exe, main_prog, weights_dir):
  81. import paddle.fluid as fluid
  82. logging.info("Try to fuse weights of batch_norm...")
  83. bn_vars = list()
  84. for block in main_prog.blocks:
  85. ops = list(block.ops)
  86. for op in ops:
  87. if op.type == 'affine_channel':
  88. scale_name = op.input('Scale')[0]
  89. bias_name = op.input('Bias')[0]
  90. prefix = scale_name[:-5]
  91. mean_name = prefix + 'mean'
  92. variance_name = prefix + 'variance'
  93. if not osp.exists(osp.join(
  94. weights_dir, mean_name)) or not osp.exists(
  95. osp.join(weights_dir, variance_name)):
  96. logging.info(
  97. "There's no batch_norm weight found to fuse, skip fuse_bn."
  98. )
  99. return
  100. bias = block.var(bias_name)
  101. pretrained_shape = parse_param_file(
  102. osp.join(weights_dir, bias_name))
  103. actual_shape = tuple(bias.shape)
  104. if pretrained_shape != actual_shape:
  105. continue
  106. bn_vars.append(
  107. [scale_name, bias_name, mean_name, variance_name])
  108. eps = 1e-5
  109. for names in bn_vars:
  110. scale_name, bias_name, mean_name, variance_name = names
  111. scale = parse_param_file(
  112. osp.join(weights_dir, scale_name), return_shape=False)
  113. bias = parse_param_file(
  114. osp.join(weights_dir, bias_name), return_shape=False)
  115. mean = parse_param_file(
  116. osp.join(weights_dir, mean_name), return_shape=False)
  117. variance = parse_param_file(
  118. osp.join(weights_dir, variance_name), return_shape=False)
  119. bn_std = np.sqrt(np.add(variance, eps))
  120. new_scale = np.float32(np.divide(scale, bn_std))
  121. new_bias = bias - mean * new_scale
  122. scale_tensor = fluid.global_scope().find_var(scale_name).get_tensor()
  123. bias_tensor = fluid.global_scope().find_var(bias_name).get_tensor()
  124. scale_tensor.set(new_scale, exe.place)
  125. bias_tensor.set(new_bias, exe.place)
  126. if len(bn_vars) == 0:
  127. logging.info(
  128. "There's no batch_norm weight found to fuse, skip fuse_bn.")
  129. else:
  130. logging.info("There's {} batch_norm ops been fused.".format(
  131. len(bn_vars)))
  132. def load_pdparams(exe, main_prog, model_dir):
  133. import paddle.fluid as fluid
  134. from paddle.fluid.proto.framework_pb2 import VarType
  135. from paddle.fluid.framework import Program
  136. vars_to_load = list()
  137. import pickle
  138. if osp.isfile(model_dir):
  139. params_file = model_dir
  140. else:
  141. params_file = osp.join(model_dir, 'model.pdparams')
  142. with open(params_file, 'rb') as f:
  143. params_dict = pickle.load(f) if six.PY2 else pickle.load(
  144. f, encoding='latin1')
  145. unused_vars = list()
  146. for var in main_prog.list_vars():
  147. if not isinstance(var, fluid.framework.Parameter):
  148. continue
  149. if var.name not in params_dict:
  150. raise Exception("{} is not in saved paddlex model".format(
  151. var.name))
  152. if var.shape != params_dict[var.name].shape:
  153. unused_vars.append(var.name)
  154. logging.warning(
  155. "[SKIP] Shape of pretrained weight {} doesn't match.(Pretrained: {}, Actual: {})"
  156. .format(var.name, params_dict[var.name].shape, var.shape))
  157. continue
  158. vars_to_load.append(var)
  159. logging.debug("Weight {} will be load".format(var.name))
  160. for var_name in unused_vars:
  161. del params_dict[var_name]
  162. fluid.io.set_program_state(main_prog, params_dict)
  163. if len(vars_to_load) == 0:
  164. logging.warning(
  165. "There is no pretrain weights loaded, maybe you should check you pretrain model!"
  166. )
  167. else:
  168. logging.info("There are {} varaibles in {} are loaded.".format(
  169. len(vars_to_load), model_dir))
  170. def is_persistable(var):
  171. import paddle.fluid as fluid
  172. from paddle.fluid.proto.framework_pb2 import VarType
  173. if var.desc.type() == fluid.core.VarDesc.VarType.FEED_MINIBATCH or \
  174. var.desc.type() == fluid.core.VarDesc.VarType.FETCH_LIST or \
  175. var.desc.type() == fluid.core.VarDesc.VarType.READER:
  176. return False
  177. return var.persistable
  178. def is_belong_to_optimizer(var):
  179. import paddle.fluid as fluid
  180. from paddle.fluid.proto.framework_pb2 import VarType
  181. if not (isinstance(var, fluid.framework.Parameter) or
  182. var.desc.need_check_feed()):
  183. return is_persistable(var)
  184. return False
  185. def load_pdopt(exe, main_prog, model_dir):
  186. import paddle.fluid as fluid
  187. optimizer_var_list = list()
  188. vars_to_load = list()
  189. import pickle
  190. with open(osp.join(model_dir, 'model.pdopt'), 'rb') as f:
  191. opt_dict = pickle.load(f) if six.PY2 else pickle.load(
  192. f, encoding='latin1')
  193. optimizer_var_list = list(
  194. filter(is_belong_to_optimizer, main_prog.list_vars()))
  195. exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  196. if len(optimizer_var_list) > 0:
  197. for var in optimizer_var_list:
  198. if var.name not in opt_dict:
  199. raise Exception("{} is not in saved paddlex optimizer, {}".
  200. format(var.name, exception_message))
  201. if var.shape != opt_dict[var.name].shape:
  202. raise Exception(
  203. "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
  204. .format(var.name, opt_dict[var.name].shape,
  205. var.shape), exception_message)
  206. optimizer_varname_list = [var.name for var in optimizer_var_list]
  207. for k, v in opt_dict.items():
  208. if k not in optimizer_varname_list:
  209. raise Exception(
  210. "{} in saved paddlex optimizer is not in the model, {}".
  211. format(k, exception_message))
  212. fluid.io.set_program_state(main_prog, opt_dict)
  213. if len(optimizer_var_list) == 0:
  214. raise Exception(
  215. "There is no optimizer parameters in the model, please set the optimizer!"
  216. )
  217. else:
  218. logging.info("There are {} optimizer parameters in {} are loaded.".
  219. format(len(optimizer_var_list), model_dir))
  220. def load_pretrain_weights(exe,
  221. main_prog,
  222. weights_dir,
  223. fuse_bn=False,
  224. resume=False):
  225. if not osp.exists(weights_dir):
  226. raise Exception("Path {} not exists.".format(weights_dir))
  227. if osp.isfile(weights_dir):
  228. if not weights_dir.endswith('.pdparams'):
  229. raise Exception("File {} is not a paddle parameter file".format(
  230. weights_dir))
  231. load_pdparams(exe, main_prog, weights_dir)
  232. return
  233. if osp.exists(osp.join(weights_dir, "model.pdparams")):
  234. load_pdparams(exe, main_prog, weights_dir)
  235. if resume:
  236. if osp.exists(osp.join(weights_dir, "model.pdopt")):
  237. load_pdopt(exe, main_prog, weights_dir)
  238. else:
  239. raise Exception(
  240. "Optimizer file {} does not exist. Stop resumming training. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  241. .format(osp.join(weights_dir, "model.pdopt")))
  242. return
  243. import paddle.fluid as fluid
  244. vars_to_load = list()
  245. for var in main_prog.list_vars():
  246. if not isinstance(var, fluid.framework.Parameter):
  247. continue
  248. if not osp.exists(osp.join(weights_dir, var.name)):
  249. logging.debug("[SKIP] Pretrained weight {}/{} doesn't exist".
  250. format(weights_dir, var.name))
  251. continue
  252. pretrained_shape = parse_param_file(osp.join(weights_dir, var.name))
  253. actual_shape = tuple(var.shape)
  254. if pretrained_shape != actual_shape:
  255. logging.warning(
  256. "[SKIP] Shape of pretrained weight {}/{} doesn't match.(Pretrained: {}, Actual: {})"
  257. .format(weights_dir, var.name, pretrained_shape, actual_shape))
  258. continue
  259. vars_to_load.append(var)
  260. logging.debug("Weight {} will be load".format(var.name))
  261. params_dict = fluid.io.load_program_state(
  262. weights_dir, var_list=vars_to_load)
  263. fluid.io.set_program_state(main_prog, params_dict)
  264. if len(vars_to_load) == 0:
  265. logging.warning(
  266. "There is no pretrain weights loaded, maybe you should check you pretrain model!"
  267. )
  268. else:
  269. logging.info("There are {} varaibles in {} are loaded.".format(
  270. len(vars_to_load), weights_dir))
  271. if fuse_bn:
  272. fuse_bn_weights(exe, main_prog, weights_dir)
  273. if resume:
  274. exception_message = "the training process can not be resumed due to optimizer set now and last time is different. Recommend to use `pretrain_weights` instead of `resume_checkpoint`"
  275. optimizer_var_list = list(
  276. filter(is_belong_to_optimizer, main_prog.list_vars()))
  277. if len(optimizer_var_list) > 0:
  278. for var in optimizer_var_list:
  279. if not osp.exists(osp.join(weights_dir, var.name)):
  280. raise Exception(
  281. "Optimizer parameter {} doesn't exist, {}".format(
  282. osp.join(weights_dir, var.name),
  283. exception_message))
  284. pretrained_shape = parse_param_file(
  285. osp.join(weights_dir, var.name))
  286. actual_shape = tuple(var.shape)
  287. if pretrained_shape != actual_shape:
  288. raise Exception(
  289. "Shape of optimizer variable {} doesn't match.(Last: {}, Now: {}), {}"
  290. .format(var.name, pretrained_shape,
  291. actual_shape), exception_message)
  292. optimizer_varname_list = [var.name for var in optimizer_var_list]
  293. if os.exists(osp.join(weights_dir, 'learning_rate')
  294. ) and 'learning_rate' not in optimizer_varname_list:
  295. raise Exception(
  296. "Optimizer parameter {}/learning_rate is not in the model, {}"
  297. .format(weights_dir, exception_message))
  298. fluid.io.load_vars(
  299. executor=exe,
  300. dirname=weights_dir,
  301. main_program=main_prog,
  302. vars=optimizer_var_list)
  303. if len(optimizer_var_list) == 0:
  304. raise Exception(
  305. "There is no optimizer parameters in the model, please set the optimizer!"
  306. )
  307. else:
  308. logging.info("There are {} optimizer parameters in {} are loaded.".
  309. format(len(optimizer_var_list), weights_dir))
  310. class EarlyStop:
  311. def __init__(self, patience, thresh):
  312. self.patience = patience
  313. self.counter = 0
  314. self.score = None
  315. self.max = 0
  316. self.thresh = thresh
  317. if patience < 1:
  318. raise Exception("Argument patience should be a positive integer.")
  319. def __call__(self, current_score):
  320. if self.score is None:
  321. self.score = current_score
  322. return False
  323. elif current_score > self.max:
  324. self.counter = 0
  325. self.score = current_score
  326. self.max = current_score
  327. return False
  328. else:
  329. if (abs(self.score - current_score) < self.thresh or
  330. current_score < self.score):
  331. self.counter += 1
  332. self.score = current_score
  333. logging.debug("EarlyStopping: %i / %i" %
  334. (self.counter, self.patience))
  335. if self.counter >= self.patience:
  336. logging.info("EarlyStopping: Stop training")
  337. return True
  338. return False
  339. else:
  340. self.counter = 0
  341. self.score = current_score
  342. return False