| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591 |
- # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from __future__ import absolute_import
- import paddle.fluid as fluid
- import os
- import sys
- import numpy as np
- import time
- import math
- import yaml
- import copy
- import json
- import functools
- import multiprocessing as mp
- import paddlex.utils.logging as logging
- from paddlex.utils import seconds_to_hms
- from paddlex.utils.utils import EarlyStop
- from paddlex.cv.transforms import arrange_transforms
- import paddlex
- from collections import OrderedDict
- from os import path as osp
- from paddle.fluid.framework import Program
- from .utils.pretrain_weights import get_pretrain_weights
- def dict2str(dict_input):
- out = ''
- for k, v in dict_input.items():
- try:
- v = round(float(v), 6)
- except:
- pass
- out = out + '{}={}, '.format(k, v)
- return out.strip(', ')
- class BaseAPI:
- def __init__(self, model_type):
- self.model_type = model_type
- # 现有的CV模型都有这个属性,而这个属且也需要在eval时用到
- self.num_classes = None
- self.labels = None
- self.version = paddlex.__version__
- if paddlex.env_info['place'] == 'cpu':
- self.places = fluid.cpu_places()
- else:
- self.places = fluid.cuda_places()
- self.exe = fluid.Executor(self.places[0])
- self.train_prog = None
- self.test_prog = None
- self.parallel_train_prog = None
- self.train_inputs = None
- self.test_inputs = None
- self.train_outputs = None
- self.test_outputs = None
- self.train_data_loader = None
- self.eval_metrics = None
- # 若模型是从inference model加载进来的,无法调用训练接口进行训练
- self.trainable = True
- # 是否使用多卡间同步BatchNorm均值和方差
- self.sync_bn = False
- # 当前模型状态
- self.status = 'Normal'
- # 已完成迭代轮数,为恢复训练时的起始轮数
- self.completed_epochs = 0
- self.scope = fluid.global_scope()
- # 线程池,在模型在预测时用于对输入数据以图片为单位进行并行处理
- # 主要用于batch_predict接口
- thread_num = mp.cpu_count() if mp.cpu_count() < 8 else 8
- self.thread_pool = mp.pool.ThreadPool(thread_num)
- def reset_thread_pool(self, thread_num):
- self.thread_pool.close()
- self.thread_pool.join()
- self.thread_pool = mp.pool.ThreadPool(thread_num)
- def _get_single_card_bs(self, batch_size):
- if batch_size % len(self.places) == 0:
- return int(batch_size // len(self.places))
- else:
- raise Exception("Please support correct batch_size, \
- which can be divided by available cards({}) in {}"
- .format(paddlex.env_info['num'], paddlex.env_info[
- 'place']))
- def build_program(self):
- if hasattr(paddlex, 'model_built') and paddlex.model_built:
- logging.error(
- "Function model.train() only can be called once in your code.")
- paddlex.model_built = True
- # 构建训练网络
- self.train_inputs, self.train_outputs = self.build_net(mode='train')
- self.train_prog = fluid.default_main_program()
- startup_prog = fluid.default_startup_program()
- # 构建预测网络
- self.test_prog = fluid.Program()
- with fluid.program_guard(self.test_prog, startup_prog):
- with fluid.unique_name.guard():
- self.test_inputs, self.test_outputs = self.build_net(
- mode='test')
- self.test_prog = self.test_prog.clone(for_test=True)
- def build_train_data_loader(self, dataset, batch_size):
- # 初始化data_loader
- if self.train_data_loader is None:
- self.train_data_loader = fluid.io.DataLoader.from_generator(
- feed_list=list(self.train_inputs.values()),
- capacity=64,
- use_double_buffer=True,
- iterable=True)
- batch_size_each_gpu = self._get_single_card_bs(batch_size)
- generator = dataset.generator(
- batch_size=batch_size_each_gpu, drop_last=True)
- self.train_data_loader.set_sample_list_generator(
- dataset.generator(batch_size=batch_size_each_gpu),
- places=self.places)
- def export_quant_model(self,
- dataset,
- save_dir,
- batch_size=1,
- batch_num=10,
- cache_dir="./temp"):
- arrange_transforms(
- model_type=self.model_type,
- class_name=self.__class__.__name__,
- transforms=dataset.transforms,
- mode='quant')
- dataset.num_samples = batch_size * batch_num
- try:
- from .slim.post_quantization import PaddleXPostTrainingQuantization
- PaddleXPostTrainingQuantization._collect_target_varnames
- except:
- raise Exception(
- "Model Quantization is not available, try to upgrade your paddlepaddle>=1.8.0"
- )
- is_use_cache_file = True
- if cache_dir is None:
- is_use_cache_file = False
- post_training_quantization = PaddleXPostTrainingQuantization(
- executor=self.exe,
- dataset=dataset,
- program=self.test_prog,
- inputs=self.test_inputs,
- outputs=self.test_outputs,
- batch_size=batch_size,
- batch_nums=batch_num,
- scope=self.scope,
- algo='KL',
- quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
- is_full_quantize=False,
- is_use_cache_file=is_use_cache_file,
- cache_dir=cache_dir)
- post_training_quantization.quantize()
- post_training_quantization.save_quantized_model(save_dir)
- model_info = self.get_model_info()
- model_info['status'] = 'Quant'
- # 保存模型输出的变量描述
- model_info['_ModelInputsOutputs'] = dict()
- model_info['_ModelInputsOutputs']['test_inputs'] = [
- [k, v.name] for k, v in self.test_inputs.items()
- ]
- model_info['_ModelInputsOutputs']['test_outputs'] = [
- [k, v.name] for k, v in self.test_outputs.items()
- ]
- with open(
- osp.join(save_dir, 'model.yml'), encoding='utf-8',
- mode='w') as f:
- yaml.dump(model_info, f)
- def net_initialize(self,
- startup_prog=None,
- pretrain_weights=None,
- fuse_bn=False,
- save_dir='.',
- sensitivities_file=None,
- eval_metric_loss=0.05,
- resume_checkpoint=None):
- if not resume_checkpoint:
- pretrain_dir = osp.join(save_dir, 'pretrain')
- if not os.path.isdir(pretrain_dir):
- if os.path.exists(pretrain_dir):
- os.remove(pretrain_dir)
- os.makedirs(pretrain_dir)
- if pretrain_weights is not None and not os.path.exists(
- pretrain_weights):
- if self.model_type == 'classifier':
- if pretrain_weights not in ['IMAGENET']:
- logging.warning(
- "Path of pretrain_weights('{}') is not exists!".
- format(pretrain_weights))
- logging.warning(
- "Pretrain_weights will be forced to set as 'IMAGENET', if you don't want to use pretrain weights, set pretrain_weights=None."
- )
- pretrain_weights = 'IMAGENET'
- elif self.model_type == 'detector':
- if pretrain_weights not in ['IMAGENET', 'COCO']:
- logging.warning(
- "Path of pretrain_weights('{}') is not exists!".
- format(pretrain_weights))
- logging.warning(
- "Pretrain_weights will be forced to set as 'IMAGENET', if you don't want to use pretrain weights, set pretrain_weights=None."
- )
- pretrain_weights = 'IMAGENET'
- elif self.model_type == 'segmenter':
- if pretrain_weights not in [
- 'IMAGENET', 'COCO', 'CITYSCAPES'
- ]:
- logging.warning(
- "Path of pretrain_weights('{}') is not exists!".
- format(pretrain_weights))
- logging.warning(
- "Pretrain_weights will be forced to set as 'IMAGENET', if you don't want to use pretrain weights, set pretrain_weights=None."
- )
- pretrain_weights = 'IMAGENET'
- if hasattr(self, 'backbone'):
- backbone = self.backbone
- else:
- backbone = self.__class__.__name__
- if backbone == "HRNet":
- backbone = backbone + "_W{}".format(self.width)
- class_name = self.__class__.__name__
- pretrain_weights = get_pretrain_weights(
- pretrain_weights, class_name, backbone, pretrain_dir)
- if startup_prog is None:
- startup_prog = fluid.default_startup_program()
- self.exe.run(startup_prog)
- if resume_checkpoint:
- logging.info(
- "Resume checkpoint from {}.".format(resume_checkpoint),
- use_color=True)
- paddlex.utils.utils.load_pretrain_weights(
- self.exe, self.train_prog, resume_checkpoint, resume=True)
- if not osp.exists(osp.join(resume_checkpoint, "model.yml")):
- raise Exception("There's not model.yml in {}".format(
- resume_checkpoint))
- with open(osp.join(resume_checkpoint, "model.yml")) as f:
- info = yaml.load(f.read(), Loader=yaml.Loader)
- self.completed_epochs = info['completed_epochs']
- elif pretrain_weights is not None:
- logging.info(
- "Load pretrain weights from {}.".format(pretrain_weights),
- use_color=True)
- paddlex.utils.utils.load_pretrain_weights(
- self.exe, self.train_prog, pretrain_weights, fuse_bn)
- # 进行裁剪
- if sensitivities_file is not None:
- import paddleslim
- from .slim.prune_config import get_sensitivities
- sensitivities_file = get_sensitivities(sensitivities_file, self,
- save_dir)
- from .slim.prune import get_params_ratios, prune_program
- logging.info(
- "Start to prune program with eval_metric_loss = {}".format(
- eval_metric_loss),
- use_color=True)
- origin_flops = paddleslim.analysis.flops(self.test_prog)
- prune_params_ratios = get_params_ratios(
- sensitivities_file, eval_metric_loss=eval_metric_loss)
- prune_program(self, prune_params_ratios)
- current_flops = paddleslim.analysis.flops(self.test_prog)
- remaining_ratio = current_flops / origin_flops
- logging.info(
- "Finish prune program, before FLOPs:{}, after prune FLOPs:{}, remaining ratio:{}"
- .format(origin_flops, current_flops, remaining_ratio),
- use_color=True)
- self.status = 'Prune'
- def get_model_info(self):
- info = dict()
- info['version'] = paddlex.__version__
- info['Model'] = self.__class__.__name__
- info['_Attributes'] = {'model_type': self.model_type}
- if 'self' in self.init_params:
- del self.init_params['self']
- if '__class__' in self.init_params:
- del self.init_params['__class__']
- if 'model_name' in self.init_params:
- del self.init_params['model_name']
- info['_init_params'] = self.init_params
- info['_Attributes']['num_classes'] = self.num_classes
- info['_Attributes']['labels'] = self.labels
- info['_Attributes']['fixed_input_shape'] = self.fixed_input_shape
- try:
- primary_metric_key = list(self.eval_metrics.keys())[0]
- primary_metric_value = float(self.eval_metrics[primary_metric_key])
- info['_Attributes']['eval_metrics'] = {
- primary_metric_key: primary_metric_value
- }
- except:
- pass
- if hasattr(self, 'test_transforms'):
- if hasattr(self.test_transforms, 'to_rgb'):
- if self.test_transforms.to_rgb:
- info['TransformsMode'] = 'RGB'
- else:
- info['TransformsMode'] = 'BGR'
- if self.test_transforms is not None:
- info['Transforms'] = list()
- for op in self.test_transforms.transforms:
- name = op.__class__.__name__
- attr = op.__dict__
- info['Transforms'].append({name: attr})
- info['completed_epochs'] = self.completed_epochs
- return info
- def save_model(self, save_dir):
- if not osp.isdir(save_dir):
- if osp.exists(save_dir):
- os.remove(save_dir)
- os.makedirs(save_dir)
- if self.train_prog is not None:
- fluid.save(self.train_prog, osp.join(save_dir, 'model'))
- else:
- fluid.save(self.test_prog, osp.join(save_dir, 'model'))
- model_info = self.get_model_info()
- model_info['status'] = self.status
- with open(
- osp.join(save_dir, 'model.yml'), encoding='utf-8',
- mode='w') as f:
- yaml.dump(model_info, f)
- # 评估结果保存
- if hasattr(self, 'eval_details'):
- with open(osp.join(save_dir, 'eval_details.json'), 'w') as f:
- json.dump(self.eval_details, f)
- if self.status == 'Prune':
- # 保存裁剪的shape
- shapes = {}
- for block in self.train_prog.blocks:
- for param in block.all_parameters():
- pd_var = fluid.global_scope().find_var(param.name)
- pd_param = pd_var.get_tensor()
- shapes[param.name] = np.array(pd_param).shape
- with open(
- osp.join(save_dir, 'prune.yml'), encoding='utf-8',
- mode='w') as f:
- yaml.dump(shapes, f)
- # 模型保存成功的标志
- open(osp.join(save_dir, '.success'), 'w').close()
- logging.info("Model saved in {}.".format(save_dir))
- def export_inference_model(self, save_dir):
- test_input_names = [
- var.name for var in list(self.test_inputs.values())
- ]
- test_outputs = list(self.test_outputs.values())
- with fluid.scope_guard(self.scope):
- fluid.io.save_inference_model(
- dirname=save_dir,
- executor=self.exe,
- params_filename='__params__',
- feeded_var_names=test_input_names,
- target_vars=test_outputs,
- main_program=self.test_prog)
- model_info = self.get_model_info()
- model_info['status'] = 'Infer'
- # 保存模型输出的变量描述
- model_info['_ModelInputsOutputs'] = dict()
- model_info['_ModelInputsOutputs']['test_inputs'] = [
- [k, v.name] for k, v in self.test_inputs.items()
- ]
- model_info['_ModelInputsOutputs']['test_outputs'] = [
- [k, v.name] for k, v in self.test_outputs.items()
- ]
- with open(
- osp.join(save_dir, 'model.yml'), encoding='utf-8',
- mode='w') as f:
- yaml.dump(model_info, f)
- # 模型保存成功的标志
- open(osp.join(save_dir, '.success'), 'w').close()
- logging.info("Model for inference deploy saved in {}.".format(
- save_dir))
- def train_loop(self,
- num_epochs,
- train_dataset,
- train_batch_size,
- eval_dataset=None,
- save_interval_epochs=1,
- log_interval_steps=10,
- save_dir='output',
- use_vdl=False,
- early_stop=False,
- early_stop_patience=5):
- if train_dataset.num_samples < train_batch_size:
- raise Exception(
- 'The amount of training datset must be larger than batch size.')
- if not osp.isdir(save_dir):
- if osp.exists(save_dir):
- os.remove(save_dir)
- os.makedirs(save_dir)
- if use_vdl:
- from visualdl import LogWriter
- vdl_logdir = osp.join(save_dir, 'vdl_log')
- # 给transform添加arrange操作
- arrange_transforms(
- model_type=self.model_type,
- class_name=self.__class__.__name__,
- transforms=train_dataset.transforms,
- mode='train')
- # 构建train_data_loader
- self.build_train_data_loader(
- dataset=train_dataset, batch_size=train_batch_size)
- if eval_dataset is not None:
- self.eval_transforms = eval_dataset.transforms
- self.test_transforms = copy.deepcopy(eval_dataset.transforms)
- # 获取实时变化的learning rate
- lr = self.optimizer._learning_rate
- if isinstance(lr, fluid.framework.Variable):
- self.train_outputs['lr'] = lr
- # 在多卡上跑训练
- if self.parallel_train_prog is None:
- build_strategy = fluid.compiler.BuildStrategy()
- build_strategy.fuse_all_optimizer_ops = False
- if paddlex.env_info['place'] != 'cpu' and len(self.places) > 1:
- build_strategy.sync_batch_norm = self.sync_bn
- exec_strategy = fluid.ExecutionStrategy()
- exec_strategy.num_iteration_per_drop_scope = 1
- self.parallel_train_prog = fluid.CompiledProgram(
- self.train_prog).with_data_parallel(
- loss_name=self.train_outputs['loss'].name,
- build_strategy=build_strategy,
- exec_strategy=exec_strategy)
- total_num_steps = math.floor(train_dataset.num_samples /
- train_batch_size)
- num_steps = 0
- time_stat = list()
- time_train_one_epoch = None
- time_eval_one_epoch = None
- total_num_steps_eval = 0
- # 模型总共的评估次数
- total_eval_times = math.ceil(num_epochs / save_interval_epochs)
- # 检测目前仅支持单卡评估,训练数据batch大小与显卡数量之商为验证数据batch大小。
- eval_batch_size = train_batch_size
- if self.model_type == 'detector':
- eval_batch_size = self._get_single_card_bs(train_batch_size)
- if eval_dataset is not None:
- total_num_steps_eval = math.ceil(eval_dataset.num_samples /
- eval_batch_size)
- if use_vdl:
- # VisualDL component
- log_writer = LogWriter(vdl_logdir)
- thresh = 0.0001
- if early_stop:
- earlystop = EarlyStop(early_stop_patience, thresh)
- best_accuracy_key = ""
- best_accuracy = -1.0
- best_model_epoch = -1
- start_epoch = self.completed_epochs
- for i in range(start_epoch, num_epochs):
- records = list()
- step_start_time = time.time()
- epoch_start_time = time.time()
- for step, data in enumerate(self.train_data_loader()):
- outputs = self.exe.run(
- self.parallel_train_prog,
- feed=data,
- fetch_list=list(self.train_outputs.values()))
- outputs_avg = np.mean(np.array(outputs), axis=1)
- records.append(outputs_avg)
- # 训练完成剩余时间预估
- current_time = time.time()
- step_cost_time = current_time - step_start_time
- step_start_time = current_time
- if len(time_stat) < 20:
- time_stat.append(step_cost_time)
- else:
- time_stat[num_steps % 20] = step_cost_time
- # 每间隔log_interval_steps,输出loss信息
- num_steps += 1
- if num_steps % log_interval_steps == 0:
- step_metrics = OrderedDict(
- zip(list(self.train_outputs.keys()), outputs_avg))
- if use_vdl:
- for k, v in step_metrics.items():
- log_writer.add_scalar(
- 'Metrics/Training(Step): {}'.format(k), v,
- num_steps)
- # 估算剩余时间
- avg_step_time = np.mean(time_stat)
- if time_train_one_epoch is not None:
- eta = (num_epochs - i - 1) * time_train_one_epoch + (
- total_num_steps - step - 1) * avg_step_time
- else:
- eta = ((num_epochs - i) * total_num_steps - step - 1
- ) * avg_step_time
- if time_eval_one_epoch is not None:
- eval_eta = (
- total_eval_times - i // save_interval_epochs
- ) * time_eval_one_epoch
- else:
- eval_eta = (
- total_eval_times - i // save_interval_epochs
- ) * total_num_steps_eval * avg_step_time
- eta_str = seconds_to_hms(eta + eval_eta)
- logging.info(
- "[TRAIN] Epoch={}/{}, Step={}/{}, {}, time_each_step={}s, eta={}"
- .format(i + 1, num_epochs, step + 1, total_num_steps,
- dict2str(step_metrics),
- round(avg_step_time, 2), eta_str))
- train_metrics = OrderedDict(
- zip(list(self.train_outputs.keys()), np.mean(
- records, axis=0)))
- logging.info('[TRAIN] Epoch {} finished, {} .'.format(
- i + 1, dict2str(train_metrics)))
- time_train_one_epoch = time.time() - epoch_start_time
- epoch_start_time = time.time()
- # 每间隔save_interval_epochs, 在验证集上评估和对模型进行保存
- self.completed_epochs += 1
- eval_epoch_start_time = time.time()
- if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1:
- current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1))
- if not osp.isdir(current_save_dir):
- os.makedirs(current_save_dir)
- if getattr(self, 'use_ema', False):
- self.exe.run(self.ema.apply_program)
- if eval_dataset is not None and eval_dataset.num_samples > 0:
- self.eval_metrics, self.eval_details = self.evaluate(
- eval_dataset=eval_dataset,
- batch_size=eval_batch_size,
- epoch_id=i + 1,
- return_details=True)
- logging.info('[EVAL] Finished, Epoch={}, {} .'.format(
- i + 1, dict2str(self.eval_metrics)))
- # 保存最优模型
- best_accuracy_key = list(self.eval_metrics.keys())[0]
- current_accuracy = self.eval_metrics[best_accuracy_key]
- if current_accuracy > best_accuracy:
- best_accuracy = current_accuracy
- best_model_epoch = i + 1
- best_model_dir = osp.join(save_dir, "best_model")
- self.save_model(save_dir=best_model_dir)
- if use_vdl:
- for k, v in self.eval_metrics.items():
- if isinstance(v, list):
- continue
- if isinstance(v, np.ndarray):
- if v.size > 1:
- continue
- log_writer.add_scalar(
- "Metrics/Eval(Epoch): {}".format(k), v, i + 1)
- self.save_model(save_dir=current_save_dir)
- if getattr(self, 'use_ema', False):
- self.exe.run(self.ema.restore_program)
- time_eval_one_epoch = time.time() - eval_epoch_start_time
- eval_epoch_start_time = time.time()
- if best_model_epoch > 0:
- logging.info(
- 'Current evaluated best model in eval_dataset is epoch_{}, {}={}'
- .format(best_model_epoch, best_accuracy_key,
- best_accuracy))
- if eval_dataset is not None and early_stop:
- if earlystop(current_accuracy):
- break
|