model.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. from ...base import BaseModel
  16. from ...base.utils.arg import CLIArgument
  17. from ...base.utils.subprocess import CompletedProcess
  18. from ....utils.device import parse_device
  19. from ....utils.misc import abspath
  20. from ....utils.errors import raise_unsupported_api_error
  21. class TSModel(BaseModel):
  22. """TS Model"""
  23. def train(
  24. self,
  25. batch_size: int = None,
  26. learning_rate: float = None,
  27. epochs_iters: int = None,
  28. ips: str = None,
  29. device: str = "gpu",
  30. resume_path: str = None,
  31. dy2st: bool = False,
  32. amp: str = "OFF",
  33. num_workers: int = None,
  34. use_vdl: bool = False,
  35. save_dir: str = None,
  36. **kwargs,
  37. ) -> CompletedProcess:
  38. """train self
  39. Args:
  40. batch_size (int, optional): the train batch size value. Defaults to None.
  41. learning_rate (float, optional): the train learning rate value. Defaults to None.
  42. epochs_iters (int, optional): the train epochs value. Defaults to None.
  43. ips (str, optional): the ip addresses of nodes when using distribution. Defaults to None.
  44. device (str, optional): the running device. Defaults to 'gpu'.
  45. resume_path (str, optional): the checkpoint file path to resume training. Train from scratch if it is set
  46. to None. Defaults to None.
  47. dy2st (bool, optional): Enable dynamic to static. Defaults to False.
  48. amp (str, optional): the amp settings. Defaults to 'OFF'.
  49. num_workers (int, optional): the workers number. Defaults to None.
  50. use_vdl (bool, optional): enable VisualDL. Defaults to False.
  51. save_dir (str, optional): the directory path to save train output. Defaults to None.
  52. Returns:
  53. CompletedProcess: the result of training subprocess execution.
  54. """
  55. config = self.config.copy()
  56. cli_args = []
  57. if batch_size is not None:
  58. cli_args.append(CLIArgument("--batch_size", batch_size))
  59. if learning_rate is not None:
  60. cli_args.append(CLIArgument("--learning_rate", learning_rate))
  61. if epochs_iters is not None:
  62. cli_args.append(CLIArgument("--epoch", epochs_iters))
  63. if resume_path:
  64. raise ValueError("`resume_path` is not supported.")
  65. # No need to handle `ips`
  66. benchmark = kwargs.pop("benchmark", None)
  67. if benchmark is not None:
  68. amp = benchmark.get("amp", None)
  69. if amp in ["O1", "O2"]:
  70. config.update_amp(amp)
  71. if use_vdl:
  72. raise ValueError(f"`use_vdl`={use_vdl} is not supported.")
  73. if device is not None:
  74. device_type, _ = parse_device(device)
  75. cli_args.append(CLIArgument("--device", device_type))
  76. if save_dir is not None:
  77. save_dir = abspath(save_dir)
  78. else:
  79. # `save_dir` is None
  80. save_dir = abspath(os.path.join("output", "train"))
  81. cli_args.append(CLIArgument("--save_dir", save_dir))
  82. # Benchmarking mode settings
  83. benchmark = kwargs.pop("benchmark", None)
  84. if benchmark is not None:
  85. envs = benchmark.get("env", None)
  86. num_workers = benchmark.get("num_workers", None)
  87. config.update_log_ranks(device)
  88. config.update_print_mem_info(benchmark.get("print_mem_info", True))
  89. if num_workers is not None:
  90. assert isinstance(num_workers, int), "num_workers must be an integer"
  91. cli_args.append(CLIArgument("--num_workers", num_workers))
  92. if envs is not None:
  93. for env_name, env_value in envs.items():
  94. os.environ[env_name] = str(env_value)
  95. else:
  96. if num_workers is not None:
  97. cli_args.append(CLIArgument("--num_workers", num_workers))
  98. # PDX related settings
  99. uniform_output_enabled = kwargs.pop("uniform_output_enabled", True)
  100. config.update({"uniform_output_enabled": uniform_output_enabled})
  101. config.update({"pdx_model_name": self.name})
  102. self._assert_empty_kwargs(kwargs)
  103. with self._create_new_config_file() as config_path:
  104. config.dump(config_path)
  105. return self.runner.train(config_path, cli_args, device, ips, save_dir)
  106. def evaluate(
  107. self,
  108. weight_path: str,
  109. batch_size: int = None,
  110. ips: str = None,
  111. device: str = "gpu",
  112. amp: str = "OFF",
  113. num_workers: int = None,
  114. **kwargs,
  115. ) -> CompletedProcess:
  116. """evaluate self using specified weight
  117. Args:
  118. weight_path (str): the path of model weight file to be evaluated.
  119. batch_size (int, optional): the batch size value in evaluating. Defaults to None.
  120. ips (str, optional): the ip addresses of nodes when using distribution. Defaults to None.
  121. device (str, optional): the running device. Defaults to 'gpu'.
  122. amp (str, optional): the AMP setting. Defaults to 'OFF'.
  123. num_workers (int, optional): the workers number in evaluating. Defaults to None.
  124. Returns:
  125. CompletedProcess: the result of evaluating subprocess execution.
  126. """
  127. config = self.config.copy()
  128. cli_args = []
  129. weight_path = abspath(weight_path)
  130. cli_args.append(CLIArgument("--checkpoints", weight_path))
  131. if batch_size is not None:
  132. if batch_size != 1:
  133. raise ValueError("Batch size other than 1 is not supported.")
  134. # No need to handle `ips`
  135. if device is not None:
  136. device_type, _ = parse_device(device)
  137. cli_args.append(CLIArgument("--device", device_type))
  138. if amp is not None:
  139. if amp != "OFF":
  140. raise ValueError(f"`amp`={amp} is not supported.")
  141. if num_workers is not None:
  142. cli_args.append(CLIArgument("--num_workers", num_workers))
  143. self._assert_empty_kwargs(kwargs)
  144. with self._create_new_config_file() as config_path:
  145. config.dump(config_path)
  146. cp = self.runner.evaluate(config_path, cli_args, device, ips)
  147. return cp
  148. def predict(
  149. self,
  150. weight_path: str,
  151. input_path: str,
  152. device: str = "gpu",
  153. save_dir: str = None,
  154. **kwargs,
  155. ) -> CompletedProcess:
  156. """predict using specified weight
  157. Args:
  158. weight_path (str): the path of model weight file used to predict.
  159. input_path (str): the path of image file to be predicted.
  160. device (str, optional): the running device. Defaults to 'gpu'.
  161. save_dir (str, optional): the directory path to save predict output. Defaults to None.
  162. Returns:
  163. CompletedProcess: the result of predicting subprocess execution.
  164. """
  165. config = self.config.copy()
  166. cli_args = []
  167. weight_path = abspath(weight_path)
  168. cli_args.append(CLIArgument("--checkpoints", weight_path))
  169. input_path = abspath(input_path)
  170. cli_args.append(CLIArgument("--csv_path", input_path))
  171. if device is not None:
  172. device_type, _ = parse_device(device)
  173. cli_args.append(CLIArgument("--device", device_type))
  174. if save_dir is not None:
  175. save_dir = abspath(save_dir)
  176. else:
  177. # `save_dir` is None
  178. save_dir = abspath(os.path.join("output", "predict"))
  179. cli_args.append(CLIArgument("--save_dir", save_dir))
  180. self._assert_empty_kwargs(kwargs)
  181. with self._create_new_config_file() as config_path:
  182. config.dump(config_path)
  183. return self.runner.predict(config_path, cli_args, device)
  184. def export(
  185. self, weight_path: str, save_dir: str = None, device: str = "gpu", **kwargs
  186. ):
  187. """export"""
  188. if not weight_path.startswith(("http://", "https://")):
  189. weight_path = abspath(weight_path)
  190. save_dir = abspath(save_dir)
  191. cli_args = []
  192. cli_args.append(CLIArgument("--checkpoints", weight_path))
  193. if save_dir is not None:
  194. save_dir = abspath(save_dir)
  195. else:
  196. save_dir = abspath(os.path.join("output", "inference"))
  197. cli_args.append(CLIArgument("--save_dir", save_dir))
  198. if device is not None:
  199. device_type, _ = parse_device(device)
  200. cli_args.append(CLIArgument("--device", device_type))
  201. self._assert_empty_kwargs(kwargs)
  202. with self._create_new_config_file() as config_path:
  203. # Update YAML config file
  204. config = self.config.copy()
  205. config.update_pretrained_weights(weight_path)
  206. config.update({"pdx_model_name": self.name})
  207. config.dump(config_path)
  208. return self.runner.export(config_path, cli_args, device)
  209. def infer(
  210. self,
  211. model_dir: str,
  212. input_path: str,
  213. device: str = "gpu",
  214. save_dir: str = None,
  215. **kwargs,
  216. ):
  217. """infer"""
  218. raise_unsupported_api_error("infer", self.__class__)
  219. def compression(
  220. self,
  221. weight_path: str,
  222. batch_size=None,
  223. learning_rate=None,
  224. epochs_iters=None,
  225. device: str = "gpu",
  226. use_vdl=True,
  227. save_dir=None,
  228. **kwargs,
  229. ):
  230. """compression"""
  231. raise_unsupported_api_error("compression", self.__class__)