device.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import GPUtil
  16. import lazy_paddle as paddle
  17. from . import logging
  18. from .flags import DISABLE_DEV_MODEL_WL
  19. from .errors import raise_unsupported_device_error
  20. from .custom_device_whitelist import (
  21. DCU_WHITELIST,
  22. MLU_WHITELIST,
  23. NPU_WHITELIST,
  24. XPU_WHITELIST,
  25. GCU_WHITELIST,
  26. )
  27. SUPPORTED_DEVICE_TYPE = ["cpu", "gpu", "xpu", "npu", "mlu", "gcu", "dcu"]
  28. def constr_device(device_type, device_ids):
  29. if device_type == "cpu" and device_ids is not None:
  30. raise ValueError("`device_ids` must be None for CPUs")
  31. if device_ids:
  32. device_ids = ",".join(map(str, device_ids))
  33. return f"{device_type}:{device_ids}"
  34. else:
  35. return f"{device_type}"
  36. def get_default_device():
  37. avail_gpus = GPUtil.getAvailable()
  38. if not avail_gpus:
  39. # maybe edge devices like Jetson
  40. if os.path.exists("/etc/nv_tegra_release"):
  41. avail_gpus = [0]
  42. logging.info(
  43. "Detected that the current device is a Jetson edge device. The default behavior will be to use GPU: 0"
  44. )
  45. if not avail_gpus:
  46. return "cpu"
  47. else:
  48. return constr_device("gpu", [avail_gpus[0]])
  49. def parse_device(device):
  50. """parse_device"""
  51. # According to https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/device/set_device_cn.html
  52. parts = device.split(":")
  53. if len(parts) > 2:
  54. raise ValueError(f"Invalid device: {device}")
  55. if len(parts) == 1:
  56. device_type, device_ids = parts[0], None
  57. else:
  58. device_type, device_ids = parts
  59. device_ids = device_ids.split(",")
  60. for device_id in device_ids:
  61. if not device_id.isdigit():
  62. raise ValueError(
  63. f"Device ID must be an integer. Invalid device ID: {device_id}"
  64. )
  65. device_ids = list(map(int, device_ids))
  66. device_type = device_type.lower()
  67. # raise_unsupported_device_error(device_type, SUPPORTED_DEVICE_TYPE)
  68. assert device_type.lower() in SUPPORTED_DEVICE_TYPE
  69. if device_type == "cpu" and device_ids is not None:
  70. raise ValueError("No Device ID should be specified for CPUs")
  71. return device_type, device_ids
  72. def update_device_num(device, num):
  73. device_type, device_ids = parse_device(device)
  74. if device_ids:
  75. assert len(device_ids) >= num
  76. return constr_device(device_type, device_ids[:num])
  77. else:
  78. return constr_device(device_type, device_ids)
  79. def set_env_for_device(device):
  80. device_type, _ = parse_device(device)
  81. return set_env_for_device_type(device_type)
  82. def set_env_for_device_type(device_type):
  83. def _set(envs):
  84. for key, val in envs.items():
  85. os.environ[key] = val
  86. logging.debug(f"{key} has been set to {val}.")
  87. # XXX: is_compiled_with_rocm() must be True on dcu platform ?
  88. if device_type.lower() == "dcu" and paddle.is_compiled_with_rocm():
  89. envs = {"FLAGS_conv_workspace_size_limit": "2000"}
  90. _set(envs)
  91. if device_type.lower() == "npu":
  92. envs = {
  93. "FLAGS_npu_jit_compile": "0",
  94. "FLAGS_use_stride_kernel": "0",
  95. "FLAGS_allocator_strategy": "auto_growth",
  96. "CUSTOM_DEVICE_BLACK_LIST": "pad3d,pad3d_grad,set_value,set_value_with_tensor",
  97. "FLAGS_npu_scale_aclnn": "True",
  98. "FLAGS_npu_split_aclnn": "True",
  99. }
  100. _set(envs)
  101. if device_type.lower() == "xpu":
  102. envs = {
  103. "BKCL_FORCE_SYNC": "1",
  104. "BKCL_TIMEOUT": "1800",
  105. "FLAGS_use_stride_kernel": "0",
  106. "XPU_BLACK_LIST": "pad3d",
  107. }
  108. _set(envs)
  109. if device_type.lower() == "mlu":
  110. envs = {"FLAGS_use_stride_kernel": "0"}
  111. _set(envs)
  112. if device_type.lower() == "gcu":
  113. envs = {"FLAGS_use_stride_kernel": "0"}
  114. _set(envs)
  115. def check_supported_device_type(device_type, model_name):
  116. if DISABLE_DEV_MODEL_WL:
  117. logging.warning(
  118. "Skip checking if model is supported on device because the flag `PADDLE_PDX_DISABLE_DEV_MODEL_WL` has been set."
  119. )
  120. return
  121. if device_type == "dcu":
  122. assert (
  123. model_name in DCU_WHITELIST
  124. ), f"The DCU device does not yet support `{model_name}` model!"
  125. elif device_type == "mlu":
  126. assert (
  127. model_name in MLU_WHITELIST
  128. ), f"The MLU device does not yet support `{model_name}` model!"
  129. elif device_type == "npu":
  130. assert (
  131. model_name in NPU_WHITELIST
  132. ), f"The NPU device does not yet support `{model_name}` model!"
  133. elif device_type == "xpu":
  134. assert (
  135. model_name in XPU_WHITELIST
  136. ), f"The XPU device does not yet support `{model_name}` model!"
  137. elif device_type == "gcu":
  138. assert (
  139. model_name in GCU_WHITELIST
  140. ), f"The GCU device does not yet support `{model_name}` model!"
  141. def check_supported_device(device, model_name):
  142. device_type, _ = parse_device(device)
  143. return check_supported_device_type(device_type, model_name)