cuda_utils.py 1014 B

123456789101112131415161718192021222324
  1. import paddle
  2. from typing import List
  3. def detect_available_gpus() -> List[int]:
  4. """检测可用的GPU"""
  5. try:
  6. gpu_count = paddle.device.cuda.device_count()
  7. available_gpus = list(range(gpu_count))
  8. print(f"检测到 {gpu_count} 个可用GPU: {available_gpus}")
  9. return available_gpus
  10. except Exception as e:
  11. print(f"GPU检测失败: {e}")
  12. return []
  13. def monitor_gpu_memory(gpu_ids: List[int] = [0, 1]):
  14. """监控GPU内存使用情况"""
  15. try:
  16. for gpu_id in gpu_ids:
  17. paddle.device.set_device(f"gpu:{gpu_id}")
  18. total = paddle.device.cuda.get_device_properties(gpu_id).total_memory / 1024**3
  19. allocated = paddle.device.cuda.memory_allocated() / 1024**3
  20. reserved = paddle.device.cuda.memory_reserved() / 1024**3
  21. print(f"GPU {gpu_id} - 显存: {total:.2f}GB, 已分配: {allocated:.2f}GB, 已预留: {reserved:.2f}GB")
  22. except Exception as e:
  23. print(f"GPU内存监控失败: {e}")