| 123456789101112131415161718192021222324 |
- import torch
- from typing import List
- def detect_available_gpus() -> List[int]:
- """检测可用的GPU"""
- try:
- gpu_count = torch.cuda.device_count()
- available_gpus = list(range(gpu_count))
- print(f"检测到 {gpu_count} 个可用GPU: {available_gpus}")
- return available_gpus
- except Exception as e:
- print(f"GPU检测失败: {e}")
- return []
- def monitor_gpu_memory(gpu_ids: List[int] = [0, 1]):
- """监控GPU内存使用情况"""
- try:
- for gpu_id in gpu_ids:
- torch.cuda.set_device(gpu_id)
- total = torch.cuda.get_device_properties(gpu_id).total_memory / 1024**3
- allocated = torch.cuda.memory_allocated(gpu_id) / 1024**3
- reserved = torch.cuda.memory_reserved(gpu_id) / 1024**3
- print(f"GPU {gpu_id} - 显存: {total:.2f}GB, 已分配: {allocated:.2f}GB, 已预留: {reserved:.2f}GB")
- except Exception as e:
- print(f"GPU内存监控失败: {e}")
|