|
@@ -0,0 +1,24 @@
|
|
|
|
|
+import paddle
|
|
|
|
|
+from typing import List
|
|
|
|
|
+def detect_available_gpus() -> List[int]:
|
|
|
|
|
+ """检测可用的GPU"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ gpu_count = paddle.device.cuda.device_count()
|
|
|
|
|
+ available_gpus = list(range(gpu_count))
|
|
|
|
|
+ print(f"检测到 {gpu_count} 个可用GPU: {available_gpus}")
|
|
|
|
|
+ return available_gpus
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"GPU检测失败: {e}")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+def monitor_gpu_memory(gpu_ids: List[int] = [0, 1]):
|
|
|
|
|
+ """监控GPU内存使用情况"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ for gpu_id in gpu_ids:
|
|
|
|
|
+ paddle.device.set_device(f"gpu:{gpu_id}")
|
|
|
|
|
+ total = paddle.device.cuda.get_device_properties(gpu_id).total_memory / 1024**3
|
|
|
|
|
+ allocated = paddle.device.cuda.memory_allocated() / 1024**3
|
|
|
|
|
+ reserved = paddle.device.cuda.memory_reserved() / 1024**3
|
|
|
|
|
+ print(f"GPU {gpu_id} - 显存: {total:.2f}GB, 已分配: {allocated:.2f}GB, 已预留: {reserved:.2f}GB")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"GPU内存监控失败: {e}")
|