Browse Source

feat(zhch): 添加NVIDIA环境检查脚本,验证nvidia-smi、NVML和PaddlePaddle配置

zhch158_admin 3 tháng trước cách đây
mục cha
commit
657c41ae11
1 tập tin đã thay đổi với 54 bổ sung0 xóa
  1. 54 0
      zhch/check_cuda_env.py

+ 54 - 0
zhch/check_cuda_env.py

@@ -0,0 +1,54 @@
+import subprocess
+import sys
+
+def check_nvidia_environment():
+    print("=== NVIDIA环境检查 ===")
+    
+    # 检查nvidia-smi
+    try:
+        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            print("✓ nvidia-smi 正常运行")
+            # 提取驱动版本
+            lines = result.stdout.split('\n')
+            for line in lines:
+                if 'Driver Version:' in line:
+                    print(f"  {line.strip()}")
+                    break
+        else:
+            print("✗ nvidia-smi 失败:")
+            print(result.stderr)
+    except Exception as e:
+        print(f"✗ nvidia-smi 错误: {e}")
+    
+    # 检查NVML
+    try:
+        import pynvml
+        pynvml.nvmlInit()
+        driver_version = pynvml.nvmlSystemGetDriverVersion()
+        print(f"✓ NVML初始化成功,驱动版本: {driver_version}")
+        
+        device_count = pynvml.nvmlDeviceGetCount()
+        print(f"✓ 检测到 {device_count} 个GPU设备")
+        
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            name = pynvml.nvmlDeviceGetName(handle)
+            print(f"  GPU {i}: {name}")
+            
+    except Exception as e:
+        print(f"✗ NVML错误: {e}")
+    
+    # 检查PaddlePaddle
+    try:
+        import paddle
+        print(f"✓ PaddlePaddle版本: {paddle.__version__}")
+        print(f"✓ CUDA编译支持: {paddle.device.is_compiled_with_cuda()}")
+        if paddle.device.is_compiled_with_cuda():
+            gpu_count = paddle.device.cuda.device_count()
+            print(f"✓ PaddlePaddle检测到 {gpu_count} 个GPU")
+    except Exception as e:
+        print(f"✗ PaddlePaddle错误: {e}")
+
+if __name__ == "__main__":
+    check_nvidia_environment()