Эх сурвалжийг харах

fix(zhch): 优化多GPU进程模型初始化和错误处理

- 在主进程中初始化Paddle,避免子进程CUDA初始化冲突
- 添加模型初始化错误处理,将错误信息发送到结果队列
- 修改默认设备设置,主进程使用CPU
- 优化代码结构,添加异常捕获和错误日志
zhch158_admin 3 сар өмнө
parent
commit
2de26d87b4

+ 38 - 6
zhch/ppstructurev3_multi_gpu_multiprocess_official.py

@@ -16,6 +16,7 @@ from paddlex import create_pipeline
 from paddlex.utils.device import constr_device, parse_device
 from tqdm import tqdm
 import paddle
+from cuda_utils import detect_available_gpus, monitor_gpu_memory
 
 from dotenv import load_dotenv
 load_dotenv(override=True)
@@ -41,9 +42,26 @@ def worker(pipeline_name_or_config_path: str,
     """
     try:
         # 创建pipeline实例
+        from dotenv import load_dotenv
+        load_dotenv(override=True)
+        print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
+        import paddle
+        paddle.set_device(device)
         pipeline = create_pipeline(pipeline_name_or_config_path, device=device)
         print(f"Worker {worker_id} initialized with device {device}")
+    except Exception as e:
+        print(f"Worker {worker_id} ({device}) initialization failed: {e}", file=sys.stderr)
+        traceback.print_exc()
+        # 发送错误信息到结果队列
+        result_queue.put([{
+            "error": f"Worker initialization failed: {str(e)}",
+            "worker_id": worker_id,
+            "device": device,
+            "success": False
+        }])
+        return
         
+    try:
         should_end = False
         batch = []
         processed_count = 0
@@ -61,7 +79,7 @@ def worker(pipeline_name_or_config_path: str,
                     start_time = time.time()
                     
                     # 使用pipeline预测
-                    results = list(pipeline.predict(
+                    results = pipeline.predict(
                         batch,
                         use_doc_orientation_classify=True,
                         use_doc_unwarping=False,
@@ -69,7 +87,7 @@ def worker(pipeline_name_or_config_path: str,
                         use_chart_recognition=True,
                         use_table_recognition=True,
                         use_formula_recognition=True,
-                    ))
+                    )
                     
                     batch_processing_time = time.time() - start_time
                     batch_results = []
@@ -193,6 +211,14 @@ def parallel_process_with_official_approach(image_paths: List[str],
     print(f"  Batch size: {batch_size}")
     print(f"  Total images: {len(image_paths)}")
     
+    # 在主进程中初始化paddle,防止子进程CUDA初始化冲突
+    try:
+        import paddle
+        # 只在主进程中设置一个默认设备
+        paddle.set_device("cpu")  # 主进程使用CPU
+    except Exception as e:
+        print(f"Warning: Failed to initialize paddle in main process: {e}")
+
     # 使用Manager创建队列
     with Manager() as manager:
         task_queue = manager.Queue()
@@ -492,6 +518,12 @@ def main():
         return 1
 
 if __name__ == "__main__":
+    print(f"🚀 启动OCR程序...")
+    print(f"CUDA 版本: {paddle.device.cuda.get_device_name()}")
+    print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
+    available_gpus = detect_available_gpus()
+    monitor_gpu_memory(available_gpus)
+
     if len(sys.argv) == 1:
         # 如果没有命令行参数,使用默认配置运行
         print("No command line arguments provided. Running with default configuration...")
@@ -501,10 +533,10 @@ if __name__ == "__main__":
             "input_dir": "../../OmniDocBench/OpenDataLab___OmniDocBench/images",
             "output_dir": "./OmniDocBench_Results_Official",
             "pipeline": "PP-StructureV3",
-            "device": "gpu:0,1",
+            "device": "gpu:0",
             "instances_per_device": 1,
-            "batch_size": 1,
-            "test_mode": False
+            "batch_size": 4,
+            # "test_mode": False
         }
         
         # 构造参数
@@ -513,6 +545,6 @@ if __name__ == "__main__":
             sys.argv.extend([f"--{key}", str(value)])
         
         # 测试模式
-        # sys.argv.append("--test_mode")
+        sys.argv.append("--test_mode")
     
     sys.exit(main())