paddle_vllm_daemon.sh 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #!/bin/bash
  2. # filepath: ocr_platform/ocr_tools/daemons/paddle_vllm_daemon.sh
  3. # 对应客户端工具: ocr_tools/paddle_vl_tool/main.py
  4. # PaddleOCR-VL vLLM 服务守护进程脚本
  5. LOGDIR="/home/ubuntu/zhch/logs"
  6. mkdir -p $LOGDIR
  7. PIDFILE="$LOGDIR/paddleocr_vl_vllm.pid"
  8. LOGFILE="$LOGDIR/paddleocr_vl_vllm.log"
  9. # 配置参数
  10. CONDA_ENV="paddle" # 根据你的环境调整
  11. PORT="8110"
  12. HOST="0.0.0.0"
  13. MODEL_NAME="PaddleOCR-VL-0.9B"
  14. BACKEND="vllm"
  15. # GPU 配置
  16. GPU_MEMORY_UTILIZATION="0.3"
  17. CUDA_VISIBLE_DEVICES="3" # 使用3号显卡
  18. MAX_MODEL_LEN="16384"
  19. MAX_NUM_BATCHED_TOKENS="8192"
  20. MAX_NUM_SEQS="8"
  21. # PaddleX 环境变量
  22. export PADDLE_PDX_MODEL_SOURCE="bos"
  23. export PYTHONWARNINGS="ignore::UserWarning"
  24. # 正确初始化和激活conda环境
  25. if [ -f "/home/ubuntu/anaconda3/etc/profile.d/conda.sh" ]; then
  26. source /home/ubuntu/anaconda3/etc/profile.d/conda.sh
  27. conda activate $CONDA_ENV
  28. elif [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then
  29. source /opt/conda/etc/profile.d/conda.sh
  30. conda activate $CONDA_ENV
  31. else
  32. echo "Warning: Using direct conda path activation"
  33. export PATH="/home/ubuntu/anaconda3/envs/$CONDA_ENV/bin:$PATH"
  34. fi
  35. start() {
  36. if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
  37. echo "PaddleOCR-VL vLLM is already running"
  38. return 1
  39. fi
  40. echo "Starting PaddleOCR-VL vLLM daemon..."
  41. echo "Host: $HOST, Port: $PORT"
  42. echo "Model: $MODEL_NAME, Backend: $BACKEND"
  43. echo "GPU memory utilization: $GPU_MEMORY_UTILIZATION"
  44. echo "CUDA devices: $CUDA_VISIBLE_DEVICES"
  45. # 检查conda环境
  46. if ! command -v python >/dev/null 2>&1; then
  47. echo "❌ Python not found. Check conda environment activation."
  48. return 1
  49. fi
  50. # 检查paddlex_genai_server命令
  51. if ! command -v paddlex_genai_server >/dev/null 2>&1; then
  52. echo "❌ paddlex_genai_server not found. Please install vllm-server plugin:"
  53. echo " paddlex --install genai-vllm-server"
  54. return 1
  55. fi
  56. echo "🔧 Using Python: $(which python)"
  57. echo "🔧 Using paddlex_genai_server: $(which paddlex_genai_server)"
  58. # 显示GPU状态
  59. echo "📊 GPU 状态检查:"
  60. if command -v nvidia-smi >/dev/null 2>&1; then
  61. nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits | \
  62. grep "^$CUDA_VISIBLE_DEVICES," | \
  63. awk -F',' '{printf " GPU %s: %s - 内存: %sMB/%sMB\n", $1, $2, $3, $4}'
  64. else
  65. echo "⚠️ nvidia-smi not available"
  66. fi
  67. # 启动PaddleOCR-VL vLLM服务
  68. CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES nohup paddlex_genai_server \
  69. --model_name $MODEL_NAME \
  70. --backend $BACKEND \
  71. --host $HOST \
  72. --port $PORT \
  73. --backend_config <(cat <<EOF
  74. gpu-memory-utilization: $GPU_MEMORY_UTILIZATION
  75. EOF
  76. ) > $LOGFILE 2>&1 &
  77. echo $! > $PIDFILE
  78. echo "✅ PaddleOCR-VL vLLM started with PID: $(cat $PIDFILE)"
  79. echo "📋 Log file: $LOGFILE"
  80. echo "🌐 Service URL: http://$HOST:$PORT"
  81. echo "📖 API Documentation: http://localhost:$PORT/docs"
  82. echo ""
  83. echo "Waiting for service to start..."
  84. sleep 5
  85. status
  86. }
  87. stop() {
  88. if [ ! -f $PIDFILE ]; then
  89. echo "PaddleOCR-VL vLLM is not running"
  90. return 1
  91. fi
  92. PID=$(cat $PIDFILE)
  93. echo "Stopping PaddleOCR-VL vLLM (PID: $PID)..."
  94. # 优雅停止
  95. kill $PID
  96. # 等待进程结束
  97. for i in {1..10}; do
  98. if ! kill -0 $PID 2>/dev/null; then
  99. break
  100. fi
  101. echo "Waiting for process to stop... ($i/10)"
  102. sleep 1
  103. done
  104. # 如果进程仍在运行,强制结束
  105. if kill -0 $PID 2>/dev/null; then
  106. echo "Force killing process..."
  107. kill -9 $PID
  108. fi
  109. rm -f $PIDFILE
  110. echo "✅ PaddleOCR-VL vLLM stopped"
  111. }
  112. status() {
  113. if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
  114. PID=$(cat $PIDFILE)
  115. echo "✅ PaddleOCR-VL vLLM is running (PID: $PID)"
  116. echo "🌐 Service URL: http://$HOST:$PORT"
  117. echo "📋 Log file: $LOGFILE"
  118. # 检查端口是否被监听
  119. if command -v ss >/dev/null 2>&1; then
  120. if ss -tuln | grep -q ":$PORT "; then
  121. echo "🔗 Port $PORT is being listened"
  122. else
  123. echo "⚠️ Port $PORT is not being listened (service may be starting up)"
  124. fi
  125. elif command -v netstat >/dev/null 2>&1; then
  126. if netstat -tuln | grep -q ":$PORT "; then
  127. echo "🔗 Port $PORT is being listened"
  128. else
  129. echo "⚠️ Port $PORT is not being listened (service may be starting up)"
  130. fi
  131. fi
  132. # 检查API响应
  133. if command -v curl >/dev/null 2>&1; then
  134. if curl -s --connect-timeout 2 http://127.0.0.1:$PORT/v1/models > /dev/null 2>&1; then
  135. echo "🎯 API 响应正常"
  136. else
  137. echo "⚠️ API 无响应 (service may be starting up)"
  138. fi
  139. fi
  140. # 显示GPU使用情况
  141. if command -v nvidia-smi >/dev/null 2>&1; then
  142. echo "📊 GPU 使用情况:"
  143. nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total --format=csv,noheader,nounits | \
  144. grep "^$CUDA_VISIBLE_DEVICES," | \
  145. awk -F',' '{printf " GPU %s: GPU利用率 %s%%, 内存利用率 %s%%, 显存 %sMB/%sMB\n", $1, $2, $3, $4, $5}'
  146. fi
  147. # 显示最新日志
  148. if [ -f $LOGFILE ]; then
  149. echo "📄 Latest logs (last 3 lines):"
  150. tail -3 $LOGFILE | sed 's/^/ /'
  151. fi
  152. else
  153. echo "❌ PaddleOCR-VL vLLM is not running"
  154. if [ -f $PIDFILE ]; then
  155. echo "Removing stale PID file..."
  156. rm -f $PIDFILE
  157. fi
  158. fi
  159. }
  160. logs() {
  161. if [ -f $LOGFILE ]; then
  162. echo "📄 PaddleOCR-VL vLLM logs:"
  163. echo "=================="
  164. tail -f $LOGFILE
  165. else
  166. echo "❌ Log file not found: $LOGFILE"
  167. fi
  168. }
  169. config() {
  170. echo "📋 Current configuration:"
  171. echo " Conda Environment: $CONDA_ENV"
  172. echo " Host: $HOST"
  173. echo " Port: $PORT"
  174. echo " Model Name: $MODEL_NAME"
  175. echo " Backend: $BACKEND"
  176. echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
  177. echo " CUDA Visible Devices: $CUDA_VISIBLE_DEVICES"
  178. echo " Max Model Length: $MAX_MODEL_LEN"
  179. echo " Max Num Seqs: $MAX_NUM_SEQS"
  180. echo " PID File: $PIDFILE"
  181. echo " Log File: $LOGFILE"
  182. echo ""
  183. echo " Model Source: ${PADDLE_PDX_MODEL_SOURCE:-default}"
  184. # 显示环境信息
  185. echo ""
  186. echo "🔧 Environment:"
  187. echo " Python: $(which python 2>/dev/null || echo 'Not found')"
  188. echo " paddlex_genai_server: $(which paddlex_genai_server 2>/dev/null || echo 'Not found')"
  189. echo " Conda: $(which conda 2>/dev/null || echo 'Not found')"
  190. echo " CUDA: $(which nvcc 2>/dev/null || echo 'Not found')"
  191. # 显示GPU信息
  192. if command -v nvidia-smi >/dev/null 2>&1; then
  193. echo ""
  194. echo "🔥 GPU Information:"
  195. nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader,nounits | \
  196. grep "^$CUDA_VISIBLE_DEVICES," | \
  197. awk -F',' '{printf " GPU %s: %s (Driver: %s, Memory: %sMB)\n", $1, $2, $3, $4}'
  198. fi
  199. }
  200. test_api() {
  201. echo "🧪 Testing PaddleOCR-VL vLLM API..."
  202. if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then
  203. echo "❌ PaddleOCR-VL vLLM service is not running"
  204. return 1
  205. fi
  206. if ! command -v curl >/dev/null 2>&1; then
  207. echo "❌ curl command not found"
  208. return 1
  209. fi
  210. echo "📡 Testing /v1/models endpoint..."
  211. response=$(curl -s --connect-timeout 5 http://127.0.0.1:$PORT/v1/models)
  212. if [ $? -eq 0 ]; then
  213. echo "✅ Models endpoint accessible"
  214. echo "$response" | python -m json.tool 2>/dev/null || echo "$response"
  215. else
  216. echo "❌ Models endpoint not accessible"
  217. fi
  218. echo ""
  219. echo "📡 Testing health endpoint..."
  220. health_response=$(curl -s --connect-timeout 5 http://127.0.0.1:$PORT/health)
  221. if [ $? -eq 0 ]; then
  222. echo "✅ Health endpoint accessible"
  223. echo "$health_response"
  224. else
  225. echo "❌ Health endpoint not accessible"
  226. fi
  227. }
  228. test_client() {
  229. echo "🧪 Testing PaddleOCR-VL client with vLLM server..."
  230. if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then
  231. echo "❌ PaddleOCR-VL vLLM service is not running. Start it first with: $0 start"
  232. return 1
  233. fi
  234. # 测试用例文件路径
  235. TEST_IMAGE="/home/ubuntu/zhch/data/至远彩色印刷工业有限公司/2023年度报告母公司.img/2023年度报告母公司_page_006.png"
  236. TEST_OUTPUT="/tmp/paddleocr_vl_vllm_test_output"
  237. PIPELINE_CONFIG="/home/ubuntu/zhch/PaddleX/zhch/my_config/PaddleOCR-VL-Client.yaml"
  238. if [ ! -f "$TEST_IMAGE" ]; then
  239. echo "⚠️ Test image not found: $TEST_IMAGE"
  240. echo "Please provide a test image or update the TEST_IMAGE path in the script"
  241. return 1
  242. fi
  243. if [ ! -f "$PIPELINE_CONFIG" ]; then
  244. echo "⚠️ Pipeline config not found: $PIPELINE_CONFIG"
  245. echo "Please update the PIPELINE_CONFIG path in the script"
  246. return 1
  247. fi
  248. echo "📄 Testing with image: $TEST_IMAGE"
  249. echo "⚙️ Using pipeline config: $PIPELINE_CONFIG"
  250. echo "📁 Output directory: $TEST_OUTPUT"
  251. echo ""
  252. # 方法1: 使用 paddlex 命令行 (推荐)
  253. echo "🔧 Using paddlex CLI..."
  254. mkdir -p "$TEST_OUTPUT"
  255. paddlex --pipeline "$PIPELINE_CONFIG" \
  256. --input "$TEST_IMAGE" \
  257. --save_path "$TEST_OUTPUT" \
  258. --use_doc_orientation_classify False \
  259. --use_doc_unwarping False
  260. if [ $? -eq 0 ]; then
  261. echo "✅ CLI test completed successfully"
  262. echo "📁 Results saved to: $TEST_OUTPUT"
  263. # 显示生成的文件
  264. if [ -d "$TEST_OUTPUT" ]; then
  265. echo ""
  266. echo "📂 Generated files:"
  267. ls -lh "$TEST_OUTPUT" | tail -n +2 | awk '{print " " $9 " (" $5 ")"}'
  268. fi
  269. else
  270. echo "❌ CLI test failed"
  271. return 1
  272. fi
  273. }
  274. # 显示使用帮助
  275. usage() {
  276. echo "PaddleOCR-VL vLLM Service Daemon"
  277. echo "================================="
  278. echo "Usage: $0 {start|stop|restart|status|logs|config|test|test-client}"
  279. echo ""
  280. echo "Commands:"
  281. echo " start - Start the PaddleOCR-VL vLLM service"
  282. echo " stop - Stop the PaddleOCR-VL vLLM service"
  283. echo " restart - Restart the PaddleOCR-VL vLLM service"
  284. echo " status - Show service status and resource usage"
  285. echo " logs - Show service logs (follow mode)"
  286. echo " config - Show current configuration"
  287. echo " test - Test API endpoints"
  288. echo " test-client - Test PaddleX client with vLLM server"
  289. echo ""
  290. echo "Configuration (edit script to modify):"
  291. echo " Host: $HOST"
  292. echo " Port: $PORT"
  293. echo " Model: $MODEL_NAME"
  294. echo " Backend: $BACKEND"
  295. echo " GPU Memory: $GPU_MEMORY_UTILIZATION"
  296. echo " CUDA Devices: $CUDA_VISIBLE_DEVICES"
  297. echo ""
  298. echo "Examples:"
  299. echo " ./paddle_vllm_daemon.sh start"
  300. echo " ./paddle_vllm_daemon.sh status"
  301. echo " ./paddle_vllm_daemon.sh logs"
  302. echo " ./paddle_vllm_daemon.sh test"
  303. echo " ./paddle_vllm_daemon.sh test-client"
  304. }
  305. case "$1" in
  306. start)
  307. start
  308. ;;
  309. stop)
  310. stop
  311. ;;
  312. restart)
  313. stop
  314. sleep 3
  315. start
  316. ;;
  317. status)
  318. status
  319. ;;
  320. logs)
  321. logs
  322. ;;
  323. config)
  324. config
  325. ;;
  326. test)
  327. test_api
  328. ;;
  329. test-client)
  330. test_client
  331. ;;
  332. *)
  333. usage
  334. exit 1
  335. ;;
  336. esac