dotsocr_vllm_daemon.sh 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/bin/bash
  2. # filepath: ocr_platform/ocr_tools/daemons/dotsocr_vllm_daemon.sh
  3. # 对应客户端工具: ocr_tools/dots.ocr_vl_tool/main.py
  4. # DotsOCR vLLM 服务守护进程脚本
  5. LOGDIR="/home/ubuntu/zhch/logs"
  6. mkdir -p $LOGDIR
  7. PIDFILE="$LOGDIR/vllm.pid"
  8. LOGFILE="$LOGDIR/vllm.log"
  9. # 配置参数
  10. CONDA_ENV="dots.ocr"
  11. PORT="8101"
  12. HOST="0.0.0.0"
  13. HF_MODEL_PATH="/home/ubuntu/zhch/dots.ocr/weights/DotsOCR"
  14. MODEL_NAME="DotsOCR"
  15. # GPU 配置
  16. GPU_MEMORY_UTILIZATION="0.70"
  17. CUDA_VISIBLE_DEVICES="1,2"
  18. DATA_PARALLEL_SIZE="2" # 2个GPU
  19. MAX_MODEL_LEN="32768"
  20. MAX_NUM_BATCHED_TOKENS="32768"
  21. MAX_NUM_SEQS="16"
  22. # 正确初始化和激活conda环境
  23. if [ -f "/home/ubuntu/anaconda3/etc/profile.d/conda.sh" ]; then
  24. source /home/ubuntu/anaconda3/etc/profile.d/conda.sh
  25. conda activate $CONDA_ENV
  26. elif [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then
  27. source /opt/conda/etc/profile.d/conda.sh
  28. conda activate $CONDA_ENV
  29. else
  30. # 方法2:直接使用conda可执行文件路径
  31. echo "Warning: Using direct conda path activation"
  32. export PATH="/home/ubuntu/anaconda3/envs/$CONDA_ENV/bin:$PATH"
  33. fi
  34. # 设置环境变量
  35. export PYTHONPATH=$(dirname "$HF_MODEL_PATH"):$PYTHONPATH
  36. export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
  37. export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
  38. # 注册 DotsOCR 模型到 vLLM
  39. register_model() {
  40. echo "🔧 注册 DotsOCR 模型到 vLLM..."
  41. vllm_path=$(which vllm)
  42. if [ -z "$vllm_path" ]; then
  43. echo "❌ vLLM 未找到,请检查安装和环境激活"
  44. return 1
  45. fi
  46. if ! grep -q "from DotsOCR import modeling_dots_ocr_vllm" "$vllm_path"; then
  47. sed -i '/^from vllm\.entrypoints\.cli\.main import main$/a\
  48. from DotsOCR import modeling_dots_ocr_vllm' "$vllm_path"
  49. echo "✅ DotsOCR 模型已注册到 vLLM"
  50. else
  51. echo "✅ DotsOCR 模型已经注册过了"
  52. fi
  53. }
  54. start() {
  55. if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
  56. echo "vLLM DotsOCR is already running"
  57. return 1
  58. fi
  59. echo "Starting vLLM DotsOCR daemon..."
  60. echo "Host: $HOST, Port: $PORT"
  61. echo "Model path: $HF_MODEL_PATH"
  62. echo "GPU memory utilization: $GPU_MEMORY_UTILIZATION"
  63. echo "Data parallel size: $DATA_PARALLEL_SIZE"
  64. # 检查模型文件是否存在
  65. if [ ! -d "$HF_MODEL_PATH" ]; then
  66. echo "❌ Model path not found: $HF_MODEL_PATH"
  67. return 1
  68. fi
  69. # 检查conda环境
  70. if ! command -v python >/dev/null 2>&1; then
  71. echo "❌ Python not found. Check conda environment activation."
  72. return 1
  73. fi
  74. # 检查vllm命令
  75. if ! command -v vllm >/dev/null 2>&1; then
  76. echo "❌ vLLM not found. Check installation and environment."
  77. return 1
  78. fi
  79. echo "🔧 Using Python: $(which python)"
  80. echo "🔧 Using vLLM: $(which vllm)"
  81. # 注册模型
  82. register_model
  83. if [ $? -ne 0 ]; then
  84. echo "❌ Model registration failed"
  85. return 1
  86. fi
  87. # 显示GPU状态
  88. echo "📊 GPU 状态检查:"
  89. if command -v nvidia-smi >/dev/null 2>&1; then
  90. nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits | \
  91. awk -F',' '{printf " GPU %s: %s - 内存: %sMB/%sMB\n", $1, $2, $3, $4}'
  92. else
  93. echo "⚠️ nvidia-smi not available"
  94. fi
  95. # 启动vLLM服务
  96. CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES nohup vllm serve $HF_MODEL_PATH \
  97. --host $HOST \
  98. --port $PORT \
  99. --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
  100. --max-log-len 1000 \
  101. --trust-remote-code \
  102. --max-model-len $MAX_MODEL_LEN \
  103. --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
  104. --uvicorn-log-level info \
  105. --limit-mm-per-prompt '{"image": 1}' \
  106. --chat-template-content-format string \
  107. --data-parallel-size $DATA_PARALLEL_SIZE \
  108. --max-num-seqs $MAX_NUM_SEQS \
  109. --enable-prefix-caching \
  110. --served-model-name $MODEL_NAME \
  111. > $LOGFILE 2>&1 &
  112. echo $! > $PIDFILE
  113. echo "✅ vLLM DotsOCR started with PID: $(cat $PIDFILE)"
  114. echo "📋 Log file: $LOGFILE"
  115. echo "🌐 Service URL: http://$HOST:$PORT"
  116. echo "📖 API Documentation: http://localhost:$PORT/docs"
  117. }
  118. stop() {
  119. if [ ! -f $PIDFILE ]; then
  120. echo "vLLM DotsOCR is not running"
  121. return 1
  122. fi
  123. PID=$(cat $PIDFILE)
  124. echo "Stopping vLLM DotsOCR (PID: $PID)..."
  125. # 优雅停止
  126. kill $PID
  127. # 等待进程结束
  128. for i in {1..10}; do
  129. if ! kill -0 $PID 2>/dev/null; then
  130. break
  131. fi
  132. echo "Waiting for process to stop... ($i/10)"
  133. sleep 1
  134. done
  135. # 如果进程仍在运行,强制结束
  136. if kill -0 $PID 2>/dev/null; then
  137. echo "Force killing process..."
  138. kill -9 $PID
  139. fi
  140. rm -f $PIDFILE
  141. echo "✅ vLLM DotsOCR stopped"
  142. }
  143. status() {
  144. if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then
  145. PID=$(cat $PIDFILE)
  146. echo "✅ vLLM DotsOCR is running (PID: $PID)"
  147. echo "🌐 Service URL: http://$HOST:$PORT"
  148. echo "📋 Log file: $LOGFILE"
  149. # 检查端口是否被监听
  150. if command -v ss >/dev/null 2>&1; then
  151. if ss -tuln | grep -q ":$PORT "; then
  152. echo "🔗 Port $PORT is being listened"
  153. else
  154. echo "⚠️ Port $PORT is not being listened (service may be starting up)"
  155. fi
  156. elif command -v netstat >/dev/null 2>&1; then
  157. if netstat -tuln | grep -q ":$PORT "; then
  158. echo "🔗 Port $PORT is being listened"
  159. else
  160. echo "⚠️ Port $PORT is not being listened (service may be starting up)"
  161. fi
  162. fi
  163. # 检查API响应
  164. if command -v curl >/dev/null 2>&1; then
  165. if curl -s --connect-timeout 2 http://127.0.0.1:$PORT/v1/models > /dev/null 2>&1; then
  166. echo "🎯 API 响应正常"
  167. else
  168. echo "⚠️ API 无响应 (service may be starting up)"
  169. fi
  170. fi
  171. # 显示GPU使用情况
  172. if command -v nvidia-smi >/dev/null 2>&1; then
  173. echo "📊 GPU 使用情况:"
  174. nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total --format=csv,noheader,nounits | \
  175. awk -F',' '{printf " GPU %s: GPU利用率 %s%%, 内存利用率 %s%%, 显存 %sMB/%sMB\n", $1, $2, $3, $4, $5}'
  176. fi
  177. # 显示最新日志
  178. if [ -f $LOGFILE ]; then
  179. echo "📄 Latest logs (last 3 lines):"
  180. tail -3 $LOGFILE | sed 's/^/ /'
  181. fi
  182. else
  183. echo "❌ vLLM DotsOCR is not running"
  184. if [ -f $PIDFILE ]; then
  185. echo "Removing stale PID file..."
  186. rm -f $PIDFILE
  187. fi
  188. fi
  189. }
  190. logs() {
  191. if [ -f $LOGFILE ]; then
  192. echo "📄 vLLM DotsOCR logs:"
  193. echo "=================="
  194. tail -f $LOGFILE
  195. else
  196. echo "❌ Log file not found: $LOGFILE"
  197. fi
  198. }
  199. config() {
  200. echo "📋 Current configuration:"
  201. echo " Conda Environment: $CONDA_ENV"
  202. echo " Host: $HOST"
  203. echo " Port: $PORT"
  204. echo " Model Path: $HF_MODEL_PATH"
  205. echo " Model Name: $MODEL_NAME"
  206. echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
  207. echo " Data Parallel Size: $DATA_PARALLEL_SIZE"
  208. echo " Max Model Length: $MAX_MODEL_LEN"
  209. echo " Max Num Seqs: $MAX_NUM_SEQS"
  210. echo " PID File: $PIDFILE"
  211. echo " Log File: $LOGFILE"
  212. if [ -d "$HF_MODEL_PATH" ]; then
  213. echo "✅ Model path exists"
  214. echo " Model files:"
  215. ls -la "$HF_MODEL_PATH" | head -10 | sed 's/^/ /'
  216. if [ $(ls -1 "$HF_MODEL_PATH" | wc -l) -gt 10 ]; then
  217. echo " ... and more files"
  218. fi
  219. else
  220. echo "❌ Model path not found"
  221. fi
  222. # 显示环境信息
  223. echo ""
  224. echo "🔧 Environment:"
  225. echo " Python: $(which python 2>/dev/null || echo 'Not found')"
  226. echo " vLLM: $(which vllm 2>/dev/null || echo 'Not found')"
  227. echo " Conda: $(which conda 2>/dev/null || echo 'Not found')"
  228. echo " CUDA: $(which nvcc 2>/dev/null || echo 'Not found')"
  229. # 显示GPU信息
  230. if command -v nvidia-smi >/dev/null 2>&1; then
  231. echo ""
  232. echo "🔥 GPU Information:"
  233. nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader,nounits | \
  234. awk -F',' '{printf " GPU %s: %s (Driver: %s, Memory: %sMB)\n", $1, $2, $3, $4}'
  235. fi
  236. }
  237. test_api() {
  238. echo "🧪 Testing vLLM DotsOCR API..."
  239. if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then
  240. echo "❌ vLLM service is not running"
  241. return 1
  242. fi
  243. if ! command -v curl >/dev/null 2>&1; then
  244. echo "❌ curl command not found"
  245. return 1
  246. fi
  247. echo "📡 Testing /v1/models endpoint..."
  248. response=$(curl -s --connect-timeout 5 http://127.0.0.1:$PORT/v1/models)
  249. if [ $? -eq 0 ]; then
  250. echo "✅ Models endpoint accessible"
  251. echo "$response" | python -m json.tool 2>/dev/null || echo "$response"
  252. else
  253. echo "❌ Models endpoint not accessible"
  254. fi
  255. }
  256. # 显示使用帮助
  257. usage() {
  258. echo "vLLM DotsOCR Service Daemon"
  259. echo "============================"
  260. echo "Usage: $0 {start|stop|restart|status|logs|config|test}"
  261. echo ""
  262. echo "Commands:"
  263. echo " start - Start the vLLM DotsOCR service"
  264. echo " stop - Stop the vLLM DotsOCR service"
  265. echo " restart - Restart the vLLM DotsOCR service"
  266. echo " status - Show service status and resource usage"
  267. echo " logs - Show service logs (follow mode)"
  268. echo " config - Show current configuration"
  269. echo " test - Test API endpoints"
  270. echo ""
  271. echo "Configuration (edit script to modify):"
  272. echo " Host: $HOST"
  273. echo " Port: $PORT"
  274. echo " Model: $HF_MODEL_PATH"
  275. echo " GPU Memory: $GPU_MEMORY_UTILIZATION"
  276. echo " Parallel Size: $DATA_PARALLEL_SIZE"
  277. echo ""
  278. echo "Examples:"
  279. echo " ./dotsocr_vllm_daemon.sh start"
  280. echo " ./dotsocr_vllm_daemon.sh status"
  281. echo " ./dotsocr_vllm_daemon.sh logs"
  282. echo " ./dotsocr_vllm_daemon.sh test"
  283. }
  284. case "$1" in
  285. start)
  286. start
  287. ;;
  288. stop)
  289. stop
  290. ;;
  291. restart)
  292. stop
  293. sleep 3
  294. start
  295. ;;
  296. status)
  297. status
  298. ;;
  299. logs)
  300. logs
  301. ;;
  302. config)
  303. config
  304. ;;
  305. test)
  306. test_api
  307. ;;
  308. *)
  309. usage
  310. exit 1
  311. ;;
  312. esac