#!/bin/bash # filepath: ocr_platform/ocr_tools/daemons/glmocr_vllm_daemon.sh # 对应: GLM-OCR 自部署 vLLM 服务,SDK 自托管模式 (config: maas.enabled=false, ocr_api → 本服务) # 保证transformers与vllm的兼容性, 注意先后次序,vllm安装后的transformers需重新安装 # uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly # uv pip install -U transformers # GLM-OCR vLLM 服务守护进程脚本 LOGDIR="/home/ubuntu/zhch/logs" mkdir -p $LOGDIR PIDFILE="$LOGDIR/glmocr_vllm.pid" LOGFILE="$LOGDIR/glmocr_vllm.log" # 配置参数 CONDA_ENV="mineru_2_7_1" PORT="20036" HOST="0.0.0.0" # 本地模型目录(与 config-zhch.yaml 自托管模式 ocr_api.api_port 一致) MODEL_PATH="/home/ubuntu/models/modelscope_cache/models/ZhipuAI/GLM-OCR" # 也可使用 HuggingFace 模型 id: zai-org/GLM-OCR SERVED_MODEL_NAME="glm-ocr" ALLOWED_LOCAL_MEDIA_PATH="/" # GPU 配置 GPU_MEMORY_UTILIZATION="0.7" CUDA_VISIBLE_DEVICES="7" # 可选:开启 MTP 推测解码以提升推理性能 ENABLE_MTP="0" MTP_NUM_SPECULATIVE_TOKENS="1" # 环境变量(按需取消注释) # export HF_HOME="/home/ubuntu/models/hf_home" # export HF_ENDPOINT="https://hf-mirror.com" # export MODELSCOPE_CACHE="/home/ubuntu/models/modelscope_cache" # 正确初始化和激活 conda 环境 if [ -f "/home/ubuntu/anaconda3/etc/profile.d/conda.sh" ]; then source /home/ubuntu/anaconda3/etc/profile.d/conda.sh conda activate $CONDA_ENV elif [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then source /opt/conda/etc/profile.d/conda.sh conda activate $CONDA_ENV else echo "Warning: Using direct conda path activation" export PATH="/home/ubuntu/anaconda3/envs/$CONDA_ENV/bin:$PATH" fi start() { if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then echo "GLM-OCR vLLM is already running" return 1 fi echo "Starting GLM-OCR vLLM daemon..." echo "Host: $HOST, Port: $PORT" echo "Model path: $MODEL_PATH" echo "Served model name: $SERVED_MODEL_NAME" echo "GPU memory utilization: $GPU_MEMORY_UTILIZATION" echo "CUDA devices: $CUDA_VISIBLE_DEVICES" # 检查模型:本地路径需存在目录,HuggingFace id (含 / 且非绝对路径) 不检查 if [[ "$MODEL_PATH" == /* ]]; then if [ ! -d "$MODEL_PATH" ]; then echo "❌ Model path not found: $MODEL_PATH" echo "Use a local path or HuggingFace id (e.g. zai-org/GLM-OCR). Edit MODEL_PATH in this script." return 1 fi fi # 检查 Python / vLLM if ! command -v python >/dev/null 2>&1; then echo "❌ Python not found. Check conda environment activation." return 1 fi if ! python -c "import vllm" 2>/dev/null; then echo "❌ vllm not found. Install: uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly" return 1 fi echo "🔧 Using Python: $(which python)" echo "🔧 vLLM: $(python -c 'import vllm; print(vllm.__file__)' 2>/dev/null || true)" echo "📊 GPU 状态检查:" if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits | \ awk -F',' '{printf " GPU %s: %s - 内存: %sMB/%sMB\n", $1, $2, $3, $4}' else echo "⚠️ nvidia-smi not available" fi # 构建 vllm serve 参数 if [ "$ENABLE_MTP" = "1" ]; then CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES TRANSFORMERS_USE_FAST=false nohup vllm serve "$MODEL_PATH" \ --host $HOST \ --port $PORT \ --allowed-local-media-path $ALLOWED_LOCAL_MEDIA_PATH \ --served-model-name $SERVED_MODEL_NAME \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $MTP_NUM_SPECULATIVE_TOKENS}" \ > $LOGFILE 2>&1 & else CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES TRANSFORMERS_USE_FAST=false nohup vllm serve "$MODEL_PATH" \ --host $HOST \ --port $PORT \ --allowed-local-media-path $ALLOWED_LOCAL_MEDIA_PATH \ --served-model-name $SERVED_MODEL_NAME \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ > $LOGFILE 2>&1 & fi echo $! > $PIDFILE echo "✅ GLM-OCR vLLM started with PID: $(cat $PIDFILE)" echo "📋 Log file: $LOGFILE" echo "🌐 Service URL: http://$HOST:$PORT" echo "📖 OpenAI-compatible API: http://localhost:$PORT/v1 (chat/completions, models)" echo "" echo "Waiting for service to start..." sleep 5 status } stop() { if [ ! -f $PIDFILE ]; then echo "GLM-OCR vLLM is not running" return 1 fi PID=$(cat $PIDFILE) echo "Stopping GLM-OCR vLLM (PID: $PID)..." kill $PID for i in {1..30}; do if ! kill -0 $PID 2>/dev/null; then break fi echo "Waiting for process to stop... ($i/30)" sleep 1 done if kill -0 $PID 2>/dev/null; then echo "Force killing process..." kill -9 $PID fi rm -f $PIDFILE echo "✅ GLM-OCR vLLM stopped" } status() { if [ -f $PIDFILE ] && kill -0 $(cat $PIDFILE) 2>/dev/null; then PID=$(cat $PIDFILE) echo "✅ GLM-OCR vLLM is running (PID: $PID)" echo "🌐 Service URL: http://$HOST:$PORT" echo "📋 Log file: $LOGFILE" if command -v ss >/dev/null 2>&1; then if ss -tuln | grep -q ":$PORT "; then echo "🔗 Port $PORT is being listened" else echo "⚠️ Port $PORT is not being listened (service may be starting up)" fi elif command -v netstat >/dev/null 2>&1; then if netstat -tuln | grep -q ":$PORT "; then echo "🔗 Port $PORT is being listened" else echo "⚠️ Port $PORT is not being listened (service may be starting up)" fi fi if command -v curl >/dev/null 2>&1; then if curl -s --connect-timeout 2 http://127.0.0.1:$PORT/v1/models > /dev/null 2>&1; then echo "🎯 API 响应正常" else echo "⚠️ API 无响应 (service may be starting up)" fi fi if command -v nvidia-smi >/dev/null 2>&1; then echo "📊 GPU 使用情况:" nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total --format=csv,noheader,nounits | \ awk -F',' '{printf " GPU %s: GPU利用率 %s%%, 内存利用率 %s%%, 显存 %sMB/%sMB\n", $1, $2, $3, $4, $5}' fi if [ -f $LOGFILE ]; then echo "📄 Latest logs (last 3 lines):" tail -3 $LOGFILE | sed 's/^/ /' fi else echo "❌ GLM-OCR vLLM is not running" if [ -f $PIDFILE ]; then echo "Removing stale PID file..." rm -f $PIDFILE fi fi } logs() { if [ -f $LOGFILE ]; then echo "📄 GLM-OCR vLLM logs:" echo "=====================" tail -f $LOGFILE else echo "❌ Log file not found: $LOGFILE" fi } config() { echo "📋 Current configuration:" echo " Conda Environment: $CONDA_ENV" echo " Host: $HOST" echo " Port: $PORT" echo " Model Path: $MODEL_PATH" echo " Served Model Name: $SERVED_MODEL_NAME" echo " Allowed Local Media Path: $ALLOWED_LOCAL_MEDIA_PATH" echo " GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" echo " CUDA Visible Devices: $CUDA_VISIBLE_DEVICES" echo " Enable MTP: $ENABLE_MTP" echo " PID File: $PIDFILE" echo " Log File: $LOGFILE" if [ -d "$MODEL_PATH" ]; then echo "✅ Model path exists" echo " Model files:" ls -la "$MODEL_PATH" | head -10 | sed 's/^/ /' if [ $(ls -1 "$MODEL_PATH" 2>/dev/null | wc -l) -gt 10 ]; then echo " ... and more files" fi else echo "❌ Model path not found (use HuggingFace id like zai-org/GLM-OCR by setting MODEL_PATH)" fi echo "" echo "🔧 Environment:" echo " Python: $(which python 2>/dev/null || echo 'Not found')" echo " vLLM: $(python -c 'import vllm; print(vllm.__file__)' 2>/dev/null || echo 'Not found')" echo " Conda: $(which conda 2>/dev/null || echo 'Not found')" echo " CUDA: $(which nvcc 2>/dev/null || echo 'Not found')" if command -v nvidia-smi >/dev/null 2>&1; then echo "" echo "🔥 GPU Information:" nvidia-smi --query-gpu=index,name,driver_version,memory.total --format=csv,noheader,nounits | \ awk -F',' '{printf " GPU %s: %s (Driver: %s, Memory: %sMB)\n", $1, $2, $3, $4}' fi } test_api() { echo "🧪 Testing GLM-OCR vLLM API..." if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then echo "❌ GLM-OCR vLLM service is not running" return 1 fi if ! command -v curl >/dev/null 2>&1; then echo "❌ curl command not found" return 1 fi echo "📡 Testing /v1/models endpoint..." response=$(curl -s --connect-timeout 10 http://127.0.0.1:$PORT/v1/models) if [ $? -eq 0 ]; then echo "✅ Models endpoint accessible" echo "$response" | python -m json.tool 2>/dev/null || echo "$response" else echo "❌ Models endpoint not accessible" fi } test_client() { echo "🧪 Testing GLM-OCR SDK with vLLM server (self-hosted mode)..." if [ ! -f $PIDFILE ] || ! kill -0 $(cat $PIDFILE) 2>/dev/null; then echo "❌ GLM-OCR vLLM service is not running. Start it first: $0 start" return 1 fi TEST_IMAGE="/home/ubuntu/zhch/data/test/sample.png" if [ ! -f "$TEST_IMAGE" ]; then echo "⚠️ Test image not found: $TEST_IMAGE" echo "Update TEST_IMAGE in this script or create the file." return 1 fi echo "📄 Test image: $TEST_IMAGE" echo "Run GLM-OCR with config that has maas.enabled=false and ocr_api pointing to 127.0.0.1:$PORT" echo "Example: glmocr parse $TEST_IMAGE --config /path/to/config.yaml" echo "" echo "Or start GLM-OCR Flask server (layout+OCR) that uses this vLLM backend:" echo " glmocr server --config /path/to/config-zhch.yaml # with maas.enabled=false, ocr_api.api_port=$PORT" echo "Then: curl -X POST http://localhost:5002/glmocr/parse -H 'Content-Type: application/json' -d '{\"images\": [\"file://$TEST_IMAGE\"]}'" } usage() { echo "GLM-OCR vLLM Service Daemon" echo "===========================" echo "Usage: $0 {start|stop|restart|status|logs|config|test|test-client}" echo "" echo "Commands:" echo " start - Start the GLM-OCR vLLM service" echo " stop - Stop the GLM-OCR vLLM service" echo " restart - Restart the GLM-OCR vLLM service" echo " status - Show service status and resource usage" echo " logs - Show service logs (follow mode)" echo " config - Show current configuration" echo " test - Test /v1/models API endpoint" echo " test-client - Show how to test SDK/Flask with this vLLM backend" echo "" echo "Configuration (edit script to modify):" echo " Host: $HOST" echo " Port: $PORT" echo " Model Path: $MODEL_PATH" echo " Served Model Name: $SERVED_MODEL_NAME" echo " GPU Memory: $GPU_MEMORY_UTILIZATION" echo " CUDA Devices: $CUDA_VISIBLE_DEVICES" echo " Enable MTP: $ENABLE_MTP" echo "" echo "Examples:" echo " ./glmocr_vllm_daemon.sh start" echo " ./glmocr_vllm_daemon.sh status" echo " ./glmocr_vllm_daemon.sh logs" echo " ./glmocr_vllm_daemon.sh test" } case "$1" in start) start ;; stop) stop ;; restart) stop sleep 3 start ;; status) status ;; logs) logs ;; config) config ;; test) test_api ;; test-client) test_client ;; *) usage exit 1 ;; esac