|
|
@@ -14,6 +14,10 @@
|
|
|
|
|
|
# 如果只有一个 GPU 或模型较小
|
|
|
# --tensor-parallel-size 1
|
|
|
+
|
|
|
+# data_parallel_size=2, # 数据并行,每张GPU独立处理请求
|
|
|
+# distributed_executor_backend="mp" # 使用multiprocessing后端
|
|
|
+# enable_prefix_caching=True, # 启用前缀缓存
|
|
|
######################################################
|
|
|
|
|
|
# download model to /path/to/model
|
|
|
@@ -32,18 +36,22 @@ export PYTHONPATH=$(dirname "$hf_model_path"):$PYTHONPATH
|
|
|
# launch vllm server
|
|
|
model_name="DotsOCR"
|
|
|
# CUDA_VISIBLE_DEVICES=0 vllm serve ${hf_model_path} --tensor-parallel-size 1 --gpu-memory-utilization 0.95 --chat-template-content-format string --served-model-name ${model_name} --trust-remote-code
|
|
|
+# --tensor-parallel-size 1 \
|
|
|
vllm serve ${hf_model_path} \
|
|
|
--host 0.0.0.0 \
|
|
|
--port 8101 \
|
|
|
- --gpu-memory-utilization 0.95 \
|
|
|
+ --gpu-memory-utilization 0.90 \
|
|
|
--max-log-len 1000 \
|
|
|
--trust-remote-code \
|
|
|
--max-model-len 65536 \
|
|
|
--max-num-batched-tokens 65536 \
|
|
|
--uvicorn-log-level info \
|
|
|
--limit-mm-per-prompt image=10 \
|
|
|
- --tensor-parallel-size 1 \
|
|
|
--chat-template-content-format string \
|
|
|
+ --data-parallel-size 2 \
|
|
|
+ --max-num-seqs 32 \
|
|
|
+ --distributed-executor-backend mp \
|
|
|
+ --enable-prefix-caching True \
|
|
|
--served-model-name ${model_name}
|
|
|
|
|
|
# # run python demo after launch vllm server
|