Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions .buildkite/nightly-benchmarks/scripts/launch-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
server_command="vllm serve $model \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
server_command="vllm serve $model \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,7 @@ run_serving_tests() {
continue
fi

server_command="$server_envs python3 \
-m vllm.entrypoints.openai.api_server \
server_command="$server_envs vllm serve \
$server_args"

# run the server
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/scripts/run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$?

# run server-based benchmarks and upload the result to buildkite
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

Expand Down
8 changes: 2 additions & 6 deletions benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,15 @@ benchmark() {
output_len=$2


CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &


CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand Down
16 changes: 4 additions & 12 deletions benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,12 @@ wait_for_server() {
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
Expand All @@ -62,18 +58,14 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &

CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -565,5 +565,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]

FROM vllm-openai-base AS vllm-openai

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["vllm", "serve"]
#################### OPENAI API SERVER ####################
2 changes: 1 addition & 1 deletion docker/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["vllm", "serve"]
2 changes: 1 addition & 1 deletion docker/Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -314,4 +314,4 @@ WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["vllm", "serve"]
2 changes: 1 addition & 1 deletion docker/Dockerfile.s390x
Original file line number Diff line number Diff line change
Expand Up @@ -309,4 +309,4 @@ USER 2000
WORKDIR /home/vllm

# Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["vllm", "serve"]
2 changes: 1 addition & 1 deletion docker/Dockerfile.xpu
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \

# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["vllm", "serve"]
6 changes: 2 additions & 4 deletions docs/contributing/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -661,8 +661,7 @@ Benchmark the performance of multi-modal requests in vLLM.
Start vLLM:

```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
Expand All @@ -688,8 +687,7 @@ vllm bench serve \
Start vLLM:

```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos
Expand Down
3 changes: 1 addition & 2 deletions docs/contributing/profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example

```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-70B
vllm serve meta-llama/Meta-Llama-3-70B
```

vllm bench command:
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/frameworks/autogen.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
1. Start the vLLM server with the supported chat completion model, e.g.

```bash
python -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-Instruct-v0.2
vllm serve mistralai/Mistral-7B-Instruct-v0.2
```

1. Call it with AutoGen:
Expand Down
2 changes: 1 addition & 1 deletion docs/deployment/frameworks/open-webui.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
For example:

```console
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
vllm serve <model> --host 0.0.0.0 --port 8000
```

3. Start the Open WebUI Docker container:
Expand Down
12 changes: 6 additions & 6 deletions docs/deployment/frameworks/skypilot.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
ports: 8081 # Expose to internet traffic.

envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.

Expand All @@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
vllm serve $MODEL_NAME \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
Expand Down Expand Up @@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
ports: 8081 # Expose to internet traffic.

envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.

Expand All @@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
vllm serve $MODEL_NAME \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
Expand Down Expand Up @@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
ports: 8081 # Expose to internet traffic.

envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.

Expand All @@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
vllm serve $MODEL_NAME \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
Expand Down
5 changes: 5 additions & 0 deletions docs/design/arch_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ Sometimes you may see the API server entrypoint used directly instead of via the
python -m vllm.entrypoints.openai.api_server --model <model>
```

!!! warning

`python -m vllm.entrypoints.openai.api_server` is deprecated
and may become unsupported in a future release.

That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.

More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
Expand Down
3 changes: 1 addition & 2 deletions docs/features/sleep_mode.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.

```bash
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-0.6B \
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
--enable-sleep-mode \
--port 8000
```
Expand Down
3 changes: 1 addition & 2 deletions docs/features/spec_decode.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin
To perform the same with an online mode launch the server:

```bash
python -m vllm.entrypoints.openai.api_server \
vllm serve facebook/opt-6.7b \
--host 0.0.0.0 \
--port 8000 \
--model facebook/opt-6.7b \
--seed 42 \
-tp 1 \
--gpu_memory_utilization 0.8 \
Expand Down
3 changes: 1 addition & 2 deletions docs/getting_started/installation/gpu/xpu.inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ docker run -it \
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:

```bash
python -m vllm.entrypoints.openai.api_server \
--model=facebook/opt-13b \
vllm serve facebook/opt-13b \
--dtype=bfloat16 \
--max_model_len=1024 \
--distributed-executor-backend=mp \
Expand Down
2 changes: 1 addition & 1 deletion examples/online_serving/sagemaker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
done < <(env | grep "^${PREFIX}")

# Pass the collected arguments to the main entrypoint
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
exec vllm serve "${ARGS[@]}"
44 changes: 37 additions & 7 deletions tests/utils_/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,13 +786,43 @@ def test_model_specification(parser_with_config, cli_config_file,
parser_with_config.parse_args(['serve', '--config', cli_config_file])

# Test using --model option raises error
with pytest.raises(
ValueError,
match=
("With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."),
):
parser_with_config.parse_args(['serve', '--model', 'my-model'])
# with pytest.raises(
# ValueError,
# match=
# ("With `vllm serve`, you should provide the model as a positional "
# "argument or in a config file instead of via the `--model` option."),
# ):
# parser_with_config.parse_args(['serve', '--model', 'my-model'])

# Test using --model option back-compatibility
# (when back-compatibility ends, the above test should be uncommented
# and the below test should be removed)
args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size',
'2',
'--model',
'my-model',
'--trust-remote-code',
'--port',
'8001',
])
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001

args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size=2',
'--model=my-model',
'--trust-remote-code',
'--port=8001',
])
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001

# Test other config values are preserved
args = parser_with_config.parse_args([
Expand Down
34 changes: 29 additions & 5 deletions vllm/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1855,13 +1855,37 @@ def parse_args( # type: ignore[override]

# Check for --model in command line arguments first
if args and args[0] == "serve":
model_in_cli_args = any(arg == '--model' for arg in args)

if model_in_cli_args:
raise ValueError(
try:
model_idx = next(
i for i, arg in enumerate(args)
if arg == "--model" or arg.startswith("--model="))
logger.warning(
"With `vllm serve`, you should provide the model as a "
"positional argument or in a config file instead of via "
"the `--model` option.")
"the `--model` option. "
"The `--model` option will be removed in v0.13.")
Comment on lines 1862 to +1866
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any harm in allowing both indefinitely?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure who removed the ability to use --model in the first place. Maybe @mgoin ?


if args[model_idx] == "--model":
model_tag = args[model_idx + 1]
rest_start_idx = model_idx + 2
else:
model_tag = args[model_idx].removeprefix("--model=")
rest_start_idx = model_idx + 1

# Move <model> to the front, e,g:
# [Before]
# vllm serve -tp 2 --model <model> --enforce-eager --port 8001
# [After]
# vllm serve <model> -tp 2 --enforce-eager --port 8001
args = [
"serve",
model_tag,
*args[1:model_idx],
*args[rest_start_idx:],
]
print("args", args)
except StopIteration:
pass

if '--config' in args:
args = self._pull_args_from_config(args)
Expand Down