diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh index fb5063db8694..ebacdcbd6821 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -181,18 +181,14 @@ launch_vllm_server() { if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" else echo "Key 'fp8' does not exist in common params." - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" fi diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index b1b7d2d77a44..34effbb6073d 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -365,8 +365,7 @@ run_serving_tests() { continue fi - server_command="$server_envs python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="$server_envs vllm serve \ $server_args" # run the server diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh index 72812218cb66..51536b36b808 100644 --- a/.buildkite/scripts/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_ bench_throughput_exit_code=$? # run server-based benchmarks and upload the result to buildkite -python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & +vllm serve meta-llama/Llama-2-7b-chat-hf & server_pid=$! wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 2c72941cf7e5..d683835db96a 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -55,9 +55,7 @@ benchmark() { output_len=$2 - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -65,9 +63,7 @@ benchmark() { '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index 0bbf7cd2b1c8..35c86cc84522 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -38,16 +38,12 @@ wait_for_server() { launch_chunked_prefill() { model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --enable-chunked-prefill \ --gpu-memory-utilization 0.6 & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --enable-chunked-prefill \ @@ -62,18 +58,14 @@ launch_chunked_prefill() { launch_disagg_prefill() { model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ diff --git a/docker/Dockerfile b/docker/Dockerfile index ccc1b024316a..56bbc3d5f1a3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -565,5 +565,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"] FROM vllm-openai-base AS vllm-openai -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["vllm", "serve"] #################### OPENAI API SERVER #################### diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 388596efd21c..2ed02ff9e3ac 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ uv pip install dist/*.whl -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["vllm", "serve"] diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le index aaff240388f2..5eaef4ea980d 100644 --- a/docker/Dockerfile.ppc64le +++ b/docker/Dockerfile.ppc64le @@ -314,4 +314,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["vllm", "serve"] diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 9942b7626f81..7fd7598b8bd9 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -309,4 +309,4 @@ USER 2000 WORKDIR /home/vllm # Set the default entrypoint -ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["vllm", "serve"] diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index ef422352509a..ffc3abd38965 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["vllm", "serve"] diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index cf14770c01a6..d53a680c9182 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -661,8 +661,7 @@ Benchmark the performance of multi-modal requests in vLLM. Start vLLM: ```bash -python -m vllm.entrypoints.openai.api_server \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ +vllm serve Qwen/Qwen2.5-VL-7B-Instruct \ --dtype bfloat16 \ --limit-mm-per-prompt '{"image": 1}' \ --allowed-local-media-path /path/to/sharegpt4v/images @@ -688,8 +687,7 @@ vllm bench serve \ Start vLLM: ```bash -python -m vllm.entrypoints.openai.api_server \ - --model Qwen/Qwen2.5-VL-7B-Instruct \ +vllm serve Qwen/Qwen2.5-VL-7B-Instruct \ --dtype bfloat16 \ --limit-mm-per-prompt '{"video": 1}' \ --allowed-local-media-path /path/to/sharegpt4video/videos diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index b62560a58748..f6a73e99546e 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -39,8 +39,7 @@ Refer to for an example ```bash VLLM_TORCH_PROFILER_DIR=./vllm_profile \ - python -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Meta-Llama-3-70B + vllm serve meta-llama/Meta-Llama-3-70B ``` vllm bench command: diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md index 7517ee771c09..5790087ed5c2 100644 --- a/docs/deployment/frameworks/autogen.md +++ b/docs/deployment/frameworks/autogen.md @@ -19,8 +19,7 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]" 1. Start the vLLM server with the supported chat completion model, e.g. ```bash - python -m vllm.entrypoints.openai.api_server \ - --model mistralai/Mistral-7B-Instruct-v0.2 + vllm serve mistralai/Mistral-7B-Instruct-v0.2 ``` 1. Call it with AutoGen: diff --git a/docs/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md index eaa51bb61328..505c129613de 100644 --- a/docs/deployment/frameworks/open-webui.md +++ b/docs/deployment/frameworks/open-webui.md @@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps: For example: ```console - python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 + vllm serve --host 0.0.0.0 --port 8000 ``` 3. Start the Open WebUI Docker container: diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index 06e2fed38f05..f4a984a6433e 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -32,6 +32,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil ports: 8081 # Expose to internet traffic. envs: + PYTHONUNBUFFERED: 1 MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct HF_TOKEN: # Change to your own huggingface token, or use --env to pass. @@ -47,9 +48,8 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil run: | conda activate vllm echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ + vllm serve $MODEL_NAME \ --port 8081 \ - --model $MODEL_NAME \ --trust-remote-code \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 2>&1 | tee api_server.log & @@ -131,6 +131,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut ports: 8081 # Expose to internet traffic. envs: + PYTHONUNBUFFERED: 1 MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct HF_TOKEN: # Change to your own huggingface token, or use --env to pass. @@ -146,9 +147,8 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut run: | conda activate vllm echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ + vllm serve $MODEL_NAME \ --port 8081 \ - --model $MODEL_NAME \ --trust-remote-code \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 2>&1 | tee api_server.log @@ -243,6 +243,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica. ports: 8081 # Expose to internet traffic. envs: + PYTHONUNBUFFERED: 1 MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct HF_TOKEN: # Change to your own huggingface token, or use --env to pass. @@ -258,9 +259,8 @@ This will scale the service up to when the QPS exceeds 2 for each replica. run: | conda activate vllm echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ + vllm serve $MODEL_NAME \ --port 8081 \ - --model $MODEL_NAME \ --trust-remote-code \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 2>&1 | tee api_server.log diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index 6b7086776025..f1300a73c26c 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -69,6 +69,11 @@ Sometimes you may see the API server entrypoint used directly instead of via the python -m vllm.entrypoints.openai.api_server --model ``` +!!! warning + + `python -m vllm.entrypoints.openai.api_server` is deprecated + and may become unsupported in a future release. + That code can be found in . More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document. diff --git a/docs/features/sleep_mode.md b/docs/features/sleep_mode.md index 5749b02d26f4..e7dd9fee12d3 100644 --- a/docs/features/sleep_mode.md +++ b/docs/features/sleep_mode.md @@ -64,8 +64,7 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users. ```bash -VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \ - --model Qwen/Qwen3-0.6B \ +VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \ --enable-sleep-mode \ --port 8000 ``` diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index 597a8e864427..25c308a6ff20 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin To perform the same with an online mode launch the server: ```bash -python -m vllm.entrypoints.openai.api_server \ +vllm serve facebook/opt-6.7b \ --host 0.0.0.0 \ --port 8000 \ - --model facebook/opt-6.7b \ --seed 42 \ -tp 1 \ --gpu_memory_utilization 0.8 \ diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index ed1dc0418cf7..2e73ac182569 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -67,8 +67,7 @@ docker run -it \ XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following: ```bash -python -m vllm.entrypoints.openai.api_server \ - --model=facebook/opt-13b \ +vllm serve facebook/opt-13b \ --dtype=bfloat16 \ --max_model_len=1024 \ --distributed-executor-backend=mp \ diff --git a/examples/online_serving/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh index 75a99ffc1f15..1a6b6780ef2a 100644 --- a/examples/online_serving/sagemaker-entrypoint.sh +++ b/examples/online_serving/sagemaker-entrypoint.sh @@ -21,4 +21,4 @@ while IFS='=' read -r key value; do done < <(env | grep "^${PREFIX}") # Pass the collected arguments to the main entrypoint -exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" \ No newline at end of file +exec vllm serve "${ARGS[@]}" \ No newline at end of file diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 658ae7e7451a..bdd92cc8e35e 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -786,13 +786,43 @@ def test_model_specification(parser_with_config, cli_config_file, parser_with_config.parse_args(['serve', '--config', cli_config_file]) # Test using --model option raises error - with pytest.raises( - ValueError, - match= - ("With `vllm serve`, you should provide the model as a positional " - "argument or in a config file instead of via the `--model` option."), - ): - parser_with_config.parse_args(['serve', '--model', 'my-model']) + # with pytest.raises( + # ValueError, + # match= + # ("With `vllm serve`, you should provide the model as a positional " + # "argument or in a config file instead of via the `--model` option."), + # ): + # parser_with_config.parse_args(['serve', '--model', 'my-model']) + + # Test using --model option back-compatibility + # (when back-compatibility ends, the above test should be uncommented + # and the below test should be removed) + args = parser_with_config.parse_args([ + 'serve', + '--tensor-parallel-size', + '2', + '--model', + 'my-model', + '--trust-remote-code', + '--port', + '8001', + ]) + assert args.model is None + assert args.tensor_parallel_size == 2 + assert args.trust_remote_code is True + assert args.port == 8001 + + args = parser_with_config.parse_args([ + 'serve', + '--tensor-parallel-size=2', + '--model=my-model', + '--trust-remote-code', + '--port=8001', + ]) + assert args.model is None + assert args.tensor_parallel_size == 2 + assert args.trust_remote_code is True + assert args.port == 8001 # Test other config values are preserved args = parser_with_config.parse_args([ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 11d6686009b2..8c69870b2bc3 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1855,13 +1855,37 @@ def parse_args( # type: ignore[override] # Check for --model in command line arguments first if args and args[0] == "serve": - model_in_cli_args = any(arg == '--model' for arg in args) - - if model_in_cli_args: - raise ValueError( + try: + model_idx = next( + i for i, arg in enumerate(args) + if arg == "--model" or arg.startswith("--model=")) + logger.warning( "With `vllm serve`, you should provide the model as a " "positional argument or in a config file instead of via " - "the `--model` option.") + "the `--model` option. " + "The `--model` option will be removed in v0.13.") + + if args[model_idx] == "--model": + model_tag = args[model_idx + 1] + rest_start_idx = model_idx + 2 + else: + model_tag = args[model_idx].removeprefix("--model=") + rest_start_idx = model_idx + 1 + + # Move to the front, e,g: + # [Before] + # vllm serve -tp 2 --model --enforce-eager --port 8001 + # [After] + # vllm serve -tp 2 --enforce-eager --port 8001 + args = [ + "serve", + model_tag, + *args[1:model_idx], + *args[rest_start_idx:], + ] + print("args", args) + except StopIteration: + pass if '--config' in args: args = self._pull_args_from_config(args)