From c866ce240a457351f1a02007197c1a44acd1dbfb Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 08:34:42 +0000 Subject: [PATCH 01/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 96 ++++---- .../pipeline_generator/pipeline_generator.py | 215 ++++++++++++++++++ 2 files changed, 270 insertions(+), 41 deletions(-) create mode 100644 scripts/pipeline_generator/pipeline_generator.py diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index f3b4093..c22958e 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -2,76 +2,90 @@ set -euo pipefail -if [[ -z "${RUN_ALL:-}" ]]; then - RUN_ALL=0 -fi +RUN_ALL=${RUN_ALL:-0} +VLLM_BUILDKITE_BRANCH=${VLLM_BUILDKITE_BRANCH:-main} + +generate_pipeline() { + python -m pip install click pydantic + + # Download necessary files + for FILE in pipeline_generator.py plugin.py step.py utils.py; do + curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_BUILDKITE_BRANCH/scripts/pipeline_generator/$FILE" + done + + # Generate and upload pipeline + python .buildkite/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" + cat .buildkite/pipeline.yaml + buildkite-agent pipeline upload .buildkite/pipeline.yaml + exit 0 +} upload_pipeline() { echo "Uploading pipeline..." ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI' + + # Install minijinja-cli curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh source /var/lib/buildkite-agent/.cargo/env - if [ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]; then - if [ ! -e ".buildkite/test-template-fastcheck.j2" ]; then - curl -o .buildkite/test-template-fastcheck.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-fastcheck.j2 - fi - cd .buildkite && minijinja-cli test-template-fastcheck.j2 test-pipeline.yaml > pipeline.yml - cat pipeline.yml - buildkite-agent pipeline upload pipeline.yml - exit 0 + + if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then + handle_fastcheck + else + handle_regular_pipeline fi - if [ ! -e ".buildkite/test-template.j2" ]; then +} + +handle_fastcheck() { + [ ! -e ".buildkite/test-template-fastcheck.j2" ] && \ + curl -o .buildkite/test-template-fastcheck.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-fastcheck.j2 + + cd .buildkite && minijinja-cli test-template-fastcheck.j2 test-pipeline.yaml > pipeline.yml + cat pipeline.yml + buildkite-agent pipeline upload pipeline.yml + exit 0 +} + +handle_regular_pipeline() { + [ ! -e ".buildkite/test-template.j2" ] && \ curl -o .buildkite/test-template.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-aws.j2 - fi + if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then python -m pip install click pydantic python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" buildkite-agent pipeline upload .buildkite/pipeline.yaml - exit 0 + else + cd .buildkite + echo "List file diff: $LIST_FILE_DIFF" + echo "Run all: $RUN_ALL" + minijinja-cli test-template.j2 test-pipeline.yaml -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" > pipeline.yml + buildkite-agent pipeline upload pipeline.yml fi - cd .buildkite - echo "List file diff: $LIST_FILE_DIFF" - echo "Run all: $RUN_ALL" - minijinja-cli test-template.j2 test-pipeline.yaml -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" > pipeline.yml - buildkite-agent pipeline upload pipeline.yml exit 0 } get_diff() { - $(git add .) - echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD)) + git add . + git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD) } get_diff_main() { - $(git add .) - echo $(git diff --name-only --diff-filter=ACMDR HEAD~1) + git add . + git diff --name-only --diff-filter=ACMDR HEAD~1 } -file_diff=$(get_diff) -if [[ $BUILDKITE_BRANCH == "main" ]]; then - file_diff=$(get_diff_main) -fi +# Determine if we need to run all tests +file_diff=$([ $BUILDKITE_BRANCH == "main" ] && get_diff_main || get_diff) +patterns=(".buildkite/test-pipeline" "Dockerfile" "CMakeLists.txt" "requirements*" "setup.py" "csrc/") -patterns=( - ".buildkite/test-pipeline" - "Dockerfile" - "CMakeLists.txt" - "requirements*" - "setup.py" - "csrc/" -) for file in $file_diff; do for pattern in "${patterns[@]}"; do if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then RUN_ALL=1 echo "Found changes: $file. Run all tests" - break + break 2 fi done done -LIST_FILE_DIFF=$(get_diff | tr ' ' '|') -if [[ $BUILDKITE_BRANCH == "main" ]]; then - LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|') -fi -upload_pipeline +LIST_FILE_DIFF=$(echo "$file_diff" | tr ' ' '|') +generate_pipeline \ No newline at end of file diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py new file mode 100644 index 0000000..4407bcb --- /dev/null +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -0,0 +1,215 @@ +import yaml +import click +from typing import List, Dict, Union +import os + +from plugin import ( + get_kubernetes_plugin_config, + get_docker_plugin_config, +) +from utils import ( + AgentQueue, + AMD_REPO, + A100_GPU, + TEST_PATH, + EXTERNAL_HARDWARE_TEST_PATH, + PIPELINE_FILE_PATH, + STEPS_TO_BLOCK, + VLLM_ECR_URL, + VLLM_ECR_REPO, + get_agent_queue, + get_full_test_command, + get_multi_node_test_command, +) +from step import ( + TestStep, + BuildkiteStep, + BuildkiteBlockStep, + get_block_step, + get_step_key +) + +class PipelineGenerator: + def __init__(self, run_all: bool, list_file_diff: List[str]): + self.run_all = run_all + self.list_file_diff = list_file_diff + self.commit = os.getenv("BUILDKITE_COMMIT") + + def read_test_steps(self, file_path: str) -> List[TestStep]: + """Read test steps from test pipeline yaml and parse them into Step objects.""" + with open(file_path, "r") as f: + content = yaml.safe_load(f) + return [TestStep(**step) for step in content["steps"]] + + def step_should_run(self, step: TestStep) -> bool: + """Determine whether the step should automatically run or not.""" + if step.optional: + return False + if not step.source_file_dependencies or self.run_all: + return True + return any(source_file in diff_file + for source_file in step.source_file_dependencies + for diff_file in self.list_file_diff) + + def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: + """Process test step and return corresponding BuildkiteStep.""" + steps = [] + current_step = self._create_buildkite_step(step) + + if step.num_nodes > 1: + self._configure_multi_node_step(current_step, step) + + if not self.step_should_run(step): + block_step = get_block_step(step.label) + steps.append(block_step) + current_step.depends_on = block_step.key + + steps.append(current_step) + return steps + + def generate_build_step(self) -> BuildkiteStep: + """Build the Docker image and push it to ECR.""" + docker_image = f"{VLLM_ECR_REPO}:{self.commit}" + build_commands = self._get_build_commands(docker_image) + + return BuildkiteStep( + label=":docker: build image", + key="build", + agents={"queue": AgentQueue.AWS_CPU.value}, + env={"DOCKER_BUILDKIT": "1"}, + retry={ + "automatic": [ + {"exit_status": -1, "limit": 2}, + {"exit_status": -10, "limit": 2} + ] + }, + commands=build_commands, + depends_on=None, + ) + + def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: + """Process the external hardware tests from the yaml file and convert to Buildkite steps.""" + buildkite_steps = self._process_external_hardware_steps() + buildkite_steps.extend(self._mirror_amd_test_steps(test_steps)) + return buildkite_steps + + def get_plugin_config(self, step: TestStep) -> Dict: + """Returns the plugin configuration for the step.""" + test_commands = [step.command] if step.command else step.commands + test_bash_command = [ + "bash", + "-c", + get_full_test_command(test_commands, step.working_dir) + ] + docker_image_path = f"{VLLM_ECR_REPO}:{self.commit}" + + if step.gpu == A100_GPU: + test_bash_command[-1] = f"'{test_bash_command[-1]}'" + return get_kubernetes_plugin_config( + docker_image_path, + test_bash_command, + step.num_gpus + ) + return get_docker_plugin_config( + docker_image_path, + test_bash_command, + step.no_gpu + ) + + def _create_buildkite_step(self, step: TestStep) -> BuildkiteStep: + return BuildkiteStep( + label=step.label, + key=get_step_key(step.label), + parallelism=step.parallelism, + soft_fail=step.soft_fail, + plugins=[self.get_plugin_config(step)], + agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} + ) + + def _configure_multi_node_step(self, current_step: BuildkiteStep, step: TestStep): + current_step.commands = get_multi_node_test_command( + step.commands, + step.working_dir, + step.num_nodes, + step.num_gpus, + f"{VLLM_ECR_REPO}:{self.commit}" + ) + current_step.plugins = None + + def _get_build_commands(self, docker_image: str) -> List[str]: + ecr_login_command = ( + "aws ecr-public get-login-password --region us-east-1 | " + f"docker login --username AWS --password-stdin {VLLM_ECR_URL}" + ) + docker_build_command = ( + f"docker build " + f"--build-arg max_jobs=64 " + f"--build-arg buildkite_commit={self.commit} " + f"--build-arg USE_SCCACHE=1 " + f"--tag {docker_image} " + f"--target test " + f"--progress plain ." + ) + docker_push_command = f"docker push {docker_image}" + return [ecr_login_command, docker_build_command, docker_push_command] + + def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: + with open(EXTERNAL_HARDWARE_TEST_PATH, "r") as f: + content = yaml.safe_load(f) + buildkite_steps = [] + amd_docker_image = f"{AMD_REPO}:{self.commit}" + for step in content["steps"]: + step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]] + buildkite_step = BuildkiteStep(**step) + buildkite_step.depends_on = "bootstrap" + if buildkite_step.key in STEPS_TO_BLOCK: + block_step = get_block_step(buildkite_step.label) + buildkite_steps.append(block_step) + buildkite_step.depends_on = block_step.key + buildkite_steps.append(buildkite_step) + return buildkite_steps + + def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteStep]: + mirrored_steps = [] + for test_step in test_steps: + if "amd" in test_step.mirror_hardwares: + test_commands = [test_step.command] if test_step.command else test_step.commands + amd_test_command = [ + "bash", + ".buildkite/run-amd-test.sh", + f"'{get_full_test_command(test_commands, test_step.working_dir)}'", + ] + mirrored_buildkite_step = BuildkiteStep( + label = f"AMD: {test_step.label}", + key = f"amd_{get_step_key(test_step.label)}", + depends_on = "amd-build", + agents = {"queue": AgentQueue.AMD_GPU.value}, + soft_fail = test_step.soft_fail, + env = {"DOCKER_BUILDKIT": "1"}, + commands = [" ".join(amd_test_command)], + ) + mirrored_steps.append(mirrored_buildkite_step) + return mirrored_steps + +@click.command() +@click.option("--run_all", type=str) +@click.option("--list_file_diff", type=str) +def main(run_all: str = "-1", list_file_diff: str = None): + list_file_diff = list_file_diff.split("|") if list_file_diff else [] + pipeline_generator = PipelineGenerator(run_all == "1", list_file_diff) + + test_steps = pipeline_generator.read_test_steps(TEST_PATH) + + buildkite_steps = [ + pipeline_generator.generate_build_step(), + *[step for test_step in test_steps for step in pipeline_generator.process_step(test_step)], + *pipeline_generator.get_external_hardware_tests(test_steps) + ] + + buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} + + with open(PIPELINE_FILE_PATH, "w") as f: + yaml.dump(buildkite_steps_dict, f, sort_keys=False) + +if __name__ == "__main__": + main() From c8c1cde7ef18dcd63f13622fd3e7baa2ebf3ff82 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 08:53:25 +0000 Subject: [PATCH 02/20] p Signed-off-by: kevin --- scripts/pipeline_generator/__init__.py | 0 scripts/pipeline_generator/step.py | 32 +++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) delete mode 100644 scripts/pipeline_generator/__init__.py diff --git a/scripts/pipeline_generator/__init__.py b/scripts/pipeline_generator/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py index 1e1c462..420f6fe 100644 --- a/scripts/pipeline_generator/step.py +++ b/scripts/pipeline_generator/step.py @@ -1,10 +1,40 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from .utils import AgentQueue +from utils import AgentQueue BUILD_STEP_KEY = "build" +class TestStep(BaseModel): + """This class represents a test step defined in the test configuration file.""" + label: str + fast_check: bool = False + mirror_hardwares: List[str] = Field(default_factory=list) + gpu: str = "" + num_gpus: int = 1 + num_nodes: int = 1 + working_dir: str = "/vllm-workspace/tests" + source_file_dependencies: List[str] = Field(default_factory=list) + no_gpu: bool = False + soft_fail: bool = False + parallelism: int = 1 + optional: bool = False + command: Optional[str] = None + commands: Optional[List[str]] = None + +class BuildkiteStep(BaseModel): + """This class represents a step in Buildkite format.""" + label: str + key: str + agents: Dict[str, Any] = {"queue": AgentQueue.AWS_CPU} + commands: Optional[List[str]] = None + plugins: Optional[List[Dict]] = None + parallelism: Optional[int] = None + soft_fail: Optional[bool] = None + depends_on: Optional[str] = "build" + env: Optional[Dict[str, str]] = None + retry: Optional[Dict[str, Any]] = None + class BuildkiteBlockStep(BaseModel): """This class represents a block step in Buildkite format.""" block: str From 4f3aa6988b0879f792e7c309d426e09170db3c9b Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 08:54:53 +0000 Subject: [PATCH 03/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index c22958e..5be3f1f 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -9,6 +9,8 @@ generate_pipeline() { python -m pip install click pydantic # Download necessary files + echo "Downloading pipeline generator scripts..." + echo $VLLM_BUILDKITE_BRANCH for FILE in pipeline_generator.py plugin.py step.py utils.py; do curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_BUILDKITE_BRANCH/scripts/pipeline_generator/$FILE" done From 2f50078312e0ea967aa9da19d5c1dcc741170eb7 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 09:02:11 +0000 Subject: [PATCH 04/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index 5be3f1f..01fbc94 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -3,16 +3,16 @@ set -euo pipefail RUN_ALL=${RUN_ALL:-0} -VLLM_BUILDKITE_BRANCH=${VLLM_BUILDKITE_BRANCH:-main} +VLLM_CI_BRANCH=${VLLM_CI_BRANCH:-main} generate_pipeline() { python -m pip install click pydantic # Download necessary files echo "Downloading pipeline generator scripts..." - echo $VLLM_BUILDKITE_BRANCH + echo $VLLM_CI_BRANCH for FILE in pipeline_generator.py plugin.py step.py utils.py; do - curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_BUILDKITE_BRANCH/scripts/pipeline_generator/$FILE" + curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" done # Generate and upload pipeline From d3bf36e2e7b8015adb1ccd6359ebdee80bcc1fe8 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 09:09:21 +0000 Subject: [PATCH 05/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 2 +- scripts/pipeline_generator/plugin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index 01fbc94..afedf59 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -10,7 +10,7 @@ generate_pipeline() { # Download necessary files echo "Downloading pipeline generator scripts..." - echo $VLLM_CI_BRANCH + echo "VLLM CI Branch: $VLLM_CI_BRANCH" for FILE in pipeline_generator.py plugin.py step.py utils.py; do curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" done diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index db093bc..133f9cd 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from .utils import HF_HOME +from utils import HF_HOME DOCKER_PLUGIN_NAME = "docker#v5.2.0" KUBERNETES_PLUGIN_NAME = "kubernetes" From 021b042fc1c8af744ea6ad7e84633158c21a065c Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 21:32:22 +0000 Subject: [PATCH 06/20] p Signed-off-by: kevin --- .../pipeline_generator/pipeline_generator.py | 32 +++++---- scripts/pipeline_generator/plugin.py | 4 +- scripts/pipeline_generator/step.py | 2 +- scripts/pipeline_generator/utils.py | 10 ++- .../test_pipeline_generator.py | 70 +++++++++++++++++++ .../tests/pipeline_generator/test_utils.py | 6 +- scripts/tests/pipeline_generator/tests.yaml | 25 +++++++ 7 files changed, 128 insertions(+), 21 deletions(-) create mode 100644 scripts/tests/pipeline_generator/test_pipeline_generator.py create mode 100644 scripts/tests/pipeline_generator/tests.yaml diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 4407bcb..3aa2ff5 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -3,11 +3,11 @@ from typing import List, Dict, Union import os -from plugin import ( +from .plugin import ( get_kubernetes_plugin_config, get_docker_plugin_config, ) -from utils import ( +from .utils import ( AgentQueue, AMD_REPO, A100_GPU, @@ -21,7 +21,7 @@ get_full_test_command, get_multi_node_test_command, ) -from step import ( +from .step import ( TestStep, BuildkiteStep, BuildkiteBlockStep, @@ -54,7 +54,7 @@ def step_should_run(self, step: TestStep) -> bool: def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: """Process test step and return corresponding BuildkiteStep.""" steps = [] - current_step = self._create_buildkite_step(step) + current_step = self.create_buildkite_step(step) if step.num_nodes > 1: self._configure_multi_node_step(current_step, step) @@ -86,6 +86,12 @@ def generate_build_step(self) -> BuildkiteStep: commands=build_commands, depends_on=None, ) + + def write_buildkite_steps(self, buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], output_file_path: str) -> None: + """Output the buildkite steps to the Buildkite pipeline yaml file.""" + buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} + with open(output_file_path, "w") as f: + yaml.dump(buildkite_steps_dict, f, sort_keys=False) def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: """Process the external hardware tests from the yaml file and convert to Buildkite steps.""" @@ -95,28 +101,28 @@ def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[ def get_plugin_config(self, step: TestStep) -> Dict: """Returns the plugin configuration for the step.""" - test_commands = [step.command] if step.command else step.commands + test_step_commands = [step.command] if step.command else step.commands test_bash_command = [ "bash", "-c", - get_full_test_command(test_commands, step.working_dir) + get_full_test_command(test_step_commands, step.working_dir) ] - docker_image_path = f"{VLLM_ECR_REPO}:{self.commit}" + test_bash_command[-1] = f"'{test_bash_command[-1]}'" + container_image = f"{VLLM_ECR_REPO}:{self.commit}" if step.gpu == A100_GPU: - test_bash_command[-1] = f"'{test_bash_command[-1]}'" return get_kubernetes_plugin_config( - docker_image_path, + container_image, test_bash_command, step.num_gpus ) return get_docker_plugin_config( - docker_image_path, + container_image, test_bash_command, step.no_gpu ) - def _create_buildkite_step(self, step: TestStep) -> BuildkiteStep: + def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: return BuildkiteStep( label=step.label, key=get_step_key(step.label), @@ -206,10 +212,8 @@ def main(run_all: str = "-1", list_file_diff: str = None): *pipeline_generator.get_external_hardware_tests(test_steps) ] - buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} + pipeline_generator.write_buildkite_steps(buildkite_steps, PIPELINE_FILE_PATH) - with open(PIPELINE_FILE_PATH, "w") as f: - yaml.dump(buildkite_steps_dict, f, sort_keys=False) if __name__ == "__main__": main() diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index 133f9cd..269835d 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from utils import HF_HOME +from .utils import HF_HOME DOCKER_PLUGIN_NAME = "docker#v5.2.0" KUBERNETES_PLUGIN_NAME = "kubernetes" @@ -103,7 +103,7 @@ def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[s def get_docker_plugin_config(docker_image_path: str, test_bash_command: List[str], no_gpu: bool) -> Dict: docker_plugin_config = DockerPluginConfig( image=docker_image_path, - command=test_bash_command + command=[" ".join(test_bash_command)] ) if no_gpu: docker_plugin_config.gpus = None diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py index 420f6fe..7dc1cda 100644 --- a/scripts/pipeline_generator/step.py +++ b/scripts/pipeline_generator/step.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from utils import AgentQueue +from .utils import AgentQueue BUILD_STEP_KEY = "build" diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 51dd6eb..12a50f7 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -40,7 +40,15 @@ def get_full_test_command(test_commands: List[str], step_working_dir: str) -> st """Convert test commands into one-line command with the right directory.""" working_dir = step_working_dir or DEFAULT_WORKING_DIR test_commands_str = ";\n".join(test_commands) - return f"cd {working_dir};\n{test_commands_str}" + # Always add these commands before running the tests + commands = [ + "(command nvidia-smi || true)", + "export VLLM_LOGGING_LEVEL=DEBUG", + "export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1", + f"cd {working_dir}", + test_commands_str + ] + return ";\n".join(commands) def get_multi_node_test_command( diff --git a/scripts/tests/pipeline_generator/test_pipeline_generator.py b/scripts/tests/pipeline_generator/test_pipeline_generator.py new file mode 100644 index 0000000..1d932f8 --- /dev/null +++ b/scripts/tests/pipeline_generator/test_pipeline_generator.py @@ -0,0 +1,70 @@ +import pytest + +from scripts.pipeline_generator.pipeline_generator import PipelineGenerator +from scripts.pipeline_generator.step import TestStep, BuildkiteStep, BuildkiteBlockStep + +TEST_COMMIT = "123456789abcdef123456789abcdef123456789a" +TEST_FILE_PATH = "scripts/tests/pipeline_generator/tests.yaml" + +def get_test_pipeline_generator(): + pipeline_generator = PipelineGenerator(run_all=False, list_file_diff=[]) + pipeline_generator.commit = TEST_COMMIT + return pipeline_generator + +def test_read_test_steps(): + pipeline_generator = get_test_pipeline_generator() + steps = pipeline_generator.read_test_steps(TEST_FILE_PATH) + assert len(steps) == 4 + for i in range(4): + assert steps[i].label == f"Test {i}" + assert steps[0].source_file_dependencies == ["dir1/", "dir2/file1"] + assert steps[0].commands == ["pytest -v -s a", "pytest -v -s b.py"] + assert steps[1].working_dir == "/tests" + assert steps[2].num_gpus == 2 + assert steps[2].num_nodes == 2 + assert steps[3].gpu == "a100" + assert steps[3].optional == True + +@pytest.mark.parametrize( + ("test_step", "expected_plugin_config"), + [ + ( + TestStep( + label="Test 0", + source_file_dependencies=["dir1/", "dir2/file1"], + commands=["test command 1", "test command 2"] + ), + { + "plugin": "docker" + } + ), + ( + TestStep( + label="Test 1", + commands=["test command 1", "test command 2"] + gpu="a100" + ), + { + "plugin": "kubernetes" + } + ) + ] +) +@mock.patch("scripts.pipeline_generator.pipeline_generator.get_docker_plugin_config") +@mock.patch("scripts.pipeline_generator.pipeline_generator.get_kubernetes_plugin_config") +@mock.patch("scripts.pipeline_generator.utils.get_full_test_command") +def test_get_plugin_config(mock_get_full_test_command, mock_get_kubernetes_plugin_config, mock_get_docker_plugin_config, test_step, expected_plugin_config): + pipeline_generator = get_test_pipeline_generator() + mock_get_full_test_command.return_value = "test command 1;\ntest command 2" + mock_get_docker_plugin_config.return_value = {"plugin": "docker"} + mock_get_kubernetes_plugin_config.return_value = {"plugin": "kubernetes"} + container_image_path = f"{VLLM_ECR_REPO}:{TEST_COMMIT}" + + plugin_config = pipeline_generator.get_plugin_config(test_step) + assert plugin_config == expected_plugin_config + if test_step.gpu == "a100": + assert mock_get_kubernetes_plugin_config.called_once_with(container_image_path, ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) \ No newline at end of file diff --git a/scripts/tests/pipeline_generator/test_utils.py b/scripts/tests/pipeline_generator/test_utils.py index 5281a93..f089b2e 100644 --- a/scripts/tests/pipeline_generator/test_utils.py +++ b/scripts/tests/pipeline_generator/test_utils.py @@ -27,9 +27,9 @@ def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_re @pytest.mark.parametrize( ("test_commands", "step_working_dir", "expected_result"), [ - (["echo 'hello'"], None, "cd /vllm-workspace/tests;\necho 'hello'"), - (["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests;\necho 'hello'"), - (["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests;\necho 'hello1';\necho 'hello2'"), + (["echo 'hello'"], None, "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\necho 'hello'"), + (["echo 'hello'"], "/vllm-workspace/tests", "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\necho 'hello'"), + (["echo 'hello1'", "echo 'hello2'"], None, "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\necho 'hello1';\necho 'hello2'"), ], ) def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str): diff --git a/scripts/tests/pipeline_generator/tests.yaml b/scripts/tests/pipeline_generator/tests.yaml new file mode 100644 index 0000000..a98b1f6 --- /dev/null +++ b/scripts/tests/pipeline_generator/tests.yaml @@ -0,0 +1,25 @@ +steps: +- label: Test 0 + source_file_dependencies: + - dir1/ + - dir2/file1 + commands: + - pytest -v -s a + - pytest -v -s b.py +- label: Test 1 + working_dir: "/tests" + commands: + - pytest -v -s d +- label: Test 2 + num_gpus: 2 + num_nodes: 2 + commands: + - pytest -v -s e && pytest -v -s f + - pytest -v -s g +- label: Test 3 + working_dir: "/tests" + gpu: a100 + num_gpus: 4 + optional: true + commands: + - pytest -v -s d From 8dfa7d72d913a59a7444f933509d4781da570710 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 21:37:41 +0000 Subject: [PATCH 07/20] p Signed-off-by: kevin --- scripts/pipeline_generator/pipeline_generator.py | 6 +++--- scripts/pipeline_generator/plugin.py | 2 +- scripts/pipeline_generator/step.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 3aa2ff5..acd23cd 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -3,11 +3,11 @@ from typing import List, Dict, Union import os -from .plugin import ( +from plugin import ( get_kubernetes_plugin_config, get_docker_plugin_config, ) -from .utils import ( +from utils import ( AgentQueue, AMD_REPO, A100_GPU, @@ -21,7 +21,7 @@ get_full_test_command, get_multi_node_test_command, ) -from .step import ( +from step import ( TestStep, BuildkiteStep, BuildkiteBlockStep, diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index 269835d..925c11e 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from .utils import HF_HOME +from utils import HF_HOME DOCKER_PLUGIN_NAME = "docker#v5.2.0" KUBERNETES_PLUGIN_NAME = "kubernetes" diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py index 7dc1cda..420f6fe 100644 --- a/scripts/pipeline_generator/step.py +++ b/scripts/pipeline_generator/step.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from .utils import AgentQueue +from utils import AgentQueue BUILD_STEP_KEY = "build" From cc233582a7f94e7c0d8bc7e91d9600e068b0d2db Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 22:51:06 +0000 Subject: [PATCH 08/20] p Signed-off-by: kevin --- scripts/pipeline_generator/pipeline_generator.py | 12 ++++++++++-- scripts/pipeline_generator/plugin.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index acd23cd..4de7be5 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -126,7 +126,7 @@ def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: return BuildkiteStep( label=step.label, key=get_step_key(step.label), - parallelism=step.parallelism, + parallelism=step.parallelism, soft_fail=step.soft_fail, plugins=[self.get_plugin_config(step)], agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} @@ -147,6 +147,14 @@ def _get_build_commands(self, docker_image: str) -> List[str]: "aws ecr-public get-login-password --region us-east-1 | " f"docker login --username AWS --password-stdin {VLLM_ECR_URL}" ) + image_check_command = f"""#!/bin/bash +if [[ -z $(docker manifest inspect {docker_image}) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi +""" docker_build_command = ( f"docker build " f"--build-arg max_jobs=64 " @@ -157,7 +165,7 @@ def _get_build_commands(self, docker_image: str) -> List[str]: f"--progress plain ." ) docker_push_command = f"docker push {docker_image}" - return [ecr_login_command, docker_build_command, docker_push_command] + return [ecr_login_command, image_check_command, docker_build_command, docker_push_command] def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: with open(EXTERNAL_HARDWARE_TEST_PATH, "r") as f: diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index 925c11e..133f9cd 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -103,7 +103,7 @@ def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[s def get_docker_plugin_config(docker_image_path: str, test_bash_command: List[str], no_gpu: bool) -> Dict: docker_plugin_config = DockerPluginConfig( image=docker_image_path, - command=[" ".join(test_bash_command)] + command=test_bash_command ) if no_gpu: docker_plugin_config.gpus = None From cf79847bd5f0c8cbf0f5741770d26da39dbb81bb Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 23:03:46 +0000 Subject: [PATCH 09/20] p Signed-off-by: kevin --- scripts/pipeline_generator/pipeline_generator.py | 1 - scripts/pipeline_generator/plugin.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 4de7be5..0658fc0 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -107,7 +107,6 @@ def get_plugin_config(self, step: TestStep) -> Dict: "-c", get_full_test_command(test_step_commands, step.working_dir) ] - test_bash_command[-1] = f"'{test_bash_command[-1]}'" container_image = f"{VLLM_ECR_REPO}:{self.commit}" if step.gpu == A100_GPU: diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index 133f9cd..2a4a591 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -92,7 +92,7 @@ def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[s containers=[ KubernetesPodContainerConfig( image=container_image, - command=[" ".join(test_bash_command)], + command=test_bash_command, resources={"limits": {"nvidia.com/gpu": num_gpus}} ) ] From aeb85b6e2c578e24e58b7f09d8bf66e3322c7e41 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 25 Sep 2024 23:34:56 +0000 Subject: [PATCH 10/20] p Signed-off-by: kevin --- scripts/pipeline_generator/pipeline_generator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 0658fc0..9b32400 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -43,6 +43,8 @@ def read_test_steps(self, file_path: str) -> List[TestStep]: def step_should_run(self, step: TestStep) -> bool: """Determine whether the step should automatically run or not.""" + if step.gpu != A100_GPU: + return False if step.optional: return False if not step.source_file_dependencies or self.run_all: From e05d0ebc46939c75174e821d0c2b5ac142991450 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 09:21:29 +0000 Subject: [PATCH 11/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 12 +- scripts/pipeline_generator/__init__.py | 0 .../pipeline_generator/pipeline_generator.py | 48 +-- scripts/pipeline_generator/plugin.py | 75 +++-- scripts/pipeline_generator/step.py | 20 +- scripts/pipeline_generator/utils.py | 2 +- .../test_pipeline_generator.py | 315 ++++++++++++++++-- 7 files changed, 379 insertions(+), 93 deletions(-) create mode 100644 scripts/pipeline_generator/__init__.py diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index afedf59..73898cd 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -11,14 +11,16 @@ generate_pipeline() { # Download necessary files echo "Downloading pipeline generator scripts..." echo "VLLM CI Branch: $VLLM_CI_BRANCH" - for FILE in pipeline_generator.py plugin.py step.py utils.py; do - curl -o ".buildkite/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" + mkdir -p .buildkite/pipeline_generator + for FILE in pipeline_generator.py plugin.py step.py utils.py __init__.py; do + curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" done # Generate and upload pipeline - python .buildkite/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" - cat .buildkite/pipeline.yaml - buildkite-agent pipeline upload .buildkite/pipeline.yaml + cd .buildkite + python -m pipeline_generator.pipeline_generator --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" + cat pipeline.yaml + buildkite-agent pipeline upload pipeline.yaml exit 0 } diff --git a/scripts/pipeline_generator/__init__.py b/scripts/pipeline_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 9b32400..768b6b0 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -3,11 +3,11 @@ from typing import List, Dict, Union import os -from plugin import ( +from .plugin import ( get_kubernetes_plugin_config, get_docker_plugin_config, ) -from utils import ( +from .utils import ( AgentQueue, AMD_REPO, A100_GPU, @@ -21,7 +21,7 @@ get_full_test_command, get_multi_node_test_command, ) -from step import ( +from .step import ( TestStep, BuildkiteStep, BuildkiteBlockStep, @@ -43,8 +43,6 @@ def read_test_steps(self, file_path: str) -> List[TestStep]: def step_should_run(self, step: TestStep) -> bool: """Determine whether the step should automatically run or not.""" - if step.gpu != A100_GPU: - return False if step.optional: return False if not step.source_file_dependencies or self.run_all: @@ -58,9 +56,6 @@ def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlo steps = [] current_step = self.create_buildkite_step(step) - if step.num_nodes > 1: - self._configure_multi_node_step(current_step, step) - if not self.step_should_run(step): block_step = get_block_step(step.label) steps.append(block_step) @@ -72,7 +67,7 @@ def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlo def generate_build_step(self) -> BuildkiteStep: """Build the Docker image and push it to ECR.""" docker_image = f"{VLLM_ECR_REPO}:{self.commit}" - build_commands = self._get_build_commands(docker_image) + build_commands = self.get_build_commands(docker_image) return BuildkiteStep( label=":docker: build image", @@ -110,7 +105,6 @@ def get_plugin_config(self, step: TestStep) -> Dict: get_full_test_command(test_step_commands, step.working_dir) ] container_image = f"{VLLM_ECR_REPO}:{self.commit}" - if step.gpu == A100_GPU: return get_kubernetes_plugin_config( container_image, @@ -124,26 +118,30 @@ def get_plugin_config(self, step: TestStep) -> Dict: ) def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: - return BuildkiteStep( + buildkite_step = BuildkiteStep( label=step.label, key=get_step_key(step.label), parallelism=step.parallelism, - soft_fail=step.soft_fail, + soft_fail=step.soft_fail, plugins=[self.get_plugin_config(step)], agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} ) + if step.num_nodes and step.num_nodes > 1: + self._configure_multi_node_step(buildkite_step, step) + return buildkite_step def _configure_multi_node_step(self, current_step: BuildkiteStep, step: TestStep): - current_step.commands = get_multi_node_test_command( - step.commands, - step.working_dir, - step.num_nodes, - step.num_gpus, - f"{VLLM_ECR_REPO}:{self.commit}" - ) + current_step.commands = [get_multi_node_test_command( + step.commands, + step.working_dir, + step.num_nodes, + step.num_gpus, + f"{VLLM_ECR_REPO}:{self.commit}" + ) + ] current_step.plugins = None - def _get_build_commands(self, docker_image: str) -> List[str]: + def get_build_commands(self, docker_image: str) -> List[str]: ecr_login_command = ( "aws ecr-public get-login-password --region us-east-1 | " f"docker login --username AWS --password-stdin {VLLM_ECR_URL}" @@ -177,6 +175,8 @@ def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, Buildkit step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]] buildkite_step = BuildkiteStep(**step) buildkite_step.depends_on = "bootstrap" + + # Add block step if step is in blocklist if buildkite_step.key in STEPS_TO_BLOCK: block_step = get_block_step(buildkite_step.label) buildkite_steps.append(block_step) @@ -185,9 +185,9 @@ def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, Buildkit return buildkite_steps def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteStep]: - mirrored_steps = [] + mirrored_buildkite_steps = [] for test_step in test_steps: - if "amd" in test_step.mirror_hardwares: + if test_step.mirror_hardwares and "amd" in test_step.mirror_hardwares: test_commands = [test_step.command] if test_step.command else test_step.commands amd_test_command = [ "bash", @@ -203,8 +203,8 @@ def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteSt env = {"DOCKER_BUILDKIT": "1"}, commands = [" ".join(amd_test_command)], ) - mirrored_steps.append(mirrored_buildkite_step) - return mirrored_steps + mirrored_buildkite_steps.append(mirrored_buildkite_step) + return mirrored_buildkite_steps @click.command() @click.option("--run_all", type=str) diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index 2a4a591..d36c0a2 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -1,11 +1,43 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from utils import HF_HOME +from .utils import HF_HOME DOCKER_PLUGIN_NAME = "docker#v5.2.0" KUBERNETES_PLUGIN_NAME = "kubernetes" +DEFAULT_DOCKER_ENVIRONMENT_VARIBLES = [ + f"HF_HOME={HF_HOME}", + "VLLM_USAGE_SOURCE=ci-test", + "HF_TOKEN", + "BUILDKITE_ANALYTICS_TOKEN" +] +DEFAULT_DOCKER_VOLUMES = [ + "/dev/shm:/dev/shm", + f"{HF_HOME}:{HF_HOME}" +] +DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS = [ + {"name": "devshm", "mountPath": "/dev/shm"}, + {"name": "hf-cache", "mountPath": HF_HOME} +] +DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES = [ + {"name": "HF_HOME", "value": HF_HOME}, + {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, + { + "name": "HF_TOKEN", + "valueFrom": { + "secretKeyRef": { + "name": "hf-token-secret", + "key": "token" + } + } + }, +] +DEFAULT_KUBERNETES_POD_VOLUMES = [ + {"name": "devshm", "emptyDir": {"medium": "Memory"}}, + {"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}} +] +DEFAULT_KUBERNETES_NODE_SELECTOR = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"} class DockerPluginConfig(BaseModel): """ @@ -19,16 +51,8 @@ class DockerPluginConfig(BaseModel): gpus: Optional[str] = "all" mount_buildkite_agent: Optional[bool] = Field(default=False, alias="mount-buildkite-agent") command: List[str] = Field(default_factory=list) - environment: List[str] = [ - f"HF_HOME={HF_HOME}", - "VLLM_USAGE_SOURCE=ci-test", - "HF_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN" - ] - volumes: List[str] = [ - "/dev/shm:/dev/shm", - f"{HF_HOME}:{HF_HOME}" - ] + environment: List[str] = DEFAULT_DOCKER_ENVIRONMENT_VARIBLES + volumes: List[str] = DEFAULT_DOCKER_VOLUMES class KubernetesPodContainerConfig(BaseModel): @@ -40,25 +64,10 @@ class KubernetesPodContainerConfig(BaseModel): resources: Dict[str, Dict[str, int]] volume_mounts: List[Dict[str, str]] = Field( alias="volumeMounts", - default=[ - {"name": "devshm", "mountPath": "/dev/shm"}, - {"name": "hf-cache", "mountPath": HF_HOME} - ] + default=DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS ) env: List[Dict[str, str]] = Field( - default=[ - {"name": "HF_HOME", "value": HF_HOME}, - {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, - { - "name": "HF_TOKEN", - "valueFrom": { - "secretKeyRef": { - "name": "hf-token-secret", - "key": "token" - } - } - }, - ], + default=DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, ) @@ -69,14 +78,11 @@ class KubernetesPodSpec(BaseModel): containers: List[KubernetesPodContainerConfig] priority_class_name: str = Field(default="ci", alias="priorityClassName") node_selector: Dict[str, Any] = Field( - default={"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"}, + default=DEFAULT_KUBERNETES_NODE_SELECTOR, alias="nodeSelector" ) volumes: List[Dict[str, Any]] = Field( - default=[ - {"name": "devshm", "emptyDir": {"medium": "Memory"}}, - {"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}} - ] + default=DEFAULT_KUBERNETES_POD_VOLUMES ) @@ -88,11 +94,12 @@ class KubernetesPluginConfig(BaseModel): def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[str], num_gpus: int) -> Dict: + test_bash_command[-1] = f'"{test_bash_command[-1]}"' pod_spec = KubernetesPodSpec( containers=[ KubernetesPodContainerConfig( image=container_image, - command=test_bash_command, + command=[" ".join(test_bash_command)], resources={"limits": {"nvidia.com/gpu": num_gpus}} ) ] diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py index 420f6fe..07b0d29 100644 --- a/scripts/pipeline_generator/step.py +++ b/scripts/pipeline_generator/step.py @@ -1,23 +1,23 @@ from pydantic import BaseModel, Field from typing import List, Dict, Any, Optional -from utils import AgentQueue +from .utils import AgentQueue BUILD_STEP_KEY = "build" class TestStep(BaseModel): """This class represents a test step defined in the test configuration file.""" label: str - fast_check: bool = False - mirror_hardwares: List[str] = Field(default_factory=list) - gpu: str = "" - num_gpus: int = 1 - num_nodes: int = 1 + fast_check: Optional[bool] = None + mirror_hardwares: Optional[List[str]] = None + gpu: Optional[str] = None + num_gpus: Optional[int] = None + num_nodes: Optional[int] = None working_dir: str = "/vllm-workspace/tests" - source_file_dependencies: List[str] = Field(default_factory=list) - no_gpu: bool = False - soft_fail: bool = False - parallelism: int = 1 + source_file_dependencies: Optional[List[str]] = None + no_gpu: Optional[bool] = None + soft_fail: Optional[bool] = None + parallelism: Optional[int] = None optional: bool = False command: Optional[str] = None commands: Optional[List[str]] = None diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 12a50f7..56cebf1 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -33,7 +33,7 @@ def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: O return AgentQueue.AWS_SMALL_CPU if gpu_type == A100_GPU: return AgentQueue.A100 - return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4 + return AgentQueue.AWS_1xL4 if not num_gpus or num_gpus == 1else AgentQueue.AWS_4xL4 def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str: diff --git a/scripts/tests/pipeline_generator/test_pipeline_generator.py b/scripts/tests/pipeline_generator/test_pipeline_generator.py index 1d932f8..a405bb6 100644 --- a/scripts/tests/pipeline_generator/test_pipeline_generator.py +++ b/scripts/tests/pipeline_generator/test_pipeline_generator.py @@ -1,16 +1,28 @@ import pytest +import sys +from unittest import mock + from scripts.pipeline_generator.pipeline_generator import PipelineGenerator from scripts.pipeline_generator.step import TestStep, BuildkiteStep, BuildkiteBlockStep +from scripts.pipeline_generator.utils import ( + AgentQueue, + get_full_test_command, + VLLM_ECR_REPO, + MULTI_NODE_TEST_SCRIPT, +) +from scripts.pipeline_generator.plugin import DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, DEFAULT_DOCKER_VOLUMES, DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS, DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, DEFAULT_KUBERNETES_NODE_SELECTOR, DEFAULT_KUBERNETES_POD_VOLUMES TEST_COMMIT = "123456789abcdef123456789abcdef123456789a" TEST_FILE_PATH = "scripts/tests/pipeline_generator/tests.yaml" + def get_test_pipeline_generator(): pipeline_generator = PipelineGenerator(run_all=False, list_file_diff=[]) pipeline_generator.commit = TEST_COMMIT return pipeline_generator + def test_read_test_steps(): pipeline_generator = get_test_pipeline_generator() steps = pipeline_generator.read_test_steps(TEST_FILE_PATH) @@ -25,6 +37,7 @@ def test_read_test_steps(): assert steps[3].gpu == "a100" assert steps[3].optional == True + @pytest.mark.parametrize( ("test_step", "expected_plugin_config"), [ @@ -32,39 +45,303 @@ def test_read_test_steps(): TestStep( label="Test 0", source_file_dependencies=["dir1/", "dir2/file1"], - commands=["test command 1", "test command 2"] + commands=["test command 1", "test command 2"], ), { - "plugin": "docker" - } + "docker#v5.2.0": { + "image": f"{VLLM_ECR_REPO}:{TEST_COMMIT}", + "always-pull": True, + "propagate-environment": True, + "gpus": "all", + "mount-buildkite-agent": False, + "command": [ + "bash", + "-c", + "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2", + ], + "environment": DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, + "volumes": DEFAULT_DOCKER_VOLUMES, + } + }, ), ( TestStep( label="Test 1", - commands=["test command 1", "test command 2"] - gpu="a100" + commands=["test command 1", "test command 2"], + gpu="a100", + num_gpus=4, ), { - "plugin": "kubernetes" - } - ) - ] + "kubernetes": { + "podSpec": { + "containers": [ + { + "image": f"{VLLM_ECR_REPO}:{TEST_COMMIT}", + "command": [ + 'bash -c "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2"' + ], + "resources": {"limits": {"nvidia.com/gpu": 4}}, + "volumeMounts": DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS, + "env": DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, + } + ], + "priorityClassName": "ci", + "nodeSelector": DEFAULT_KUBERNETES_NODE_SELECTOR, + "volumes": DEFAULT_KUBERNETES_POD_VOLUMES, + } + } + }, + ), + ], ) -@mock.patch("scripts.pipeline_generator.pipeline_generator.get_docker_plugin_config") -@mock.patch("scripts.pipeline_generator.pipeline_generator.get_kubernetes_plugin_config") -@mock.patch("scripts.pipeline_generator.utils.get_full_test_command") -def test_get_plugin_config(mock_get_full_test_command, mock_get_kubernetes_plugin_config, mock_get_docker_plugin_config, test_step, expected_plugin_config): +def test_get_plugin_config(test_step, expected_plugin_config): pipeline_generator = get_test_pipeline_generator() - mock_get_full_test_command.return_value = "test command 1;\ntest command 2" - mock_get_docker_plugin_config.return_value = {"plugin": "docker"} - mock_get_kubernetes_plugin_config.return_value = {"plugin": "kubernetes"} container_image_path = f"{VLLM_ECR_REPO}:{TEST_COMMIT}" plugin_config = pipeline_generator.get_plugin_config(test_step) assert plugin_config == expected_plugin_config - if test_step.gpu == "a100": - assert mock_get_kubernetes_plugin_config.called_once_with(container_image_path, ) +@pytest.mark.parametrize( + ("test_step", "expected_buildkite_step"), + [ + ( + TestStep( + label="Test 0", + source_file_dependencies=["dir1/", "dir2/file1"], + commands=["test command 1", "test command 2"], + ), + BuildkiteStep( + label="Test 0", + key="test-0", + agents={"queue": AgentQueue.AWS_1xL4.value}, + plugins=[ + { + "docker#v5.2.0": { + "image": "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123456789abcdef123456789abcdef123456789a", + "always-pull": True, + "propagate-environment": True, + "gpus": "all", + "mount-buildkite-agent": False, + "command": [ + "bash", + "-c", + "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2", + ], + "environment": DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, + "volumes": DEFAULT_DOCKER_VOLUMES, + } + } + ], + ), + ), + # A100 test + ( + TestStep( + label="Test 1", + commands=["test command 1", "test command 2"], + gpu="a100", + num_gpus=4, + ), + BuildkiteStep( + label="Test 1", + key="test-1", + agents={"queue": AgentQueue.A100.value}, + plugins=[ + { + "kubernetes": { + "podSpec": { + "containers": [ + { + "image": "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123456789abcdef123456789abcdef123456789a", + "command": [ + 'bash -c "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2"' + ], + "resources": {"limits": {"nvidia.com/gpu": 4}}, + "volumeMounts": DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS, + "env": DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, + } + ], + "priorityClassName": "ci", + "nodeSelector": DEFAULT_KUBERNETES_NODE_SELECTOR, + "volumes": DEFAULT_KUBERNETES_POD_VOLUMES, + } + } + }, + ], + ), + ), + # Multi node test + ( + TestStep( + label="Test 2", + num_gpus=2, + num_nodes=2, + commands=["test command 1", "test command 2"], + working_dir="/tests", + ), + BuildkiteStep( + label="Test 2", + key="test-2", + agents={"queue": AgentQueue.AWS_4xL4.value}, + commands=[f"{MULTI_NODE_TEST_SCRIPT} /tests 2 2 {VLLM_ECR_REPO}:{TEST_COMMIT} 'test command 1' 'test command 2'"], + ), + ) + ], +) +def test_create_buildkite_step(test_step, expected_buildkite_step): + pipeline_generator = get_test_pipeline_generator() + + buildkite_step = pipeline_generator.create_buildkite_step(test_step) + assert buildkite_step == expected_buildkite_step + +@pytest.mark.parametrize( + ("test_step", "expected_value_without_runall", "expected_value_with_runall"), + [ + ( + TestStep( + label="Test 0", + source_file_dependencies=["dir1/", "dir2/file1"], + commands=["test command 1", "test command 2"], + ), + True, + True + ), + ( + TestStep( + label="Test 0", + commands=["test command 1", "test command 2"], + ), + True, + True + ), + ( + TestStep( + label="Test 0", + source_file_dependencies=["dir2/", "dir3/file1"], + commands=["test command 1", "test command 2"], + ), + False, + True + ), + ( + TestStep( + label="Test 1", + commands=["test command 1", "test command 2"], + gpu="a100", + optional=True, + num_gpus=4, + ), + False, + False + ), + ], +) +def test_step_should_run(test_step, expected_value_without_runall, expected_value_with_runall): + pipeline_generator = get_test_pipeline_generator() + pipeline_generator.list_file_diff = ["dir1/a.py", "dir3/file2.py"] + assert pipeline_generator.step_should_run(test_step) == expected_value_without_runall + + # With run_all + pipeline_generator.run_all = True + assert pipeline_generator.step_should_run(test_step) == expected_value_with_runall + +@pytest.mark.parametrize( + ("test_step", "expected_buildkite_steps"), + [ + # Test always run so no block step + ( + TestStep( + label="Test 0", + commands=["test command 1", "test command 2"], + ), + [ + BuildkiteStep( + label="Test 0", + key="test-0", + agents={"queue": AgentQueue.AWS_1xL4.value}, + plugins=[ + { + "docker#v5.2.0": { + "image": "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123456789abcdef123456789abcdef123456789a", + "always-pull": True, + "propagate-environment": True, + "gpus": "all", + "mount-buildkite-agent": False, + "command": [ + "bash", + "-c", + "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2", + ], + "environment": DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, + "volumes": DEFAULT_DOCKER_VOLUMES, + } + } + ], + ), + ] + ), + # Test doesn't automatically run because dependencies are not matched -> with block step + ( + TestStep( + label="Test 0", + source_file_dependencies=["dir1/", "dir2/file1"], + commands=["test command 1", "test command 2"], + ), + [ + BuildkiteBlockStep(block="Run Test 0", key="block-test-0"), + BuildkiteStep( + label="Test 0", + key="test-0", + agents={"queue": AgentQueue.AWS_1xL4.value}, + depends_on="block-test-0", + plugins=[ + { + "docker#v5.2.0": { + "image": "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123456789abcdef123456789abcdef123456789a", + "always-pull": True, + "propagate-environment": True, + "gpus": "all", + "mount-buildkite-agent": False, + "command": [ + "bash", + "-c", + "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2", + ], + "environment": DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, + "volumes": DEFAULT_DOCKER_VOLUMES, + } + } + ], + ), + ] + ) + ] +) +def test_process_step(test_step, expected_buildkite_steps): + pipeline_generator = get_test_pipeline_generator() + buildkite_steps = pipeline_generator.process_step(test_step) + assert buildkite_steps == expected_buildkite_steps + +def test_generate_build_step(): + pipeline_generator = get_test_pipeline_generator() + pipeline_generator.get_build_commands = mock.MagicMock(return_value=["build command 1", "build command 2"]) + build_step = pipeline_generator.generate_build_step() + expected_build_step = BuildkiteStep( + label=":docker: build image", + key="build", + agents={"queue": AgentQueue.AWS_CPU.value}, + env={"DOCKER_BUILDKIT": "1"}, + retry={ + "automatic": [ + {"exit_status": -1, "limit": 2}, + {"exit_status": -10, "limit": 2} + ] + }, + commands=["build command 1", "build command 2"], + depends_on=None, + ) + assert build_step == expected_build_step + if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) \ No newline at end of file + sys.exit(pytest.main(["-v", __file__])) From 1412ca5aaffc6c176336c0e9d43c5839770fac6a Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 09:23:47 +0000 Subject: [PATCH 12/20] p Signed-off-by: kevin --- scripts/pipeline_generator/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 56cebf1..9df6e43 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -10,10 +10,10 @@ A100_GPU = "a100" # File paths -TEST_PATH = ".buildkite/test-pipeline.yaml" -EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml" -PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml" -MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" +TEST_PATH = "./test-pipeline.yaml" +EXTERNAL_HARDWARE_TEST_PATH = "./external-tests.yaml" +PIPELINE_FILE_PATH = "./pipeline.yaml" +MULTI_NODE_TEST_SCRIPT = "./run-multi-node-test.sh" STEPS_TO_BLOCK = [] From 51e6fca79236237ddc7a74c9ee158f5110d5a44c Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 09:40:20 +0000 Subject: [PATCH 13/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index 73898cd..34a1ab4 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -6,11 +6,9 @@ RUN_ALL=${RUN_ALL:-0} VLLM_CI_BRANCH=${VLLM_CI_BRANCH:-main} generate_pipeline() { - python -m pip install click pydantic + python -m pip install "click==8.1.7" "pydantic==2.9.2" # Download necessary files - echo "Downloading pipeline generator scripts..." - echo "VLLM CI Branch: $VLLM_CI_BRANCH" mkdir -p .buildkite/pipeline_generator for FILE in pipeline_generator.py plugin.py step.py utils.py __init__.py; do curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" @@ -18,10 +16,9 @@ generate_pipeline() { # Generate and upload pipeline cd .buildkite - python -m pipeline_generator.pipeline_generator --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" + python -m pipeline_generator.pipeline_generator --run_all="$RUN_ALL" --list_file_diff="$LIST_FILE_DIFF" cat pipeline.yaml buildkite-agent pipeline upload pipeline.yaml - exit 0 } upload_pipeline() { @@ -29,7 +26,7 @@ upload_pipeline() { ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI' # Install minijinja-cli - curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh + curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh source /var/lib/buildkite-agent/.cargo/env if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then @@ -40,9 +37,9 @@ upload_pipeline() { } handle_fastcheck() { - [ ! -e ".buildkite/test-template-fastcheck.j2" ] && \ + if [[ ! -e ".buildkite/test-template-fastcheck.j2" ]]; then curl -o .buildkite/test-template-fastcheck.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-fastcheck.j2 - + fi cd .buildkite && minijinja-cli test-template-fastcheck.j2 test-pipeline.yaml > pipeline.yml cat pipeline.yml buildkite-agent pipeline upload pipeline.yml @@ -68,8 +65,8 @@ handle_regular_pipeline() { } get_diff() { - git add . - git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD) + $(git add .) + echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD)) } get_diff_main() { From 6272ba4e444a7be8283673379d5502c1d62a8b7e Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 09:40:45 +0000 Subject: [PATCH 14/20] p Signed-off-by: kevin --- scripts/pipeline_generator/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 9df6e43..6472b1f 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -13,7 +13,7 @@ TEST_PATH = "./test-pipeline.yaml" EXTERNAL_HARDWARE_TEST_PATH = "./external-tests.yaml" PIPELINE_FILE_PATH = "./pipeline.yaml" -MULTI_NODE_TEST_SCRIPT = "./run-multi-node-test.sh" +MULTI_NODE_TEST_SCRIPT = "./buildkite/run-multi-node-test.sh" STEPS_TO_BLOCK = [] From 081017ac1f665f3753f6dfbe7f820bda7b25e8c7 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 09:52:44 +0000 Subject: [PATCH 15/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 83 +++++++++++++++-------------- scripts/pipeline_generator/utils.py | 2 +- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index 34a1ab4..dbead3d 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -7,13 +7,13 @@ VLLM_CI_BRANCH=${VLLM_CI_BRANCH:-main} generate_pipeline() { python -m pip install "click==8.1.7" "pydantic==2.9.2" - + # Download necessary files mkdir -p .buildkite/pipeline_generator for FILE in pipeline_generator.py plugin.py step.py utils.py __init__.py; do curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" done - + # Generate and upload pipeline cd .buildkite python -m pipeline_generator.pipeline_generator --run_all="$RUN_ALL" --list_file_diff="$LIST_FILE_DIFF" @@ -24,43 +24,31 @@ generate_pipeline() { upload_pipeline() { echo "Uploading pipeline..." ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI' - - # Install minijinja-cli - curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh + curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh source /var/lib/buildkite-agent/.cargo/env - - if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then - handle_fastcheck - else - handle_regular_pipeline - fi -} - -handle_fastcheck() { - if [[ ! -e ".buildkite/test-template-fastcheck.j2" ]]; then - curl -o .buildkite/test-template-fastcheck.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-fastcheck.j2 + if [ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]; then + if [ ! -e ".buildkite/test-template-fastcheck.j2" ]; then + curl -o .buildkite/test-template-fastcheck.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-fastcheck.j2 + fi + cd .buildkite && minijinja-cli test-template-fastcheck.j2 test-pipeline.yaml > pipeline.yml + cat pipeline.yml + buildkite-agent pipeline upload pipeline.yml + exit 0 fi - cd .buildkite && minijinja-cli test-template-fastcheck.j2 test-pipeline.yaml > pipeline.yml - cat pipeline.yml - buildkite-agent pipeline upload pipeline.yml - exit 0 -} - -handle_regular_pipeline() { - [ ! -e ".buildkite/test-template.j2" ] && \ + if [ ! -e ".buildkite/test-template.j2" ]; then curl -o .buildkite/test-template.j2 https://raw.githubusercontent.com/vllm-project/buildkite-ci/main/scripts/test-template-aws.j2 - + fi if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then python -m pip install click pydantic python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" buildkite-agent pipeline upload .buildkite/pipeline.yaml - else - cd .buildkite - echo "List file diff: $LIST_FILE_DIFF" - echo "Run all: $RUN_ALL" - minijinja-cli test-template.j2 test-pipeline.yaml -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" > pipeline.yml - buildkite-agent pipeline upload pipeline.yml + exit 0 fi + cd .buildkite + echo "List file diff: $LIST_FILE_DIFF" + echo "Run all: $RUN_ALL" + minijinja-cli test-template.j2 test-pipeline.yaml -D list_file_diff="$LIST_FILE_DIFF" -D run_all="$RUN_ALL" > pipeline.yml + buildkite-agent pipeline upload pipeline.yml exit 0 } @@ -70,23 +58,40 @@ get_diff() { } get_diff_main() { - git add . - git diff --name-only --diff-filter=ACMDR HEAD~1 + $(git add .) + echo $(git diff --name-only --diff-filter=ACMDR HEAD~1) } -# Determine if we need to run all tests -file_diff=$([ $BUILDKITE_BRANCH == "main" ] && get_diff_main || get_diff) -patterns=(".buildkite/test-pipeline" "Dockerfile" "CMakeLists.txt" "requirements*" "setup.py" "csrc/") +file_diff=$(get_diff) +if [[ $BUILDKITE_BRANCH == "main" ]]; then + file_diff=$(get_diff_main) +fi +patterns=( + ".buildkite/test-pipeline" + "Dockerfile" + "CMakeLists.txt" + "requirements*" + "setup.py" + "csrc/" +) for file in $file_diff; do for pattern in "${patterns[@]}"; do if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then RUN_ALL=1 echo "Found changes: $file. Run all tests" - break 2 + break fi done done -LIST_FILE_DIFF=$(echo "$file_diff" | tr ' ' '|') -generate_pipeline \ No newline at end of file +LIST_FILE_DIFF=$(get_diff | tr ' ' '|') +if [[ $BUILDKITE_BRANCH == "main" ]]; then + LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|') +fi + +if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then + upload_pipeline +else + generate_pipeline +fi diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 6472b1f..81818bd 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -13,7 +13,7 @@ TEST_PATH = "./test-pipeline.yaml" EXTERNAL_HARDWARE_TEST_PATH = "./external-tests.yaml" PIPELINE_FILE_PATH = "./pipeline.yaml" -MULTI_NODE_TEST_SCRIPT = "./buildkite/run-multi-node-test.sh" +MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" STEPS_TO_BLOCK = [] From a1e30813761f460d07ed838cd8a65b37a8adf74a Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 10:06:10 +0000 Subject: [PATCH 16/20] p Signed-off-by: kevin --- .../pipeline_generator/pipeline_generator.py | 64 +++++++++-------- scripts/pipeline_generator/plugin.py | 1 + scripts/pipeline_generator/step.py | 5 +- .../test_pipeline_generator.py | 69 ++++++++++++------- 4 files changed, 83 insertions(+), 56 deletions(-) diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 768b6b0..573f6ff 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -22,13 +22,14 @@ get_multi_node_test_command, ) from .step import ( - TestStep, - BuildkiteStep, - BuildkiteBlockStep, - get_block_step, + TestStep, + BuildkiteStep, + BuildkiteBlockStep, + get_block_step, get_step_key ) + class PipelineGenerator: def __init__(self, run_all: bool, list_file_diff: List[str]): self.run_all = run_all @@ -47,8 +48,8 @@ def step_should_run(self, step: TestStep) -> bool: return False if not step.source_file_dependencies or self.run_all: return True - return any(source_file in diff_file - for source_file in step.source_file_dependencies + return any(source_file in diff_file + for source_file in step.source_file_dependencies for diff_file in self.list_file_diff) def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: @@ -70,21 +71,25 @@ def generate_build_step(self) -> BuildkiteStep: build_commands = self.get_build_commands(docker_image) return BuildkiteStep( - label=":docker: build image", - key="build", - agents={"queue": AgentQueue.AWS_CPU.value}, - env={"DOCKER_BUILDKIT": "1"}, + label=":docker: build image", + key="build", + agents={"queue": AgentQueue.AWS_CPU.value}, + env={"DOCKER_BUILDKIT": "1"}, retry={ "automatic": [ - {"exit_status": -1, "limit": 2}, + {"exit_status": -1, "limit": 2}, {"exit_status": -10, "limit": 2} ] - }, + }, commands=build_commands, depends_on=None, ) - - def write_buildkite_steps(self, buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], output_file_path: str) -> None: + + def write_buildkite_steps( + self, + buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], + output_file_path: str + ) -> None: """Output the buildkite steps to the Buildkite pipeline yaml file.""" buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} with open(output_file_path, "w") as f: @@ -119,8 +124,8 @@ def get_plugin_config(self, step: TestStep) -> Dict: def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: buildkite_step = BuildkiteStep( - label=step.label, - key=get_step_key(step.label), + label=step.label, + key=get_step_key(step.label), parallelism=step.parallelism, soft_fail=step.soft_fail, plugins=[self.get_plugin_config(step)], @@ -132,10 +137,10 @@ def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: def _configure_multi_node_step(self, current_step: BuildkiteStep, step: TestStep): current_step.commands = [get_multi_node_test_command( - step.commands, - step.working_dir, - step.num_nodes, - step.num_gpus, + step.commands, + step.working_dir, + step.num_nodes, + step.num_gpus, f"{VLLM_ECR_REPO}:{self.commit}" ) ] @@ -190,22 +195,23 @@ def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteSt if test_step.mirror_hardwares and "amd" in test_step.mirror_hardwares: test_commands = [test_step.command] if test_step.command else test_step.commands amd_test_command = [ - "bash", - ".buildkite/run-amd-test.sh", + "bash", + ".buildkite/run-amd-test.sh", f"'{get_full_test_command(test_commands, test_step.working_dir)}'", ] mirrored_buildkite_step = BuildkiteStep( - label = f"AMD: {test_step.label}", - key = f"amd_{get_step_key(test_step.label)}", - depends_on = "amd-build", - agents = {"queue": AgentQueue.AMD_GPU.value}, - soft_fail = test_step.soft_fail, - env = {"DOCKER_BUILDKIT": "1"}, - commands = [" ".join(amd_test_command)], + label=f"AMD: {test_step.label}", + key=f"amd_{get_step_key(test_step.label)}", + depends_on="amd-build", + agents={"queue": AgentQueue.AMD_GPU.value}, + soft_fail=test_step.soft_fail, + env={"DOCKER_BUILDKIT": "1"}, + commands=[" ".join(amd_test_command)], ) mirrored_buildkite_steps.append(mirrored_buildkite_step) return mirrored_buildkite_steps + @click.command() @click.option("--run_all", type=str) @click.option("--list_file_diff", type=str) diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py index d36c0a2..3a79e0e 100644 --- a/scripts/pipeline_generator/plugin.py +++ b/scripts/pipeline_generator/plugin.py @@ -39,6 +39,7 @@ ] DEFAULT_KUBERNETES_NODE_SELECTOR = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"} + class DockerPluginConfig(BaseModel): """ Configuration for Docker plugin running in a Buildkite step. diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py index 07b0d29..bdf34ee 100644 --- a/scripts/pipeline_generator/step.py +++ b/scripts/pipeline_generator/step.py @@ -1,10 +1,11 @@ -from pydantic import BaseModel, Field +from pydantic import BaseModel from typing import List, Dict, Any, Optional from .utils import AgentQueue BUILD_STEP_KEY = "build" + class TestStep(BaseModel): """This class represents a test step defined in the test configuration file.""" label: str @@ -22,6 +23,7 @@ class TestStep(BaseModel): command: Optional[str] = None commands: Optional[List[str]] = None + class BuildkiteStep(BaseModel): """This class represents a step in Buildkite format.""" label: str @@ -35,6 +37,7 @@ class BuildkiteStep(BaseModel): env: Optional[Dict[str, str]] = None retry: Optional[Dict[str, Any]] = None + class BuildkiteBlockStep(BaseModel): """This class represents a block step in Buildkite format.""" block: str diff --git a/scripts/tests/pipeline_generator/test_pipeline_generator.py b/scripts/tests/pipeline_generator/test_pipeline_generator.py index a405bb6..40236c2 100644 --- a/scripts/tests/pipeline_generator/test_pipeline_generator.py +++ b/scripts/tests/pipeline_generator/test_pipeline_generator.py @@ -7,11 +7,17 @@ from scripts.pipeline_generator.step import TestStep, BuildkiteStep, BuildkiteBlockStep from scripts.pipeline_generator.utils import ( AgentQueue, - get_full_test_command, VLLM_ECR_REPO, MULTI_NODE_TEST_SCRIPT, ) -from scripts.pipeline_generator.plugin import DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, DEFAULT_DOCKER_VOLUMES, DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS, DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, DEFAULT_KUBERNETES_NODE_SELECTOR, DEFAULT_KUBERNETES_POD_VOLUMES +from scripts.pipeline_generator.plugin import ( + DEFAULT_DOCKER_ENVIRONMENT_VARIBLES, + DEFAULT_DOCKER_VOLUMES, + DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS, + DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, + DEFAULT_KUBERNETES_NODE_SELECTOR, + DEFAULT_KUBERNETES_POD_VOLUMES, +) TEST_COMMIT = "123456789abcdef123456789abcdef123456789a" TEST_FILE_PATH = "scripts/tests/pipeline_generator/tests.yaml" @@ -35,7 +41,7 @@ def test_read_test_steps(): assert steps[2].num_gpus == 2 assert steps[2].num_nodes == 2 assert steps[3].gpu == "a100" - assert steps[3].optional == True + assert steps[3].optional is True @pytest.mark.parametrize( @@ -96,7 +102,6 @@ def test_read_test_steps(): ) def test_get_plugin_config(test_step, expected_plugin_config): pipeline_generator = get_test_pipeline_generator() - container_image_path = f"{VLLM_ECR_REPO}:{TEST_COMMIT}" plugin_config = pipeline_generator.get_plugin_config(test_step) assert plugin_config == expected_plugin_config @@ -153,7 +158,7 @@ def test_get_plugin_config(test_step, expected_plugin_config): "podSpec": { "containers": [ { - "image": "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:123456789abcdef123456789abcdef123456789a", + "image": f"{VLLM_ECR_REPO}:{TEST_COMMIT}", "command": [ 'bash -c "(command nvidia-smi || true);\nexport VLLM_LOGGING_LEVEL=DEBUG;\nexport VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1;\ncd /vllm-workspace/tests;\ntest command 1;\ntest command 2"' ], @@ -184,9 +189,11 @@ def test_get_plugin_config(test_step, expected_plugin_config): label="Test 2", key="test-2", agents={"queue": AgentQueue.AWS_4xL4.value}, - commands=[f"{MULTI_NODE_TEST_SCRIPT} /tests 2 2 {VLLM_ECR_REPO}:{TEST_COMMIT} 'test command 1' 'test command 2'"], + commands=[ + f"{MULTI_NODE_TEST_SCRIPT} /tests 2 2 {VLLM_ECR_REPO}:{TEST_COMMIT} 'test command 1' 'test command 2'" + ], ), - ) + ), ], ) def test_create_buildkite_step(test_step, expected_buildkite_step): @@ -195,6 +202,7 @@ def test_create_buildkite_step(test_step, expected_buildkite_step): buildkite_step = pipeline_generator.create_buildkite_step(test_step) assert buildkite_step == expected_buildkite_step + @pytest.mark.parametrize( ("test_step", "expected_value_without_runall", "expected_value_with_runall"), [ @@ -205,7 +213,7 @@ def test_create_buildkite_step(test_step, expected_buildkite_step): commands=["test command 1", "test command 2"], ), True, - True + True, ), ( TestStep( @@ -213,7 +221,7 @@ def test_create_buildkite_step(test_step, expected_buildkite_step): commands=["test command 1", "test command 2"], ), True, - True + True, ), ( TestStep( @@ -222,7 +230,7 @@ def test_create_buildkite_step(test_step, expected_buildkite_step): commands=["test command 1", "test command 2"], ), False, - True + True, ), ( TestStep( @@ -233,19 +241,24 @@ def test_create_buildkite_step(test_step, expected_buildkite_step): num_gpus=4, ), False, - False + False, ), ], ) -def test_step_should_run(test_step, expected_value_without_runall, expected_value_with_runall): +def test_step_should_run( + test_step, expected_value_without_runall, expected_value_with_runall +): pipeline_generator = get_test_pipeline_generator() pipeline_generator.list_file_diff = ["dir1/a.py", "dir3/file2.py"] - assert pipeline_generator.step_should_run(test_step) == expected_value_without_runall + assert ( + pipeline_generator.step_should_run(test_step) == expected_value_without_runall + ) - # With run_all + # With run_all pipeline_generator.run_all = True assert pipeline_generator.step_should_run(test_step) == expected_value_with_runall + @pytest.mark.parametrize( ("test_step", "expected_buildkite_steps"), [ @@ -279,7 +292,7 @@ def test_step_should_run(test_step, expected_value_without_runall, expected_valu } ], ), - ] + ], ), # Test doesn't automatically run because dependencies are not matched -> with block step ( @@ -314,34 +327,38 @@ def test_step_should_run(test_step, expected_value_without_runall, expected_valu } ], ), - ] - ) - ] + ], + ), + ], ) def test_process_step(test_step, expected_buildkite_steps): pipeline_generator = get_test_pipeline_generator() buildkite_steps = pipeline_generator.process_step(test_step) assert buildkite_steps == expected_buildkite_steps + def test_generate_build_step(): pipeline_generator = get_test_pipeline_generator() - pipeline_generator.get_build_commands = mock.MagicMock(return_value=["build command 1", "build command 2"]) + pipeline_generator.get_build_commands = mock.MagicMock( + return_value=["build command 1", "build command 2"] + ) build_step = pipeline_generator.generate_build_step() expected_build_step = BuildkiteStep( - label=":docker: build image", - key="build", - agents={"queue": AgentQueue.AWS_CPU.value}, - env={"DOCKER_BUILDKIT": "1"}, + label=":docker: build image", + key="build", + agents={"queue": AgentQueue.AWS_CPU.value}, + env={"DOCKER_BUILDKIT": "1"}, retry={ "automatic": [ - {"exit_status": -1, "limit": 2}, - {"exit_status": -10, "limit": 2} + {"exit_status": -1, "limit": 2}, + {"exit_status": -10, "limit": 2}, ] - }, + }, commands=["build command 1", "build command 2"], depends_on=None, ) assert build_step == expected_build_step + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From be935718c44f84b61d020ff820087864baee5371 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 10:18:35 +0000 Subject: [PATCH 17/20] format Signed-off-by: kevin --- scripts/tests/pipeline_generator/test_pipeline_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/tests/pipeline_generator/test_pipeline_generator.py b/scripts/tests/pipeline_generator/test_pipeline_generator.py index 40236c2..7e16839 100644 --- a/scripts/tests/pipeline_generator/test_pipeline_generator.py +++ b/scripts/tests/pipeline_generator/test_pipeline_generator.py @@ -102,7 +102,6 @@ def test_read_test_steps(): ) def test_get_plugin_config(test_step, expected_plugin_config): pipeline_generator = get_test_pipeline_generator() - plugin_config = pipeline_generator.get_plugin_config(test_step) assert plugin_config == expected_plugin_config From 4bb37902e2d677e3dbcb801ac3aadcd07c4fe5f8 Mon Sep 17 00:00:00 2001 From: kevin Date: Thu, 26 Sep 2024 10:23:41 +0000 Subject: [PATCH 18/20] fix test Signed-off-by: kevin --- scripts/tests/pipeline_generator/test_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/tests/pipeline_generator/test_plugin.py b/scripts/tests/pipeline_generator/test_plugin.py index 7e85ac0..4ced8ab 100644 --- a/scripts/tests/pipeline_generator/test_plugin.py +++ b/scripts/tests/pipeline_generator/test_plugin.py @@ -11,7 +11,7 @@ def test_get_kubernetes_plugin_config(): docker_image_path = "test_image:latest" - test_bash_command = ["echo", "Hello, Kubernetes!"] + test_bash_command = ["bash", "-c", "echo A"] num_gpus = 1 expected_config = { @@ -20,7 +20,7 @@ def test_get_kubernetes_plugin_config(): "containers": [ { "image": docker_image_path, - "command": [" ".join(test_bash_command)], + "command": ['bash -c "echo A"'], "resources": {"limits": {"nvidia.com/gpu": num_gpus}}, "volumeMounts": [ {"name": "devshm", "mountPath": "/dev/shm"}, From 7dab0214968a0104423d2922917c5c309dd3145d Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 27 Sep 2024 23:19:12 +0000 Subject: [PATCH 19/20] p Signed-off-by: kevin --- scripts/ci_aws_bootstrap.sh | 2 +- .../pipeline_generator/pipeline_generator.py | 205 +++++++----------- .../pipeline_generator_helper.py | 84 +++++++ 3 files changed, 164 insertions(+), 127 deletions(-) create mode 100644 scripts/pipeline_generator/pipeline_generator_helper.py diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh index dbead3d..74a1298 100644 --- a/scripts/ci_aws_bootstrap.sh +++ b/scripts/ci_aws_bootstrap.sh @@ -10,7 +10,7 @@ generate_pipeline() { # Download necessary files mkdir -p .buildkite/pipeline_generator - for FILE in pipeline_generator.py plugin.py step.py utils.py __init__.py; do + for FILE in pipeline_generator.py pipeline_generator_helper.py plugin.py step.py utils.py __init__.py; do curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE" done diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py index 573f6ff..984caed 100644 --- a/scripts/pipeline_generator/pipeline_generator.py +++ b/scripts/pipeline_generator/pipeline_generator.py @@ -2,6 +2,7 @@ import click from typing import List, Dict, Union import os +from pydantic import BaseModel from .plugin import ( get_kubernetes_plugin_config, @@ -28,47 +29,37 @@ get_block_step, get_step_key ) +from .pipeline_generator_helper import ( + step_should_run, + get_plugin_config, + create_buildkite_step, + get_build_commands, +) +class PipelineGeneratorConfig(BaseModel): + run_all: bool + list_file_diff: List[str] + container_registry: str + container_registry_repo: str + commit: str + test_path: str + external_hardware_test_path: str + pipeline_file_path: str -class PipelineGenerator: - def __init__(self, run_all: bool, list_file_diff: List[str]): - self.run_all = run_all - self.list_file_diff = list_file_diff - self.commit = os.getenv("BUILDKITE_COMMIT") - - def read_test_steps(self, file_path: str) -> List[TestStep]: - """Read test steps from test pipeline yaml and parse them into Step objects.""" - with open(file_path, "r") as f: - content = yaml.safe_load(f) - return [TestStep(**step) for step in content["steps"]] - - def step_should_run(self, step: TestStep) -> bool: - """Determine whether the step should automatically run or not.""" - if step.optional: - return False - if not step.source_file_dependencies or self.run_all: - return True - return any(source_file in diff_file - for source_file in step.source_file_dependencies - for diff_file in self.list_file_diff) - - def process_step(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: - """Process test step and return corresponding BuildkiteStep.""" - steps = [] - current_step = self.create_buildkite_step(step) - - if not self.step_should_run(step): - block_step = get_block_step(step.label) - steps.append(block_step) - current_step.depends_on = block_step.key + @property + def container_image(self) -> str: + return f"{self.container_registry}/{self.container_registry_repo}:{self.commit}" - steps.append(current_step) - return steps +class PipelineGenerator: + def __init__( + self, + config: PipelineGeneratorConfig + ): + self.config = config def generate_build_step(self) -> BuildkiteStep: - """Build the Docker image and push it to ECR.""" - docker_image = f"{VLLM_ECR_REPO}:{self.commit}" - build_commands = self.get_build_commands(docker_image) + """Build the Docker image and push it to container registry.""" + build_commands = get_build_commands(self.config.container_registry, self.config.commit, self.config.container_image) return BuildkiteStep( label=":docker: build image", @@ -85,15 +76,24 @@ def generate_build_step(self) -> BuildkiteStep: depends_on=None, ) - def write_buildkite_steps( - self, - buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], - output_file_path: str - ) -> None: - """Output the buildkite steps to the Buildkite pipeline yaml file.""" - buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} - with open(output_file_path, "w") as f: - yaml.dump(buildkite_steps_dict, f, sort_keys=False) + def read_test_steps(self, file_path: str) -> List[TestStep]: + """Read test steps from test pipeline yaml and parse them into Step objects.""" + with open(file_path, "r") as f: + content = yaml.safe_load(f) + return [TestStep(**step) for step in content["steps"]] + + def convert_test_step_to_buildkite_steps(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: + """Process test step and return corresponding BuildkiteStep.""" + steps = [] + current_step = create_buildkite_step(step, self.config.container_image) + + if not step_should_run(step, self.config.run_all, self.config.list_file_diff): + block_step = get_block_step(step.label) + steps.append(block_step) + current_step.depends_on = block_step.key + + steps.append(current_step) + return steps def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: """Process the external hardware tests from the yaml file and convert to Buildkite steps.""" @@ -101,81 +101,12 @@ def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[ buildkite_steps.extend(self._mirror_amd_test_steps(test_steps)) return buildkite_steps - def get_plugin_config(self, step: TestStep) -> Dict: - """Returns the plugin configuration for the step.""" - test_step_commands = [step.command] if step.command else step.commands - test_bash_command = [ - "bash", - "-c", - get_full_test_command(test_step_commands, step.working_dir) - ] - container_image = f"{VLLM_ECR_REPO}:{self.commit}" - if step.gpu == A100_GPU: - return get_kubernetes_plugin_config( - container_image, - test_bash_command, - step.num_gpus - ) - return get_docker_plugin_config( - container_image, - test_bash_command, - step.no_gpu - ) - - def create_buildkite_step(self, step: TestStep) -> BuildkiteStep: - buildkite_step = BuildkiteStep( - label=step.label, - key=get_step_key(step.label), - parallelism=step.parallelism, - soft_fail=step.soft_fail, - plugins=[self.get_plugin_config(step)], - agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} - ) - if step.num_nodes and step.num_nodes > 1: - self._configure_multi_node_step(buildkite_step, step) - return buildkite_step - - def _configure_multi_node_step(self, current_step: BuildkiteStep, step: TestStep): - current_step.commands = [get_multi_node_test_command( - step.commands, - step.working_dir, - step.num_nodes, - step.num_gpus, - f"{VLLM_ECR_REPO}:{self.commit}" - ) - ] - current_step.plugins = None - - def get_build_commands(self, docker_image: str) -> List[str]: - ecr_login_command = ( - "aws ecr-public get-login-password --region us-east-1 | " - f"docker login --username AWS --password-stdin {VLLM_ECR_URL}" - ) - image_check_command = f"""#!/bin/bash -if [[ -z $(docker manifest inspect {docker_image}) ]]; then - echo "Image not found, proceeding with build..." -else - echo "Image found" - exit 0 -fi -""" - docker_build_command = ( - f"docker build " - f"--build-arg max_jobs=64 " - f"--build-arg buildkite_commit={self.commit} " - f"--build-arg USE_SCCACHE=1 " - f"--tag {docker_image} " - f"--target test " - f"--progress plain ." - ) - docker_push_command = f"docker push {docker_image}" - return [ecr_login_command, image_check_command, docker_build_command, docker_push_command] def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]: with open(EXTERNAL_HARDWARE_TEST_PATH, "r") as f: content = yaml.safe_load(f) buildkite_steps = [] - amd_docker_image = f"{AMD_REPO}:{self.commit}" + amd_docker_image = f"{AMD_REPO}:{self.config.commit}" for step in content["steps"]: step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]] buildkite_step = BuildkiteStep(**step) @@ -211,23 +142,45 @@ def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteSt mirrored_buildkite_steps.append(mirrored_buildkite_step) return mirrored_buildkite_steps + def write_buildkite_steps( + self, + buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], + output_file_path: str + ) -> None: + """Output the buildkite steps to the Buildkite pipeline yaml file.""" + buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]} + with open(output_file_path, "w") as f: + yaml.dump(buildkite_steps_dict, f, sort_keys=False) + + def generate(self): + test_steps = self.read_test_steps(self.config.test_path) + buildkite_steps = [self.generate_build_step()] + + for test_step in test_steps: + test_buildkite_steps = self.convert_test_step_to_buildkite_steps(test_step) + buildkite_steps.extend(test_buildkite_steps) + buildkite_steps.extend(self.get_external_hardware_tests(test_steps)) + + self.write_buildkite_steps(buildkite_steps, self.config.pipeline_file_path) + @click.command() @click.option("--run_all", type=str) @click.option("--list_file_diff", type=str) def main(run_all: str = "-1", list_file_diff: str = None): list_file_diff = list_file_diff.split("|") if list_file_diff else [] - pipeline_generator = PipelineGenerator(run_all == "1", list_file_diff) - - test_steps = pipeline_generator.read_test_steps(TEST_PATH) - - buildkite_steps = [ - pipeline_generator.generate_build_step(), - *[step for test_step in test_steps for step in pipeline_generator.process_step(test_step)], - *pipeline_generator.get_external_hardware_tests(test_steps) - ] - - pipeline_generator.write_buildkite_steps(buildkite_steps, PIPELINE_FILE_PATH) + pipeline_generator_config = PipelineGeneratorConfig( + run_all=run_all == "1", + list_file_diff=list_file_diff, + container_registry=VLLM_ECR_URL, + container_registry_repo=VLLM_ECR_REPO, + commit=os.getenv("BUILDKITE_COMMIT"), + test_path=TEST_PATH, + external_hardware_test_path=EXTERNAL_HARDWARE_TEST_PATH, + pipeline_file_path=PIPELINE_FILE_PATH + ) + pipeline_generator = PipelineGenerator(pipeline_generator_config) + pipeline_generator.generate() if __name__ == "__main__": diff --git a/scripts/pipeline_generator/pipeline_generator_helper.py b/scripts/pipeline_generator/pipeline_generator_helper.py new file mode 100644 index 0000000..17476e5 --- /dev/null +++ b/scripts/pipeline_generator/pipeline_generator_helper.py @@ -0,0 +1,84 @@ +from typing import List, Dict +from .plugin import get_kubernetes_plugin_config, get_docker_plugin_config +from .utils import get_agent_queue, get_full_test_command, get_multi_node_test_command, A100_GPU +from .step import BuildkiteStep, TestStep, get_step_key + +def step_should_run(step: TestStep, run_all: bool, list_file_diff: List[str]) -> bool: + """Determine whether the step should automatically run or not.""" + if step.optional: + return False + if not step.source_file_dependencies or run_all: + return True + return any(source_file in diff_file + for source_file in step.source_file_dependencies + for diff_file in list_file_diff) + +def get_plugin_config(step: TestStep, container_image: str) -> Dict: + """Returns the plugin configuration for the step.""" + test_step_commands = [step.command] if step.command else step.commands + test_bash_command = [ + "bash", + "-c", + get_full_test_command(test_step_commands, step.working_dir) + ] + if step.gpu == A100_GPU: + return get_kubernetes_plugin_config( + container_image, + test_bash_command, + step.num_gpus + ) + return get_docker_plugin_config( + container_image, + test_bash_command, + step.no_gpu + ) + + +def create_buildkite_step(step: TestStep, container_image: str) -> BuildkiteStep: + """Convert TestStep into BuildkiteStep.""" + buildkite_step = BuildkiteStep( + label=step.label, + key=get_step_key(step.label), + parallelism=step.parallelism, + soft_fail=step.soft_fail, + plugins=[get_plugin_config(step, container_image)], + agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} + ) + # If test is multi-node, configure step to run with custom script + if step.num_nodes and step.num_nodes > 1: + buildkite_step.commands = [get_multi_node_test_command( + step.commands, + step.working_dir, + step.num_nodes, + step.num_gpus, + container_image + ) + ] + buildkite_step.plugins = None + return buildkite_step + + +def get_build_commands(container_registry: str, buildkite_commit: str, container_image: str) -> List[str]: + ecr_login_command = ( + "aws ecr-public get-login-password --region us-east-1 | " + f"docker login --username AWS --password-stdin {container_registry}" + ) + image_check_command = f"""#!/bin/bash +if [[ -z $(docker manifest inspect {container_image}) ]]; then +echo "Image not found, proceeding with build..." +else +echo "Image found" +exit 0 +fi +""" + docker_build_command = ( + f"docker build " + f"--build-arg max_jobs=64 " + f"--build-arg buildkite_commit={buildkite_commit} " + f"--build-arg USE_SCCACHE=1 " + f"--tag {container_image} " + f"--target test " + f"--progress plain ." + ) + docker_push_command = f"docker push {container_image}" + return [ecr_login_command, image_check_command, docker_build_command, docker_push_command] \ No newline at end of file From 88e406bf1f5024fef3ec7ad56b4067d21c456f94 Mon Sep 17 00:00:00 2001 From: kevin Date: Sat, 28 Sep 2024 00:35:14 +0000 Subject: [PATCH 20/20] p Signed-off-by: kevin --- scripts/pipeline_generator/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py index 81818bd..8117963 100644 --- a/scripts/pipeline_generator/utils.py +++ b/scripts/pipeline_generator/utils.py @@ -5,7 +5,7 @@ HF_HOME = "/root/.cache/huggingface" DEFAULT_WORKING_DIR = "/vllm-workspace/tests" VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" -VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" +VLLM_ECR_REPO = f"vllm-ci-test-repo" AMD_REPO = "rocm/vllm-ci" A100_GPU = "a100"