Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline generator #36

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
28 changes: 24 additions & 4 deletions scripts/ci_aws_bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,24 @@

set -euo pipefail

if [[ -z "${RUN_ALL:-}" ]]; then
RUN_ALL=0
fi
RUN_ALL=${RUN_ALL:-0}
VLLM_CI_BRANCH=${VLLM_CI_BRANCH:-main}

generate_pipeline() {
python -m pip install "click==8.1.7" "pydantic==2.9.2"

# Download necessary files
mkdir -p .buildkite/pipeline_generator
for FILE in pipeline_generator.py pipeline_generator_helper.py plugin.py step.py utils.py __init__.py; do
curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE"
done

# Generate and upload pipeline
cd .buildkite
python -m pipeline_generator.pipeline_generator --run_all="$RUN_ALL" --list_file_diff="$LIST_FILE_DIFF"
cat pipeline.yaml
buildkite-agent pipeline upload pipeline.yaml
}

upload_pipeline() {
echo "Uploading pipeline..."
Expand Down Expand Up @@ -74,4 +89,9 @@ LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
if [[ $BUILDKITE_BRANCH == "main" ]]; then
LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
fi
upload_pipeline

if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
upload_pipeline
else
generate_pipeline
fi
176 changes: 171 additions & 5 deletions scripts/pipeline_generator/pipeline_generator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,39 @@
import yaml
import click
from typing import List, Dict, Union
import os
import re
from typing import List, Optional

from pydantic import BaseModel, field_validator
from pydantic import BaseModel

from .plugin import (
get_kubernetes_plugin_config,
get_docker_plugin_config,
)
from .utils import (
AgentQueue,
AMD_REPO,
TEST_PATH,
EXTERNAL_HARDWARE_TEST_PATH,
PIPELINE_FILE_PATH,
STEPS_TO_BLOCK,
VLLM_ECR_URL,
VLLM_ECR_REPO,
get_agent_queue,
get_full_test_command,
get_multi_node_test_command,
)
from .step import (
TestStep,
BuildkiteStep,
BuildkiteBlockStep,
get_block_step,
get_step_key
)
from .pipeline_generator_helper import (
step_should_run,
get_plugin_config,
create_buildkite_step,
get_build_commands,
)

class PipelineGeneratorConfig:
def __init__(
Expand Down Expand Up @@ -45,11 +75,147 @@ def validate(self):
if not os.path.isfile(self.external_hardware_test_path):
raise FileNotFoundError(f"External hardware test file {self.external_hardware_test_path} not found")

def read_test_steps(self, file_path: str) -> List[TestStep]:
"""Read test steps from test pipeline yaml and parse them into TestStep objects."""
with open(file_path, "r") as f:
content = yaml.safe_load(f)
return [TestStep(**step) for step in content["steps"]]

def write_buildkite_steps(
self,
buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]],
output_file_path: str
) -> None:
"""Output the buildkite steps to the Buildkite pipeline yaml file."""
buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]}
with open(output_file_path, "w") as f:
yaml.dump(buildkite_steps_dict, f, sort_keys=False)

class PipelineGenerator:
def __init__(
self,
config: PipelineGeneratorConfig
):
config.validate()
self.config = config

def generate_build_step(self) -> BuildkiteStep:
"""Build the Docker image and push it to container registry."""
build_commands = get_build_commands(self.config.container_registry, self.config.commit, self.config.container_image)

return BuildkiteStep(
label=":docker: build image",
key="build",
agents={"queue": AgentQueue.AWS_CPU.value},
env={"DOCKER_BUILDKIT": "1"},
retry={
"automatic": [
{"exit_status": -1, "limit": 2},
{"exit_status": -10, "limit": 2}
]
},
commands=build_commands,
depends_on=None,
)

def convert_test_step_to_buildkite_steps(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
"""Process test step and return corresponding BuildkiteStep."""
steps = []
current_step = create_buildkite_step(step, self.config.container_image)

if not step_should_run(step, self.config.run_all, self.config.list_file_diff):
block_step = get_block_step(step.label)
steps.append(block_step)
current_step.depends_on = block_step.key

steps.append(current_step)
return steps

def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
"""Process the external hardware tests from the yaml file and convert to Buildkite steps."""
buildkite_steps = self._process_external_hardware_steps()
buildkite_steps.extend(self._mirror_amd_test_steps(test_steps))
return buildkite_steps


def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
with open(EXTERNAL_HARDWARE_TEST_PATH, "r") as f:
content = yaml.safe_load(f)
buildkite_steps = []
amd_docker_image = f"{AMD_REPO}:{self.config.commit}"
for step in content["steps"]:
step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]]
buildkite_step = BuildkiteStep(**step)
buildkite_step.depends_on = "bootstrap"

# Add block step if step is in blocklist
if buildkite_step.key in STEPS_TO_BLOCK:
block_step = get_block_step(buildkite_step.label)
buildkite_steps.append(block_step)
buildkite_step.depends_on = block_step.key
buildkite_steps.append(buildkite_step)
return buildkite_steps

def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteStep]:
mirrored_buildkite_steps = []
for test_step in test_steps:
if test_step.mirror_hardwares and "amd" in test_step.mirror_hardwares:
test_commands = [test_step.command] if test_step.command else test_step.commands
amd_test_command = [
"bash",
".buildkite/run-amd-test.sh",
f"'{get_full_test_command(test_commands, test_step.working_dir)}'",
]
mirrored_buildkite_step = BuildkiteStep(
label=f"AMD: {test_step.label}",
key=f"amd_{get_step_key(test_step.label)}",
depends_on="amd-build",
agents={"queue": AgentQueue.AMD_GPU.value},
soft_fail=test_step.soft_fail,
env={"DOCKER_BUILDKIT": "1"},
commands=[" ".join(amd_test_command)],
)
mirrored_buildkite_steps.append(mirrored_buildkite_step)
return mirrored_buildkite_steps


def generate(self):
test_steps = self.read_test_steps(self.config.test_path)
buildkite_steps = [self.generate_build_step()]

for test_step in test_steps:
test_buildkite_steps = self.convert_test_step_to_buildkite_steps(test_step)
buildkite_steps.extend(test_buildkite_steps)
buildkite_steps.extend(self.get_external_hardware_tests(test_steps))

self.write_buildkite_steps(buildkite_steps, self.config.pipeline_file_path)


@click.command()
@click.option("--run_all", type=str)
@click.option("--list_file_diff", type=str)
def main(run_all: str = "-1", list_file_diff: str = None):
list_file_diff = list_file_diff.split("|") if list_file_diff else []
pipeline_generator_config = PipelineGeneratorConfig(
run_all=run_all == "1",
list_file_diff=list_file_diff,
container_registry=VLLM_ECR_URL,
container_registry_repo=VLLM_ECR_REPO,
commit=os.getenv("BUILDKITE_COMMIT"),
test_path=TEST_PATH,
external_hardware_test_path=EXTERNAL_HARDWARE_TEST_PATH,
pipeline_file_path=PIPELINE_FILE_PATH
)
pipeline_generator = PipelineGenerator(pipeline_generator_config)
pipeline_generator.generate()


if __name__ == "__main__":
main()
import os
import re
from typing import List, Optional

from pydantic import BaseModel, field_validator



84 changes: 84 additions & 0 deletions scripts/pipeline_generator/pipeline_generator_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from typing import List, Dict
from .plugin import get_kubernetes_plugin_config, get_docker_plugin_config
from .utils import get_agent_queue, get_full_test_command, get_multi_node_test_command, GPUType
from .step import BuildkiteStep, TestStep, get_step_key

def step_should_run(step: TestStep, run_all: bool, list_file_diff: List[str]) -> bool:
"""Determine whether the step should automatically run or not."""
if step.optional:
return False
if not step.source_file_dependencies or run_all:
return True
return any(source_file in diff_file
for source_file in step.source_file_dependencies
for diff_file in list_file_diff)

def get_plugin_config(step: TestStep, container_image: str) -> Dict:
"""Returns the plugin configuration for the step."""
test_step_commands = [step.command] if step.command else step.commands
test_bash_command = [
"bash",
"-c",
get_full_test_command(test_step_commands, step.working_dir)
]
if step.gpu == GPUType.A100:
return get_kubernetes_plugin_config(
container_image,
test_bash_command,
step.num_gpus
)
return get_docker_plugin_config(
container_image,
test_bash_command,
step.no_gpu
)


def create_buildkite_step(step: TestStep, container_image: str) -> BuildkiteStep:
"""Convert TestStep into BuildkiteStep."""
buildkite_step = BuildkiteStep(
label=step.label,
key=get_step_key(step.label),
parallelism=step.parallelism,
soft_fail=step.soft_fail,
plugins=[get_plugin_config(step, container_image)],
agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value}
)
# If test is multi-node, configure step to run with custom script
if step.num_nodes and step.num_nodes > 1:
buildkite_step.commands = [get_multi_node_test_command(
step.commands,
step.working_dir,
step.num_nodes,
step.num_gpus,
container_image
)
]
buildkite_step.plugins = None
return buildkite_step


def get_build_commands(container_registry: str, buildkite_commit: str, container_image: str) -> List[str]:
ecr_login_command = (
"aws ecr-public get-login-password --region us-east-1 | "
f"docker login --username AWS --password-stdin {container_registry}"
)
image_check_command = f"""#!/bin/bash
if [[ -z $(docker manifest inspect {container_image}) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
"""
docker_build_command = (
f"docker build "
f"--build-arg max_jobs=64 "
f"--build-arg buildkite_commit={buildkite_commit} "
f"--build-arg USE_SCCACHE=1 "
f"--tag {container_image} "
f"--target test "
f"--progress plain ."
)
docker_push_command = f"docker push {container_image}"
return [ecr_login_command, image_check_command, docker_build_command, docker_push_command]
1 change: 1 addition & 0 deletions scripts/pipeline_generator/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class KubernetesPluginConfig(BaseModel):


def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[str], num_gpus: int) -> Dict:
test_bash_command[-1] = f'"{test_bash_command[-1]}"'
pod_spec = KubernetesPodSpec(
containers=[
KubernetesPodContainerConfig(
Expand Down
8 changes: 4 additions & 4 deletions scripts/pipeline_generator/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ class TestStep(BaseModel):
@classmethod
def validate_and_convert_command(cls, values) -> Any:
"""
Validate that either 'command' or 'commands' is defined.
Validate that either 'command' or 'commands' or `plugins` is defined.
If 'command' is defined, convert it to 'commands'.
"""
if not values.get("command") and not values.get("commands"):
raise ValueError("Either 'command' or 'commands' must be defined.")
if not values.get("command") and not values.get("commands") and not values.get("plugins"):
raise ValueError("Either 'command' or 'commands' or `plugins` must be defined.")
if values.get("command") and values.get("commands"):
raise ValueError("Only one of 'command' or 'commands' can be defined.")
if values.get("command"):
Expand All @@ -59,7 +59,7 @@ class BuildkiteStep(BaseModel):
"""This class represents a step in Buildkite format."""
label: str
agents: Dict[str, AgentQueue] = {"queue": AgentQueue.AWS_CPU}
commands: List[str]
commands: Optional[List[str]] = None
key: Optional[str] = None
plugins: Optional[List[Dict]] = None
parallelism: Optional[int] = None
Expand Down
8 changes: 4 additions & 4 deletions scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
HF_HOME = "/root/.cache/huggingface"
DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
VLLM_ECR_REPO = f"vllm-ci-test-repo"
AMD_REPO = "rocm/vllm-ci"

# File paths
TEST_PATH = ".buildkite/test-pipeline.yaml"
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
TEST_PATH = "./test-pipeline.yaml"
EXTERNAL_HARDWARE_TEST_PATH = "./external-tests.yaml"
PIPELINE_FILE_PATH = "./pipeline.yaml"
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"

TEST_DEFAULT_COMMANDS = [
Expand Down
Loading
Loading