vllm-project · khluu · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/scripts/ci_aws_bootstrap.sh b/scripts/ci_aws_bootstrap.sh
@@ -2,9 +2,24 @@
 
 set -euo pipefail
 
-if [[ -z "${RUN_ALL:-}" ]]; then
-    RUN_ALL=0
-fi
+RUN_ALL=${RUN_ALL:-0}
+VLLM_CI_BRANCH=${VLLM_CI_BRANCH:-main}
+
+generate_pipeline() {
+    python -m pip install "click==8.1.7" "pydantic==2.9.2"
+
+    # Download necessary files
+    mkdir -p .buildkite/pipeline_generator
+    for FILE in pipeline_generator.py pipeline_generator_helper.py plugin.py step.py utils.py __init__.py; do
+        curl -o ".buildkite/pipeline_generator/$FILE" "https://raw.githubusercontent.com/vllm-project/buildkite-ci/$VLLM_CI_BRANCH/scripts/pipeline_generator/$FILE"
+    done
+
+    # Generate and upload pipeline
+    cd .buildkite
+    python -m pipeline_generator.pipeline_generator --run_all="$RUN_ALL" --list_file_diff="$LIST_FILE_DIFF"
+    cat pipeline.yaml
+    buildkite-agent pipeline upload pipeline.yaml
+}
 
 upload_pipeline() {
     echo "Uploading pipeline..."
@@ -74,4 +89,9 @@ LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
 if [[ $BUILDKITE_BRANCH == "main" ]]; then
     LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
 fi
-upload_pipeline
+
+if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
+    upload_pipeline
+else
+    generate_pipeline
+fi
diff --git a/scripts/pipeline_generator/pipeline_generator.py b/scripts/pipeline_generator/pipeline_generator.py
@@ -1,9 +1,39 @@
+import yaml
+import click
+from typing import List, Dict, Union
 import os
-import re
-from typing import List, Optional
-
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel
 
+from .plugin import (
+    get_kubernetes_plugin_config,
+    get_docker_plugin_config,
+)
+from .utils import (
+    AgentQueue,
+    AMD_REPO,
+    TEST_PATH,
+    EXTERNAL_HARDWARE_TEST_PATH,
+    PIPELINE_FILE_PATH,
+    STEPS_TO_BLOCK,
+    VLLM_ECR_URL,
+    VLLM_ECR_REPO,
+    get_agent_queue,
+    get_full_test_command,
+    get_multi_node_test_command,
+)
+from .step import (
+    TestStep,
+    BuildkiteStep,
+    BuildkiteBlockStep,
+    get_block_step,
+    get_step_key
+)
+from .pipeline_generator_helper import (
+    step_should_run,
+    get_plugin_config,
+    create_buildkite_step,
+    get_build_commands,
+)
 
 class PipelineGeneratorConfig:
     def __init__(
@@ -45,11 +75,147 @@ def validate(self):
         if not os.path.isfile(self.external_hardware_test_path):
             raise FileNotFoundError(f"External hardware test file {self.external_hardware_test_path} not found")
 
+def read_test_steps(self, file_path: str) -> List[TestStep]:
+    """Read test steps from test pipeline yaml and parse them into TestStep objects."""
+    with open(file_path, "r") as f:
+        content = yaml.safe_load(f)
+    return [TestStep(**step) for step in content["steps"]]
+
+def write_buildkite_steps(
+        self,
+        buildkite_steps: List[Union[BuildkiteStep, BuildkiteBlockStep]],
+        output_file_path: str
+        ) -> None:
+    """Output the buildkite steps to the Buildkite pipeline yaml file."""
+    buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in buildkite_steps]}
+    with open(output_file_path, "w") as f:
+        yaml.dump(buildkite_steps_dict, f, sort_keys=False)
 
 class PipelineGenerator:
     def __init__(
             self, 
             config: PipelineGeneratorConfig
         ):
-        config.validate()
         self.config = config
+
+    def generate_build_step(self) -> BuildkiteStep:
+        """Build the Docker image and push it to container registry."""
+        build_commands = get_build_commands(self.config.container_registry, self.config.commit, self.config.container_image)
+
+        return BuildkiteStep(
+            label=":docker: build image",
+            key="build",
+            agents={"queue": AgentQueue.AWS_CPU.value},
+            env={"DOCKER_BUILDKIT": "1"},
+            retry={
+                "automatic": [
+                    {"exit_status": -1, "limit": 2},
+                    {"exit_status": -10, "limit": 2}
+                ]
+            },
+            commands=build_commands,
+            depends_on=None,
+        )
+
+    def convert_test_step_to_buildkite_steps(self, step: TestStep) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
+        """Process test step and return corresponding BuildkiteStep."""
+        steps = []
+        current_step = create_buildkite_step(step, self.config.container_image)
+
+        if not step_should_run(step, self.config.run_all, self.config.list_file_diff):
+            block_step = get_block_step(step.label)
+            steps.append(block_step)
+            current_step.depends_on = block_step.key
+
+        steps.append(current_step)
+        return steps
+
+    def get_external_hardware_tests(self, test_steps: List[TestStep]) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
+        """Process the external hardware tests from the yaml file and convert to Buildkite steps."""
+        buildkite_steps = self._process_external_hardware_steps()
+        buildkite_steps.extend(self._mirror_amd_test_steps(test_steps))
+        return buildkite_steps
+
+
+    def _process_external_hardware_steps(self) -> List[Union[BuildkiteStep, BuildkiteBlockStep]]:
+        with open(EXTERNAL_HARDWARE_TEST_PATH, "r") as f:
+            content = yaml.safe_load(f)
+        buildkite_steps = []
+        amd_docker_image = f"{AMD_REPO}:{self.config.commit}"
+        for step in content["steps"]:
+            step["commands"] = [cmd.replace("DOCKER_IMAGE_AMD", amd_docker_image) for cmd in step["commands"]]
+            buildkite_step = BuildkiteStep(**step)
+            buildkite_step.depends_on = "bootstrap"
+
+            # Add block step if step is in blocklist
+            if buildkite_step.key in STEPS_TO_BLOCK:
+                block_step = get_block_step(buildkite_step.label)
+                buildkite_steps.append(block_step)
+                buildkite_step.depends_on = block_step.key
+            buildkite_steps.append(buildkite_step)
+        return buildkite_steps
+
+    def _mirror_amd_test_steps(self, test_steps: List[TestStep]) -> List[BuildkiteStep]:
+        mirrored_buildkite_steps = []
+        for test_step in test_steps:
+            if test_step.mirror_hardwares and "amd" in test_step.mirror_hardwares:
+                test_commands = [test_step.command] if test_step.command else test_step.commands
+                amd_test_command = [
+                    "bash",
+                    ".buildkite/run-amd-test.sh",
+                    f"'{get_full_test_command(test_commands, test_step.working_dir)}'",
+                ]
+                mirrored_buildkite_step = BuildkiteStep(
+                    label=f"AMD: {test_step.label}",
+                    key=f"amd_{get_step_key(test_step.label)}",
+                    depends_on="amd-build",
+                    agents={"queue": AgentQueue.AMD_GPU.value},
+                    soft_fail=test_step.soft_fail,
+                    env={"DOCKER_BUILDKIT": "1"},
+                    commands=[" ".join(amd_test_command)],
+                )
+                mirrored_buildkite_steps.append(mirrored_buildkite_step)
+        return mirrored_buildkite_steps
+
+
+    def generate(self):
+        test_steps = self.read_test_steps(self.config.test_path)
+        buildkite_steps = [self.generate_build_step()]
+
+        for test_step in test_steps:
+            test_buildkite_steps = self.convert_test_step_to_buildkite_steps(test_step)
+            buildkite_steps.extend(test_buildkite_steps)
+        buildkite_steps.extend(self.get_external_hardware_tests(test_steps))
+
+        self.write_buildkite_steps(buildkite_steps, self.config.pipeline_file_path)
+
+
+@click.command()
+@click.option("--run_all", type=str)
+@click.option("--list_file_diff", type=str)
+def main(run_all: str = "-1", list_file_diff: str = None):
+    list_file_diff = list_file_diff.split("|") if list_file_diff else []
+    pipeline_generator_config = PipelineGeneratorConfig(
+        run_all=run_all == "1",
+        list_file_diff=list_file_diff,
+        container_registry=VLLM_ECR_URL,
+        container_registry_repo=VLLM_ECR_REPO,
+        commit=os.getenv("BUILDKITE_COMMIT"),
+        test_path=TEST_PATH,
+        external_hardware_test_path=EXTERNAL_HARDWARE_TEST_PATH,
+        pipeline_file_path=PIPELINE_FILE_PATH
+    )
+    pipeline_generator = PipelineGenerator(pipeline_generator_config)
+    pipeline_generator.generate()
+
+
+if __name__ == "__main__":
+    main()
+import os
+import re
+from typing import List, Optional
+
+from pydantic import BaseModel, field_validator
+
+
+
diff --git a/scripts/pipeline_generator/pipeline_generator_helper.py b/scripts/pipeline_generator/pipeline_generator_helper.py
@@ -0,0 +1,84 @@
+from typing import List, Dict
+from .plugin import get_kubernetes_plugin_config, get_docker_plugin_config
+from .utils import get_agent_queue, get_full_test_command, get_multi_node_test_command, GPUType
+from .step import BuildkiteStep, TestStep, get_step_key
+
+def step_should_run(step: TestStep, run_all: bool, list_file_diff: List[str]) -> bool:
+    """Determine whether the step should automatically run or not."""
+    if step.optional:
+        return False
+    if not step.source_file_dependencies or run_all:
+        return True
+    return any(source_file in diff_file
+                for source_file in step.source_file_dependencies
+                for diff_file in list_file_diff)
+
+def get_plugin_config(step: TestStep, container_image: str) -> Dict:
+    """Returns the plugin configuration for the step."""
+    test_step_commands = [step.command] if step.command else step.commands
+    test_bash_command = [
+        "bash",
+        "-c",
+        get_full_test_command(test_step_commands, step.working_dir)
+    ]
+    if step.gpu == GPUType.A100:
+        return get_kubernetes_plugin_config(
+            container_image,
+            test_bash_command,
+            step.num_gpus
+        )
+    return get_docker_plugin_config(
+        container_image,
+        test_bash_command,
+        step.no_gpu
+    )
+
+
+def create_buildkite_step(step: TestStep, container_image: str) -> BuildkiteStep:
+    """Convert TestStep into BuildkiteStep."""
+    buildkite_step = BuildkiteStep(
+        label=step.label,
+        key=get_step_key(step.label),
+        parallelism=step.parallelism,
+        soft_fail=step.soft_fail,
+        plugins=[get_plugin_config(step, container_image)],
+        agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value}
+    )
+    # If test is multi-node, configure step to run with custom script
+    if step.num_nodes and step.num_nodes > 1:
+        buildkite_step.commands = [get_multi_node_test_command(
+                step.commands,
+                step.working_dir,
+                step.num_nodes,
+                step.num_gpus,
+                container_image
+            )
+        ]
+        buildkite_step.plugins = None
+    return buildkite_step
+
+
+def get_build_commands(container_registry: str, buildkite_commit: str, container_image: str) -> List[str]:
+    ecr_login_command = (
+        "aws ecr-public get-login-password --region us-east-1 | "
+        f"docker login --username AWS --password-stdin {container_registry}"
+    )
+    image_check_command = f"""#!/bin/bash
+if [[ -z $(docker manifest inspect {container_image}) ]]; then
+echo "Image not found, proceeding with build..."
+else
+echo "Image found"
+exit 0
+fi
+"""
+    docker_build_command = (
+        f"docker build "
+        f"--build-arg max_jobs=64 "
+        f"--build-arg buildkite_commit={buildkite_commit} "
+        f"--build-arg USE_SCCACHE=1 "
+        f"--tag {container_image} "
+        f"--target test "
+        f"--progress plain ."
+    )
+    docker_push_command = f"docker push {container_image}"
+    return [ecr_login_command, image_check_command, docker_build_command, docker_push_command]
diff --git a/scripts/pipeline_generator/plugin.py b/scripts/pipeline_generator/plugin.py
@@ -95,6 +95,7 @@ class KubernetesPluginConfig(BaseModel):
 
 
 def get_kubernetes_plugin_config(container_image: str, test_bash_command: List[str], num_gpus: int) -> Dict:
+    test_bash_command[-1] = f'"{test_bash_command[-1]}"'
     pod_spec = KubernetesPodSpec(
         containers=[
             KubernetesPodContainerConfig(

diff --git a/scripts/pipeline_generator/step.py b/scripts/pipeline_generator/step.py
@@ -28,11 +28,11 @@ class TestStep(BaseModel):
     @classmethod
     def validate_and_convert_command(cls, values) -> Any:
         """
-        Validate that either 'command' or 'commands' is defined.
+        Validate that either 'command' or 'commands' or `plugins` is defined.
         If 'command' is defined, convert it to 'commands'.
         """
-        if not values.get("command") and not values.get("commands"):
-            raise ValueError("Either 'command' or 'commands' must be defined.")
+        if not values.get("command") and not values.get("commands") and not values.get("plugins"):
+            raise ValueError("Either 'command' or 'commands' or `plugins` must be defined.")
         if values.get("command") and values.get("commands"):
             raise ValueError("Only one of 'command' or 'commands' can be defined.")
         if values.get("command"):
@@ -59,7 +59,7 @@ class BuildkiteStep(BaseModel):
     """This class represents a step in Buildkite format."""
     label: str
     agents: Dict[str, AgentQueue] = {"queue": AgentQueue.AWS_CPU}
-    commands: List[str]
+    commands: Optional[List[str]] = None
     key: Optional[str] = None
     plugins: Optional[List[Dict]] = None
     parallelism: Optional[int] = None

diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py
@@ -5,13 +5,13 @@
 HF_HOME = "/root/.cache/huggingface"
 DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
 VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
-VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
+VLLM_ECR_REPO = f"vllm-ci-test-repo"
 AMD_REPO = "rocm/vllm-ci"
 
 # File paths
-TEST_PATH = ".buildkite/test-pipeline.yaml"
-EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
-PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
+TEST_PATH = "./test-pipeline.yaml"
+EXTERNAL_HARDWARE_TEST_PATH = "./external-tests.yaml"
+PIPELINE_FILE_PATH = "./pipeline.yaml"
 MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"
 
 TEST_DEFAULT_COMMANDS = [