diff --git a/.github/workflows/build_linux_jax_wheels.yml b/.github/workflows/build_linux_jax_wheels.yml new file mode 100644 index 000000000..8b4f18ae5 --- /dev/null +++ b/.github/workflows/build_linux_jax_wheels.yml @@ -0,0 +1,290 @@ +name: Build Portable Linux JAX Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + rocm_version: + description: ROCm version to install + type: string + tar_url: + description: URL to TheRock tarball to build against + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + repository: + description: "Repository to checkout. Defaults to `ROCm/TheRock`." + type: string + default: "ROCm/TheRock" + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + required: true + type: string + default: "3.12" + release_type: + type: choice + description: Type of release to create. All developer-triggered jobs should use "dev"! + options: + - dev + - nightly + - prerelease + default: dev + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + rocm_version: + description: ROCm version to install + type: string + tar_url: + description: URL to TheRock tarball to build against + type: string + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + jax_ref: + description: rocm-jax repository ref/branch to check out + type: string + default: rocm-jaxlib-v0.8.0 + +permissions: + id-token: write + contents: read + +run-name: Build Linux JAX Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }}) + +jobs: + build_jax_wheels: + strategy: + matrix: + jax_ref: [rocm-jaxlib-v0.8.0] + name: Build Linux JAX Wheels | ${{ inputs.amdgpu_family }} | Python ${{ inputs.python_version }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + PACKAGE_DIST_DIR: ${{ github.workspace }}/jax/jax_rocm_plugin/wheelhouse + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + outputs: + cp_version: ${{ env.cp_version }} + jax_version: ${{ steps.extract_jax_version.outputs.jax_version }} + steps: + - name: Checkout TheRock + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Checkout JAX + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + path: jax + repository: rocm/rocm-jax + ref: ${{ matrix.jax_ref }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - name: "Setting up Python" + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Build JAX Wheels + env: + ROCM_VERSION: ${{ inputs.rocm_version }} + run: | + ls -lah + pushd jax + python3 build/ci_build \ + --compiler=clang \ + --python-versions="${{ inputs.python_version }}" \ + --rocm-version="${ROCM_VERSION}" \ + --therock-path="${{ inputs.tar_url }}" \ + dist_wheels + + - name: Extract JAX version + id: extract_jax_version + run: | + # Extract JAX version from requirements.txt (e.g., "jax==0.8.0") + # Remove all whitespace from requirements.txt to simplify parsing + # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments) + # Extract the version number by splitting on '=' and taking the 3rd field + # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments + JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \ + | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3) + echo "jax_version=$JAX_VERSION" >> "$GITHUB_OUTPUT" + + - name: Install AWS CLI + if: always() + run: bash ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Upload wheels to S3 + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index + if: ${{ github.repository_owner == 'ROCm' }} + run: | + python3 -m venv .venv + source .venv/bin/activate + pip3 install boto3 packaging + python3 ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "linux" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_jax_wheels: + name: Test JAX wheels | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + needs: [build_jax_wheels, generate_target_to_run] + permissions: + contents: read + packages: read + uses: ./.github/workflows/test_linux_jax_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + release_type: ${{ inputs.release_type }} + s3_subdir: ${{ inputs.s3_subdir }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + rocm_version: ${{ inputs.rocm_version }} + tar_url: ${{ inputs.tar_url }} + python_version: ${{ inputs.python_version }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + jax_ref: ${{ inputs.jax_ref }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + + upload_jax_wheels: + name: Release JAX Wheels to S3 + needs: [build_jax_wheels, generate_target_to_run, test_jax_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + JAX_VERSION: "${{ needs.build_jax_wheels.outputs.jax_version }}" + ROCM_VERSION: "${{ inputs.rocm_version }}" + CP_VERSION: "${{ needs.build_jax_wheels.outputs.cp_version }}" + + steps: + - name: Checkout + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8 # v5.1.0 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_jax_wheels.result }} + TEST_RESULT: ${{ needs.test_jax_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy JAX wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "jaxlib-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_27_x86_64.whl" \ + --include "jax_rocm7_plugin-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_28_x86_64.whl" \ + --include "jax_rocm7_pjrt-${JAX_VERSION}+rocm${ROCM_VERSION}-py3-none-manylinux_2_28_x86_64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/build_native_linux_packages.yml b/.github/workflows/build_native_linux_packages.yml new file mode 100644 index 000000000..ead640630 --- /dev/null +++ b/.github/workflows/build_native_linux_packages.yml @@ -0,0 +1,135 @@ +name: Build Native Linux Packages + +on: + workflow_call: + inputs: + artifact_group: + description: gfx arch group for the s3 server + type: string + default: gfx94X-dcgpu + artifact_run_id: + description: workflow run id to download the artifacts from. + required: true + type: string + rocm_version: + description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...). + required: true + type: string + native_package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: string + package_suffix: + description: The suffix to be added to package name (asan, static or rpath). + required: false + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: false + type: string + workflow_dispatch: + inputs: + artifact_group: + type: string + default: gfx94X-dcgpu + artifact_run_id: + description: workflow run id to download the artifacts from + type: string + rocm_version: + description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...). + type: string + default: "0.0.1" + native_package_type: + description: Specify whether debian or rpm packages are needed (deb or rpm). + required: true + type: choice + options: + - rpm + - deb + default: "rpm" + package_suffix: + description: The suffix to be added to package name (asan, static or rpath). + type: string + required: false + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + +permissions: + id-token: write + contents: read + +run-name: Build native Linux packages (${{ inputs.artifact_group }}, ${{ inputs.rocm_version }}, ${{ inputs.native_package_type }}, ${{ inputs.package_suffix }}, ${{ inputs.release_type }}) + +jobs: + build_native_packages: + name: Build Linux native Packages + strategy: + fail-fast: false + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + ARTIFACT_RUN_ID: ${{ inputs.artifact_run_id || github.run_id }} + PACKAGE_SUFFIX: ${{ inputs.package_suffix != '' && inputs.package_suffix || '' }} + OUTPUT_DIR: ${{ github.workspace }}/output + ARTIFACTS_DIR: ${{ github.workspace }}/output/artifacts + PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages + RELEASE_TYPE: ${{ inputs.release_type || '' }} + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + - name: Install Python requirements + run: | + pip install pyelftools boto3 jinja2 + + - name: Install System requirements + run: | + # Install the needed tools for creating rpm / deb packages + # Also install tools for creating repo files + sudo apt update + sudo apt install -y llvm + sudo apt install -y rpm debhelper-compat build-essential + sudo apt install -y dpkg-dev createrepo-c + + - name: Fetch Artifacts + run: | + echo "Fetching artifacts for build ${{ inputs.artifact_run_id }}" + python ./build_tools/fetch_artifacts.py \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --run-github-repo="ROCm/TheRock" \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir=${{ env.ARTIFACTS_DIR }} + + - name: Build Packages + id: build-packages + run: | + echo "Building ${{ inputs.native_package_type }} packages for ${{ inputs.artifact_group }} ${{ inputs.artifact_run_id }}" + python ./build_tools/packaging/linux/build_package.py \ + --dest-dir ${{ env.PACKAGE_DIST_DIR }} \ + --rocm-version ${{ inputs.rocm_version }} \ + --target ${{ inputs.artifact_group }} \ + --artifacts-dir ${{ env.ARTIFACTS_DIR }} \ + --pkg-type ${{ inputs.native_package_type }} \ + --version-suffix ${{ env.ARTIFACT_RUN_ID }} + + - name: Install AWS CLI + run: bash ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials for non-forked repos + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + + - name: Upload Package repo to S3 + id: upload-packages + run: | + echo "Uploading to s3 bucket" + python ./build_tools/packaging/linux/upload_package_repo.py \ + --pkg-type ${{ inputs.native_package_type }} \ + --s3-bucket therock-deb-rpm-test \ + --amdgpu-family ${{ inputs.artifact_group }} \ + --artifact-id ${{ env.ARTIFACT_RUN_ID }} diff --git a/.github/workflows/build_portable_linux_artifacts.yml b/.github/workflows/build_portable_linux_artifacts.yml new file mode 100644 index 000000000..785229fc6 --- /dev/null +++ b/.github/workflows/build_portable_linux_artifacts.yml @@ -0,0 +1,231 @@ +name: Build Portable Linux Artifacts + +on: + workflow_dispatch: + inputs: + amdgpu_families: + type: string + default: gfx94X-dcgpu + artifact_group: + type: string + default: gfx94X-dcgpu + build_variant_label: + type: string + description: "A label for the build variant (ex: 'release', 'asan')" + default: "release" + build_variant_suffix: + type: string + description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')" + default: "" + build_variant_cmake_preset: + type: string + description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')" + default: "" + package_version: + type: string + default: ADHOCBUILD + expect_failure: + type: boolean + default: false + extra_cmake_options: + type: string + + workflow_call: + inputs: + package_version: + type: string + default: ADHOCBUILD + amdgpu_families: + type: string + artifact_group: + type: string + build_variant_label: + type: string + build_variant_suffix: + type: string + build_variant_cmake_preset: + type: string + expect_failure: + type: boolean + extra_cmake_options: + type: string + +# See the details regarding permissions from the link: +# https://github.com/aws-actions/configure-aws-credentials?tab=readme-ov-file#oidc +permissions: + contents: read + +jobs: + build_portable_linux_artifacts: + name: Build (xfail ${{ inputs.expect_failure }}) + # azure-linux-scale-rocm are used for regular CI builds + # azure-linux-scale-rocm-heavy are used for CI builds that require more resources (ex: ASAN builds) + runs-on: ${{ inputs.build_variant_label == 'asan' && 'azure-linux-u2404-hx176-cpu-rocm' || 'azure-linux-scale-rocm' }} + continue-on-error: ${{ inputs.expect_failure }} + timeout-minutes: 720 # 12 hour timeout + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:6e8242d347af7e0c43c82d5031a3ac67b669f24898ea8dc2f1d5b7e4798b66bd + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + CACHE_DIR: ${{ github.workspace }}/.container-cache + # The ccache.conf will be written by setup_ccache.py before this gets used. + CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + TEATIME_FORCE_INTERACTIVE: 0 + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + fetch-depth: 10 + + - name: Update Submodule Pointer to the PR + run: | + git config --global --add safe.directory $PWD + # Fetch the latest commit SHA from the PR branch + PR_SHA=${{ github.event.pull_request.head.sha }} + # Update the submodule pointer using cacheinfo + git update-index --cacheinfo 160000,$PR_SHA,compiler/hipify + git config --global user.email "z1-cciauto@amd.com" + git config --global user.name "Z1 cciauto" + git commit -m "Update submodule reference for compiler/hipify" + # Verify the pointer update + git submodule status + git submodule + + - name: Install python deps + run: | + pip install -r requirements.txt + + # safe.directory must be set before Runner Health Status + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Test build_tools + run: | + python -m pytest build_tools/tests build_tools/github_actions/tests + + - name: Fetch sources + timeout-minutes: 30 + run: | + ./build_tools/fetch_sources.py --jobs 12 + + - name: "Checking out repository for llvm-project" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/llvm-project" + path: compiler/amd-llvm + ref: amd-mainline + + - name: "Checking out repository for spriv-llvm-translator" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/SPIRV-LLVM-Translator" + path: compiler/spirv-llvm-translator + ref: amd-mainline + + - name: Apply patches + run: | + cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm + cd compiler/amd-llvm + git log -10 + git config --global --add safe.directory $PWD + find . -type f -name '*.patch' -exec git apply --check {} \; + find . -type f -name '*.patch' -exec git apply {} \; + git log -15 + cd - + + - name: TheRock and llvm SHA + run: | + git config --global --add safe.directory $PWD + git log --oneline -1 + ls -l compiler/amd-llvm + cd compiler/amd-llvm/llvm + ls -l + git log --oneline -3 + cd - + + - name: Configure Projects + env: + cmake_preset: ${{ inputs.build_variant_cmake_preset }} + amdgpu_families: ${{ inputs.amdgpu_families }} + package_version: ${{ inputs.package_version }} + extra_cmake_options: ${{ inputs.extra_cmake_options }} + BUILD_DIR: build + run: | + python3 build_tools/github_actions/build_configure.py --manylinux + + - name: Build therock-archives and therock-dist + run: | + cmake --build build --target therock-archives therock-dist -- -j32 + + - name: Test Packaging + if: ${{ github.event.repository.name == 'TheRock' }} + run: | + ctest --test-dir build --output-on-failure + + - name: Report + if: ${{ !cancelled() }} + shell: bash + run: | + if [ -d "./build" ]; then + echo "Full SDK du:" + echo "------------" + du -h -d 1 build/dist/rocm + echo "Artifact Archives:" + echo "------------------" + ls -lh build/artifacts/*.tar.xz + echo "Artifacts:" + echo "----------" + du -h -d 1 build/artifacts + echo "CCache Stats:" + echo "-------------" + ccache -s -v + tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log + else + echo "[ERROR] Build directory ./build does not exist. Skipping report!" + echo " This should only happen if the CI is cancelled before the build step." + exit 1 + fi + + # Analyze ninja build log to generate per-component timing report + - name: Analyze Build Times + if: ${{ !cancelled() }} + run: | + python3 build_tools/analyze_build_times.py --build-dir build + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci-external + + - name: Post Build Upload + if: always() + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group "${{ inputs.artifact_group }}" \ + --build-dir build \ + --upload diff --git a/.github/workflows/build_portable_linux_python_packages.yml b/.github/workflows/build_portable_linux_python_packages.yml new file mode 100644 index 000000000..69390ff9f --- /dev/null +++ b/.github/workflows/build_portable_linux_python_packages.yml @@ -0,0 +1,95 @@ +name: Build Portable Linux Python Packages + +on: + workflow_dispatch: + inputs: + artifact_github_repo: + description: GitHub repository for artifact_run_id + type: string + default: ROCm/TheRock + artifact_run_id: + description: Workflow run ID to download artifacts from + type: string + default: "17865324892" # TODO: default to the most recent successful run (using a script) + artifact_group: + description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)" + type: string + package_version: + type: string + workflow_call: + inputs: + artifact_github_repo: + type: string + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + package_version: + type: string + +permissions: + contents: read + +run-name: Build portable Linux Python Packages (${{ inputs.artifact_group }}, ${{ inputs.package_version }}) + +jobs: + build: + name: Build Python | ${{ inputs.artifact_group }} + # Note: GitHub-hosted runners run out of disk space for some gpu families + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + env: + BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + ARTIFACTS_DIR: "${{ github.workspace }}/artifacts" + PACKAGES_DIR: "${{ github.workspace }}/packages" + MANYLINUX: 1 + + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + + - name: Install Python requirements + run: pip install boto3 packaging piprepo setuptools + + # Note: we could fetch "all" artifacts if we wanted to include more files + - name: Fetch artifacts + env: + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + run: | + python ./build_tools/fetch_artifacts.py \ + --run-github-repo=${{ inputs.artifact_github_repo }} \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir=${{ env.ARTIFACTS_DIR }} \ + _dev_ _lib_ _run_ + + - name: Build Python packages + run: | + ./build_tools/linux_portable_build.py \ + --image=${{ env.BUILD_IMAGE }} \ + --output-dir=${{ env.PACKAGES_DIR }} \ + --artifact-dir=${{ env.ARTIFACTS_DIR }} \ + --build-python-only \ + -- \ + "--version=${{ inputs.package_version }}" + + - name: Inspect Python packages + run: | + ls -la "${{ env.PACKAGES_DIR }}" + + # TODO(#1559): Sanity check (Linux can't find the directories, maybe Docker issues?) + + # - name: Sanity check Python packages + # run: | + # piprepo build "${{ env.PACKAGES_DIR }}/dist" + # pip install rocm[devel]==${{ inputs.package_version }} \ + # --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/" + # rocm-sdk test + + # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket diff --git a/.github/workflows/build_portable_linux_pytorch_wheels.yml b/.github/workflows/build_portable_linux_pytorch_wheels.yml new file mode 100644 index 000000000..59a811ee6 --- /dev/null +++ b/.github/workflows/build_portable_linux_pytorch_wheels.yml @@ -0,0 +1,325 @@ +name: Build Portable Linux PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + required: true + type: string + default: "3.12" + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + default: "release/2.7" + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + default: "rocm_2.7" + +permissions: + id-token: write + contents: read + +run-name: Build portable Linux PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }}) + +jobs: + build_pytorch_wheels: + name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }} + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + env: + OUTPUT_DIR: ${{ github.workspace }}/output + PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages/dist + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + optional_build_prod_arguments: "" + outputs: + cp_version: ${{ env.cp_version }} + # The following are python package versions produced by the build. The + # exact versions will depend on workflow inputs and the underlying code. + # For example: + # Inputs + # rocm_version : 7.10.0a20251120 + # pytorch_git_ref : release/2.9 + # Outputs + # torch_version : 2.9.1+rocm7.10.0a20251120 + # torchaudio_version : 2.9.0+rocm7.10.0a20251120 + # torchvision_version: 0.24.0+rocm7.10.0a20251120 + # triton_version : 3.5.1+rocm7.10.0a20251120 + # Future jobs can use these version outputs to identify newly built + # packages, for example via `pip install torch==${TORCH_VERSION}`. + torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }} + torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }} + torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }} + triton_version: ${{ steps.build-pytorch-wheels.outputs.triton_version }} + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Add selected Python version to PATH + run: | + python_dir="/opt/python/${{ env.cp_version }}" + if ! [ -x "${python_dir}/bin/python" ]; then + echo "ERROR: Could not find python: ${python_dir}" + exit 1 + fi + echo "${python_dir}/bin" >> "$GITHUB_PATH" + + # Checkout nightly sources from https://github.com/pytorch/pytorch + - name: Checkout PyTorch Source Repos from nightly branch + if: ${{ inputs.pytorch_git_ref == 'nightly' }} + run: | + ./external-builds/pytorch/pytorch_torch_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_audio_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_vision_repo.py checkout --repo-hashtag nightly + ./external-builds/pytorch/pytorch_triton_repo.py checkout --patch --patchset nightly + + # Checkout stable sources from https://github.com/ROCm/pytorch + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ inputs.pytorch_git_ref != 'nightly' }} + run: | + ./external-builds/pytorch/pytorch_torch_repo.py checkout --gitrepo-origin https://github.com/ROCm/pytorch.git --repo-hashtag ${{ inputs.pytorch_git_ref }} --patchset ${{ inputs.pytorch_patchset }} + ./external-builds/pytorch/pytorch_audio_repo.py checkout --require-related-commit + ./external-builds/pytorch/pytorch_vision_repo.py checkout --require-related-commit + ./external-builds/pytorch/pytorch_triton_repo.py checkout + + - name: Create pip cache directory + run: mkdir -p /tmp/pipcache + + - name: Determine optional arguments passed to `build_prod_wheels.py` + if: ${{ inputs.rocm_version }} + run: | + pip install packaging + python build_tools/github_actions/determine_version.py \ + --rocm-version ${{ inputs.rocm_version }} + + - name: Build PyTorch Wheels + id: build-pytorch-wheels + run: | + echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}" + ./external-builds/pytorch/build_prod_wheels.py \ + build \ + --install-rocm \ + --pip-cache-dir /tmp/pipcache \ + --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" \ + --clean \ + --output-dir ${{ env.PACKAGE_DIST_DIR }} ${{ env.optional_build_prod_arguments }} + python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }} + + - name: Sanity Check Wheel + run: | + python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }}/ + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Upload wheels to S3 staging + if: ${{ github.repository_owner == 'ROCm' }} + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "linux" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_pytorch_wheels: + name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }} + needs: [build_pytorch_wheels, generate_target_to_run] + uses: ./.github/workflows/test_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + python_version: ${{ inputs.python_version }} + torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }} + pytorch_git_ref: ${{ inputs.pytorch_git_ref }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + upload_pytorch_wheels: + name: Release PyTorch Wheels to S3 + needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}" + TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}" + TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}" + TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}" + TRITON_VERSION: "${{ needs.build_pytorch_wheels.outputs.triton_version }}" + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }} + TEST_RESULT: ${{ needs.test_pytorch_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy PyTorch wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "torch-${TORCH_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ + --include "triton-${TRITON_VERSION}-${CP_VERSION}-linux_x86_64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/build_windows_artifacts.yml b/.github/workflows/build_windows_artifacts.yml new file mode 100644 index 000000000..68ddfa76a --- /dev/null +++ b/.github/workflows/build_windows_artifacts.yml @@ -0,0 +1,230 @@ +name: Build Windows Artifacts + +on: + workflow_dispatch: + inputs: + amdgpu_families: + type: string + default: gfx1151 + artifact_group: + type: string + default: gfx1151 + build_variant_label: + type: string + description: "A label for the build variant (ex: 'release', 'asan')" + default: "release" + build_variant_suffix: + type: string + description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')" + default: "" + build_variant_cmake_preset: + type: string + description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')" + default: "" + package_version: + type: string + default: ADHOCBUILD + expect_failure: + type: boolean + extra_cmake_options: + type: string + + workflow_call: + inputs: + package_version: + type: string + default: ADHOCBUILD + amdgpu_families: + type: string + artifact_group: + type: string + build_variant_label: + type: string + build_variant_suffix: + type: string + build_variant_cmake_preset: + type: string + expect_failure: + type: boolean + extra_cmake_options: + type: string + +permissions: + contents: read + +jobs: + build_windows_artifacts: + name: Build ${{ inputs.build_variant_label }} (xfail ${{ inputs.expect_failure }}) + runs-on: azure-windows-scale-rocm + continue-on-error: ${{ inputs.expect_failure }} + timeout-minutes: 720 # 12 hour timeout + permissions: + id-token: write + defaults: + run: + shell: bash + strategy: + fail-fast: true + env: + BUILD_DIR: B:\build + CACHE_DIR: "${{github.workspace}}/.cache" + CCACHE_DIR: "${{github.workspace}}/.cache/ccache" + CCACHE_MAXSIZE: "4000M" + TEATIME_FORCE_INTERACTIVE: 0 + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + fetch-depth: 10 + + - name: SHA of TheRock + run: | + git rev-parse HEAD + git log -1 + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Install python deps + run: | + pip install -r requirements.txt + + - name: Install requirements + # The first two lines removes the default commmunity feed and uses the internal proxy feed + run: | + choco source disable -n=chocolatey + choco source add -n=internal -s http://10.0.167.96:8081/repository/choco-group/ --priority=1 + choco install --no-progress -y ccache + # ninja pinned due to a bug in the 1.13.0 release: + # https://github.com/ninja-build/ninja/issues/2616 + choco install --no-progress -y ninja --version 1.12.1 + choco install --no-progress -y strawberryperl + echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH + choco install --no-progress -y awscli + choco install --no-progress -y pkgconfiglite + echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + + - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0 + with: + version: '3.62.0' + + # After other installs, so MSVC get priority in the PATH. + - name: Configure MSVC + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Runner health status + run: | + ccache --zero-stats + python ./build_tools/health_status.py + + - name: Test build_tools + run: | + python -m pytest build_tools/tests build_tools/github_actions/tests + + # TODO: We shouldn't be using a cache on actual release branches, but it + # really helps for iteration time. + - name: Enable cache + uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: ${{ env.CACHE_DIR }} + key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }} + restore-keys: | + windows-build-packages-v4-${{ inputs.amdgpu_families }}- + + - name: Fetch sources + timeout-minutes: 30 + run: | + git config fetch.parallel 10 + git config --global core.symlinks true + git config --global core.longpaths true + python ./build_tools/fetch_sources.py --jobs 12 + + - name: "Checking out repository for llvm-project" + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + path: compiler/amd-llvm + + - name: Apply patches + run: | + cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm + cd compiler/amd-llvm + git config --global --add safe.directory /__w/llvm-project/llvm-project + find . -type f -name '*.patch' -exec git apply --check {} \; + find . -type f -name '*.patch' -exec git apply {} \; + git log -15 + cd - + + - name: Configure Projects + env: + cmake_preset: ${{ inputs.build_variant_cmake_preset }} + amdgpu_families: ${{ inputs.amdgpu_families }} + package_version: ${{ inputs.package_version }} + extra_cmake_options: ${{ inputs.extra_cmake_options }} + run: | + # clear cache before build and after download + ccache -z + python3 build_tools/github_actions/build_configure.py + + - name: Build therock-archives and therock-dist + run: cmake --build "${{ env.BUILD_DIR }}" --target therock-archives therock-dist -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + shell: bash + run: | + if [ -d "${{ env.BUILD_DIR }}" ]; then + echo "Build dir:" + echo "------------" + ls -lh "${{ env.BUILD_DIR }}" + echo "Artifact Archives:" + echo "------------------" + ls -lh "${{ env.BUILD_DIR }}"/artifacts/*.tar.xz + echo "Artifacts:" + echo "----------" + du -h -d 1 "${{ env.BUILD_DIR }}"/artifacts + echo "CCache Stats:" + echo "-------------" + ccache -s + else + echo "[ERROR] Build directory ${{ env.BUILD_DIR }} does not exist. Skipping report!" + echo " This should only happen if the CI is cancelled before the build step." + exit 1 + fi + + - name: "Build size report" + if: always() + shell: powershell + run: | + $fs = Get-PSDrive -PSProvider "FileSystem" + $fsout = $fs | Select-Object -Property Name,Used,Free,Root + $fsout | % {$_.Used/=1GB;$_.Free/=1GB;$_} | Write-Host + get-disk | Select-object @{Name="Size(GB)";Expression={$_.Size/1GB}} | Write-Host + + - name: Configure AWS Credentials for non-forked repos + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + special-characters-workaround: true + + - name: Post Build Upload + if: always() + run: | + python3 build_tools/github_actions/post_build_upload.py \ + --run-id ${{ github.run_id }} \ + --artifact-group ${{ inputs.artifact_group }} \ + --build-dir ${{ env.BUILD_DIR }} \ + --upload + + - name: Save cache + uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + if: ${{ !cancelled() }} + with: + path: ${{ env.CACHE_DIR }} + key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }} diff --git a/.github/workflows/build_windows_python_packages.yml b/.github/workflows/build_windows_python_packages.yml new file mode 100644 index 000000000..40c3d184a --- /dev/null +++ b/.github/workflows/build_windows_python_packages.yml @@ -0,0 +1,87 @@ +name: Build Windows Python Packages + +on: + workflow_dispatch: + inputs: + artifact_github_repo: + description: GitHub repository for artifact_run_id + type: string + default: ROCm/TheRock + artifact_run_id: + description: Workflow run ID to download artifacts from + type: string + default: "17865324892" # TODO: default to the most recent successful run (using a script) + artifact_group: + description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)" + type: string + package_version: + type: string + workflow_call: + inputs: + artifact_github_repo: + type: string + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + package_version: + type: string + +permissions: + contents: read + +jobs: + build: + name: Build Python | ${{ inputs.artifact_group }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }} + env: + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + ARTIFACTS_DIR: "${{ github.workspace }}/artifacts" + PACKAGES_DIR: "${{ github.workspace }}/packages" + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: '3.12' + + - name: Install Python requirements + run: pip install boto3 packaging piprepo setuptools + + # Note: we could fetch "all" artifacts if we wanted to include more files + - name: Fetch artifacts + env: + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + run: | + python ./build_tools/fetch_artifacts.py \ + --run-github-repo=${{ inputs.artifact_github_repo }} \ + --run-id=${{ env.ARTIFACT_RUN_ID }} \ + --artifact-group=${{ inputs.artifact_group }} \ + --output-dir="${{ env.ARTIFACTS_DIR }}" \ + _dev_ _lib_ _run_ + + - name: Build Python packages + run: | + python ./build_tools/build_python_packages.py \ + --artifact-dir="${{ env.ARTIFACTS_DIR }}" \ + --dest-dir="${{ env.PACKAGES_DIR }}" \ + --version="${{ inputs.package_version }}" + + - name: Inspect Python packages + run: | + ls -la "${{ env.PACKAGES_DIR }}" + + - name: Sanity check Python packages + run: | + piprepo build "${{ env.PACKAGES_DIR }}/dist" + pip install rocm[libraries,devel]==${{ inputs.package_version }} \ + --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/" + rocm-sdk test + + # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket diff --git a/.github/workflows/build_windows_pytorch_wheels.yml b/.github/workflows/build_windows_pytorch_wheels.yml new file mode 100644 index 000000000..aa1fc5d43 --- /dev/null +++ b/.github/workflows/build_windows_pytorch_wheels.yml @@ -0,0 +1,357 @@ +name: Build Windows PyTorch Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + python_version: + required: true + type: string + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + required: true + type: string + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + required: true + type: string + cloudfront_url: + description: CloudFront URL pointing to Python index + required: true + type: string + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + required: true + type: string + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx1151 + python_version: + required: true + type: string + default: "3.12" + release_type: + description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"! + type: string + default: "dev" + s3_subdir: + description: S3 subdirectory, not including the GPU-family + type: string + default: "v2" + s3_staging_subdir: + description: S3 staging subdirectory, not including the GPU-family + type: string + default: "v2-staging" + cloudfront_url: + description: CloudFront base URL pointing to Python index + type: string + default: "https://rocm.devreleases.amd.com/v2" + cloudfront_staging_url: + description: CloudFront base URL pointing to staging Python index + type: string + default: "https://rocm.devreleases.amd.com/v2-staging" + rocm_version: + description: ROCm version to pip install (e.g. "7.10.0a20251124") + type: string + pytorch_git_ref: + description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y") + required: true + type: string + default: "release/2.7" + pytorch_patchset: + description: Patch directory name from where to apply existing patches. + required: true + type: string + default: "rocm_2.7" + +permissions: + id-token: write + contents: read + +jobs: + build_pytorch_wheels: + name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }} + runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }} + env: + CHECKOUT_ROOT: B:/src + OUTPUT_DIR: ${{ github.workspace }}/output + # Note the \ here instead of /. This should be used from 'cmd' not 'bash'! + PACKAGE_DIST_DIR: ${{ github.workspace }}\output\packages\dist + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + optional_build_prod_arguments: "" + outputs: + cp_version: ${{ env.cp_version }} + # The following are python package versions produced by the build. The + # exact versions will depend on workflow inputs and the underlying code. + # For example: + # Inputs + # rocm_version : 7.10.0a20251120 + # pytorch_git_ref : release/2.9 + # Outputs + # torch_version : 2.9.1+rocm7.10.0a20251120 + # torchaudio_version : 2.9.0+rocm7.10.0a20251120 + # torchvision_version: 0.24.0+rocm7.10.0a20251120 + # Future jobs can use these version outputs to identify newly built + # packages, for example via `pip install torch==${TORCH_VERSION}`. + torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }} + torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }} + torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }} + + defaults: + run: + # Note: there are mixed uses of 'bash' (this default) and 'cmd' below + shell: bash + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure Git Identity + run: | + git config --global user.name "therockbot" + git config --global user.email "therockbot@amd.com" + + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + # TODO(amd-justchen): share with build_windows_artifacts.yml. Include in VM image? Dockerfile? + - name: Install requirements + run: | + choco install --no-progress -y ninja --version 1.13.1 + choco install --no-progress -y awscli + echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH + + # After other installs, so MSVC get priority in the PATH. + - name: Configure MSVC + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + # Checkout nightly sources from https://github.com/pytorch/pytorch + # TODO: switch to 'nightly' to match our Linux workflows? + - name: Checkout PyTorch source repos (nightly branch) + if: ${{ inputs.pytorch_git_ref == 'nightly' }} + run: | + git config --global core.longpaths true + python ./external-builds/pytorch/pytorch_torch_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --repo-hashtag nightly + python ./external-builds/pytorch/pytorch_audio_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \ + --repo-hashtag nightly + python ./external-builds/pytorch/pytorch_vision_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \ + --repo-hashtag nightly + + # Checkout stable sources from https://github.com/ROCm/pytorch + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ inputs.pytorch_git_ref != 'nightly' }} + run: | + git config --global core.longpaths true + python ./external-builds/pytorch/pytorch_torch_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --gitrepo-origin https://github.com/ROCm/pytorch.git \ + --repo-hashtag ${{ inputs.pytorch_git_ref }} \ + --patchset ${{ inputs.pytorch_patchset }} + python ./external-builds/pytorch/pytorch_audio_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \ + --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --require-related-commit + python ./external-builds/pytorch/pytorch_vision_repo.py checkout \ + --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \ + --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \ + --require-related-commit + + - name: Determine optional arguments passed to `build_prod_wheels.py` + if: ${{ inputs.rocm_version }} + run: | + pip install packaging + python build_tools/github_actions/determine_version.py \ + --rocm-version ${{ inputs.rocm_version }} + + - name: Build PyTorch Wheels + id: build-pytorch-wheels + # Using 'cmd' here is load bearing! There are configuration issues when + # run under 'bash': https://github.com/ROCm/TheRock/issues/827#issuecomment-3025858800 + shell: cmd + run: | + echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}" + python ./external-builds/pytorch/build_prod_wheels.py ^ + build ^ + --install-rocm ^ + --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" ^ + --pytorch-dir ${{ env.CHECKOUT_ROOT }}/torch ^ + --pytorch-audio-dir ${{ env.CHECKOUT_ROOT }}/audio ^ + --pytorch-vision-dir ${{ env.CHECKOUT_ROOT }}/vision ^ + --enable-pytorch-flash-attention-windows ^ + --clean ^ + --output-dir ${{ env.PACKAGE_DIST_DIR }} ^ + ${{ env.optional_build_prod_arguments }} + python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }} + + - name: Sanity Check Wheel + shell: cmd + run: | + python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + special-characters-workaround: true + + - name: Upload wheels to S3 staging + if: ${{ github.repository_owner == 'ROCm' }} + # Using 'cmd' here since PACKAGE_DIST_DIR uses \ in paths instead of / + shell: cmd + run: | + aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ ^ + s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ ^ + --recursive --exclude "*" --include "*.whl" + + - name: (Re-)Generate Python package release index for staging + if: ${{ github.repository_owner == 'ROCm' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}" + shell: cmd + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} + + generate_target_to_run: + name: Generate target_to_run + runs-on: ubuntu-24.04 + outputs: + test_runs_on: ${{ steps.configure.outputs.test-runs-on }} + bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} + steps: + - name: Checking out repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Generating target to run + id: configure + env: + TARGET: ${{ inputs.amdgpu_family }} + PLATFORM: "windows" + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/configure_target_run.py + + test_pytorch_wheels: + name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} + if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }} + needs: [build_pytorch_wheels, generate_target_to_run] + uses: ./.github/workflows/test_pytorch_wheels.yml + with: + amdgpu_family: ${{ inputs.amdgpu_family }} + test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + package_index_url: ${{ inputs.cloudfront_staging_url }} + python_version: ${{ inputs.python_version }} + torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }} + pytorch_git_ref: ${{ inputs.pytorch_git_ref }} + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + upload_pytorch_wheels: + name: Release PyTorch Wheels to S3 + needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels] + if: ${{ !cancelled() }} + runs-on: ubuntu-24.04 + env: + S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" + CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}" + TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}" + TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}" + TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}" + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Configure AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases + special-characters-workaround: true + + - name: Determine upload flag + env: + BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }} + TEST_RESULT: ${{ needs.test_pytorch_wheels.result }} + TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} + BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} + run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py + + - name: Copy PyTorch wheels from staging to release S3 + if: ${{ env.upload == 'true' }} + run: | + echo "Copying exact tested wheels to release S3 bucket..." + aws s3 cp \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ + s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive \ + --exclude "*" \ + --include "torch-${TORCH_VERSION}-${CP_VERSION}-win_amd64.whl" \ + --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-win_amd64.whl" \ + --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-win_amd64.whl" + + - name: (Re-)Generate Python package release index + if: ${{ env.upload == 'true' }} + env: + # Environment variables to be set for `manage.py` + CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..cd0558e2d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,137 @@ +# This CI workflow is triggered by: +# - push to main branch +# - pull request +# - workflow dispatch +# +# For pull requests, we run default builds and tests for: +# - Linux: gfx94X gfx110X +# - Windows: gfx110X +# If you want to trigger jobs for additional targets, please add a defined label (ex: gfx120X-linux) to the pull request +# +# For push to main branch, all AMD families will built and tested from `amdgpu_family_matrix.py`. +# +# Note: If a test machine is not available for a specific AMD GPU family in `amdgpu_family_matrix.py`, tests will be skipped. + +name: CI + +on: + push: + branches: + - main + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + pull_request: + types: + - labeled + - opened + - synchronize + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: 'smoke' + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + # build_python_packages: + # name: Build Python Packages + # uses: ./.github/workflows/build_python_packages.yml + + ci_summary: + name: CI Summary + if: always() + needs: + - setup + - linux_build_and_test + runs-on: ubuntu-24.04 + steps: + - name: Output failed jobs + run: | + echo '${{ toJson(needs) }}' + + # Build a list of failed jobs, but ignore those marked continue-on-error + FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ + | jq --raw-output ' + to_entries + | map(select( + (.value.result != "success" and .value.result != "skipped") + and (.value.outputs.continue_on_error | not) + )) + | map(.key) + | join(",") + ' \ + )" + + if [[ -n "${FAILED_JOBS}" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + else + echo "All required jobs succeeded (continue-on-error jobs ignored)." + fi diff --git a/.github/workflows/ci_asan.yml b/.github/workflows/ci_asan.yml new file mode 100644 index 000000000..4da6ce0b1 --- /dev/null +++ b/.github/workflows/ci_asan.yml @@ -0,0 +1,67 @@ +name: CI ASAN + +on: + schedule: + - cron: "0 2 * * *" # Runs nightly at 2 AM UTC + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "asan" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml new file mode 100644 index 000000000..e9522b323 --- /dev/null +++ b/.github/workflows/ci_linux.yml @@ -0,0 +1,108 @@ +name: CI - Linux + +on: + workflow_call: + inputs: + artifact_group: + type: string + amdgpu_families: + type: string + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + test_runs_on: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + sanity_check_only_for_family: + type: boolean + +permissions: + contents: read + +jobs: + build_portable_linux_artifacts: + name: Build Artifacts + if: ${{ inputs.use_prebuilt_artifacts == 'false' }} + uses: ./.github/workflows/build_portable_linux_artifacts.yml + secrets: inherit + with: + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} + amdgpu_families: ${{ inputs.amdgpu_families }} + build_variant_label: ${{ inputs.build_variant_label }} + build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_variant_suffix: ${{ inputs.build_variant_suffix }} + expect_failure: ${{ inputs.expect_failure }} + permissions: + contents: read + id-token: write + + # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here? + # I don't want to copy/paste this condition and special case plumbing + # through multiple workflows. All the packaging and testing workflows need + # to know is what artifact run id to use. That could be the current + # (implicit) run id, or it could be an explicit run id. + # How about having the "build artifacts" job run as a passthrough? + + test_linux_artifacts: + needs: [build_portable_linux_artifacts] + name: Test Artifacts + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + # If we are expecting a build failure, do not run tests to save machine capacity + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/test_artifacts.yml + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + artifact_run_id: ${{ inputs.artifact_run_id }} + test_type: ${{ inputs.test_type }} + test_labels: ${{ inputs.test_labels }} + sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }} + + build_portable_linux_python_packages: + needs: [build_portable_linux_artifacts] + name: Build Python + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/build_portable_linux_python_packages.yml + with: + artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} diff --git a/.github/workflows/ci_nightly.yml b/.github/workflows/ci_nightly.yml new file mode 100644 index 000000000..e15f5e887 --- /dev/null +++ b/.github/workflows/ci_nightly.yml @@ -0,0 +1,124 @@ +# This CI workflow is triggered by: +# - scheduled run +# +# In the scheduled run, we run all targets from amdgpu_family_matrix.py and amdgpu_family_matrix_xfail.py +# As some of these builds are xfail, we allow errors to occur with `continue-on-error`, where the job will fail but the workflow is green + +name: CI Nightly + +on: + # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger + schedule: + - cron: "0 2 * * *" # Runs nightly at 2 AM UTC + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:'" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + + linux_build_and_test: + name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/ci_linux.yml + secrets: inherit + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + windows_build_and_test: + name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.windows_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }} + uses: ./.github/workflows/ci_windows.yml + with: + amdgpu_families: ${{ matrix.variant.family }} + artifact_group: ${{ matrix.variant.artifact_group }} + test_runs_on: ${{ matrix.variant.test-runs-on }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.windows_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.windows_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }} + permissions: + contents: read + id-token: write + + # build_python_packages: + # name: Build Python Packages + # uses: ./.github/workflows/build_python_packages.yml diff --git a/.github/workflows/ci_weekly.yml b/.github/workflows/ci_weekly.yml new file mode 100644 index 000000000..9570a74f3 --- /dev/null +++ b/.github/workflows/ci_weekly.yml @@ -0,0 +1,14 @@ +name: WIP Placeholder CI Weekly + +on: + # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger + # schedule: + # - cron: "0 3 * * 0" # Runs weekly at 3 AM UTC Sundays + workflow_dispatch: + + +jobs: + donothing: + runs-on: ubuntu-latest + steps: + - run: echo "Skipped" diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml new file mode 100644 index 000000000..536463a2c --- /dev/null +++ b/.github/workflows/ci_windows.yml @@ -0,0 +1,108 @@ +name: CI - Windows + +on: + workflow_call: + inputs: + artifact_group: + type: string + amdgpu_families: + type: string + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + test_runs_on: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + sanity_check_only_for_family: + type: boolean + +permissions: + contents: read + +jobs: + build_windows_artifacts: + name: Build Artifacts + if: ${{ inputs.use_prebuilt_artifacts == 'false' }} + uses: ./.github/workflows/build_windows_artifacts.yml + secrets: inherit + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + build_variant_label: ${{ inputs.build_variant_label }} + build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_variant_suffix: ${{ inputs.build_variant_suffix }} + package_version: ${{ inputs.rocm_package_version }} + expect_failure: ${{ inputs.expect_failure }} + permissions: + contents: read + id-token: write + + # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here? + # I don't want to copy/paste this condition and special case plumbing + # through multiple workflows. All the packaging and testing workflows need + # to know is what artifact run id to use. That could be the current + # (implicit) run id, or it could be an explicit run id. + # How about having the "build artifacts" job run as a passthrough? + + test_windows_artifacts: + needs: [build_windows_artifacts] + name: Test Artifacts + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + # If we are expecting a build failure, do not run tests to save machine capacity + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/test_artifacts.yml + with: + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + artifact_run_id: ${{ inputs.artifact_run_id }} + test_type: ${{ inputs.test_type }} + test_labels: ${{ inputs.test_labels }} + sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }} + + build_windows_python_packages: + needs: [build_windows_artifacts] + name: Build Python + # If the dependent job failed/cancelled, this job will not be run + # The use_prebuilt_artifacts "or" statement ensures that tests will run if + # previous build step is run or skipped.concurrency. + if: >- + ${{ + !failure() && + !cancelled() && + ( + inputs.use_prebuilt_artifacts == 'false' || + inputs.use_prebuilt_artifacts == 'true' + ) && + inputs.expect_failure == false + }} + uses: ./.github/workflows/build_windows_python_packages.yml + with: + artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + artifact_group: ${{ inputs.artifact_group }} + package_version: ${{ inputs.rocm_package_version }} diff --git a/.github/workflows/copy_release.yml b/.github/workflows/copy_release.yml new file mode 100644 index 000000000..fd4a49dbe --- /dev/null +++ b/.github/workflows/copy_release.yml @@ -0,0 +1,101 @@ +name: Copy release to dev bucket + +on: + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to copy, e.g. 7.0.0rc20250912 + type: string + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + python_version: + type: choice + options: + - 3.11 + - 3.12 + - 3.13 + default: 3.12 + include_torch: + type: boolean + default: false + sourcesubdir: + type: choice + options: + - v2 + - v2-staging + destsubdir: + type: string + default: v2 + sourcebucket: + type: choice + options: + - nightly + - dev + default: nightly + destbucket: + type: choice + options: + - dev + - nightly + default: dev +permissions: + contents: read + +jobs: + copy_python_packages: + name: Copy ${{ inputs.sourcebucket }} ${{ inputs.sourcesubdir }} -> ${{ inputs.destbucket }} ${{ inputs.destsubdir }} | ${{ inputs.amdgpu_family }} | rocm ${{ inputs.rocm_version }} | py ${{ inputs.python_version }} + runs-on: ubuntu-24.04 + permissions: + id-token: write + + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install the AWS tool + run: ./dockerfiles/install_awscli.sh + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.destbucket }}-releases + + - name: Select Python version + run: | + python build_tools/github_actions/python_to_cp_version.py \ + --python-version ${{ inputs.python_version }} + + - name: Copy ROCm packages between S3 buckets + run: | + aws s3 cp \ + s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \ + s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "rocm*${{ inputs.rocm_version }}*" + + - name: Copy torch wheels between S3 buckets + if: ${{ inputs.include_torch }} + run: | + aws s3 cp \ + s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \ + s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \ + --recursive --exclude "*" --include "*torch*${{ inputs.rocm_version }}*${{ env.cp_version }}*" + + - name: (Re-)Generate Python package release index + env: + S3_BUCKET_PY: "therock-${{ inputs.destbucket }}-python" + CUSTOM_PREFIX: "${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}" + run: | + pip install boto3 packaging + python ./build_tools/third_party/s3_management/manage.py ${CUSTOM_PREFIX} diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml new file mode 100644 index 000000000..acffe4306 --- /dev/null +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -0,0 +1,785 @@ +# Multi-Arch Build - Sharded Pipeline for Linux +# +# This workflow builds TheRock in stages: +# 1. foundation (generic) - sysdeps, base +# 2. compiler-runtime (generic) - compiler, runtimes, profiler-core +# 3. math-libs (per-arch) - BLAS, FFT, etc. +# 4. comm-libs (per-arch) - RCCL (parallel to math-libs) +# 5. dctools-core (generic) - RDC (parallel to math-libs) +# 6. profiler-apps (generic) - rocprofiler-systems (parallel to math-libs) +# 7. media (generic) - sysdeps-amd-mesa, rocdecode (todo), rocjpeg (todo) +# +# Artifacts flow between stages via S3 using the artifact_manager.py tool. + +name: Multi-Arch Build (Linux) + +on: + workflow_call: + inputs: + artifact_group: + type: string + matrix_per_family_json: + type: string + description: "JSON array of {amdgpu_family, test-runs-on} objects for per-arch stages" + dist_amdgpu_families: + type: string + description: "Semicolon-separated list of all GPU families for dist targets" + build_variant_label: + type: string + build_variant_cmake_preset: + type: string + build_variant_suffix: + type: string + test_labels: + type: string + artifact_run_id: + type: string + expect_failure: + type: boolean + use_prebuilt_artifacts: + type: string + rocm_package_version: + type: string + test_type: + type: string + +permissions: + contents: read + +env: + CONTAINER_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf + CACHE_DIR: ${{ github.workspace }}/.container-cache + TEATIME_FORCE_INTERACTIVE: 0 + +jobs: + # ========================================================================== + # STAGE: foundation (generic) + # ========================================================================== + foundation: + name: Stage - Foundation + # Always run all stages + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: foundation + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials + if: ${{ always() && !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: compiler-runtime (generic) + # ========================================================================== + compiler-runtime: + name: Stage - Compiler Runtime + needs: foundation + runs-on: azure-linux-scale-rocm + timeout-minutes: 480 # 8 hours (compiler is big) + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: compiler-runtime + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: math-libs (per-arch) + # ========================================================================== + math-libs: + name: Stage - Math Libs (${{ matrix.family_info.amdgpu_family }}) + needs: compiler-runtime + strategy: + fail-fast: false + matrix: + family_info: ${{ fromJSON(inputs.matrix_per_family_json) }} + runs-on: azure-linux-scale-rocm + timeout-minutes: 480 # 8 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: math-libs + AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --build-dir build + + # ========================================================================== + # STAGE: comm-libs (per-arch, parallel to math-libs) + # ========================================================================== + comm-libs: + name: Stage - Comm Libs (${{ matrix.family_info.amdgpu_family }}) + needs: compiler-runtime + strategy: + fail-fast: false + matrix: + family_info: ${{ fromJSON(inputs.matrix_per_family_json) }} + runs-on: azure-linux-scale-rocm + timeout-minutes: 240 # 4 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: comm-libs + AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \ + --build-dir build + + # ========================================================================== + # STAGE: dctools-core (generic, parallel to math-libs) + # ========================================================================== + dctools-core: + name: Stage - DC Tools Core + needs: compiler-runtime + runs-on: azure-linux-scale-rocm + timeout-minutes: 120 # 2 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: dctools-core + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: profiler-apps (generic, parallel to math-libs) + # ========================================================================== + profiler-apps: + name: Stage - Profiler Apps + needs: compiler-runtime + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: profiler-apps + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials (refresh for push) + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build + + # ========================================================================== + # STAGE: media (generic) + # ========================================================================== + media: + name: Stage - Media + needs: foundation + runs-on: azure-linux-scale-rocm + timeout-minutes: 180 # 3 hours + permissions: + id-token: write + container: + image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6 + options: -v /runner/config:/home/awsconfig/ + env: + AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini + STAGE_NAME: media + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install python deps + run: pip install -r requirements.txt + + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Setup ccache + run: | + ./build_tools/setup_ccache.py \ + --config-preset "github-oss-presubmit" \ + --dir "$(dirname $CCACHE_CONFIGPATH)" \ + --local-path "$CACHE_DIR/ccache" + + - name: Runner health status + run: | + ./build_tools/health_status.py + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Fetch inbound artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --output-dir build \ + --bootstrap + + - name: Fetch sources + timeout-minutes: 30 + run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1 + + - name: Get stage configuration + id: stage_config + run: | + python build_tools/configure_stage.py \ + --stage ${STAGE_NAME} \ + --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \ + --gha-output + + - name: Install stage python deps + if: ${{ steps.stage_config.outputs.pip_install_cmd }} + run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }} + + - name: Configure + run: | + cmake -B build -S . -GNinja \ + -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${{ steps.stage_config.outputs.cmake_args }} + + - name: Build stage + run: | + cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0 + + - name: Report + if: ${{ !cancelled() }} + run: | + echo "CCache Stats:" + ccache -s -v + echo "Artifacts:" + ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found" + + - name: Configure AWS Credentials + if: ${{ !github.event.pull_request.head.repo.fork }} + uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 + with: + aws-region: us-east-2 + role-to-assume: arn:aws:iam::692859939525:role/therock-ci + + - name: Push stage artifacts + if: ${{ !github.event.pull_request.head.repo.fork }} + run: | + python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \ + --stage ${STAGE_NAME} \ + --build-dir build diff --git a/.github/workflows/multi_arch_ci.yml b/.github/workflows/multi_arch_ci.yml new file mode 100644 index 000000000..73a6a74b9 --- /dev/null +++ b/.github/workflows/multi_arch_ci.yml @@ -0,0 +1,142 @@ +# Multi-Arch CI +# +# This is a staging workflow for the sharded multi-arch build pipeline. +# It mirrors ci.yml but uses multi_arch_build_portable_linux.yml instead of +# ci_linux.yml. Once validated, ci.yml will be updated to use the multi-arch +# sub-workflows directly. + +name: Multi-Arch CI + +on: + push: + branches: + # While we are iterating on testing. + - 'multi_arch/**' + workflow_dispatch: + inputs: + linux_amdgpu_families: + type: string + description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + linux_test_labels: + type: string + description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub" + default: "" + linux_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests" + windows_amdgpu_families: + type: string + description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X" + default: "" + windows_test_labels: + type: string + description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub" + default: "" + windows_use_prebuilt_artifacts: + type: boolean + description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests" + artifact_run_id: + type: string + description: "If provided, the tests will run on this artifact ID" + default: "" + # pull_request: + # types: + # - labeled + # - opened + # - synchronize + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup: + uses: ./.github/workflows/setup.yml + with: + build_variant: "release" + multi_arch: true + + linux_build_and_test: + name: Linux::${{ matrix.variant.build_variant_label }} + needs: setup + if: >- + ${{ + needs.setup.outputs.linux_variants != '[]' && + needs.setup.outputs.enable_build_jobs == 'true' + }} + strategy: + fail-fast: false + matrix: + variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }} + uses: ./.github/workflows/multi_arch_build_portable_linux.yml + secrets: inherit + with: + matrix_per_family_json: ${{ matrix.variant.matrix_per_family_json }} + dist_amdgpu_families: ${{ matrix.variant.dist_amdgpu_families }} + artifact_group: ${{ matrix.variant.artifact_group }} + build_variant_label: ${{ matrix.variant.build_variant_label }} + build_variant_suffix: ${{ matrix.variant.build_variant_suffix }} + build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }} + test_labels: ${{ needs.setup.outputs.linux_test_labels }} + artifact_run_id: ${{ inputs.artifact_run_id }} + expect_failure: ${{ matrix.variant.expect_failure == true }} + use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }} + rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }} + test_type: ${{ needs.setup.outputs.test_type }} + permissions: + contents: read + id-token: write + + # TODO: Add windows_build_and_test when ready + # windows_build_and_test: + # name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }} + # needs: setup + # if: >- + # ${{ + # needs.setup.outputs.windows_variants != '[]' && + # needs.setup.outputs.enable_build_jobs == 'true' + # }} + # strategy: + # fail-fast: false + # matrix: + # variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }} + # uses: ./.github/workflows/ci_windows.yml + # ... + + ci_summary: + name: CI Summary + if: always() + needs: + - setup + - linux_build_and_test + runs-on: ubuntu-24.04 + steps: + - name: Output failed jobs + run: | + # Build a list of failed jobs, but ignore those marked continue-on-error + FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ + | jq --raw-output ' + to_entries + | map(select( + (.value.result != "success" and .value.result != "skipped") + and (.value.outputs.continue_on_error | not) + )) + | map(.key) + | join(",") + ' \ + )" + + if [[ -n "${FAILED_JOBS}" ]]; then + echo "The following jobs failed: ${FAILED_JOBS}" + exit 1 + else + echo "All required jobs succeeded (continue-on-error jobs ignored)." + fi diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml new file mode 100644 index 000000000..c2dc2de65 --- /dev/null +++ b/.github/workflows/pr-request-release-note.yml @@ -0,0 +1,49 @@ +name: PR Request Release Note + +permissions: + contents: read + +on: + pull_request: + types: + - closed + +jobs: + request-release-note: + if: >- + github.repository_owner == 'llvm' && + startsWith(github.ref, 'refs/heads/release') + + runs-on: ubuntu-24.04 + steps: + # We need to pull the script from the main branch, so that we ensure + # we get the latest version of this script. + - name: Checkout Scripts + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + sparse-checkout: | + llvm/utils/git/requirements.txt + llvm/utils/git/github-automation.py + sparse-checkout-cone-mode: false + + - name: Install Dependencies + run: | + pip install --require-hashes -r llvm/utils/git/requirements.txt + + - name: Request Release Note + env: + # We need to use an llvmbot token here, because we are mentioning a user. + GITHUB_TOKEN: ${{ github.token }} + run: | + python3 llvm/utils/git/github-automation.py \ + --repo "$GITHUB_REPOSITORY" \ + --token "$GITHUB_TOKEN" \ + request-release-note \ + --pr-number ${{ github.event.pull_request.number}} + + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + if: always() + with: + name: workflow-args + path: | + comments diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 000000000..a129cad3f --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,21 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + pre-commit: + runs-on: ubuntu-24.04 + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + fetch-depth: 10 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml new file mode 100644 index 000000000..e878e6d2b --- /dev/null +++ b/.github/workflows/setup.yml @@ -0,0 +1,93 @@ +name: Setup + +on: + workflow_call: + inputs: + build_variant: + type: string + default: "release" + multi_arch: + type: boolean + default: false + description: "If true, group all families into one entry per build_variant instead of expanding cross-product" + outputs: + enable_build_jobs: + description: Whether to enable build jobs. + value: true + linux_variants: + description: Matrix variants to run on Linux + value: ${{ jobs.setup.outputs.linux_variants }} + linux_test_labels: + description: ROCm projects to run Linux tests on. Optional filter. + value: ${{ jobs.setup.outputs.linux_test_labels }} + windows_variants: + description: Matrix variants to run on Windows. + value: ${{ jobs.setup.outputs.windows_variants }} + test_type: + description: The test type to run for component tests (i.e. smoke, full) + value: 'smoke' + windows_test_labels: + description: ROCm projects to run Windows tests on. Optional filter. + value: ${{ jobs.setup.outputs.windows_test_labels }} + rocm_package_version: + description: ROCm package version (primarily for Python packages). + value: ${{ jobs.setup.outputs.rocm_package_version }} + +permissions: + contents: read + +jobs: + setup: + runs-on: ubuntu-24.04 + env: + # The commit being checked out is the merge commit for a PR. Its first + # parent will be the tip of the base branch. + BASE_REF: HEAD^ + outputs: + enable_build_jobs: true + linux_variants: ${{ steps.configure.outputs.linux_variants }} + linux_test_labels: ${{ steps.configure.outputs.linux_test_labels }} + windows_variants: ${{ steps.configure.outputs.windows_variants }} + test_type: 'smoke' + windows_test_labels: ${{ steps.configure.outputs.windows_test_labels }} + rocm_package_version: ${{ steps.rocm_package_version.outputs.rocm_package_version }} + steps: + - name: Checkout TheRock repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + fetch-depth: 10 + - name: SHA of TheRock + run: | + git rev-parse HEAD + git log -1 + - name: Set PR_LABELS variable with labels assigned to pull request + if: ${{ github.event.pull_request }} # only set PR labels var if this is a pull request + env: + GITHUB_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.number }} + run: | + echo "PR_LABELS=$(gh pr view ${PR_NUMBER} --repo ROCm/HIPIFY --json labels)" >> $GITHUB_ENV + + - name: Configuring CI options + id: configure + env: + #INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }} + INPUT_LINUX_AMDGPU_FAMILIES: "gfx94X" + LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }} + LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }} + #INPUT_WINDOWS_AMDGPU_FAMILIES: ${{ github.event.inputs.windows_amdgpu_families }} + INPUT_WINDOWS_AMDGPU_FAMILIES: "gfx1151" + WINDOWS_TEST_LABELS: ${{ github.event.inputs.windows_test_labels }} + WINDOWS_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.windows_use_prebuilt_artifacts }} + BUILD_VARIANT: ${{ inputs.build_variant }} + MULTI_ARCH: ${{ inputs.multi_arch }} + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: ./build_tools/github_actions/configure_ci.py + + - name: Compute package version + id: rocm_package_version + run: python ./build_tools/compute_rocm_package_version.py --release-type=dev diff --git a/.github/workflows/test_artifacts.yml b/.github/workflows/test_artifacts.yml new file mode 100644 index 000000000..53a1e2442 --- /dev/null +++ b/.github/workflows/test_artifacts.yml @@ -0,0 +1,122 @@ +name: Test Artifacts + +on: + workflow_dispatch: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + test_runs_on: + type: string + sanity_check_only_for_family: + type: boolean + default: false + test_type: + type: string + test_labels: + type: string + workflow_call: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + test_runs_on: + type: string + sanity_check_only_for_family: + type: boolean + default: false + test_type: + type: string + test_labels: + type: string + push: + branches: + - ADHOCBUILD + +permissions: + contents: read + +jobs: + configure_test_matrix: + name: "Configure test matrix" + # if there is a test machine available + if: ${{ inputs.test_runs_on != '' }} + runs-on: ${{ inputs.test_runs_on }} + outputs: + components: ${{ steps.configure.outputs.components }} + platform: ${{ steps.configure.outputs.platform }} + shard_arr: ${{ steps.configure.outputs.shard_arr }} + steps: + - name: "Fetch 'build_tools' from repository" + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + sparse-checkout: build_tools + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + path: "prejob" + + # Checkout failure is possible on Windows, as it's the first job on a GPU test runner. + # Post-job cleanup isn't necessary since no executables are launched in this job. + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: "Checking out repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + + + - name: Setting up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: "Configuring CI options" + id: configure + env: + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + TEST_TYPE: ${{ inputs.test_type }} + TEST_LABELS: ${{ inputs.test_labels }} + run: python ./build_tools/github_actions/fetch_test_configurations.py + + test_sanity_check: + name: 'Test Sanity Check' + needs: configure_test_matrix + uses: './.github/workflows/test_sanity_check.yml' + with: + artifact_group: ${{ inputs.artifact_group }} + artifact_run_id: ${{ inputs.artifact_run_id }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + platform: ${{ needs.configure_test_matrix.outputs.platform }} + + test_components: + name: 'Test ${{ matrix.components.job_name }}' + needs: [test_sanity_check, configure_test_matrix] + # skip tests if no test matrix to run and sanity check only requested + if: ${{ needs.configure_test_matrix.outputs.components != '[]' && !inputs.sanity_check_only_for_family }} + strategy: + fail-fast: false + matrix: + components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }} + uses: './.github/workflows/test_component.yml' + with: + artifact_run_id: ${{ inputs.artifact_run_id }} + artifact_group: ${{ inputs.artifact_group }} + amdgpu_families: ${{ inputs.amdgpu_families }} + test_runs_on: ${{ inputs.test_runs_on }} + platform: ${{ needs.configure_test_matrix.outputs.platform }} + component: ${{ toJSON(matrix.components) }} diff --git a/.github/workflows/test_component.yml b/.github/workflows/test_component.yml new file mode 100644 index 000000000..7475e9643 --- /dev/null +++ b/.github/workflows/test_component.yml @@ -0,0 +1,110 @@ +name: Test component + +on: + workflow_call: + inputs: + artifact_run_id: + type: string + default: "" + artifact_group: + type: string + amdgpu_families: + type: string + test_runs_on: + type: string + platform: + type: string + component: + type: string + +permissions: + contents: read + +jobs: + test_component: + name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})' + runs-on: ${{ inputs.test_runs_on }} + timeout-minutes: 210 + container: + image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + strategy: + fail-fast: false + matrix: + # The shard array is based on "total_shards" from "fetch_test_configurations.py" + # The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards) + shard: ${{ fromJSON(inputs.component).shard_arr }} + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + OUTPUT_ARTIFACTS_DIR: "./build" + THEROCK_BIN_DIR: "./build/bin" + AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} + steps: + - name: "Fetch 'build_tools' from repository" + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + sparse-checkout: build_tools + path: "prejob" + + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }} + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + # safe.directory must be set before Runner Health Status + - name: Adjust git config + run: | + git config --global --add safe.directory $PWD + git config fetch.parallel 10 + + - name: Runner health status + run: | + python ./build_tools/health_status.py + + - name: Driver / GPU sanity check + run: | + python ./build_tools/print_driver_gpu_info.py + + - name: Test + timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }} + env: + SHARD_INDEX: ${{ matrix.shard }} + TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }} + TEST_TYPE: ${{ fromJSON(inputs.component).test_type }} + run: | + ${{ fromJSON(inputs.component).test_script }} + + # GitHub's 'Complete job' step is unaware of launched executables + # and will fail to clean up orphan processes. + - name: Post-job cleanup processes on Windows + if: ${{ always() && runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1' diff --git a/.github/workflows/test_jax_dockerfile.yml b/.github/workflows/test_jax_dockerfile.yml new file mode 100644 index 000000000..a577dbe5e --- /dev/null +++ b/.github/workflows/test_jax_dockerfile.yml @@ -0,0 +1,54 @@ +name: Test JAX Wheels + +on: + workflow_dispatch: + inputs: + test_runs_on: + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + image_name: + required: true + description: JAX docker image to run tests with + type: string + jax_version: + description: Version of JAX to install + required: false + type: string + jax_plugin_branch: + required: true + description: JAX plugin branch to checkout + type: string + default: "rocm-jaxlib-v0.6.0" + + workflow_call: + inputs: + test_runs_on: + required: true + type: string + image_name: + required: true + description: JAX docker image to run tests with + type: string + jax_version: + description: Version of JAX to install instead of the one on the docker image + required: false + type: string + jax_plugin_branch: + description: JAX plugin branch to checkout to use for test scripts + type: string + default: "rocm-jaxlib-v0.8.0" + +permissions: + contents: read + +jobs: + test_wheels: + name: Test + runs-on: ${{ inputs.test_runs_on }} + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repo: rocm/rocm-jax + # TODO: Add steps for creating the JAX docker image with an install of TheRock and then running JAX tests on the container diff --git a/.github/workflows/test_linux_jax_wheels.yml b/.github/workflows/test_linux_jax_wheels.yml new file mode 100644 index 000000000..00823960f --- /dev/null +++ b/.github/workflows/test_linux_jax_wheels.yml @@ -0,0 +1,203 @@ +name: Test Linux JAX Wheels + +on: + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + release_type: + required: true + type: string + s3_subdir: + required: true + type: string + package_index_url: + description: Base CloudFront URL for the Python package index + required: true + type: string + rocm_version: + description: ROCm version (optional, informational) + required: false + type: string + tar_url: + description: URL to TheRock tarball to configure ROCm + required: true + type: string + python_version: + description: Python version(s) to test (e.g., "3.12") + required: true + type: string + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + jax_ref: + description: rocm-jax repository ref/branch to check out + required: false + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + test_runs_on: + required: true + type: string + + workflow_dispatch: + inputs: + amdgpu_family: + type: choice + options: + - gfx101X-dgpu + - gfx103X-dgpu + - gfx110X-all + - gfx1150 + - gfx1151 + - gfx120X-all + - gfx90X-dcgpu + - gfx94X-dcgpu + - gfx950-dcgpu + default: gfx94X-dcgpu + release_type: + description: The type of release ("nightly" or "dev") + required: true + type: string + default: dev + s3_subdir: + description: S3 subdirectory, not including the GPU-family + required: true + type: string + default: v2 + package_index_url: + description: Base CloudFront URL for the Python package index + required: true + type: string + default: https://rocm.nightlies.amd.com/v2-staging/ + rocm_version: + description: ROCm version + required: false + type: string + tar_url: + description: URL to TheRock tarball to configure ROCm + required: true + type: string + python_version: + description: Python version(s) to test (e.g., "3.12") + required: true + type: string + default: "3.12" + jax_ref: + description: rocm-jax repository ref/branch to check out + required: false + type: string + test_runs_on: + description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + +permissions: + contents: read + packages: read + +jobs: + test_jax_wheels: + name: Test JAX Wheels | ${{ inputs.amdgpu_family }} + runs-on: ${{ inputs.test_runs_on }} + container: + image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26 + options: >- + --device /dev/kfd + --device /dev/dri + --group-add render + --group-add video + --user root + --env-file /etc/podinfo/gha-gpu-isolation-settings + defaults: + run: + shell: bash + env: + VIRTUAL_ENV: ${{ github.workspace }}/.venv + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + THEROCK_TAR_URL: ${{ inputs.tar_url }} + PYTHON_VERSION: ${{ inputs.python_version }} + WHEEL_INDEX_URL: ${{ inputs.package_index_url }}/${{ inputs.amdgpu_family }} + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Checkout rocm-jax (plugin + build scripts) + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + path: jax + repository: rocm/rocm-jax + ref: ${{ inputs.jax_ref }} + + - name: Checkout JAX extended tests repo + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: rocm/jax + ref: ${{ inputs.jax_ref }} + path: jax/jax_tests + + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + check-latest: true + + - name: System deps, venv configure + run: | + python3 -m venv "${VIRTUAL_ENV}" + echo "PATH=${VIRTUAL_ENV}/bin:${PATH}" >> "$GITHUB_ENV" + python3 build_tools/setup_venv.py "${VIRTUAL_ENV}" --activate-in-future-github-actions-steps + + - name: Install base JAX test requirements + run: | + # This script sets up the venv and activates it across steps; keep it consistent + pip install -r external-builds/jax/requirements-jax.txt + + - name: Configure ROCm from TheRock tarball + env: + ROCM_VERSION: ${{ inputs.rocm_version }} + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + run: | + DEST="/opt/rocm-${{ inputs.rocm_version }}" + # Install directly from TheRock release buckets (nightly/dev) using the provided version + python build_tools/install_rocm_from_artifacts.py \ + --release "${{ inputs.rocm_version }}" \ + --artifact-group "${{ inputs.amdgpu_family }}" \ + --output-dir "${DEST}" + + - name: Extract JAX version and set to GITHUB_ENV + run: | + # Extract JAX version from requirements.txt (e.g., "jax==0.8.0") + # Remove all whitespace from requirements.txt to simplify parsing + # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments) + # Extract the version number by splitting on '=' and taking the 3rd field + # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments + JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \ + | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3) + echo "JAX_VERSION=$JAX_VERSION" >> "$GITHUB_ENV" + + - name: Install JAX wheels from package index + run: | + # Install jaxlib/plugin/pjrt from the GPU-family index; install jax from PyPI to match the version + pip install --index-url "${{ env.WHEEL_INDEX_URL }}" \ + "jaxlib==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \ + "jax-rocm7-plugin==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \ + "jax-rocm7-pjrt==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" + pip install --extra-index-url https://pypi.org/simple "jax==${JAX_VERSION}" + + - name: Run JAX tests + run: | + pytest jax/jax_tests/tests/multi_device_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/core_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/util_test.py -q --log-cli-level=INFO + pytest jax/jax_tests/tests/scipy_stats_test.py -q --log-cli-level=INFO diff --git a/.github/workflows/test_pytorch_wheels.yml b/.github/workflows/test_pytorch_wheels.yml new file mode 100644 index 000000000..93fe73a70 --- /dev/null +++ b/.github/workflows/test_pytorch_wheels.yml @@ -0,0 +1,190 @@ +name: Test PyTorch Wheels + +on: + workflow_dispatch: + inputs: + amdgpu_family: + description: GPU family to test + required: true + type: string + default: "gfx94X-dcgpu" + test_runs_on: + description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family + required: true + type: string + default: "linux-mi325-1gpu-ossci-rocm-frac" + package_index_url: + description: Base Python package index URL to test, typically nightly/dev URL with a "v2" or "v2-staging" subdir (without a GPU family subdir) + required: true + type: string + default: "https://rocm.nightlies.amd.com/v2" + python_version: + required: true + type: string + default: "3.12" + torch_version: + description: torch package version to install. (e.g. "2.7.1+rocm7.10.0a20251120") + required: true + type: string + pytorch_git_ref: + description: PyTorch ref to checkout test sources from. (e.g. "nightly", or "release/2.7") + type: string + default: "release/2.7" + + workflow_call: + inputs: + amdgpu_family: + required: true + type: string + test_runs_on: + required: true + type: string + package_index_url: + required: true + type: string + python_version: + required: true + type: string + torch_version: + required: true + type: string + pytorch_git_ref: + type: string + default: "release/2.7" + repository: + description: "Repository to checkout. Otherwise, defaults to `github.repository`." + type: string + ref: + description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow." + type: string + +permissions: + contents: read + +run-name: Test PyTorch (${{ inputs.amdgpu_family }}, ${{ inputs.torch_version}}, ${{ inputs.test_runs_on }}) + +jobs: + test_wheels: + name: Test PyTorch | ${{ inputs.amdgpu_family }} + runs-on: ${{ inputs.test_runs_on }} + container: + image: ${{ contains(inputs.test_runs_on, 'linux') && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + AMDGPU_FAMILY: ${{ inputs.amdgpu_family }} + + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: ${{ inputs.repository || github.repository }} + ref: ${{ inputs.ref || '' }} + + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: ${{ inputs.python_version }} + + # TODO: also upload and reference test report together with this logging? + - name: Summarize workflow inputs + run: | + python build_tools/github_actions/summarize_test_pytorch_workflow.py \ + --torch-version=${{ inputs.torch_version }} \ + --pytorch-git-ref=${{ inputs.pytorch_git_ref }} \ + --index-url=${{ inputs.package_index_url }} \ + --index-subdir=${{ inputs.amdgpu_family }} + + - name: Set git options + run: | + git config --global core.longpaths true + + # Here we checkout the same version of PyTorch that wheels were built from + # so we have the right set of test source files. We _probably_ don't need + # to run HIPIFY or apply any patches, so we skip those steps to save time. + - name: Checkout PyTorch Source Repos from nightly branch + if: ${{ (inputs.pytorch_git_ref == 'nightly') }} + run: | + python external-builds/pytorch/pytorch_torch_repo.py checkout \ + --gitrepo-origin https://github.com/pytorch/pytorch.git \ + --repo-hashtag nightly \ + --no-hipify --no-patch + + - name: Checkout PyTorch Source Repos from stable branch + if: ${{ (inputs.pytorch_git_ref != 'nightly') }} + run: | + python external-builds/pytorch/pytorch_torch_repo.py checkout \ + --gitrepo-origin https://github.com/ROCm/pytorch.git \ + --repo-hashtag ${{ inputs.pytorch_git_ref }} \ + --no-hipify --no-patch + + - name: Set up virtual environment + run: | + python build_tools/setup_venv.py ${VENV_DIR} \ + --packages torch==${{ inputs.torch_version }} \ + --index-url=${{ inputs.package_index_url }} \ + --index-subdir=${{ inputs.amdgpu_family }} \ + --activate-in-future-github-actions-steps + + - name: Install test requirements + run: | + python -m pip install -r external-builds/pytorch/requirements-test.txt + pip freeze + + - name: Run rocm-sdk sanity tests + run: | + rocm-sdk test + + - name: Run PyTorch smoketests + run: | + python ./external-builds/pytorch/run_pytorch_smoke_tests.py -- \ + --log-cli-level=INFO \ + -v + + - name: (Linux) Run PyTorch tests + if: ${{ runner.os == 'Linux' }} + run: | + python ./external-builds/pytorch/run_pytorch_tests.py -- \ + --continue-on-collection-errors \ + --import-mode=importlib \ + -v + + # Windows testing is a recent addition and is being enabled incrementally. + # See https://github.com/ROCm/TheRock/issues/2258. + # + # Many tests are failing on torch 2.10+ so we limit testing to 2.9. + # (Obviously that's not ideal, but we need to start somewhere) + # + # HACK: The test process does not terminate on its own gracefully, + # so we write to run_pytorch_tests_exit_code.txt and then kill the process. + # After killing the process we read the return code to signal it normally. + # See https://github.com/ROCm/TheRock/issues/999. + - name: (Windows) Run PyTorch tests + if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }} + continue-on-error: true + run: | + python ./external-builds/pytorch/run_pytorch_tests.py -- \ + --continue-on-collection-errors \ + --import-mode=importlib \ + -v + + - name: (Windows) Read and propagate exit code + if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }} + run: | + if [ -f run_pytorch_tests_exit_code.txt ]; then + EXIT_CODE=$(cat run_pytorch_tests_exit_code.txt) + echo "Exit code from file: ${EXIT_CODE}" + exit ${EXIT_CODE} + else + echo "No run_pytorch_tests_exit_code.txt found" + exit 1 + fi diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml new file mode 100644 index 000000000..830e6beae --- /dev/null +++ b/.github/workflows/test_sanity_check.yml @@ -0,0 +1,118 @@ +name: TheRock Sanity Check + +on: + workflow_dispatch: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + default: "" + test_runs_on: + type: string + platform: + type: string + workflow_call: + inputs: + artifact_group: + type: string + artifact_run_id: + type: string + default: "" + amdgpu_families: + type: string + default: "" + test_runs_on: + type: string + platform: + type: string + push: + branches: + - ADHOCBUILD + +permissions: + contents: read + +jobs: + test_sanity_check: + name: "Sanity ROCM Test" + runs-on: ${{ inputs.test_runs_on }} + container: + image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }} + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user + defaults: + run: + shell: bash + env: + VENV_DIR: ${{ github.workspace }}/.venv + ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" + OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build + THEROCK_BIN_DIR: ${{ github.workspace }}/build/bin + steps: + - name: "Fetch 'build_tools' from repository" + if: ${{ runner.os == 'Windows' }} + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + sparse-checkout: build_tools + path: prejob + + - name: Pre-job cleanup processes on Windows + if: ${{ runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' + + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + ref: ${{ secrets.THEROCK_MAINLINE_REF }} + + - name: Pre-job cleanup Docker containers on Linux + if: ${{ runner.os == 'Linux' }} + shell: bash + run: | + # Remove any stopped containers + docker container prune -f || true + # Remove dangling networks + docker network prune -f || true + + - name: Run setup test environment workflow + uses: './.github/actions/setup_test_environment' + with: + ARTIFACT_GROUP: ${{ inputs.artifact_group }} + ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} + OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} + VENV_DIR: ${{ env.VENV_DIR }} + FETCH_ARTIFACT_ARGS: "--base-only" + IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} + + - name: Set HIP_CLANG_PATH for windows + if: ${{ runner.os == 'Windows' }} + run: echo "HIP_CLANG_PATH=${OUTPUT_ARTIFACTS_DIR}\lib\llvm\bin" >> $GITHUB_ENV + + - name: Driver / GPU sanity check + run: | + python ./build_tools/print_driver_gpu_info.py + + - name: Run ROCm Sanity Tests + timeout-minutes: 5 + env: + # Enable verbose logging, see + # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html + AMD_LOG_LEVEL: 4 + run: | + pytest tests/ --log-cli-level=info --timeout=60 + + - name: Post-job cleanup processes on Windows + if: ${{ always() && runner.os == 'Windows' }} + shell: powershell + run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1' diff --git a/.github/workflows/therock_test_harness.yml b/.github/workflows/therock_test_harness.yml new file mode 100644 index 000000000..1699af369 --- /dev/null +++ b/.github/workflows/therock_test_harness.yml @@ -0,0 +1,101 @@ +name: TheRock Test Harness + +on: + workflow_dispatch: + inputs: + families: + type: string + description: 'The AMD GPU family to test. ex: gfx94X, gfx120X' + default: 'gfx94X' + release_version: + type: string + description: 'TheRock release version. (ex: nightly-tarball (X.Y.ZrcYYYYMMDD) or dev-tarball (X.Y.Z.dev0+{hash}))' + default: '7.9.0rc20251008' + tests_to_run: + type: string + description: 'The list of tests to run with "or" expression. (ex: "hipcub or rocprim")' + default: 'hipcub or rocprim or rocrand or rocthrust' + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + setup_metadata: + runs-on: ubuntu-24.04 + outputs: + package_targets: ${{ steps.configure.outputs.package_targets }} + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Generating package target matrix + id: configure + env: + AMDGPU_FAMILIES: ${{ inputs.families }} + THEROCK_PACKAGE_PLATFORM: "linux" + TEST_HARNESS_TARGET_FETCH: true + # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS' + ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }} + LOAD_TEST_RUNNERS_FROM_VAR: false + run: python ./build_tools/github_actions/fetch_package_targets.py + + + therock_test_harness_linux: + name: TheRock Tests Sharded Linux Nightly + needs: [setup_metadata] + runs-on: ${{ matrix.target_bundle.test_machine }} + container: + image: 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' + options: --ipc host + --group-add video + --device /dev/kfd + --device /dev/dri + --group-add 110 + --env-file /etc/podinfo/gha-gpu-isolation-settings + strategy: + fail-fast: false + matrix: + target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }} + defaults: + run: + shell: bash + steps: + - name: Checkout Repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + repository: "ROCm/TheRock" + + - name: Setup Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + with: + python-version: 3.12 + + - name: Install TheRock + env: + release_version: ${{ inputs.release_version }} + run: | + pip install -r requirements-test.txt + python3 build_tools/install_rocm_from_artifacts.py --tests --amdgpu-family ${{ matrix.target_bundle.amdgpu_family }} --release ${{ env.release_version }} + + # TODO: add parallelism + - name: Running test harness + # TESTING + run: | + python3 -m pytest -s -v --tb=short --therock-path=./therock-build tests/harness/tests*.py -k ${{ inputs.tests_to_run }} + +# TODO: Add windows tests