diff --git a/.github/workflows/backend-test.yaml b/.github/workflows/backend-test.yaml index 061dd2a05e..c6942f017c 100644 --- a/.github/workflows/backend-test.yaml +++ b/.github/workflows/backend-test.yaml @@ -26,14 +26,9 @@ on: required: false type: string default: '' - -# NOTE: The following environment variables are for CI runners to -# mitigate the network connection problems. However, this may not work -# if the PR is triggered from a forked repository because in that case -# GitHub doesn't allow the workflow to access any secrets. -# env: -# http_proxy: ${{ secrets.HTTP_PROXY }} -# https_proxy: ${{ secrets.HTTPS_PROXY }} + secrets: + RUNNER_SSH_KEY: + required: true permissions: contents: read diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 066fd69b34..fe76bc47f9 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -113,6 +113,8 @@ jobs: runner_label: h20 gpu_check_script: tools/gpu_check.sh test_script: tools/run_backend_tests_nvidia.sh + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} backend-ascend: needs: preprocess @@ -123,6 +125,8 @@ jobs: runner_label: ascend_CI gpu_check_script: tools/gpu_check_ascend.sh test_script: tools/run_backend_tests.sh + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} backend-iluvatar: needs: preprocess @@ -130,9 +134,11 @@ jobs: uses: ./.github/workflows/backend-test.yaml with: vendor: iluvatar - runner_label: iluvatar_CI + runner_label: iluvatar gpu_check_script: tools/gpu_check_iluvatar.sh test_script: tools/run_backend_tests.sh + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} backend-metax: needs: preprocess @@ -142,6 +148,8 @@ jobs: vendor: metax runner_label: metax_CI test_script: tools/run_backend_tests_metax.sh + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} backend-moore: needs: preprocess @@ -152,6 +160,8 @@ jobs: runner_label: moore_CI gpu_check_script: tools/gpu_check_moore.sh test_script: tools/run_backend_tests.sh + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} # TODO(Qiming): This job doesn't require an nvidia backend, the generic # test-op workflow should be fine. @@ -165,3 +175,5 @@ jobs: gpu_check_script: tools/gpu_check.sh test_script: tools/test-op-experimental.sh changed_files: ${{ needs.preprocess.outputs.changed_files }} + secrets: + RUNNER_SSH_KEY: ${{ secrets.RUNNER_SSH_KEY }} diff --git a/pyproject.toml b/pyproject.toml index c8eaee5adb..d9060694bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,14 @@ official_torch_270 = [ "torchaudio==2.7.0", ] +# Turned out that iluvatar requires native CUDA toolkit 10.2 +# cuda_runtime = [ +# "nvidia-cublas-cu12", +# "nvidia-cuda-runtime-cu12", +# "nvidia-cuda-nvrtc-cu12", +# "nvidia-cudnn-cu12", +# ] + nvidia = [ "flag_gems[official_torch_290]", ] diff --git a/src/flag_gems/runtime/backend/_iluvatar/op_black_list.yaml b/src/flag_gems/runtime/backend/_iluvatar/op_black_list.yaml index b9c43ec695..e046a0943b 100644 --- a/src/flag_gems/runtime/backend/_iluvatar/op_black_list.yaml +++ b/src/flag_gems/runtime/backend/_iluvatar/op_black_list.yaml @@ -2,14 +2,17 @@ # Description: List of operators unsupported (FALSE) in the current environment # test based: 2026-03-11 # Source: Operator Compatibility Test Table - +# unsupported_operators: - name: grouped_topk - reason: "All dtypes failed" + reason: All dtypes failed - name: topk_softmax - reason: "vllm not supported" + reason: vllm not supported summary: total_unsupported: 2 - note: "This list is based on items explicitly marked as FALSE in the provided table. Certain operators (e.g., conv1d, index_add) have blank entries in some columns and are therefore excluded; their support status requires further verification." + note: | + This list is based on items explicitly marked as FALSE in the provided table. + Certain operators (e.g., `conv1d`, `index_add`) have blank entries in some columns and + are therefore excluded; their support status requires further verification. diff --git a/src/flag_gems/runtime/backend/_iluvatar/ops/div.py b/src/flag_gems/runtime/backend/_iluvatar/ops/div.py index e74bb00321..8aaaabdef3 100644 --- a/src/flag_gems/runtime/backend/_iluvatar/ops/div.py +++ b/src/flag_gems/runtime/backend/_iluvatar/ops/div.py @@ -6,6 +6,7 @@ from flag_gems.utils import pointwise_dynamic, tl_extra_shim +# TODO: Check if this logger instantiation is good logger = logging.getLogger(__name__) div_rn = tl_extra_shim.div_rn div_rz = tl_extra_shim.div_rz diff --git a/tools/gpu_check_iluvatar.sh b/tools/gpu_check_iluvatar.sh index 49278a4dee..aa0a4a29c8 100644 --- a/tools/gpu_check_iluvatar.sh +++ b/tools/gpu_check_iluvatar.sh @@ -4,6 +4,8 @@ memory_usage_max=30000 # Maximum memory usage limit (MB) sleep_time=120 # Wait time (seconds), default is 2 minutes +export LD_LIBRARY_PATH=/usr/local/corex/lib:$LD_LIBRARY_PATH + # Get the number of GPUs gpu_count=$(ixsmi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l) diff --git a/tools/run_backend_tests_iluvatar.sh b/tools/run_backend_tests_iluvatar.sh index 8f3b102fd4..751a58d276 100644 --- a/tools/run_backend_tests_iluvatar.sh +++ b/tools/run_backend_tests_iluvatar.sh @@ -3,12 +3,19 @@ VENDOR=${1:?"Usage: bash tools/run_backend_tests_iluvatar.sh "} export GEMS_VENDOR=$VENDOR -source tools/run_command.sh - echo "Running FlagGems tests with GEMS_VENDOR=$VENDOR" -run_command python3 -m pytest -s tests/test_tensor_constructor_ops.py -run_command python3 -m pytest -s tests/test_shape_utils.py -run_command python3 -m pytest -s tests/test_tensor_wrapper.py -run_command python3 -m pytest -s tests/test_pointwise_dynamic.py -run_command python3 -m pytest -s tests/test_distribution_ops.py +export LD_LIBRARY_PATH=/usr/local/corex-4.4.0/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH +echo $LD_LIBRARY_PATH +export PYENV_ROOT="$HOME/.pyenv" +export PATH="$PYENV_ROOT/bin:$PATH" +eval "$(pyenv init - bash)" + +pip install -U pip +pip install uv +uv venv +source .venv/bin/activate +uv pip install setuptools==82.0.1 scikit-build-core==0.12.2 pybind11==3.0.3 cmake==3.31.10 ninja==1.13.0 +uv pip install -e .[iluvatar,test] + +pytest -s tests