diff --git a/.github/workflows/qa-l0-pytorch-wheel.yml b/.github/workflows/qa-l0-pytorch-wheel.yml index aef4396ae8..1da2058249 100644 --- a/.github/workflows/qa-l0-pytorch-wheel.yml +++ b/.github/workflows/qa-l0-pytorch-wheel.yml @@ -14,15 +14,14 @@ concurrency: jobs: qa-l0-pytorch-wheel: - runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ] + runs-on: [ dx4-nvidia-8gpu-v1 ] defaults: run: shell: bash container: image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 - ports: - - 80:80 options: >- + -v /home/runner/_work/${{ github.run_id }}:${{ github.workspace }} --gpus all --shm-size=500g --privileged @@ -34,6 +33,9 @@ jobs: --pull always steps: + - name: Configure git safe directory + run: git config --system --add safe.directory '*' + - name: Checkout Code uses: actions/checkout@v6.0.1 with: diff --git a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml index 52299cf411..ffac442b1f 100644 --- a/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml +++ b/.github/workflows/qa-l0-te-cpp-unittest-pytorch-lint.yml @@ -24,25 +24,28 @@ concurrency: jobs: run-qa-l0-core-tests: - runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ] + runs-on: [ dx4-nvidia-8gpu-v1 ] defaults: run: shell: bash container: image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 - ports: - - 80:80 options: >- - --gpus all - --shm-size=500g - --privileged - --ipc=host - --ulimit memlock=-1 - --ulimit stack=67108864 - --ulimit nofile=65535:65535 + -v /home/runner/_work/${{ github.run_id }}:${{ github.workspace }} + --gpus all + --shm-size=500g + --privileged + --ipc=host + --ulimit memlock=-1 + --ulimit stack=67108864 + --ulimit nofile=65535:65535 --user root --pull always steps: + + - name: Configure git safe directory + run: git config --system --add safe.directory '*' + - name: Checkout Code uses: actions/checkout@v6.0.1 with: @@ -59,6 +62,7 @@ jobs: submodules: recursive set-safe-directory: true + - name: Install Dependencies & Build Transformer Engine # timeout-minutes: 40 env: diff --git a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml index 51f071aa3b..963410dd0a 100644 --- a/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml +++ b/.github/workflows/qa-l1-te-cpp-pytorch-tests.yml @@ -35,15 +35,14 @@ concurrency: jobs: run-qa-l1-comprehensive-tests: - runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ] + runs-on: [ dx4-nvidia-8gpu-v1 ] defaults: run: shell: bash container: image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 - ports: - - 80:80 options: >- + -v /home/runner/_work/${{ github.run_id }}:${{ github.workspace }} --gpus all --shm-size=500g --privileged @@ -54,6 +53,9 @@ jobs: --user root --pull always steps: + - name: Configure git safe directory + run: git config --system --add safe.directory '*' + - name: Checkout Code uses: actions/checkout@v6.0.1 with: diff --git a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml index 9a881dd2d9..f1498304ff 100644 --- a/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml +++ b/.github/workflows/qa-l3-te-pytorch-fa-versions-test.yml @@ -20,15 +20,14 @@ concurrency: jobs: run-qa-l3-attention-tests: - runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ] + runs-on: [ dx4-nvidia-8gpu-v1 ] defaults: run: shell: bash container: image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 - ports: - - 80:80 options: >- + -v /home/runner/_work/${{ github.run_id }}:${{ github.workspace }} --gpus all --shm-size=500g --privileged @@ -39,6 +38,9 @@ jobs: --user root --pull always steps: + - name: Configure git safe directory + run: git config --system --add safe.directory '*' + - name: Checkout Code uses: actions/checkout@v6.0.1 with: diff --git a/.github/workflows/te-plugin-tests.yml b/.github/workflows/te-plugin-tests.yml index f487673444..af08eb3f9b 100644 --- a/.github/workflows/te-plugin-tests.yml +++ b/.github/workflows/te-plugin-tests.yml @@ -18,15 +18,14 @@ concurrency: jobs: run-plugin-tests: - runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ] + runs-on: [ dx4-nvidia-8gpu-v1 ] defaults: run: shell: bash container: image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 - ports: - - 80:80 options: >- + -v /home/runner/_work/${{ github.run_id }}:${{ github.workspace }} --gpus all --shm-size=500g --privileged @@ -37,6 +36,9 @@ jobs: --user root --pull always steps: + - name: Configure git safe directory + run: git config --system --add safe.directory '*' + - name: Checkout Code uses: actions/checkout@v6.0.1 with: