[CI] remove legacy tests & skip intel tests & disable flash_attn for some models (#1722)

CSY-ModelCloud · web-flow · commit d9bd9f1e31fe · 2025-08-22T15:29:27.000+08:00
* [CI] remove legacy tests &amp; skip intel tests

* [CI] move if down

* [CI] filter in py

* [CI] remove legacy dep

* [CI] skip tests if ipex was not available

* fix models don't support flash_attn
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -160,7 +160,7 @@ jobs:
 
           torch_test_files = [f for f in all_tests+all_tests_models if (not input_test_files_list or f in input_test_files_list) and f not in transformers_test_files and 'mlx' not in f]
 
-          torch_test_files = [test for test in torch_test_files if re.match(f'{TEST_REGEX}', test)]
+          torch_test_files = [test for test in torch_test_files if re.match(f'{TEST_REGEX}', test) and 'ipex' not in test and 'xpu' not in test]
           transformers_test_files = [test for test in transformers_test_files if re.match(f'{TEST_REGEX}', test)]
 
           m4_test_files = [f for f in all_tests if ('mlx' in f or 'apple' in f) and (f.strip().removesuffix('.py') in input_test_files_list if input_test_files_list else True)]
@@ -320,204 +320,12 @@ jobs:
         if: always()
         run: rm -rf ./* .[^.] .??* # pip cache purge && uv cache clean &&
 
-  legacy:
-    needs:
-      - build
-      - list-test-files
-      - check-vm
-    runs-on: [ self-hosted, xeon5 ]
-    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.transformers-files != '[]'
-    container:
-      image: ${{ needs.check-vm.outputs.ip }}:5000/nvidia/cuda:${{ needs.check-vm.outputs.cuda_version }}-ubuntu22.04
-      volumes:
-        - /home/ci/models:/monster/data/model
-        - /home/ci/models/huggingface:/github/home/.cache/huggingface
-        - /home/ci/models/pyenv:/opt/pyenv
-    strategy:
-      fail-fast: false
-      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 20 }}
-      matrix:
-        test_script: ${{ fromJSON(needs.list-test-files.outputs.transformers-files) }}
-    steps:
-      - name: Checkout Codes
-        uses: actions/checkout@v5
-        with:
-          repository: ${{ github.event.inputs.repo }}
-          ref: ${{ github.event.inputs.ref }}
-
-      - name: Fetch PR by number
-        if: ${{ github.event.inputs.pr_number != 0 }}
-        run: |
-          PR_NUMBER=${{ github.event.inputs.pr_number }}
-          echo "pr number $PR_NUMBER"
-          git config --global --add safe.directory $(pwd)
-          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
-          git checkout pr-${PR_NUMBER}
-
-      - name: Print Env
-        run: |
-          python_version=${{ env.PYTHON_VERSION }}
-          if [[ "$python_version" != *"."* ]]; then
-            python_version="${python_version/3/3.}"
-          fi
-          test_name=${{ matrix.test_script }}
-          test_name=${test_name//\//_}
-          env_name="cu${{ needs.check-vm.outputs.cuda_version }}_torch${{ env.TORCH_VERSION }}_py${python_version}_test_${test_name}"
-
-          if [ -d "$(pyenv root)/versions/$env_name" ]; then
-            echo "env exists, skip"
-            pyenv local $env_name
-            pyenv activate $env_name
-          else
-            echo "creating venv..."
-            pyenv virtualenv "$python_version" "$env_name"
-            pyenv local $env_name
-            pyenv activate $env_name
-            bash -c "$(curl -L http://${RUNNER}/scripts/env/init_compiler_no_env.sh)" @ ${{ needs.check-vm.outputs.cuda_version }} ${{ env.TORCH_VERSION }} $python_version
-          fi
-
-          pyenv local $env_name
-          pyenv activate $env_name
-          
-          echo "== pyenv =="
-          pyenv versions
-          echo "== python =="
-          python --version
-          echo "== nvcc =="
-          nvcc --version
-          echo "== torch =="
-          pip show torch || true
-          echo "== pip list =="
-          pip list
-
-#      - name: Install requirements
-#        run: |
-#          bash -c "$(curl -L http://${RUNNER}/scripts/env/init_compiler_no_env.sh)" @ ${{ needs.check-vm.outputs.cuda_version }} ${{ env.TORCH_VERSION }} $python_version
-
-      - name: Download source from local
-        continue-on-error: true
-        run: |
-          curl -s -O  http://$RUNNER/whl/${{ env.repo }}/${{ github.run_id }}/gptqmodel_source.tar.gz
-          ls -ahl .
-          sha256=$(sha256sum $file_name)
-          echo "sha256=$sha256"
-          echo "SOURCE_DOWNLOADED=1" >> $GITHUB_ENV
-
-      # - name: Download source from github
-      #   if: env.SOURCE_DOWNLOADED == '' && !cancelled()
-      #   uses: actions/download-artifact@v5
-      #   with:
-      #     name: source
-      #     path: dist
-      #     run-id: ${{ github.run_id }}
-
-      #      - name: Uncompress source
-      #       continue-on-error: true
-      #      run: |
-      #       find . -mindepth 1 ! -name "gptqmodel_source.tar.gz" -exec rm -rf {} +
-      #      ls -ahl .
-      #     tar -zxf gptqmodel_source.tar.gz
-
-      - name: Download wheel from local
-        continue-on-error: true
-        run: |
-          file_name=$(curl -s  -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://$RUNNER/gpu/whl/download")
-
-          echo "file_name=$file_name"
-
-          if echo "$file_name" | grep -q "gptqmodel"; then
-              mkdir dist || true
-              cd dist
-              curl -s -O  http://$RUNNER/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name
-              ls -ahl .
-              sha256=$(sha256sum $file_name)
-              echo "sha256=$sha256"
-              echo "WHL_DOWNLOADED=1" >> $GITHUB_ENV
-          fi
-
-      - name: Download artifact from github
-        if: env.WHL_DOWNLOADED == '' && !cancelled()
-        uses: actions/download-artifact@v5
-        with:
-          name: whl
-          path: dist
-          run-id: ${{ needs.check-vm.outputs.run_id }}
-
-      - name: Install wheel
-        run: |
-          pip install uv -U
-          uv pip install -r requirements.txt
-          echo "===== install optimum bitblas parameterized uvicorn ====="
-          uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://$RUNNER/simple/ --trusted-host $RUNNER --extra-index-url https://pypi.org/simple
-          echo "===== install dist/whl ====="
-          uv pip install dist/*.whl -i http://$RUNNER/simple/ --trusted-host $RUNNER --extra-index-url https://pypi.org/simple
-          echo "===== init test env ====="
-          echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 ====="
-          uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://$RUNNER/simple/ --trusted-host $RUNNER --extra-index-url https://pypi.org/simple
-          if [ "${{ matrix.test_script }}" == "models/test_xverse" ]; then
-            echo "===== install tokenizers==0.15.2 ====="
-            uv pip install tokenizers==0.15.2 -i http://$RUNNER/simple/ --trusted-host $RUNNER --extra-index-url https://pypi.org/simple
-          fi
-          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
-            echo "===== install auto_round ====="
-            uv pip install auto_round -i http://$RUNNER/simple/ --trusted-host $RUNNER --extra-index-url https://pypi.org/simple
-          fi
-
-          echo "== pip list =="
-          pip list
-
-      - name: Find suitable GPU
-        if: ${{ !contains(matrix.test_script, 'ipex') && !cancelled() }}
-        run: |
-          timestamp=$(date +%s%3N)
-          gpu_id=-1
-
-          url="http://$XEON5/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }}"
-          echo "$url"
-          while [ "$gpu_id" -lt 0 ]; do
-            gpu_id=$(curl -s "$url")
-
-            if [ "$gpu_id" -lt 0 ]; then
-              echo "http://$XEON5/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id"
-              echo "No available GPU, waiting 5 seconds..."
-              curl http://$XEON5/gpu/status2
-              sleep 5
-            else
-              echo "Allocated GPU ID: $gpu_id"
-            fi
-          done
-          if [[ ! "$gpu_id" =~ ^[0-9]+$ ]]; then
-            echo "gpu_id: $gpu_id is not a number"
-          fi
-          echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
-          echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
-          echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
-          curl http://$XEON5/gpu/status2
-
-      - name: Run tests
-        if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
-        run: |
-          start_time=$(date +%s)
-          pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
-          execution_time=$(( $(date +%s) - start_time ))
-          echo "$((execution_time / 60))m $((execution_time % 60))s"
-          curl "http://$RUNNER/gpu/log_test_vram?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second&name=${{ matrix.test_script }}"
-
-      - name: Release GPU
-        if: always() && !contains(matrix.test_script, 'ipex')
-        run: curl -X GET "http://$XEON5/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}"
-
-      - name: Clean cache
-        if: always()
-        run: rm -rf ./* .[^.] .??* #pip cache purge && uv cache clean &&
-
   torch:
     needs:
       - build
       - list-test-files
       - check-vm
     runs-on: [ self-hosted, xeon5 ]
-    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-files != '[]'
     container:
       image: ${{ needs.check-vm.outputs.ip }}:5000/nvidia/cuda:${{ needs.check-vm.outputs.cuda_version }}-ubuntu22.04
       options: --device /dev/dri --ipc=host --runtime=nvidia --gpus all
@@ -531,6 +339,7 @@ jobs:
       max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 20 }}
       matrix:
         test_script: ${{ fromJSON(needs.list-test-files.outputs.torch-files) }}
+    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-files != '[]'
     steps:
       - name: Checkout Codes
         uses: actions/checkout@v5
@@ -712,12 +521,6 @@ jobs:
             uv pip install numpy==1.26.3
           fi
 
-
-          if [[ "${{ matrix.test_script }}" != "models/test_qwen2_vl" ]]; then
-            echo "===== uninstall torchvision ====="
-            uv pip uninstall torchvision || true
-          fi
-
           echo "===== install -r requirements.txt ====="
           uv pip install -r requirements.txt
 
@@ -790,7 +593,6 @@ jobs:
     container:
       image: modelcloud/gptqmodel:alpine-ci-v1
     needs:
-      - legacy
       - torch
     steps:
       - name: Print statistics
diff --git a/tests/models/test_bloom.py b/tests/models/test_bloom.py
@@ -23,6 +23,7 @@ class TestBloom(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC = 0.2201
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2440
     TORCH_DTYPE = torch.float16
+    USE_FLASH_ATTN = False
 
     def test_bloom(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_chatglm.py b/tests/models/test_chatglm.py
@@ -25,6 +25,7 @@ class TestChatGlm(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC = 0.3319
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3729
     TRUST_REMOTE_CODE = True
+    USE_FLASH_ATTN = False
 
     def test_chatglm(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_codegen.py b/tests/models/test_codegen.py
@@ -23,6 +23,7 @@ class TestCodeGen(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2005
     TRUST_REMOTE_CODE = True
     USE_VLLM = False
+    USE_FLASH_ATTN = False
 
     def test_codegen(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_ernie4_5.py b/tests/models/test_ernie4_5.py
@@ -23,6 +23,7 @@ class TestErnie4_5(ModelTest):
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3183
     TRUST_REMOTE_CODE = True
     EVAL_BATCH_SIZE = 6
+    USE_FLASH_ATTN = False
 
     def test_exaone(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_longllama.py b/tests/models/test_longllama.py
@@ -24,6 +24,7 @@ class TestLongLlama(ModelTest):
     TRUST_REMOTE_CODE = True
     QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.5
     USE_VLLM = False
+    USE_FLASH_ATTN = False
 
     def test_longllama(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_mpt.py b/tests/models/test_mpt.py
@@ -24,6 +24,7 @@ class TestMpt(ModelTest):
     APPLY_CHAT_TEMPLATE = False
     TRUST_REMOTE_CODE = False
     EVAL_BATCH_SIZE = 6
+    USE_FLASH_ATTN = False
 
     def test_mpt(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py
@@ -27,6 +27,7 @@ class TestOvis1_6_Llama(ModelTest):
     TRUST_REMOTE_CODE = True
     APPLY_CHAT_TEMPLATE = False
     EVAL_BATCH_SIZE = 1
+    USE_FLASH_ATTN = False
 
     def test_ovis_1_6(self):
         model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE,
diff --git a/tests/models/test_telechat2.py b/tests/models/test_telechat2.py
@@ -24,6 +24,7 @@ class TestTeleChat_2(ModelTest):
     TRUST_REMOTE_CODE = True
     EVAL_BATCH_SIZE = 6
     USE_VLLM = False
+    USE_FLASH_ATTN = False
 
 
     def test_telechat2(self):
diff --git a/tests/test_bits.py b/tests/test_bits.py
@@ -29,7 +29,7 @@
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
-from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear  # noqa: E402
+from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear, HAS_IPEX  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
@@ -49,10 +49,11 @@ class TestBits(unittest.TestCase):
         BACKEND.TRITON: TritonV2QuantLinear,
         BACKEND.TORCH: TorchQuantLinear,
         BACKEND.BITBLAS: BitBLASQuantLinear,
-        BACKEND.IPEX: IPEXQuantLinear,
         BACKEND.MARLIN: MarlinQuantLinear,
         BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear,
     }
+    if HAS_IPEX:
+        QLINEAR_DICT[BACKEND.IPEX] = IPEXQuantLinear
 
     QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2
     QUANT_ARC_MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.2
diff --git a/tests/test_group_size.py b/tests/test_group_size.py
@@ -28,7 +28,7 @@
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
-from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear  # noqa: E402
+from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear, HAS_IPEX  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
@@ -55,8 +55,9 @@ class TestGroupSize(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS,
-                             BACKEND.IPEX]
+        cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS]
+        if HAS_IPEX:
+            cls.pack_backends.append(BACKEND.IPEX)
         cls.backends = list(cls.pack_backends)
         cls.backends.extend([BACKEND.EXLLAMA_V2, BACKEND.MARLIN, ])
 
diff --git a/tests/test_kernel_output_ipex.py b/tests/test_kernel_output_ipex.py
@@ -2,7 +2,7 @@
 
 import torch
 from gptqmodel import BACKEND, GPTQModel
-from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear
+from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear, HAS_IPEX
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 from gptqmodel.utils.model import find_modules
 from logbar import LogBar
@@ -61,6 +61,8 @@ def assert_on_mismatch(self, a: Tensor, b: Tensor, rtol=0.00005, atol=0.00005):
         (BACKEND.IPEX,  r_tolerance, a_tolerance),
     ])
     def test_kernel_output(self, backend: BACKEND, r_tolerance: float, a_tolerance: float):
+        if not HAS_IPEX and backend == BACKEND.IPEX:
+            self.skipTest("IPEX is not available")
         model = GPTQModel.load(self.model_path, backend=backend, device_map=self.device_map, torch_dtype=self.torch_dtype)
         log.info(f"device_map: {self.device_map} ")
         log.info(f"backend: {backend} ")
diff --git a/tests/test_packing.py b/tests/test_packing.py
@@ -27,7 +27,7 @@
 import torch.nn as nn  # noqa: E402
 # isort: on
 from gptqmodel import BACKEND  # noqa: E402
-from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear  # noqa: E402
+from gptqmodel.nn_modules.qlinear.ipex import IPEXQuantLinear, HAS_IPEX  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.utils import dequantize_4bits_weight  # noqa: E402
@@ -115,6 +115,8 @@ def pack(self, qlinearCls, backend):
         list(QLINEAR_DICT.keys())
     )
     def test_compare_exllama_triton_torch(self, backend):
+        if backend == BACKEND.IPEX and not HAS_IPEX:
+            self.skipTest("IPEX is not available")
         triton_linear = self.pack(self.QLINEAR_DICT[backend], backend=backend)
 
         dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear)