From 943c0b94ea7d131c4c78def0d687bb10f00fe57f Mon Sep 17 00:00:00 2001 From: fluctlux <38945811+fluctlux@users.noreply.github.com> Date: Fri, 5 Dec 2025 23:03:10 +0800 Subject: [PATCH 1/2] fix ngram & suffix ci oom issue Signed-off-by: fluctlux <38945811+fluctlux@users.noreply.github.com> --- .../spec_decode_v1/test_v1_spec_decode.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 0902fe6dd68..2d12506085c 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import os import random from typing import Any @@ -9,6 +10,8 @@ from tests.e2e.conftest import VllmRunner +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + @pytest.fixture def test_prompts(): @@ -61,7 +64,6 @@ def eagle3_model_name(): return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B" -@pytest.mark.skip("TODO: Revert me after ngram oom issue on ci is fixed") def test_ngram_correctness( test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, @@ -71,9 +73,11 @@ def test_ngram_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "ngram", @@ -156,9 +160,10 @@ def test_suffix_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "suffix", From ad3444f95483eaf681f02332bd6e10adef9eba59 Mon Sep 17 00:00:00 2001 From: fluctlux <38945811+fluctlux@users.noreply.github.com> Date: Sat, 6 Dec 2025 17:21:16 +0800 Subject: [PATCH 2/2] enable e2e/test_v1_spec_decode Signed-off-by: fluctlux <38945811+fluctlux@users.noreply.github.com> --- .github/workflows/_e2e_test.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index cab9ca927bd..5ec5fbb3d00 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -107,8 +107,7 @@ jobs: # ------------------------------------ v1 spec decode test ------------------------------------ # pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py - # Fix me: test_eagle_correctness OOM error - #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py e2e-2-cards: name: multicard-2