diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index bacdfd656f6..d1504a2753e 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -108,8 +108,7 @@ jobs: # ------------------------------------ v1 spec decode test ------------------------------------ # pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py - # Fix me: test_eagle_correctness OOM error - #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py + pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py e2e-2-cards: name: multicard-2 diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py index 5d74b5d4b8e..d8c0fabe30c 100644 --- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import os import random from typing import Any @@ -9,6 +10,8 @@ from tests.e2e.conftest import VllmRunner +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + @pytest.fixture def test_prompts(): @@ -61,7 +64,6 @@ def eagle3_model_name(): return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B" -@pytest.mark.skip("TODO: Revert me after ngram oom issue on ci is fixed") def test_ngram_correctness( test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, @@ -71,9 +73,11 @@ def test_ngram_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "ngram", @@ -156,9 +160,10 @@ def test_suffix_correctness( Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' - ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False) - ref_outputs = ref_llm.chat(test_prompts, sampling_config) - del ref_llm + with VllmRunner(model_name, max_model_len=1024, + enforce_eager=False) as ref_llm: + ref_outputs = ref_llm.model.chat(test_prompts, sampling_config) + with VllmRunner(model_name, speculative_config={ "method": "suffix",