Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ def get_model_name():
"adapters": ["french", "spanish"],
"tokenizer": "unsloth/llama-3-8b-Instruct"
},
"gpt-oss-20b-lora": {
"batch_size": [4],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["promptinj", "uncensored"],
},
"gemma-7b-unmerged-lora": {
"batch_size": [4],
"seq_length": [16, 32],
Expand Down Expand Up @@ -365,6 +371,10 @@ def get_model_name():
"batch_size": [1, 4],
"seq_length": [256],
},
"gpt-oss": {
"batch_size": [1, 4],
"seq_length": [256],
},
"tinyllama-input-len-exceeded": {
"batch_size": [1],
"seq_length": [25],
Expand Down
38 changes: 38 additions & 0 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,11 @@
"option.task": "text-generation",
"option.tensor_parallel_degree": 4
},
"gpt-oss-20b": {
"option.model_id": "s3://djl-llm/gpt-oss-20b",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4
},
"mistral-7b": {
"option.model_id": "s3://djl-llm/mistral-7b-instruct-v03",
"option.task": "text-generation",
Expand Down Expand Up @@ -372,6 +377,16 @@
"option.tensor_parallel_degree": 1,
"option.max_rolling_batch_size": 4,
},
"gpt-oss-20b-speculative-eagle3": {
"option.model_id": "s3://djl-llm/gpt-oss-20b",
"option.task": "text-generation",
"option.speculative_model": "zhuyksir/EAGLE3-gpt-oss-20b-bf16",
"option.speculative_method": "eagle3",
"option.num_speculative_tokens": 4,
"option.use_v2_block_manager": True,
"option.tensor_parallel_degree": 1,
"option.max_rolling_batch_size": 4,
},
"llama-68m-speculative-eagle": {
"option.model_id": "s3://djl-llm/llama-68m/",
"option.task": "text-generation",
Expand Down Expand Up @@ -554,6 +569,29 @@
"option.gpu_memory_utilization":
"0.8",
},
"gpt-oss-20b-unmerged-lora": {
"option.model_id":
"s3://djl-llm/gpt-oss-20b",
"option.tensor_parallel_degree":
"max",
"option.enable_lora":
"true",
"option.max_loras":
1,
"option.max_lora_rank":
128,
"option.long_lora_scaling_factors":
"4.0",
"option.adapters":
"adapters",
"adapter_ids": [
"waliboii/gpt-oss-20b-promptinj-lora",
"jworks/gpt-oss-20b-uncensored-lora",
],
"adapter_names": ["promptinj", "uncensored"],
"option.gpu_memory_utilization":
"0.8",
},
"phi2-unmerged-lora": {
"option.model_id":
"s3://djl-llm/phi-2/",
Expand Down
18 changes: 18 additions & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,12 @@ def test_llama2_7b_chat(self):
r.launch()
client.run("vllm_chat llama2-7b-chat".split())

def test_gpt_oss_20b(self):
with Runner('lmi', 'gpt-oss-20b') as r:
prepare.build_vllm_async_model("gpt-oss-20b")
r.launch()
client.run("vllm gpt-oss".split())

@pytest.mark.skipif(not is_applicable_cuda_capability(89),
reason="Unsupported CUDA capability")
def test_qwen2_7b_fp8(self):
Expand Down Expand Up @@ -409,6 +415,12 @@ def test_llama_68m_speculative_medusa(self):
@pytest.mark.gpu_4
class TestVllm2:

def test_gpt_oss_speculative_eagle3(self):
with Runner('lmi', 'gpt-oss-20b-speculative-eagle3') as r:
prepare.build_vllm_async_model("gpt-oss-20b-speculative-eagle3")
r.launch()
client.run("vllm gpt-oss".split())

def test_llama_68m_speculative_eagle(self):
with Runner('lmi', 'llama-68m-speculative-eagle') as r:
prepare.build_vllm_async_model("llama-68m-speculative-eagle")
Expand Down Expand Up @@ -482,6 +494,12 @@ def test_lora_phi2(self):
@pytest.mark.gpu_4
class TestVllmAsyncLora:

def test_gpt_oss_20b_lora(self):
with Runner('lmi', 'gpt-oss-20b-unmerged-lora') as r:
prepare.build_vllm_async_model("gpt-oss-20b-unmerged-lora")
r.launch()
client.run("vllm_async_adapters gpt-oss-20b-lora".split())

def test_lora_llama3_8b_async(self):
with Runner('lmi', 'llama3-8b-unmerged-lora-async') as r:
prepare.build_vllm_async_model("llama3-8b-unmerged-lora")
Expand Down
Loading