deepjavalibrary · ksuma2109 · Oct 31, 2025 · Nov 1, 2025
@@ -338,6 +338,12 @@ def get_model_name():
         "adapters": ["french", "spanish"],
         "tokenizer": "unsloth/llama-3-8b-Instruct"
     },
+    "gpt-oss-20b-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32],
+        "worker": 1,
+        "adapters": ["promptinj", "uncensored"],
+    },
     "gemma-7b-unmerged-lora": {
         "batch_size": [4],
         "seq_length": [16, 32],
@@ -365,6 +371,10 @@ def get_model_name():
         "batch_size": [1, 4],
         "seq_length": [256],
     },
+    "gpt-oss": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+    },
     "tinyllama-input-len-exceeded": {
         "batch_size": [1],
         "seq_length": [25],

@@ -322,6 +322,11 @@
         "option.task": "text-generation",
         "option.tensor_parallel_degree": 4
     },
+    "gpt-oss-20b": {
+        "option.model_id": "s3://djl-llm/gpt-oss-20b",
+        "option.task": "text-generation",
+        "option.tensor_parallel_degree": 4
+    },
     "mistral-7b": {
         "option.model_id": "s3://djl-llm/mistral-7b-instruct-v03",
         "option.task": "text-generation",
@@ -372,6 +377,16 @@
         "option.tensor_parallel_degree": 1,
         "option.max_rolling_batch_size": 4,
     },
+    "gpt-oss-20b-speculative-eagle3": {
+        "option.model_id": "s3://djl-llm/gpt-oss-20b",
+        "option.task": "text-generation",
+        "option.speculative_model": "zhuyksir/EAGLE3-gpt-oss-20b-bf16",
+        "option.speculative_method": "eagle3",
+        "option.num_speculative_tokens": 4,
+        "option.use_v2_block_manager": True,
+        "option.tensor_parallel_degree": 1,
+        "option.max_rolling_batch_size": 4,
+    },
     "llama-68m-speculative-eagle": {
         "option.model_id": "s3://djl-llm/llama-68m/",
         "option.task": "text-generation",
@@ -554,6 +569,29 @@
         "option.gpu_memory_utilization":
         "0.8",
     },
+    "gpt-oss-20b-unmerged-lora": {
+        "option.model_id":
+        "s3://djl-llm/gpt-oss-20b",
+        "option.tensor_parallel_degree":
+        "max",
+        "option.enable_lora":
+        "true",
+        "option.max_loras":
+        1,
+        "option.max_lora_rank":
+        128,
+        "option.long_lora_scaling_factors":
+        "4.0",
+        "option.adapters":
+        "adapters",
+        "adapter_ids": [
+            "waliboii/gpt-oss-20b-promptinj-lora",
+            "jworks/gpt-oss-20b-uncensored-lora",
+        ],
+        "adapter_names": ["promptinj", "uncensored"],
+        "option.gpu_memory_utilization":
+        "0.8",
+    },
     "phi2-unmerged-lora": {
         "option.model_id":
         "s3://djl-llm/phi-2/",

@@ -376,6 +376,12 @@ def test_llama2_7b_chat(self):
             r.launch()
             client.run("vllm_chat llama2-7b-chat".split())
 
+    def test_gpt_oss_20b(self):
+        with Runner('lmi', 'gpt-oss-20b') as r:
+            prepare.build_vllm_async_model("gpt-oss-20b")
+            r.launch()
+            client.run("vllm gpt-oss".split())
+
     @pytest.mark.skipif(not is_applicable_cuda_capability(89),
                         reason="Unsupported CUDA capability")
     def test_qwen2_7b_fp8(self):
@@ -409,6 +415,12 @@ def test_llama_68m_speculative_medusa(self):
 @pytest.mark.gpu_4
 class TestVllm2:
 
+    def test_gpt_oss_speculative_eagle3(self):
+        with Runner('lmi', 'gpt-oss-20b-speculative-eagle3') as r:
+            prepare.build_vllm_async_model("gpt-oss-20b-speculative-eagle3")
+            r.launch()
+            client.run("vllm gpt-oss".split())
+
     def test_llama_68m_speculative_eagle(self):
         with Runner('lmi', 'llama-68m-speculative-eagle') as r:
             prepare.build_vllm_async_model("llama-68m-speculative-eagle")
@@ -482,6 +494,12 @@ def test_lora_phi2(self):
 @pytest.mark.gpu_4
 class TestVllmAsyncLora:
 
+    def test_gpt_oss_20b_lora(self):
+        with Runner('lmi', 'gpt-oss-20b-unmerged-lora') as r:
+            prepare.build_vllm_async_model("gpt-oss-20b-unmerged-lora")
+            r.launch()
+            client.run("vllm_async_adapters gpt-oss-20b-lora".split())
+
     def test_lora_llama3_8b_async(self):
         with Runner('lmi', 'llama3-8b-unmerged-lora-async') as r:
             prepare.build_vllm_async_model("llama3-8b-unmerged-lora")