genlm · vicky-xef · Nov 20, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 24, 2025
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -26,7 +26,7 @@ jobs:
         run: |
           python -m venv venv
           source venv/bin/activate
-          pip install -e .[test]
+          pip install -e .[lora]
           pip install -r requirements-dev.txt
 
       - name: Run tests

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -17,6 +17,16 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 1
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/[email protected]
+        with:
+          tool-cache: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: false
 
       - uses: actions/setup-python@v4
         with:
@@ -27,6 +37,6 @@ jobs:
         run: |
           python -m venv venv
           source venv/bin/activate
-          pip install -e .[test]
+          pip install -e .[lora]
           pip install -r requirements-dev.txt
           python -m pytest tests --ignore=tests/test_mlx.py
diff --git a/README.md b/README.md
@@ -35,6 +35,11 @@ Or to install with MLX support, run:
 pip install genlm-backend[mlx]
 ```
 
+Or to install with LoRA support, run:
+
+```bash
+pip install genlm-backend[lora]
+```
 
 ## 🧪 Example: Autobatched Sequential Importance Sampling with LLMs
 

diff --git a/genlm/backend/llm/hf.py b/genlm/backend/llm/hf.py
@@ -162,6 +162,37 @@ def cache_kv(self, prompt_tokens):
         result = self.model(torch.tensor([prompt_tokens]).to(self.device))
         node = self.cache.extend_cache(0, prompt_tokens, result.logits[0], 0)
         node.past_key_values = result.past_key_values
+
+    def load_lora(self, lora_path, lora_name='lora_1'):
+        """Load a LoRA adapter into the base model.
+
+        Args:
+            lora_path (str): Path to the adapter weights directory or identifier in HuggingFace's model hub.
+            lora_name (str): Name to assign to the loaded adapter.
+
+        Notes:
+            This does not activate the adapter immediately. Call `set_lora()` to enable the adapter.
+        """
+        self.model.load_adapter(lora_path, lora_name)
+
+    def set_lora(self, lora_name='lora_1'):
+        """Activate a previously loaded LoRA adapter.
+
+        Args:
+            lora_name (str): Name of the LoRA adapter to activate.
+
+        """
+        self.clear_kv_cache()
+        self.clear_cache()
+        self.model.set_adapter(lora_name)
+
+    def clear_lora(self):
+        """
+        Deactivate all LoRA adapters.
+        """
+        self.clear_kv_cache()
+        self.clear_cache()
+        self.model.set_adapter([])
 
     @torch.no_grad()
     def batch_evaluate_queries(self):

diff --git a/genlm/backend/llm/vllm.py b/genlm/backend/llm/vllm.py
@@ -7,6 +7,7 @@
 
 try:
     from vllm import AsyncLLMEngine, SamplingParams, AsyncEngineArgs
+    from vllm.lora.request import LoRARequest
     from vllm.utils import Counter
     from vllm.inputs import TokensPrompt
 
@@ -81,6 +82,7 @@ def __init__(self, async_llm_engine, cache_size=0, cache_opts={}):
                 if cache_size > 0
                 else None
             )
+            self.lora_request = None
 
             async_llm_engine.engine.log_stats = False
 
@@ -128,6 +130,22 @@ def from_name(cls, model_name, engine_opts=None, **kwargs):
         def underlying_model(self):
             return self.async_llm_engine.engine.model_executor.driver_worker.model_runner.model
 
+        def clear_lora(self):
+            """
+            Disable any active LoRA adapter for the vLLM engine.
+            """
+            self.lora_request = None
+
+        def set_lora(self, lora_path, lora_name="current_lora", lora_id=1):
+            """Configure a LoRA adapter request for the vLLM engine.
+
+            Args:
+                lora_path (str): Path to the adapter weights directory or identifier in HuggingFace's model hub.
+                lora_name (str): Identifier name to associate with this LoRA adapter within vLLM.
+                lora_id (int): Globally unique ID for the adapter.
+            """
+            self.lora_request = LoRARequest(lora_name, lora_id, lora_path)
+
         async def next_token_logprobs(self, token_ids):
             """Request log probabilities of next token asynchronously with output caching.
 
@@ -172,6 +190,7 @@ async def _next_token_logprobs(self, token_ids):
                 sampling_params=SamplingParams(
                     **self.default_params, logits_processors=[processor]
                 ),
+                lora_request=self.lora_request,
                 request_id=req_id,
             ):
                 if output.finished:
@@ -215,11 +234,12 @@ def batch_next_token_logprobs_sync(self, token_ids_list):
                     params=SamplingParams(
                         **self.default_params, logits_processors=[processor]
                     ),
+                    lora_request=self.lora_request,
                     request_id=req_id,
                 )
 
             while self.async_llm_engine.engine.has_unfinished_requests():
-                output = self.async_llm_engine.engine.step()
+                output = self.async_llm_engine.engine.step() 
                 for out in output:
                     if out.finished:
                         assert out.request_id in req_id2processors, (
@@ -275,6 +295,7 @@ async def sample(
                     seed=seed,
                     stop=[self.byte_vocab[i].decode() for i in eos_token_ids],
                 ),
+                lora_request=self.lora_request,
                 request_id=str(next(self.request_counter)),
             ):
                 if output.finished:

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,14 +16,17 @@ dependencies = [
     "bitsandbytes; sys_platform == 'linux'",
     "numba",
     "vllm>=0.6.6,<=0.10.0; sys_platform == 'linux'",
-    "triton>=3.2.0; sys_platform == 'linux'",
+    "triton==3.2.0; sys_platform == 'linux'",
 ]
 
 [project.optional-dependencies]
 mlx = [
     "mlx",
     "mlx-lm"
 ]
+lora = [
+    'peft'
+]
 docs = [
     "mkdocs",
     "mkdocstrings[python]",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,6 +9,7 @@
         destroy_model_parallel,
         destroy_distributed_environment,
     )
+    from vllm.lora.request import LoRARequest
 
     HAS_VLLM = True
 except ImportError:
@@ -142,6 +143,7 @@ def __init__(self, llm):
             stop=None,
             ignore_eos=True,
         )
+        self.lora_request = None
 
         self.llm.llm_engine.log_stats = False
 
@@ -158,11 +160,18 @@ def from_name(cls, model_name, llm_opts=None):
         llm = LLM(model=model_name, tokenizer=model_name, **llm_opts)
         return cls(llm)
 
+    def clear_lora(self):
+        self.lora_request = None
+
+    def set_lora(self, lora_path, lora_name="current_lora", lora_id=1):
+        self.lora_request = LoRARequest(lora_name, lora_id, lora_path)
+
     def next_token_logprobs_sync(self, token_ids):
         outputs = self.llm.generate(
             prompts=TokensPrompt(prompt_token_ids=token_ids),
             sampling_params=self.DEFAULT_SAMPLING_PARAMS,
             use_tqdm=False,
+            lora_request=self.lora_request
         )
         logprobs = np.array(
             [
@@ -185,6 +194,7 @@ async def batch_next_token_logprobs(self, token_ids_list):
             prompts=prompts,
             sampling_params=self.DEFAULT_SAMPLING_PARAMS,
             use_tqdm=False,
+            lora_request=self.lora_request
         )
         logprobs = np.array(
             [