generative-computing · guicho271828 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 7, 2026
diff --git a/conda/environment.yml b/conda/environment.yml
@@ -5,5 +5,3 @@ channels:
 dependencies:
   - python=3.12                 # note: at the time of writing, xformer (< vllm) has a broken wheel for 3.13. https://github.com/facebookresearch/xformers/issues/740#issuecomment-2753869337
   - uv
-variables:
-  VLLM_USE_V1: 0                 # need this to make outlines work
diff --git a/conda/install.sh b/conda/install.sh
@@ -4,8 +4,7 @@ CONDA=""
 if which mamba > /dev/null
 then
     CONDA=$(which mamba)
-fi
-if which conda > /dev/null
+elif which conda > /dev/null
 then
     CONDA=$(which conda)
 fi

diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
@@ -18,15 +18,17 @@
 from typing import TYPE_CHECKING, Any, cast
 
 import granite_common
-import outlines
-import outlines_core
+import llguidance
+import llguidance.hf
+import llguidance.torch
 import peft
 import torch
 from transformers import (
     AsyncTextIteratorStreamer,
     AutoModelForCausalLM,
     AutoTokenizer,
     DynamicCache,
+    LogitsProcessorList,
     PreTrainedModel,
     PreTrainedTokenizer,
     set_seed,
@@ -69,8 +71,6 @@
 from mellea.stdlib.intrinsics.intrinsic import Intrinsic
 from mellea.stdlib.requirement import ALoraRequirement, LLMaJRequirement, Requirement
 
-assert outlines, "outlines needs to be present to make outlines_core work"
-
 """A configuration type for the unhappy path: Tokenizer * Model * torch device string
 
 Huggingface backends can initialize themselves from a model string if the transformers `Auto*` classes can be used. Therefore, a TransformersTorchConfig usually isn't required. However, sometimes a model needs special care to instantiate properly, or a custom device type needs to bse used. Instead of trying to do a lot of partial magic, we basically have two modaliites: either the constructor can figure out everything from the model_id, or the user has to provide an entire config.
@@ -90,6 +90,59 @@ class HFAloraCacheInfo:
     q_end: int = -1
 
 
+# modified from VLLM v0.9.2 code base
+# https://github.com/vllm-project/vllm/blob/v0.9.2/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+class _GuidanceLogitsProcessor:
+    def __init__(self, grammar: str, ll_tokenizer: llguidance.LLTokenizer) -> None:
+        self.grammar = grammar
+        self.vocab_size: int = ll_tokenizer.vocab_size
+        self.ll_tokenizer: llguidance.LLTokenizer = ll_tokenizer
+        self.ll_matchers: list[llguidance.LLMatcher] = []
+        self.bitmasks: list[torch.Tensor] = []
+        self.new_sampling: bool = False
+        self.batch_size: int = -1
+
+    def __call__(
+        self, batch_input_ids: torch.Tensor, batch_scores: torch.Tensor
+    ) -> torch.Tensor:
+        i_batch, i_seqlen = batch_input_ids.shape
+        s_batch, s_vocab = batch_scores.shape
+        assert i_batch == s_batch
+        assert s_vocab == self.vocab_size
+
+        if self.batch_size != i_batch:
+            self.batch_size = i_batch
+            self.bitmasks = [
+                llguidance.torch.allocate_token_bitmask(1, self.vocab_size)  # type: ignore[attr-defined]
+                for _ in range(self.batch_size)
+            ]
+
+            self.ll_matchers = [
+                llguidance.LLMatcher(self.ll_tokenizer, self.grammar)
+                for _ in range(self.batch_size)
+            ]
+
+        for input_ids, scores, ll_matcher, bitmask in zip(
+            batch_input_ids, batch_scores, self.ll_matchers, self.bitmasks
+        ):
+            if self.new_sampling and len(input_ids) > 0:
+                ll_matcher.consume_token(  # type: ignore[attr-defined]
+                    input_ids.tolist()[-1]
+                )
+                err = ll_matcher.get_error()  # type: ignore[attr-defined]
+                if err:
+                    FancyLogger.get_logger().warning("Error in LLMatcher: %s", err)
+
+            llguidance.torch.fill_next_token_bitmask(ll_matcher, bitmask, 0)
+            llguidance.torch.apply_token_bitmask_inplace(
+                scores, bitmask.to(scores.device)
+            )  # type: ignore[attr-defined]
+
+        self.new_sampling = True
+
+        return scores
+
+
 class LocalHFBackend(FormatterBackend, AdapterMixin):
     """The LocalHFBackend uses Huggingface's transformers library for inference, and uses a Formatter to convert `Component`s into prompts. This backend also supports Activated LoRAs (ALoras)](https://arxiv.org/pdf/2504.12397).
 
@@ -178,6 +231,10 @@ def __init__(
             case _:
                 self._tokenizer, self._model, self._device = custom_config
 
+        self._llguidance_tokenizer: llguidance.LLTokenizer = (
+            llguidance.hf.from_tokenizer(self._tokenizer)  # type:ignore
+        )
+
         self._use_caches = use_caches
         self._cache = cache if cache is not None else SimpleLRUCache(3)
 
@@ -596,24 +653,15 @@ async def _generate_from_context_with_kv_cache(
 
             format_kwargs = {}
             if _format:
-                # outlines.generate.json always parses the resulting json into a python dict.
-                # We however want to keep it as a json string for later storing it in ModelOutputThunk
                 schema: dict[str, Any] = _format.model_json_schema()
-                schema_json: str = json.dumps(schema)
-                regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema(  # type: ignore
-                    schema_json
+                grammar: str = llguidance.LLMatcher.grammar_from_json_schema(
+                    schema, defaults={"whitespace_flexible": False}
+                )
+                logits_processor = _GuidanceLogitsProcessor(
+                    grammar, self._llguidance_tokenizer
                 )
-
-                from outlines.models.transformers import TransformerTokenizer
-                from outlines.processors.structured import RegexLogitsProcessor
-                from transformers import LogitsProcessorList
-
                 format_kwargs["logits_processor"] = LogitsProcessorList(
-                    [
-                        RegexLogitsProcessor(
-                            regex_str, tokenizer=TransformerTokenizer(self._tokenizer)
-                        )
-                    ]
+                    [logits_processor]
                 )
 
             streaming_kwargs = {}
@@ -763,24 +811,15 @@ async def _generate_from_context_standard(
 
             format_kwargs = {}
             if _format:
-                # outlines.generate.json always parses the resulting json into a python dict.
-                # We however want to keep it as a json string for later storing it in ModelOutputThunk
                 schema: dict[str, Any] = _format.model_json_schema()
-                schema_json: str = json.dumps(schema)
-                regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema(  # type: ignore
-                    schema_json
+                grammar: str = llguidance.LLMatcher.grammar_from_json_schema(
+                    schema, defaults={"whitespace_flexible": False}
+                )
+                logits_processor = _GuidanceLogitsProcessor(
+                    grammar, self._llguidance_tokenizer
                 )
-
-                from outlines.models.transformers import TransformerTokenizer
-                from outlines.processors.structured import RegexLogitsProcessor
-                from transformers import LogitsProcessorList
-
                 format_kwargs["logits_processor"] = LogitsProcessorList(
-                    [
-                        RegexLogitsProcessor(
-                            regex_str, tokenizer=TransformerTokenizer(self._tokenizer)
-                        )
-                    ]
+                    [logits_processor]
                 )
 
             streaming_kwargs = {}
@@ -990,25 +1029,14 @@ async def generate_from_raw(
 
         format_kwargs = {}
         if format:
-            # outlines.generate.json always parses the resulting json into a python dict.
-            # We however want to keep it as a json string for later storing it in ModelOutputThunk
             schema: dict[str, Any] = format.model_json_schema()
-            schema_json: str = json.dumps(schema)
-            regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema(  # type: ignore
-                schema_json
+            grammar: str = llguidance.LLMatcher.grammar_from_json_schema(
+                schema, defaults={"whitespace_flexible": False}
             )
-
-            from outlines.models.transformers import TransformerTokenizer
-            from outlines.processors.structured import RegexLogitsProcessor
-            from transformers import LogitsProcessorList
-
-            format_kwargs["logits_processor"] = LogitsProcessorList(
-                [
-                    RegexLogitsProcessor(
-                        regex_str, tokenizer=TransformerTokenizer(self._tokenizer)
-                    )
-                ]
+            logits_processor = _GuidanceLogitsProcessor(
+                grammar, self._llguidance_tokenizer
             )
+            format_kwargs["logits_processor"] = LogitsProcessorList([logits_processor])
 
         outputs = await asyncio.to_thread(
             self._generate_with_adapter_lock,

diff --git a/mellea/backends/vllm.py b/mellea/backends/vllm.py
@@ -19,8 +19,6 @@
 from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec  # type:ignore
-import outlines
-import outlines_core
 import torch
 import vllm  # type:ignore
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -50,8 +48,6 @@
 from mellea.stdlib.chat import Message
 from mellea.stdlib.requirement import LLMaJRequirement, Requirement
 
-assert outlines, "outlines needs to be present to make outlines_core work"
-
 format: None = None  # typing this variable in order to shadow the global format function and ensure mypy checks for errors
 
 
@@ -83,14 +79,6 @@ def __init__(
             formatter (Formatter): A mechanism for turning `stdlib` stuff into strings. Experimental Span-based models should use `mellea.backends.span.*` backends.
             model_options (Optional[dict]): Default model options.
         """
-        if os.environ.get("VLLM_USE_V1", -1) != "0":
-            FancyLogger.get_logger().error(
-                "Mellea LocalVLLMBackend doesn't support VLLM V1. Must `export VLLM_USE_V1=0`."
-            )
-            raise ValueError(
-                "Mellea LocalVLLMBackend doesn't support VLLM V1. Must `export VLLM_USE_V1=0`."
-            )
-
         formatter = (
             formatter if formatter is not None else TemplateFormatter(model_id=model_id)
         )
@@ -205,23 +193,20 @@ def __init__(
 
         # Keep track of the event loop the engine was instantiated in.
         self._event_loop = get_current_event_loop()
+        # we store the engine args because we have to reset the engine with a different event loop. See _model .
+        self.engine_args = engine_args
 
         self._tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
             self._hf_model_id
         )  # type:ignore
 
-        # See the notes in outlines.models.vllm.adapt_tokenizer for why this is needed.
-        # Note: there is a module named outlines.models.vllm and a function named outlines.models.vllm.vllm .
-        # However, outlines.models import outlines.models.vllm.vllm as vllm,
-        # thus the module outlines.models.vllm becomes inaccessible,
-        # hence the use of importlib to get the module.
-        self._tokenizer_for_outlines: PreTrainedTokenizerBase = importlib.import_module(
-            "outlines.models.vllm"
-        ).adapt_tokenizer(self._tokenizer)
-
     @property
     def _model(self) -> vllm.AsyncLLMEngine:
         """Use model when making generation requests."""
+        # 2026/01/06 Masa: Temporarily canceling the mechanism below.
+        # After vllm 0.11.0, start/shutdown_background_loop is gone.
+        # 2026/01/07 Masa: Rewrote it to reinstantiate the engine.
+
         el = get_current_event_loop()
 
         # vLLM attaches itself to the event loop that is running when instantiated /
@@ -231,8 +216,13 @@ def _model(self) -> vllm.AsyncLLMEngine:
         # Most of the time, this should be a no-op. The event loop will only change
         # if switching between async and sync calls.
         if el != self._event_loop:
-            self._underlying_model.shutdown_background_loop()
-            self._underlying_model.start_background_loop()
+            FancyLogger.get_logger().warning("restarting the vllm event loop")
+            # self._underlying_model.shutdown_background_loop()
+            # self._underlying_model.start_background_loop()
+            self._underlying_model.shutdown()
+            self._underlying_model = vllm.AsyncLLMEngine.from_engine_args(
+                vllm.AsyncEngineArgs(model=self._hf_model_id, **self.engine_args)
+            )
             self._event_loop = el
 
         return self._underlying_model
@@ -320,22 +310,10 @@ async def _generate_from_context_standard(
             )
 
             if _format is not None:
-                # outlines.generate.json always parses the resulting json into a python dict.
-                # We however want to keep it as a json string for later storing it in ModelOutputThunk
-                schema: dict[str, Any] = _format.model_json_schema()
-                schema_json: str = json.dumps(schema)
-                regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema(  # type: ignore
-                    schema_json  # type: ignore
-                )  # type: ignore
-
-                from outlines.processors import RegexLogitsProcessor
-
-                logits_processor = RegexLogitsProcessor(
-                    regex_str,
-                    tokenizer=self._tokenizer_for_outlines,  # type: ignore
-                )
-                sampling_params.logits_processors = (
-                    [logits_processor] if logits_processor is not None else []
+                sampling_params.structured_outputs = (
+                    vllm.sampling_params.StructuredOutputsParams(
+                        json=_format.model_json_schema()
+                    )
                 )
 
             # stream = model_options.get(ModelOption.STREAM, False)
@@ -458,20 +436,10 @@ async def generate_from_raw(
         )
 
         if format is not None:
-            schema: dict[str, Any] = format.model_json_schema()
-            schema_json: str = json.dumps(schema)
-            regex_str: str = outlines_core.fsm.json_schema.build_regex_from_schema(  # type: ignore
-                schema_json  # type: ignore
-            )  # type: ignore
-
-            from outlines.processors import RegexLogitsProcessor
-
-            logits_processor = RegexLogitsProcessor(
-                regex_str,
-                tokenizer=self._tokenizer_for_outlines,  # type: ignore
-            )
-            sampling_params.logits_processors = (
-                [logits_processor] if logits_processor is not None else []
+            sampling_params.structured_outputs = (
+                vllm.sampling_params.StructuredOutputsParams(
+                    json=format.model_json_schema()
+                )
             )
 
         async def generate(prompt, request_id):

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,34 +57,18 @@ m = "cli.m:cli"
 # uv pip install -e .[hf, watsonx]
 # if you want to install all dependencies, use uv sync --all-extras
 
-
-# note on outlines versions:
-# outlines>=1.2.0 requires outlines-core==0.2.11
-# outlines<=1.1.* requires outlines-core==0.1.26
-# vllm==0.10.0    requires outlines-core==0.2.10
-# vllm==0.9.*     requires outlines-core==0.1.26
-#
-# thus the following version combination allows installing vllm and outlines
-# (main library) at the same time.
-
 hf = [
     "accelerate>=1.9.0",
     "alora==0.2.0",
     "datasets>=4.0.0",
-    "outlines-core==0.1.26",
-    "outlines",                 # intentionally un-versioned, expecting a minor update. coutlines-core version should be enough to specify it
+    "llguidance",
     "peft>=0.18.0", # aLoRA support was added in Peft 0.18.0
     "transformers>=4.53.2",
     "trl==0.19.1",
 ]
 
 vllm = [
-    "transformers<4.54.0",
-    # see https://github.com/vllm-project/vllm-ascend/issues/2046
-    "numpy<2.0.0",              # patching incorrect dependencies in vllm and outlines.
-    # see https://github.com/vllm-project/vllm/issues/5587
-    "outlines-core==0.1.26",
-    "vllm>=0.9.1",
+    "vllm>=0.13.0",
 ]
 
 litellm = [

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
@@ -39,7 +39,7 @@
 def backend():
     """Shared HuggingFace backend for all tests in this module."""
     backend = LocalHFBackend(
-        model_id="ibm-granite/granite-3.3-8b-instruct",
+        model_id="ibm-granite/granite-3.3-2b-instruct",
         formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
         cache=SimpleLRUCache(5),
     )

diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
@@ -22,7 +22,7 @@
 def backend():
     """Shared HuggingFace backend for all tests in this module."""
     backend = LocalHFBackend(
-        model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B, cache=SimpleLRUCache(5)
+        model_id="ibm-granite/granite-3.3-2b-instruct", cache=SimpleLRUCache(5)
     )
     # add_granite_aloras(backend)
     return backend