PaddlePaddle · ST-XX · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
@@ -7,6 +7,7 @@
 Structured Outputs refer to predefined format constraints that force large language models to generate content strictly following specified structures. This feature significantly improves output controllability and is suitable for scenarios requiring precise format outputs (such as API calls, data parsing, code generation, etc.), while supporting dynamic grammar extensions to balance flexibility and standardization.
 
 FastDeploy supports using the [XGrammar](https://xgrammar.mlc.ai/docs/) backend to generate structured outputs.
+FastDeploy supports using the [LLguidance](https://github.com/guidance-ai/llguidance) backend to generate structured outputs.
 
 Supported output formats:
 

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -44,7 +44,7 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```disable_sequence_parallel_moe``` | `bool` | Disable sequence parallel moe, default: False |
 | ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None |
-| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `off`, default: `off` |
+| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `guidance`, `off`, default: `off` |
 | ```guided_decoding_disable_any_whitespace``` | `bool` | Whether to disable whitespace generation during guided decoding, default: False |
 | ```speculative_config``` | `dict[str]` | Speculative decoding configuration, only supports standard format JSON string, default: None |
 | ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |

diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md
@@ -7,6 +7,7 @@
 Structured Outputs 是指通过预定义格式约束，使大模型生成内容严格遵循指定结构。该功能可显著提升生成结果的可控性，适用于需要精确格式输出的场景（如API调用、数据解析、代码生成等），同时支持动态语法扩展，平衡灵活性与规范性。
 
 FastDeploy 支持使用 [XGrammar](https://xgrammar.mlc.ai/docs/) 后端生成结构化输出。
+FastDeploy 支持使用 [LLguidance](https://github.com/guidance-ai/llguidance) 后端生成结构化输出。
 
 支持输出格式
 

diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md
@@ -42,7 +42,7 @@
 | ```disable_sequence_parallel_moe``` | `bool` | 禁止在TP+EP中使用序列并行优化, default: False |
 | ```splitwise_role```               | `str`       | 是否开启splitwise推理，默认值mixed， 支持参数为["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports```         | `str`       | prefill 实例内部引擎启动端口 （仅单机PD分离需要），默认值None |
-| ```guided_decoding_backend```      | `str`       | 指定要使用的guided decoding后端，支持 `auto`、`xgrammar`、`off`, 默认为 `off` |
+| ```guided_decoding_backend```      | `str`       | 指定要使用的guided decoding后端，支持 `auto`、`xgrammar`、 `guidance`、`off`, 默认为 `off` |
 | ```guided_decoding_disable_any_whitespace``` | `bool`   | guided decoding期间是否禁止生成空格，默认False |
 | ```speculative_config```           | `dict[str]` | 投机解码配置，仅支持标准格式json字符串，默认为None |
 | ```dynamic_load_weight```          | `int`       | 是否动态加载权重，默认0 |

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1595,13 +1595,27 @@ def postprocess(self):
 
         if (
             self.structured_outputs_config is not None
-            and self.structured_outputs_config.guided_decoding_backend == "auto"
+            and self.structured_outputs_config.guided_decoding_backend != "off"
         ):
             if current_platform.is_xpu() or self.speculative_config.method is not None:
                 logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.structured_outputs_config.guided_decoding_backend = "off"
-            else:
+            elif self.structured_outputs_config.guided_decoding_backend in ["auto", "xgrammar"]:
                 self.structured_outputs_config.guided_decoding_backend = "xgrammar"
+            elif self.structured_outputs_config.guided_decoding_backend == "guidance":
+                try:
+                    import llguidance.torch
+
+                    llguidance.torch
+                except ImportError:
+                    raise ImportError(
+                        "The 'llguidance' package is required for using guidance as the guided decoding backend. "
+                        "Please install it via the appropriate method."
+                    )
+            else:
+                raise NotImplementedError(
+                    f"Guided decoding backend '{self.structured_outputs_config.guided_decoding_backend}' is not implemented. [auto, xgrammar, guidance, off]"
+                )
 
         if self.model_config.enable_mm:
             if self.cache_config.max_encoder_cache is None or self.cache_config.max_encoder_cache < 0:
@@ -1711,7 +1725,8 @@ def check(self):
                 "XGrammar",
                 "auto",
                 "off",
-            ], f"Only support xgrammar、auto guided decoding backend, but got {self.structured_outputs_config.guided_decoding_backend}."
+                "guidance",
+            ], f"Only support [auto, xgrammar, guidance, off] guided decoding backend, but got {self.structured_outputs_config.guided_decoding_backend}."
 
             if self.structured_outputs_config.guided_decoding_backend != "off":
                 # TODO: speculative decoding support guided_decoding

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -161,6 +161,8 @@
     "FD_ENGINE_TASK_QUEUE_WITH_SHM": lambda: int(os.getenv("FD_ENGINE_TASK_QUEUE_WITH_SHM", "0")),
     "FD_FILL_BITMASK_BATCH": lambda: int(os.getenv("FD_FILL_BITMASK_BATCH", "4")),
     "FD_ENABLE_PDL": lambda: int(os.getenv("FD_ENABLE_PDL", "1")),
+    "FD_GUIDANCE_DISABLE_ADDITIONAL": lambda: bool(int(os.getenv("FD_GUIDANCE_DISABLE_ADDITIONAL", "1"))),
+    "FD_LLGUIDANCE_LOG_LEVEL": lambda: int(os.getenv("FD_LLGUIDANCE_LOG_LEVEL", "0")),
 }
 
 

diff --git a/fastdeploy/lazy_loader.py b/fastdeploy/lazy_loader.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A LazyLoader class."""
+
+import importlib
+import sys
+import types
+from typing import Any
+
+
+class LazyLoader(types.ModuleType):
+    """
+    LazyLoader module borrowed from Tensorflow
+    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
+    with an addition of "module caching".
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+    Modules such as `xgrammar` might do additional side effects, so we
+    only want to use this when it is needed, delaying all eager effects
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: dict[str, Any],
+        name: str,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._module: types.ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> types.ModuleType:
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules
+            # ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise err from None
+
+        # Update this object's dict so that if someone keeps a
+        # reference to the LazyLoader, lookups are efficient
+        # (__getattr__ is only called on lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> list[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py
@@ -50,6 +50,15 @@ def get_guided_backend(
             fd_config=fd_config,
             **kwargs,
         )
+    elif fd_config.structured_outputs_config.guided_decoding_backend.lower() == "guidance":
+        from fastdeploy.model_executor.guided_decoding.guidance_backend import (
+            LLGuidanceBackend,
+        )
+
+        return LLGuidanceBackend(
+            fd_config=fd_config,
+            **kwargs,
+        )
     else:
         raise ValueError(
             f"Get unsupported backend {fd_config.structured_outputs_config.guided_decoding_backend},"
@@ -77,5 +86,11 @@ def schema_checker(backend_name: str, **kwargs):
         )
 
         return XGrammarChecker(**kwargs)
+    elif backend_name.lower() == "guidance":
+        from fastdeploy.model_executor.guided_decoding.guidance_backend import (
+            LLGuidanceChecker,
+        )
+
+        return LLGuidanceChecker(**kwargs)
     else:
         raise ValueError(f"Get unsupported backend {backend_name}, please check your configuration.")
diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -294,7 +294,12 @@ def _get_tokenizer_hf(self):
         """
         try:
             architectures = self.fd_config.model_config.architectures
-            if not ErnieArchitectures.contains_ernie_arch(architectures):
+            is_guidance_backend = (
+                self.fd_config.structured_outputs_config is not None
+                and self.fd_config.structured_outputs_config.guided_decoding_backend is not None
+                and self.fd_config.structured_outputs_config.guided_decoding_backend == "guidance"
+            )
+            if not ErnieArchitectures.contains_ernie_arch(architectures) or is_guidance_backend:
                 from transformers import AutoTokenizer, PreTrainedTokenizerFast
 
                 tokenizer = AutoTokenizer.from_pretrained(