Remove sdp_kernel and replace with sdpa_kernel in attention namespace (pytorch#114689)

drisspg · pytorchmergebot · commit 4e29f01bf2ed · 2024-01-24T22:28:04.000Z
# Summary Simplification of Backend Selection This PR deprecates the `torch.backends/cuda/sdp_kernel` context manager and replaces it with a new context manager `torch.nn.attention.sdpa_kernel`. This context manager also changes the api for this context manager. For `sdp_kernel` one would specify the backend choice by taking the negation of what kernel they would like to run. The purpose of this backend manager was to only to be a debugging tool, "turn off the math backend" and see if you can run one of the fused implementations. Problems: - This pattern makes sense if majority of users don't care to know anything about the backends that can be run. However, if users are seeking to use this context manager then they are explicitly trying to run a specific backend. - This is not scalable. We are working on adding the cudnn backend and this API makes it so so that more implementations will need to be turned off if user wants to explicitly run a given backend. - Discoverability of the current context manager. It is somewhat un-intutive that this backend manager is in backends/cuda/init when this now also controls the CPU fused kernel behavior. I think centralizing to attention namespace will be helpful. Other concerns: - Typically backends (kernels) for operators are entirely hidden from users and implementation details of the framework. We have exposed this to users already, albeit not by default and with beta warnings. Does making backends choices even more explicit lead to problems when we potentially want to remove existing backends, (perhaps inputs shapes will get covered by newer backends). A nice side effect is now that we aren't using the `BACKEND_MAP` in test_transformers many, many dynamo failures are passing for CPU tests. Pull Request resolved: pytorch#114689 Approved by: https://github.com/cpuhrsch
diff --git a/docs/source/_templates/autosummary/classnoinheritance.rst b/docs/source/_templates/autosummary/classnoinheritance.rst
@@ -0,0 +1,11 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+.. autogenerated from source/_templates/autosummary/class.rst
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
@@ -68,8 +68,6 @@ torch.backends.cuda
 
 .. autofunction:: torch.backends.cuda.preferred_linalg_library
 
-.. autoclass:: torch.backends.cuda.SDPBackend
-
 .. autoclass:: torch.backends.cuda.SDPAParams
 
 .. autofunction:: torch.backends.cuda.flash_sdp_enabled
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -93,7 +93,7 @@ Features described in this documentation are classified by release status:
    torch.package <package>
    profiler
    nn.init
-   nn.attention.bias
+   nn.attention
    onnx
    optim
    complex_numbers
diff --git a/docs/source/nn.attention.bias.rst b/docs/source/nn.attention.bias.rst
@@ -10,7 +10,13 @@ torch.nn.attention.bias
 CausalBias
 ==========
 
-.. autoclass:: CausalBias
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classnoinheritance.rst
+
+    CausalBias
+
 
 .. autosummary::
     :toctree: generated
diff --git a/docs/source/nn.attention.rst b/docs/source/nn.attention.rst
@@ -0,0 +1,28 @@
+.. role:: hidden
+    :class: hidden-section
+
+torch.nn.attention
+==================
+
+.. automodule:: torch.nn.attention
+
+Utils
+-------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    sdpa_kernel
+    SDPBackend
+
+Submodules
+----------
+.. autosummary::
+    :nosignatures:
+
+    bias
+
+.. toctree::
+    :hidden:
+
+    nn.attention.bias
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
@@ -527,7 +527,6 @@ Lazy Modules Initialization
 
 .. This module needs to be documented. Adding here in the meantime
 .. for tracking purposes
-.. py:module:: torch.nn.attention
 .. py:module:: torch.nn.backends
 .. py:module:: torch.nn.utils.stateless
 .. py:module:: torch.nn.backends.thnn
diff --git a/test/test_transformers.py b/test/test_transformers.py
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
@@ -1,4 +1,5 @@
 import contextlib
+import warnings
 
 from typing import Union
 
@@ -13,7 +14,6 @@
     "preferred_linalg_library",
     "cufft_plan_cache",
     "matmul",
-    "SDPBackend",
     "SDPAParams",
     "enable_flash_sdp",
     "flash_sdp_enabled",
@@ -204,10 +204,9 @@ def preferred_linalg_library(
     return torch._C._get_linalg_preferred_backend()
 
 
-from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
+from torch._C import _SDPAParams as SDPAParams
 
 # Set the __module__ attribute
-SDPBackend.__module__ = "torch.backends.cuda"
 SDPAParams.__module__ = "torch.backends.cuda"
 SDPAParams.__name__ = "SDPAParams"
 
@@ -318,18 +317,30 @@ def sdp_kernel(
     This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
     Upon exiting the context manager, the previous state of the flags will be restored.
     """
-    previous_flash: bool = flash_sdp_enabled()
-    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
-    previous_math: bool = math_sdp_enabled()
-    try:
-        enable_flash_sdp(enable_flash)
-        enable_mem_efficient_sdp(enable_mem_efficient)
-        enable_math_sdp(enable_math)
-        yield {}
-    finally:
-        enable_flash_sdp(previous_flash)
-        enable_mem_efficient_sdp(previous_mem_efficient)
-        enable_math_sdp(previous_math)
+    warnings.warn(
+        (
+            "torch.backends.cuda.sdp_kernel() "
+            "is deprecated. In the future, this context manager will be removed. "
+            "Please see, torch.nn.attention.sdpa_kernel() for the new context manager, with updated "
+            "signature."
+        ),
+        FutureWarning,
+    )
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+    backend_list = []
+    if enable_flash:
+        backend_list.append(SDPBackend.FLASH_ATTENTION)
+    if enable_mem_efficient:
+        backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
+    if enable_math:
+        backend_list.append(SDPBackend.MATH)
+
+    with sdpa_kernel(backend_list) as context:
+        try:
+            yield context
+        finally:
+            pass
 
 
 cufft_plan_cache = cuFFTPlanCacheManager()
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -1809,7 +1809,9 @@ Call this whenever a new thread is created in order to propagate values from
   py::enum_<sdp::SDPBackend>(
       py_module,
       "_SDPBackend",
-      "Enum class for the scaled dot product attention backends\n\n... warning:: This class is in beta and subject to change.")
+      "An enum-like class that contains the different backends for scaled dot product attention.\n\n... warning:: This class is in beta and subject to change.\n\n"
+      "This backend class is designed to be used with the sdpa_kernel context manager."
+      "See :func: torch.nn.attention.sdpa_kernel for more details.")
       .value("ERROR", sdp::SDPBackend::error)
       .value("MATH", sdp::SDPBackend::math)
       .value("FLASH_ATTENTION", sdp::SDPBackend::flash_attention)
diff --git a/torch/nested/_internal/sdpa.py b/torch/nested/_internal/sdpa.py
@@ -12,9 +12,10 @@
     math_sdp_enabled,
     mem_efficient_sdp_enabled,
     SDPAParams,
-    SDPBackend,
 )
 
+from torch.nn.attention import SDPBackend
+
 from .nested_tensor import buffer_from_jagged, NestedTensor, ViewNestedFromBuffer
 
 log = logging.getLogger(__name__)
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
@@ -1,16 +1,24 @@
-from typing import List
+""" This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention """
+import contextlib
+from typing import List, Union
 from warnings import warn
 
 from torch.backends.cuda import (
     can_use_efficient_attention,
     can_use_flash_attention,
+    enable_flash_sdp,
+    enable_math_sdp,
+    enable_mem_efficient_sdp,
+    flash_sdp_enabled,
+    math_sdp_enabled,
+    mem_efficient_sdp_enabled,
     SDPAParams,
 )
 
-__all__: List[str] = []
+__all__: List[str] = ["SDPBackend", "sdpa_kernel", "WARN_FOR_UNFUSED_KERNELS"]
 
 # Note: [SDPA warnings]
-# TODO: Consider using this to sdpa regardless of subclasses
+# TODO: Consider using this for sdpa regardless of subclasses
 # This only effects users of bias subclasses
 # If this is set to True, we will warn the user if they are not using the fused kernels
 # As well, it will raise warnings for all the reasons why the fused kernels can't be run.
@@ -19,6 +27,21 @@
 WARN_FOR_UNFUSED_KERNELS = False
 
 
+from torch._C import _SDPBackend as SDPBackend
+
+# Hacks for Sphinx documentation:
+# https://stackoverflow.com/questions/38765577/overriding-sphinx-autodoc-alias-of-for-import-of-private-class
+SDPBackend = SDPBackend
+r"""An enum-like class that contains the different backends for scaled dot product attention.
+    This backend class is designed to be used with the sdpa_kernel context manager.
+    See :func: torch.nn.attention.sdpa_kernel for more details.
+
+    ... warning:: This class is in beta and subject to change.
+"""
+SDPBackend.__module__ = __name__
+SDPBackend.__name__ = "SDPBackend"
+
+
 def _raise_kernel_warnings(params: SDPAParams) -> None:
     """
     If WARN_FOR_UNFUSED_KERNELS is set to True, this will raise warnings
@@ -31,3 +54,39 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
         if not can_use_flash_attention(params):
             warn("Flash attention can't be used because:")
             can_use_flash_attention(params, True)
+
+
+@contextlib.contextmanager
+def sdpa_kernel(backends: List[SDPBackend]):
+    r"""
+    Context manager to select which backend to use for scaled dot product attention.
+
+    .. warning:: This function is beta and subject to change.
+
+    Args:
+        backend (Union[List[SDPBackend], SDPBackend]): A backend or list of backends for scaled dot product attention.
+
+    This context manager can be used to select which backend to use for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends.
+    """
+    assert backends is None or isinstance(
+        backends, list
+    ), "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+
+    backends = set(backends)
+    previous_flash: bool = flash_sdp_enabled()
+    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
+    previous_math: bool = math_sdp_enabled()
+    try:
+        enable_flash = SDPBackend.FLASH_ATTENTION in backends
+        enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION in backends
+        enable_math = SDPBackend.MATH in backends
+
+        enable_flash_sdp(enable_flash)
+        enable_mem_efficient_sdp(enable_mem_efficient)
+        enable_math_sdp(enable_math)
+        yield {}
+    finally:
+        enable_flash_sdp(previous_flash)
+        enable_mem_efficient_sdp(previous_mem_efficient)
+        enable_math_sdp(previous_math)
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
@@ -1,4 +1,4 @@
-"""Defines utilities for interacting with scaled_dot_product_attention"""
+"""Defines bias subclasses that work with scaled_dot_product_attention"""
 from enum import auto, IntEnum
 from typing import Optional
 from warnings import warn
diff --git a/torch/testing/_internal/dynamo_test_failures.py b/torch/testing/_internal/dynamo_test_failures.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-"""Defines utilities for interacting with scaled_dot_product_attention"""`
	`1`	`+"""Defines bias subclasses that work with scaled_dot_product_attention"""`
`2`	`2`	`from enum import auto, IntEnum`
`3`	`3`	`from typing import Optional`
`4`	`4`	`from warnings import warn`