Add Accelerator device and shell hooks (pytorch#119329)

albanD · pytorchmergebot · commit ca777fbbb735 · 2024-02-13T23:15:24.000Z
This adds a concept of Accelerator that points to one of our devices. See DeviceAccelerator.h in this PR for details https://github.com/pytorch/pytorch/pull/119329/files#diff-83cc748bed5df1a453c272cc5ecc7e572d4eb694c5125384d8fbd17a0b5f50c8 It also adds scaffolding for shared C++ API to allow generic feature implementation. This PR in particular updates the autograd engine to use this generic API. Pull Request resolved: pytorch#119329 Approved by: https://github.com/ezyang, https://github.com/huydhn
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -1,11 +1,13 @@
 #pragma once
 
 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/DeviceAccelerator.h>
 #include <ATen/LinalgBackend.h>
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/IPUHooksInterface.h>
@@ -56,6 +58,22 @@ class TORCH_API Context {
       AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
     }
   }
+  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+    c10::DeviceType device_type = opt_device_type.has_value()
+        ? opt_device_type.value()
+        : at::getAccelerator(true).value();
+    if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks();
+    } else if (device_type == at::kPrivateUse1) {
+      return at::detail::getPrivateUse1Hooks();
+    } else {
+      AT_ERROR(
+          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+    }
+  }
   Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
     initCUDAIfNeeded(device_type);
     initHIPIfNeeded(device_type);
diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp
@@ -0,0 +1,31 @@
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/Context.h>
+
+namespace at {
+
+C10_API std::optional<DeviceType> getAccelerator(bool checked) {
+#define CHECK_NO_CUDA \
+  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");
+
+#define CHECK_NO_PU1 \
+  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
+
+    if (is_privateuse1_backend_registered()) {
+        // We explicitly allow PrivateUse1 and another device at the same time
+        // as we use this for testing.
+        // Whenever a PrivateUse1 device is registered, use it first.
+        return kPrivateUse1;
+    } else if (at::hasCUDA()) {
+        CHECK_NO_PU1
+        return kCUDA;
+    } else {
+        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
+        return std::nullopt;
+    }
+
+#undef CHECK_NO_CUDA
+#undef CHECK_NO_PU1
+}
+
+
+} // namespace at
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS, XPU
+
+namespace at {
+
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<DeviceType> getAccelerator(bool checked = false);
+
+} // namespace at
diff --git a/aten/src/ATen/detail/AcceleratorHooksInterface.h b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <c10/core/Device.h>
+
+namespace at {
+
+// AcceleratorHooksInterface is a shared interface provided by all
+// accelerators to allow generic code.
+// This inferface is hook-based as it corresponds to all the functions
+// that are going to be called in a generic way from the CPU code.
+
+struct TORCH_API AcceleratorHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~AcceleratorHooksInterface() = default;
+
+  // Whether the device at device_index is fully initialized or not.
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const = 0;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -4,6 +4,8 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
 // Forward-declares at::Generator and at::cuda::NVRTC
 namespace at {
 struct Generator;
@@ -57,7 +59,7 @@ constexpr const char* CUDA_HELP =
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct TORCH_API CUDAHooksInterface {
+struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~CUDAHooksInterface() = default;
@@ -107,7 +109,7 @@ struct TORCH_API CUDAHooksInterface {
     TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
   }
 
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const {
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
     TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without ATen_cuda library. ", CUDA_HELP);
   }
 
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -4,14 +4,15 @@
 
 #include <c10/core/Allocator.h>
 #include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
 #include <cstddef>
 
 namespace at {
 
-struct TORCH_API MPSHooksInterface {
+struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
   // this fails the implementation if MPSHooks functions are called, but
   // MPS backend is not present.
   #define FAIL_MPSHOOKS_FUNC(func) \
@@ -86,7 +87,9 @@ struct TORCH_API MPSHooksInterface {
   virtual double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const {
     FAIL_MPSHOOKS_FUNC(__func__);
   }
-
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
   #undef FAIL_MPSHOOKS_FUNC
 };
 
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -4,6 +4,8 @@
 
 #include <c10/util/Registry.h>
 
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
 #include <string>
 
 namespace at {
@@ -17,7 +19,7 @@ constexpr const char* MTIA_HELP =
     "this error has occurred because you are trying "
     "to use some MTIA's functionality without MTIA extension included.";
 
-struct TORCH_API MTIAHooksInterface {
+struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
   virtual ~MTIAHooksInterface() = default;
 
   virtual void initMTIA() const {
@@ -37,6 +39,14 @@ struct TORCH_API MTIAHooksInterface {
         "Cannot query detailed MTIA version without MTIA Extension for PyTorch.",
         MTIA_HELP);
   }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK(
+        false,
+        "Cannot check MTIA primary context without MTIA Extension for PyTorch.",
+        MTIA_HELP);
+  }
+
 };
 
 struct TORCH_API MTIAHooksArgs {};
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
@@ -22,4 +22,15 @@ TORCH_API bool isPrivateUse1HooksRegistered() {
   return privateuse1_hooks != nullptr;
 }
 
+namespace detail {
+
+TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks() {
+  TORCH_CHECK(
+      privateuse1_hooks != nullptr,
+      "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first.");
+  return *privateuse1_hooks;
 }
+
+} // namespace detail
+
+} // namespace at
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@@ -1,13 +1,14 @@
 #pragma once
 
 #include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 #include <c10/core/Storage.h>
 #include <c10/util/Exception.h>
 namespace at {
 
-struct TORCH_API PrivateUse1HooksInterface {
+struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
   virtual ~PrivateUse1HooksInterface() = default;
   virtual const at::Generator& getDefaultGenerator(
       c10::DeviceIndex device_index) {
@@ -28,7 +29,7 @@ struct TORCH_API PrivateUse1HooksInterface {
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
   }
 
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const {
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
@@ -51,4 +52,10 @@ TORCH_API at::PrivateUse1HooksInterface* GetPrivateUse1HooksInterface();
 
 TORCH_API bool isPrivateUse1HooksRegistered();
 
+namespace detail {
+
+TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks();
+
+} // namespace detail
+
 } // namespace at
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
@@ -46,6 +46,12 @@ struct MPSHooks : public at::MPSHooksInterface {
   void synchronizeEvent(uint32_t event_id) const override;
   bool queryEvent(uint32_t event_id) const override;
   double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override;
+
+  // Compatibility with Accelerator API
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
+    // When MPS is available, it is always in use for the one device.
+    return true;
+  }
 };
 
 } // namespace at::mps
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -956,6 +956,7 @@ aten_cpu_non_globed_sources = [
 aten_cpu_non_globed_headers = [
     "aten/src/ATen/CPUGeneratorImpl.h",
     "aten/src/ATen/NumericUtils.h",
+    "aten/src/ATen/detail/AcceleratorHooksInterface.h",
     "aten/src/ATen/detail/CUDAHooksInterface.h",
     "aten/src/ATen/detail/MPSHooksInterface.h",
     "aten/src/ATen/detail/HIPHooksInterface.h",
@@ -970,6 +971,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/AccumulateType.cpp",
     "aten/src/ATen/LegacyBatchedTensorImpl.cpp",
     "aten/src/ATen/CPUGeneratorImpl.cpp",
+    "aten/src/ATen/DeviceAccelerator.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
     "aten/src/ATen/EmptyTensor.cpp",
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
diff --git a/torch/csrc/distributed/autograd/context/context.cpp b/torch/csrc/distributed/autograd/context/context.cpp
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp