Pass default values for hints more cleanly

pavanbalaji · meta-codesync[bot] · commit 6900f7d81e34 · 2025-11-02T19:18:17.000-08:00
Summary: Reduces possibility of garbage values.

Reviewed By: siyengar

Differential Revision: D85972057

fbshipit-source-id: 4786a97ef5833eddbb9e26fc05d3e30145a5cce0
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.cpp b/comms/torchcomms/nccl/TorchCommNCCL.cpp
@@ -151,7 +151,7 @@ void TorchCommNCCL::init(
     max_event_pool_size_ =
         std::stoull(options_.hints.at("torchcomm::nccl::max_event_pool_size"));
   } else {
-    max_event_pool_size_ = kMaxEventPoolSize;
+    max_event_pool_size_ = kDefaultMaxEventPoolSize;
   }
 
   // Give up our internal reference to the store object here.  The caller
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.hpp b/comms/torchcomms/nccl/TorchCommNCCL.hpp
@@ -28,7 +28,7 @@
 namespace torch {
 namespace comms {
 
-constexpr size_t kMaxEventPoolSize = 1000;
+constexpr size_t kDefaultMaxEventPoolSize = 1000;
 
 // Custom exception class for better error handling
 class NCCLException : public std::exception {
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.cpp b/comms/torchcomms/ncclx/TorchCommNCCLX.cpp
@@ -151,25 +151,19 @@ void TorchCommNCCLX::init(
       "Failed to allocate barrier buffer");
 
   if (options_.hints.contains("torchcomm::ncclx::max_event_pool_size")) {
-    max_event_pool_size_ =
+    configs_.max_event_pool_size_ =
         std::stoull(options_.hints.at("torchcomm::ncclx::max_event_pool_size"));
-  } else {
-    max_event_pool_size_ = kMaxEventPoolSize;
   }
 
   if (options_.hints.contains(
           "torchcomm::ncclx::garbage_collect_interval_ms")) {
-    garbage_collect_interval_ms_ = std::stoull(
+    configs_.garbage_collect_interval_ms_ = std::stoull(
         options_.hints.at("torchcomm::ncclx::garbage_collect_interval_ms"));
-  } else {
-    garbage_collect_interval_ms_ = kGarbageCollectIntervalMs;
   }
 
   if (options_.hints.contains("torchcomm::ncclx::enable_cuda_graph_support")) {
-    enable_cuda_graph_support_ = string_to_bool(
+    configs_.enable_cuda_graph_support_ = string_to_bool(
         options_.hints.at("torchcomm::ncclx::enable_cuda_graph_support"));
-  } else {
-    enable_cuda_graph_support_ = kEnableCudaGraphSupport;
   }
 
   // Give up our internal reference to the store object here.  The caller
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.hpp b/comms/torchcomms/ncclx/TorchCommNCCLX.hpp
@@ -28,9 +28,9 @@
 namespace torch {
 namespace comms {
 
-constexpr size_t kMaxEventPoolSize = 1000;
-constexpr size_t kGarbageCollectIntervalMs = 100;
-constexpr bool kEnableCudaGraphSupport = true;
+constexpr size_t kDefaultMaxEventPoolSize = 1000;
+constexpr size_t kDefaultGarbageCollectIntervalMs = 100;
+constexpr bool kDefaultEnableCudaGraphSupport = true;
 
 // Custom exception class for better error handling
 class NCCLException : public std::exception {
@@ -332,9 +332,14 @@ class TorchCommNCCLX : public TorchCommBackend,
   int comm_size_{};
   int rank_{};
   CommOptions options_;
-  size_t max_event_pool_size_{};
-  size_t garbage_collect_interval_ms_{};
-  bool enable_cuda_graph_support_{};
+
+  struct Configs {
+    size_t max_event_pool_size_{kDefaultMaxEventPoolSize};
+    size_t garbage_collect_interval_ms_{kDefaultGarbageCollectIntervalMs};
+    bool enable_cuda_graph_support_{kDefaultEnableCudaGraphSupport};
+  };
+  Configs configs_;
+
   cudaStream_t internal_stream_{};
   void* barrier_buffer_{}; // Pre-allocated CUDA buffer for barrier operations
   enum class InitializationState {
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLXUtils.cpp b/comms/torchcomms/ncclx/TorchCommNCCLXUtils.cpp
@@ -184,7 +184,7 @@ void TorchCommNCCLX::timeoutWatchdog() noexcept {
       // Wake up either after some time or immediately if shutdown is requested
       timeout_cv_.wait_for(
           lock,
-          std::chrono::milliseconds(garbage_collect_interval_ms_),
+          std::chrono::milliseconds(configs_.garbage_collect_interval_ms_),
           [this]() { return shutdown_.load(); });
 
       // If we're shutting down, exit the loop
@@ -254,7 +254,7 @@ void TorchCommNCCLX::checkAndAbortIfTimedOutOrError() {
 }
 
 bool TorchCommNCCLX::getGraphCaptureMode() {
-  if (!enable_cuda_graph_support_) {
+  if (!configs_.enable_cuda_graph_support_) {
     return false;
   }
 
@@ -430,7 +430,7 @@ cudaEvent_t TorchCommNCCLX::getEvent() {
 void TorchCommNCCLX::returnEvent(cudaEvent_t event) {
   std::lock_guard<std::mutex> lock(event_pool_mutex_);
 
-  if (event_pool_.size() < max_event_pool_size_) {
+  if (event_pool_.size() < configs_.max_event_pool_size_) {
     event_pool_.push(event);
   } else {
     // Pool is full, destroy the event

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ void TorchCommNCCL::init(`
`151`	`151`	`max_event_pool_size_ =`
`152`	`152`	`std::stoull(options_.hints.at("torchcomm::nccl::max_event_pool_size"));`
`153`	`153`	`} else {`
`154`		`- max_event_pool_size_ = kMaxEventPoolSize;`
	`154`	`+ max_event_pool_size_ = kDefaultMaxEventPoolSize;`
`155`	`155`	`}`
`156`	`156`
`157`	`157`	`// Give up our internal reference to the store object here. The caller`