cms-patatrack · felicepantaleo · Mar 15, 2019 · Mar 15, 2019
diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -34,14 +34,20 @@ namespace heterogeneous {
       return;
     }
 
-    // TODO: possible ideas to improve the "assignment" logic include
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve. Possible ideas include
     // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
     //   * the first module of a chain dictates the device for the rest of the chain
     // - our own CUDA memory allocator
     //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
     //   * would probably still need some buffer space/device to hold e.g. conditions data
     //     - for conditions, how to handle multiple lumis per job?
-    deviceId_ = cudacore::chooseCUDADevice(id);
+    deviceId_ = id % cudaService->numberOfDevices();
+
     cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
 
     // Create the CUDA stream for this module-edm::Stream pair

diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -13,6 +13,6 @@ namespace cudacore {
     // (and even then there is no load balancing).
     //
     // TODO: improve the "assignment" logic
-    return cudaService->devices()[id % cudaService->numberOfDevices()];
+    return id % cudaService->numberOfDevices();
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -4,5 +4,4 @@
 
 <bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
   <use name="cuda"/>
-  <use name="HeterogeneousCore/CUDAUtilities"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -1,7 +1,31 @@
+#include <algorithm>
+#include <array>
 #include <cstdlib>
+#include <iostream>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
+#include <cuda_runtime.h>
 
 int main() {
-  return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
+  int devices = 0;
+  auto status = cudaGetDeviceCount(& devices);
+  if (status != cudaSuccess) {
+    return EXIT_FAILURE;
+  }
+
+  int minimumMajor = 6; // min minor is implicitly 0
+
+  // This approach (requiring all devices are supported) is rather
+  // conservative. In principle we could consider just dropping the
+  // unsupported devices. Currently that would be easiest to achieve
+  // in CUDAService though.
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+
+    if(properties.major < minimumMajor) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
 }
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -52,9 +52,6 @@ class CUDAService {
 
   int numberOfDevices() const { return numberOfDevices_; }
 
-  // devices supported by the CUDA configuration and compilation flags
-  std::vector<int> const& devices() const { return supportedDevices_; }
-
   // major, minor
   std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
 
@@ -155,7 +152,6 @@ class CUDAService {
   std::unique_ptr<CUDAEventCache> cudaEventCache_;
 
   int numberOfDevices_ = 0;
-  std::vector<int> supportedDevices_;
   std::vector<std::pair<int, int>> computeCapabilities_;
   bool enabled_ = false;
 };

diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -30,15 +30,15 @@ class CUDAMonitoringService {
   void postEvent(edm::StreamContext const& sc);
 
 private:
-  std::vector<int> devices_;
+  int numberOfDevices_ = 0;
 };
 
 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
   // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
   edm::Service<CUDAService> cudaService;
   if(!cudaService->enabled())
     return;
-  devices_ = cudaService->devices();
+  numberOfDevices_ = cudaService->numberOfDevices();
 
   if(config.getUntrackedParameter<bool>("memoryConstruction")) {
     registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
 // activity handlers
 namespace {
   template <typename T>
-  void dumpUsedMemory(T& log, std::vector<int> const& devices) {
+  void dumpUsedMemory(T& log, int num) {
     int old = 0;
     cudaCheck(cudaGetDevice(&old));
-    for(int i: devices) {
+    for(int i = 0; i < num; ++i) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
-  dumpUsedMemory(log, devices_);
+  dumpUsedMemory(log, numberOfDevices_);
 }
 
 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
-  dumpUsedMemory(log, devices_);
+  dumpUsedMemory(log, numberOfDevices_);
 }
 
 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after event";
-  dumpUsedMemory(log, devices_);
+  dumpUsedMemory(log, numberOfDevices_);
 }
 
 DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -12,7 +12,6 @@
 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 #include "CachingDeviceAllocator.h"
 #include "CachingHostAllocator.h"
@@ -95,10 +94,10 @@ namespace {
     }
   }
 
-  void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
+  void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
     int device;
     cudaCheck(cudaGetDevice(&device));
-    for (int i : cs.devices()) {
+    for(int i=0; i<numberOfDevices; ++i) {
       cudaCheck(cudaSetDevice(i));
       preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
           return cs.make_device_unique<char[]>(size, stream);
@@ -122,14 +121,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     return;
   }
 
-  supportedDevices_ = supportedCUDADevices();
-  numberOfDevices_ = supportedDevices_.size();
-  if (numberOfDevices_ == 0) {
+  auto status = cudaGetDeviceCount(&numberOfDevices_);
+  if (cudaSuccess != status) {
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
     return;
   }
   edm::LogInfo log("CUDAService");
-  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
+  computeCapabilities_.reserve(numberOfDevices_);
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
 
   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
   auto printfFifoSize               = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
@@ -138,20 +137,18 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   auto devRuntimeSyncDepth          = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
 
-  int lastDevice = supportedDevices_.back();
-  computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
-  for (int i: supportedDevices_) {
+  for (int i = 0; i < numberOfDevices_; ++i) {
     // read information about the compute device.
     // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
     cudaCheck(cudaGetDeviceProperties(&properties, i));
     log << "CUDA device " << i << ": " << properties.name << '\n';
 
     // compute capabilities
-    computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
     log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
+    computeCapabilities_.emplace_back(properties.major, properties.minor);
     log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
-    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
+    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
     log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
 
     // compute mode
@@ -294,7 +291,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     size_t minCachedBytes = std::numeric_limits<size_t>::max();
     int currentDevice;
     cudaCheck(cudaGetDevice(&currentDevice));
-    for (int i: supportedDevices_) {
+    for (int i = 0; i < numberOfDevices_; ++i) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -343,7 +340,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   enabled_ = true;
 
   // Preallocate buffers if asked to
-  devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
   hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }
 
@@ -356,7 +353,7 @@ CUDAService::~CUDAService() {
     cudaEventCache_.reset();
     cudaStreamCache_.reset();
 
-    for (int i: supportedDevices_) {
+    for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
       // Explicitly destroys and cleans up all resources associated with the current device in the
@@ -401,7 +398,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
 
   size_t maxFreeMemory = 0;
   int device = -1;
-  for (int i: supportedDevices_) {
+  for(int i = 0; i < numberOfDevices_; ++i) {
     /*
     // TODO: understand why the api-wrappers version gives same value for all devices
     auto device = cuda::device::get(i);
@@ -435,6 +432,9 @@ struct CUDAService::Allocator {
   template <typename ...Args>
   Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
 
+  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
+  void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
+
   size_t maxAllocation;
   notcub::CachingDeviceAllocator deviceAllocator;
   notcub::CachingHostAllocator hostAllocator;

diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -14,7 +14,6 @@
 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -30,10 +29,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
 
   // Test setup: check if a simple CUDA runtime API call fails:
   // if so, skip the test with the CUDAService enabled
-  int deviceCount = supportedCUDADevices().size();
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
 
-  if (deviceCount == 0) {
-    WARN("No supported CUDA devices available. Running only tests not requiring devices.");
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Running only tests not requiring devices.");
   }
 
   SECTION("CUDAService enabled") {
@@ -56,7 +58,6 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     }
 
     auto cs = makeCUDAService(ps, ar);
-    cudaError_t ret;
 
     SECTION("CUDA Queries") {
       int driverVersion = 0, runtimeVersion = 0;

diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
@@ -4,7 +4,6 @@
 #include <cuda_runtime.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 void exitSansCUDADevices() {
   int devices = 0;
@@ -17,9 +16,4 @@ void exitSansCUDADevices() {
     std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
     exit(EXIT_SUCCESS);
   }
-  int supported = supportedCUDADevices().size();
-  if (supported == 0) {
-    std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
-    exit(EXIT_SUCCESS);
-  }
 }
diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu