cms-sw · cmsbuild · Aug 11, 2020 · Aug 10, 2020 · Aug 10, 2020 · Jul 31, 2020
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -13,6 +13,7 @@
 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
 
 namespace edm {
   class StreamContext;
@@ -27,6 +28,7 @@ class CUDAMonitoringService {
 
   void postModuleConstruction(edm::ModuleDescription const& desc);
   void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
+  void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
   void postEvent(edm::StreamContext const& sc);
 
 private:
@@ -46,6 +48,9 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
   if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
     registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
   }
+  if (config.getUntrackedParameter<bool>("memoryPerModule")) {
+    registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
+  }
   if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
     registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
   }
@@ -58,6 +63,8 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
       ->setComment("Print memory information for each device after the construction of each module");
   desc.addUntracked<bool>("memoryBeginStream", true)
       ->setComment("Print memory information for each device after the beginStream() of each module");
+  desc.addUntracked<bool>("memoryPerModule", true)
+      ->setComment("Print memory information for each device after the event of each module");
   desc.addUntracked<bool>("memoryPerEvent", true)
       ->setComment("Print memory information for each device after each event");
 
@@ -71,15 +78,23 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
 namespace {
   template <typename T>
   void dumpUsedMemory(T& log, int num) {
+    auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
     int old = 0;
     cudaCheck(cudaGetDevice(&old));
+    constexpr auto mbytes = 1 << 20;
     for (int i = 0; i < num; ++i) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
       log << "\n"
-          << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
-          << " MB total";
+          << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
+      auto found = cachingDeviceAllocatorStatus.find(i);
+      if (found != cachingDeviceAllocatorStatus.end()) {
+        auto const& cached = found->second;
+        log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
+            << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
+            << (cached.live + cached.free) / mbytes << " MB total cached";
+      }
     }
     cudaCheck(cudaSetDevice(old));
   }
@@ -98,6 +113,13 @@ void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm
   dumpUsedMemory(log, numberOfDevices_);
 }
 
+void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
+      << mcc.moduleDescription()->moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after event";

diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
@@ -1,10 +1,78 @@
 #! /bin/bash -e
 
+function help() {
+  cat <<@EOF
+Usage:
+  cmsCudaRebuild.sh [-g|-G] [-v] [-h]
+
+Check out and build all packages that contain CUDA code in .cu files.
+
+Options:
+  -g    Compile with debugging symbols, passing
+          "-g -rdynamic" to the host compiler, and
+          "-g -lineinfo" to CUDA compiler
+
+  -G    Compile with debugging symbols and enable asserts on the GPU, passing
+          "-g -rdynamic -DGPU_DEBUG" to the host compiler, and
+          "-g -lineinfo -DGPU_DEBUG" to the CUDA compiler.
+
+  -h    Show this help, and exit.
+
+  -v    Make scram be verbose.
+
+@EOF
+}
+
+
+DEBUG=0
+VERBOSE=0
+
+while [ "$*" ]; do
+  case "$1" in
+  -h)
+    help
+    exit 0
+    ;;
+  -g)
+    DEBUG=1
+    shift
+    ;;
+  -G)
+    DEBUG=2
+    shift
+    ;;
+  -v)
+    VERBOSE=$((VERBOSE + 1))
+    shift
+    ;;
+  *)
+    help
+    exit 1
+    ;;
+  esac
+done
+
 # move to the .../src directory
 cd $CMSSW_BASE/src/
 
 # check out all packages containing .cu files
 git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg
 
+# set additional compilation flags
+if (( DEBUG == 1 )); then
+  export USER_CXXFLAGS="-g -rdynamic $USER_CXXFLAGS"
+  export USER_CUDA_FLAGS="-g -lineinfo $USER_CUDA_FLAGS"
+elif (( DEBUG == 2 )); then
+  export USER_CXXFLAGS="-g -rdynamic -DGPU_DEBUG $USER_CXXFLAGS"
+  export USER_CUDA_FLAGS="-g -lineinfo -DGPU_DEBUG $USER_CUDA_FLAGS"
+fi
+
+if (( VERBOSE > 0 )); then
+  SCRAM_VERBOSE="-v"
+fi
+
+# clean all built packages
+scram b clean
+
 # rebuild all checked out packages
-scram b -j
+scram b $SCRAM_VERBOSE -j
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
@@ -6,7 +6,8 @@ DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
 CAPS=$(echo $DOTS | sed -e's#\.*##g')
 
 # remove existing capabilities
-sed -i $TOOL -e'\#<flags CUDA_FLAGS="-gencode arch=compute_..,code=sm_.."/>#d'
+sed -i $TOOL -e"s#-gencode arch=compute_..,code=sm_.. *##g"
+sed -i $TOOL -e"\#<flags CUDA_FLAGS=\"\"/>#d"
 
 # add support for the capabilities found on this machine
 for CAP in $CAPS; do

diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -3,13 +3,16 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <type_traits>
 
 namespace cms {
   namespace cuda {
+
     // Single element
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
       // Shouldn't compile for array types because of sizeof(T), but
@@ -19,6 +22,15 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T>& dst, const host::noncached::unique_ptr<T>& src, cudaStream_t stream) {
+      // Shouldn't compile for array types because of sizeof(T), but
+      // let's add an assert with a more helpful message
+      static_assert(std::is_array<T>::value == false,
+                    "For array types, use the other overload with the size parameter");
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
       static_assert(std::is_array<T>::value == false,
@@ -27,6 +39,7 @@ namespace cms {
     }
 
     // Multiple elements
+
     template <typename T>
     inline void copyAsync(device::unique_ptr<T[]>& dst,
                           const host::unique_ptr<T[]>& src,
@@ -35,6 +48,14 @@ namespace cms {
       cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
     }
 
+    template <typename T>
+    inline void copyAsync(device::unique_ptr<T[]>& dst,
+                          const host::noncached::unique_ptr<T[]>& src,
+                          size_t nelements,
+                          cudaStream_t stream) {
+      cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+    }
+
     template <typename T>
     inline void copyAsync(host::unique_ptr<T[]>& dst,
                           const device::unique_ptr<T[]>& src,

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
@@ -27,6 +27,13 @@ namespace cms {
     extern thread_local dim3 blockIdx;
     extern thread_local dim3 gridDim;
 
+    template <typename T1, typename T2>
+    T1 atomicCAS(T1* address, T1 compare, T2 val) {
+      T1 old = *address;
+      *address = old == compare ? val : old;
+      return old;
+    }
+
     template <typename T1, typename T2>
     T1 atomicInc(T1* a, T2 b) {
       auto ret = *a;

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h b/HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace cms {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace cms
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -41,9 +41,9 @@ namespace cms {
   namespace cuda {
 
     // limited to 32*32 elements....
-    template <typename T>
-    __host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
-                                                             T* __restrict__ co,
+    template <typename VT, typename T>
+    __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
+                                                             VT* co,
                                                              uint32_t size,
                                                              T* ws
 #ifndef __CUDA_ARCH__
@@ -138,7 +138,9 @@ namespace cms {
 
     // in principle not limited....
     template <typename T>
-    __global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
+    __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
+      volatile T const* ci = ici;
+      volatile T* co = ico;
       __shared__ T ws[32];
 #ifdef __CUDA_ARCH__
       assert(sizeof(T) * gridDim.x <= dynamic_smem_size());  // size of psum below
@@ -152,6 +154,7 @@ namespace cms {
       // count blocks that finished
       __shared__ bool isLastBlockDone;
       if (0 == threadIdx.x) {
+        __threadfence();
         auto value = atomicAdd(pc, 1);  // block counter
         isLastBlockDone = (value == (int(gridDim.x) - 1));
       }