Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"

namespace edm {
class StreamContext;
Expand All @@ -27,6 +28,7 @@ class CUDAMonitoringService {

void postModuleConstruction(edm::ModuleDescription const& desc);
void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
void postEvent(edm::StreamContext const& sc);

private:
Expand All @@ -46,6 +48,9 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
}
if (config.getUntrackedParameter<bool>("memoryPerModule")) {
registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
}
if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
}
Expand All @@ -58,6 +63,8 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
->setComment("Print memory information for each device after the construction of each module");
desc.addUntracked<bool>("memoryBeginStream", true)
->setComment("Print memory information for each device after the beginStream() of each module");
desc.addUntracked<bool>("memoryPerModule", true)
->setComment("Print memory information for each device after the event of each module");
desc.addUntracked<bool>("memoryPerEvent", true)
->setComment("Print memory information for each device after each event");

Expand All @@ -71,15 +78,23 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& des
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
int old = 0;
cudaCheck(cudaGetDevice(&old));
constexpr auto mbytes = 1 << 20;
for (int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
log << "\n"
<< i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
<< " MB total";
<< i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
auto found = cachingDeviceAllocatorStatus.find(i);
if (found != cachingDeviceAllocatorStatus.end()) {
auto const& cached = found->second;
log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
<< "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
<< (cached.live + cached.free) / mbytes << " MB total cached";
}
}
cudaCheck(cudaSetDevice(old));
}
Expand All @@ -98,6 +113,13 @@ void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
<< mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
Expand Down
70 changes: 69 additions & 1 deletion HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,78 @@
#! /bin/bash -e

function help() {
cat <<@EOF
Usage:
cmsCudaRebuild.sh [-g|-G] [-v] [-h]

Check out and build all packages that contain CUDA code in .cu files.

Options:
-g Compile with debugging symbols, passing
"-g -rdynamic" to the host compiler, and
"-g -lineinfo" to CUDA compiler

-G Compile with debugging symbols and enable asserts on the GPU, passing
"-g -rdynamic -DGPU_DEBUG" to the host compiler, and
"-g -lineinfo -DGPU_DEBUG" to the CUDA compiler.

-h Show this help, and exit.

-v Make scram be verbose.

@EOF
}


DEBUG=0
VERBOSE=0

while [ "$*" ]; do
case "$1" in
-h)
help
exit 0
;;
-g)
DEBUG=1
shift
;;
-G)
DEBUG=2
shift
;;
-v)
VERBOSE=$((VERBOSE + 1))
shift
;;
*)
help
exit 1
;;
esac
done

# move to the .../src directory
cd $CMSSW_BASE/src/

# check out all packages containing .cu files
git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg

# set additional compilation flags
if (( DEBUG == 1 )); then
export USER_CXXFLAGS="-g -rdynamic $USER_CXXFLAGS"
export USER_CUDA_FLAGS="-g -lineinfo $USER_CUDA_FLAGS"
elif (( DEBUG == 2 )); then
export USER_CXXFLAGS="-g -rdynamic -DGPU_DEBUG $USER_CXXFLAGS"
export USER_CUDA_FLAGS="-g -lineinfo -DGPU_DEBUG $USER_CUDA_FLAGS"
fi

if (( VERBOSE > 0 )); then
SCRAM_VERBOSE="-v"
fi

# clean all built packages
scram b clean

# rebuild all checked out packages
scram b -j
scram b $SCRAM_VERBOSE -j
3 changes: 2 additions & 1 deletion HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
CAPS=$(echo $DOTS | sed -e's#\.*##g')

# remove existing capabilities
sed -i $TOOL -e'\#<flags CUDA_FLAGS="-gencode arch=compute_..,code=sm_.."/>#d'
sed -i $TOOL -e"s#-gencode arch=compute_..,code=sm_.. *##g"
sed -i $TOOL -e"\#<flags CUDA_FLAGS=\"\"/>#d"

# add support for the capabilities found on this machine
for CAP in $CAPS; do
Expand Down
21 changes: 21 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <type_traits>

namespace cms {
namespace cuda {

// Single element

template <typename T>
inline void copyAsync(device::unique_ptr<T>& dst, const host::unique_ptr<T>& src, cudaStream_t stream) {
// Shouldn't compile for array types because of sizeof(T), but
Expand All @@ -19,6 +22,15 @@ namespace cms {
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(device::unique_ptr<T>& dst, const host::noncached::unique_ptr<T>& src, cudaStream_t stream) {
// Shouldn't compile for array types because of sizeof(T), but
// let's add an assert with a more helpful message
static_assert(std::is_array<T>::value == false,
"For array types, use the other overload with the size parameter");
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(host::unique_ptr<T>& dst, const device::unique_ptr<T>& src, cudaStream_t stream) {
static_assert(std::is_array<T>::value == false,
Expand All @@ -27,6 +39,7 @@ namespace cms {
}

// Multiple elements

template <typename T>
inline void copyAsync(device::unique_ptr<T[]>& dst,
const host::unique_ptr<T[]>& src,
Expand All @@ -35,6 +48,14 @@ namespace cms {
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(device::unique_ptr<T[]>& dst,
const host::noncached::unique_ptr<T[]>& src,
size_t nelements,
cudaStream_t stream) {
cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
}

template <typename T>
inline void copyAsync(host::unique_ptr<T[]>& dst,
const device::unique_ptr<T[]>& src,
Expand Down
7 changes: 7 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ namespace cms {
extern thread_local dim3 blockIdx;
extern thread_local dim3 gridDim;

template <typename T1, typename T2>
T1 atomicCAS(T1* address, T1 compare, T2 val) {
T1 old = *address;
*address = old == compare ? val : old;
return old;
}

template <typename T1, typename T2>
T1 atomicInc(T1* a, T2 b) {
auto ret = *a;
Expand Down
63 changes: 0 additions & 63 deletions HeterogeneousCore/CUDAUtilities/interface/cuda_cxx17.h

This file was deleted.

23 changes: 23 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h

#include <map>

namespace cms {
namespace cuda {
namespace allocator {
struct TotalBytes {
size_t free;
size_t live;
size_t liveRequested; // CMS: monitor also requested amount
TotalBytes() { free = live = liveRequested = 0; }
};
/// Map type of device ordinals to the number of cached bytes cached by each device
using GpuCachedBytes = std::map<int, TotalBytes>;
} // namespace allocator

allocator::GpuCachedBytes deviceAllocatorStatus();
} // namespace cuda
} // namespace cms

#endif
11 changes: 7 additions & 4 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ namespace cms {
namespace cuda {

// limited to 32*32 elements....
template <typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(T const* __restrict__ ci,
T* __restrict__ co,
template <typename VT, typename T>
__host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
VT* co,
uint32_t size,
T* ws
#ifndef __CUDA_ARCH__
Expand Down Expand Up @@ -138,7 +138,9 @@ namespace cms {

// in principle not limited....
template <typename T>
__global__ void multiBlockPrefixScan(T const* ci, T* co, int32_t size, int32_t* pc) {
__global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
volatile T const* ci = ici;
volatile T* co = ico;
__shared__ T ws[32];
#ifdef __CUDA_ARCH__
assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
Expand All @@ -152,6 +154,7 @@ namespace cms {
// count blocks that finished
__shared__ bool isLastBlockDone;
if (0 == threadIdx.x) {
__threadfence();
auto value = atomicAdd(pc, 1); // block counter
isLastBlockDone = (value == (int(gridDim.x) - 1));
}
Expand Down
Loading