Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,20 @@ namespace heterogeneous {
return;
}

// TODO: possible ideas to improve the "assignment" logic include
// For startes we "statically" assign the device based on
// edm::Stream number. This is suboptimal if the number of
// edm::Streams is not a multiple of the number of CUDA devices
// (and even then there is no load balancing).
//
// TODO: improve. Possible ideas include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = cudacore::chooseCUDADevice(id);
deviceId_ = id % cudaService->numberOfDevices();

cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

// Create the CUDA stream for this module-edm::Stream pair
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return cudaService->devices()[id % cudaService->numberOfDevices()];
return id % cudaService->numberOfDevices();
}
}
1 change: 0 additions & 1 deletion HeterogeneousCore/CUDAServices/bin/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@

<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
<use name="cuda"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
</bin>
28 changes: 26 additions & 2 deletions HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,31 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <iostream>

#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
#include <cuda_runtime.h>

int main() {
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
int devices = 0;
auto status = cudaGetDeviceCount(& devices);
if (status != cudaSuccess) {
return EXIT_FAILURE;
}

int minimumMajor = 6; // min minor is implicitly 0

// This approach (requiring all devices are supported) is rather
// conservative. In principle we could consider just dropping the
// unsupported devices. Currently that would be easiest to achieve
// in CUDAService though.
for (int i = 0; i < devices; ++i) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);

if(properties.major < minimumMajor) {
return EXIT_FAILURE;
}
}

return EXIT_SUCCESS;
}
4 changes: 0 additions & 4 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ class CUDAService {

int numberOfDevices() const { return numberOfDevices_; }

// devices supported by the CUDA configuration and compilation flags
std::vector<int> const& devices() const { return supportedDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }

Expand Down Expand Up @@ -155,7 +152,6 @@ class CUDAService {
std::unique_ptr<CUDAEventCache> cudaEventCache_;

int numberOfDevices_ = 0;
std::vector<int> supportedDevices_;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};
Expand Down
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
std::vector<int> devices_;
int numberOfDevices_ = 0;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
devices_ = cudaService->devices();
numberOfDevices_ = cudaService->numberOfDevices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
void dumpUsedMemory(T& log, int num) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i: devices) {
for(int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
32 changes: 16 additions & 16 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
Expand Down Expand Up @@ -95,10 +94,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for (int i : cs.devices()) {
for(int i=0; i<numberOfDevices; ++i) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -122,14 +121,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

supportedDevices_ = supportedCUDADevices();
numberOfDevices_ = supportedDevices_.size();
if (numberOfDevices_ == 0) {
auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";

auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
Expand All @@ -138,20 +137,18 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

int lastDevice = supportedDevices_.back();
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
cudaCheck(cudaGetDeviceProperties(&properties, i));
log << "CUDA device " << i << ": " << properties.name << '\n';

// compute capabilities
computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -294,7 +291,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -343,7 +340,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -356,7 +353,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -401,7 +398,7 @@ int CUDAService::deviceWithMostFreeMemory() const {

size_t maxFreeMemory = 0;
int device = -1;
for (int i: supportedDevices_) {
for(int i = 0; i < numberOfDevices_; ++i) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -435,6 +432,9 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
11 changes: 6 additions & 5 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -30,10 +29,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = supportedCUDADevices().size();
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );

if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
}

SECTION("CUDAService enabled") {
Expand All @@ -56,7 +58,6 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down

This file was deleted.

6 changes: 0 additions & 6 deletions HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

void exitSansCUDADevices() {
int devices = 0;
Expand All @@ -17,9 +16,4 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
int supported = supportedCUDADevices().size();
if (supported == 0) {
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
}
42 changes: 0 additions & 42 deletions HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu

This file was deleted.