diff --git a/HeterogeneousCore/ROCmServices/BuildFile.xml b/HeterogeneousCore/ROCmServices/BuildFile.xml new file mode 100644 index 0000000000000..0ff47a94f4ebc --- /dev/null +++ b/HeterogeneousCore/ROCmServices/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/interface/ROCmService.h b/HeterogeneousCore/ROCmServices/interface/ROCmService.h new file mode 100644 index 0000000000000..c78ec27f51d80 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/interface/ROCmService.h @@ -0,0 +1,45 @@ +#ifndef HeterogeneousCore_ROCmServices_interface_ROCmService_h +#define HeterogeneousCore_ROCmServices_interface_ROCmService_h + +#include +#include + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace edm { + class ParameterSet; + class ActivityRegistry; + class ConfigurationDescriptions; +} // namespace edm + +class ROCmService { +public: + ROCmService(edm::ParameterSet const& config); + ~ROCmService(); + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + bool enabled() const { return enabled_; } + + int numberOfDevices() const { return numberOfDevices_; } + + // major, minor + std::pair computeCapability(int device) const { return computeCapabilities_.at(device); } + + // Returns the id of device with most free memory. If none is found, returns -1. + int deviceWithMostFreeMemory() const; + +private: + int numberOfDevices_ = 0; + std::vector> computeCapabilities_; + bool enabled_ = false; + bool verbose_ = false; +}; + +namespace edm { + namespace service { + inline bool isProcessWideService(ROCmService const*) { return true; } + } // namespace service +} // namespace edm + +#endif // HeterogeneousCore_ROCmServices_interface_ROCmService_h diff --git a/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml b/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml new file mode 100644 index 0000000000000..42f9e3024fc2f --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc b/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc new file mode 100644 index 0000000000000..3bd0f2448f1b4 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc @@ -0,0 +1,120 @@ +#include + +#include + +#include "DataFormats/Provenance/interface/ModuleDescription.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace edm { + class StreamContext; +} + +class ROCmMonitoringService { +public: + ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~ROCmMonitoringService() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void postModuleConstruction(edm::ModuleDescription const& desc); + void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc); + void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc); + void postEvent(edm::StreamContext const& sc); + +private: + int numberOfDevices_ = 0; +}; + +ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { + // make sure that ROCm is initialised, and that the ROCmService destructor is called after this service's destructor + edm::Service rocmService; + if (!rocmService->enabled()) + return; + numberOfDevices_ = rocmService->numberOfDevices(); + + if (config.getUntrackedParameter("memoryConstruction")) { + registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction); + } + if (config.getUntrackedParameter("memoryBeginStream")) { + registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream); + } + if (config.getUntrackedParameter("memoryPerModule")) { + registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent); + } + if (config.getUntrackedParameter("memoryPerEvent")) { + registry.watchPostEvent(this, &ROCmMonitoringService::postEvent); + } +} + +void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.addUntracked("memoryConstruction", false) + ->setComment("Print memory information for each device after the construction of each module"); + desc.addUntracked("memoryBeginStream", true) + ->setComment("Print memory information for each device after the beginStream() of each module"); + desc.addUntracked("memoryPerModule", true) + ->setComment("Print memory information for each device after the event of each module"); + desc.addUntracked("memoryPerEvent", true) + ->setComment("Print memory information for each device after each event"); + + descriptions.add("ROCmMonitoringService", desc); + descriptions.setComment( + "The memory information is the global state of the device. This gets confusing if there are multiple processes " + "running on the same device. Probably the information retrieval should be re-thought?"); +} + +// activity handlers +namespace { + template + void dumpUsedMemory(T& log, int num) { + int old = 0; + hipCheck(hipGetDevice(&old)); + constexpr auto mbytes = 1 << 20; + for (int i = 0; i < num; ++i) { + size_t freeMemory, totalMemory; + hipCheck(hipSetDevice(i)); + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + log << "\n" + << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total"; + } + hipCheck(hipSetDevice(old)); + } +} // namespace + +void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after event"; + dumpUsedMemory(log, numberOfDevices_); +} + +DEFINE_FWK_SERVICE(ROCmMonitoringService); diff --git a/HeterogeneousCore/ROCmServices/plugins/plugins.cc b/HeterogeneousCore/ROCmServices/plugins/plugins.cc new file mode 100644 index 0000000000000..a418eeced333f --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/plugins.cc @@ -0,0 +1,4 @@ +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" + +DEFINE_FWK_SERVICE_MAKER(ROCmService, edm::serviceregistry::ParameterSetMaker); diff --git a/HeterogeneousCore/ROCmServices/src/ROCmService.cc b/HeterogeneousCore/ROCmServices/src/ROCmService.cc new file mode 100644 index 0000000000000..2cabaed127d99 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/src/ROCmService.cc @@ -0,0 +1,382 @@ +#include +#include +#include +#include +#include +#include + +#include +/* +#include +*/ + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/ResourceInformation.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" +/* +#include "HeterogeneousCore/ROCmUtilities/interface/nvmlCheck.h" +*/ + +void setHipLimit(hipLimit_t limit, const char* name, size_t request) { + // read the current device + int device; + hipCheck(hipGetDevice(&device)); + // try to set the requested limit + auto result = hipDeviceSetLimit(limit, request); + if (hipErrorUnsupportedLimit == result) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": unsupported limit \"" << name << "\""; + return; + } + // read back the limit value + size_t value; + result = hipDeviceGetLimit(&value, limit); + if (hipSuccess != result) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": failed to set limit \"" << name << "\" to " + << request << ", current value is " << value; + } else if (value != request) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": limit \"" << name << "\" set to " << value + << " instead of requested " << request; + } +} + +std::string decodeVersion(int version) { + return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10); +} + +/// Constructor +ROCmService::ROCmService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter("verbose")) { + bool configEnabled = config.getUntrackedParameter("enabled"); + if (not configEnabled) { + edm::LogInfo("ROCmService") << "ROCmService disabled by configuration"; + return; + } + + auto status = hipGetDeviceCount(&numberOfDevices_); + if (hipSuccess != status) { + edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n" + << "Disabling the ROCmService."; + return; + } + computeCapabilities_.reserve(numberOfDevices_); + + /* + // AMD system driver version, e.g. 470.57.02 + char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; + nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH)); + nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion))); + nvmlCheck(nvmlShutdown()); + */ + + // ROCm driver version, e.g. 11.4 + // the full version, like 11.4.1 or 11.4.100, is not reported + int driverVersion = 0; + hipCheck(hipDriverGetVersion(&driverVersion)); + + // ROCm runtime version, e.g. 11.4 + // the full version, like 11.4.1 or 11.4.108, is not reported + int runtimeVersion = 0; + hipCheck(hipRuntimeGetVersion(&runtimeVersion)); + + edm::LogInfo log("ROCmService"); + if (verbose_) { + /* + log << "AMD driver: " << systemDriverVersion << '\n'; + */ + log << "ROCm driver API: " << decodeVersion(driverVersion) << /*" (compiled with " << decodeVersion(ROCm_VERSION) + << ")" */ + "\n"; + log << "ROCm runtime API: " << decodeVersion(runtimeVersion) + << /*" (compiled with " << decodeVersion(ROCmRT_VERSION) + << ")" */ + "\n"; + log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n"; + } else { + log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version " + << decodeVersion(driverVersion) + /* + << ", AMD driver version " << systemDriverVersion + */ + ; + } + + auto const& limits = config.getUntrackedParameter("limits"); + /* + auto printfFifoSize = limits.getUntrackedParameter("hipLimitPrintfFifoSize"); + */ + auto stackSize = limits.getUntrackedParameter("hipLimitStackSize"); + auto mallocHeapSize = limits.getUntrackedParameter("hipLimitMallocHeapSize"); + /* + auto devRuntimeSyncDepth = limits.getUntrackedParameter("hipLimitDevRuntimeSyncDepth"); + auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("hipLimitDevRuntimePendingLaunchCount"); + */ + + std::set models; + + for (int i = 0; i < numberOfDevices_; ++i) { + // read information about the compute device. + // see the documentation of hipGetDeviceProperties() for more information. + hipDeviceProp_t properties; + hipCheck(hipGetDeviceProperties(&properties, i)); + log << '\n' << "ROCm device " << i << ": " << properties.name; + if (verbose_) { + log << '\n'; + } + models.insert(std::string(properties.name)); + + // compute capabilities + computeCapabilities_.emplace_back(properties.major, properties.minor); + if (verbose_) { + log << " compute capability: " << properties.major << "." << properties.minor; + } + log << " (sm_" << properties.major << properties.minor << ")"; + if (verbose_) { + log << '\n'; + log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n'; + log << " ROCm cores: " << std::setw(28) << "not yet implemented" << '\n'; + /* + log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio + << ":1\n"; + */ + } + + // compute mode + static constexpr const char* computeModeDescription[] = { + "default (shared)", // hipComputeModeDefault + "exclusive (single thread)", // hipComputeModeExclusive + "prohibited", // hipComputeModeProhibited + "exclusive (single process)", // hipComputeModeExclusiveProcess + "unknown"}; + if (verbose_) { + log << " compute mode:" << std::right << std::setw(27) + << computeModeDescription[std::min(properties.computeMode, + static_cast(std::size(computeModeDescription)) - 1)] + << '\n'; + } + + // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception + hipCheck(hipSetDevice(i)); + hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost)); + + // read the free and total amount of memory available for allocation by the device, in bytes. + // see the documentation of hipMemGetInfo() for more information. + if (verbose_) { + size_t freeMemory, totalMemory; + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) + << totalMemory / (1 << 20) << " MB total\n"; + log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n"; + log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n"; + } + + // L1 cache behaviour + if (verbose_) { + /* + static constexpr const char* l1CacheModeDescription[] = { + "unknown", "local memory", "global memory", "local and global memory"}; + int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported; + log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n'; + log << '\n'; + */ + + log << "Other capabilities\n"; + log << " " << (properties.canMapHostMemory ? "can" : "cannot") + << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n"; + log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support") + << " coherently accessing pageable memory without calling hipHostRegister() on it\n"; + log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot") + << " access pageable memory via the host's page tables\n"; + /* + log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot") + << " access host registered memory at the same virtual address as the host\n"; + log << " " << (properties.unifiedAddressing ? "shares" : "does not share") + << " a unified address space with the host\n"; + */ + log << " " << (properties.managedMemory ? "supports" : "does not support") + << " allocating managed memory on this system\n"; + log << " " << (properties.concurrentManagedAccess ? "can" : "cannot") + << " coherently access managed memory concurrently with the host\n"; + log << " " + << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot") + << " directly access managed memory on the device without migration\n"; + log << " " << (properties.cooperativeLaunch ? "supports" : "does not support") + << " launching cooperative kernels via hipLaunchCooperativeKernel()\n"; + log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support") + << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n"; + log << '\n'; + } + + // set and read the ROCm device flags. + // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for more information. + if (verbose_) { + log << "ROCm flags\n"; + unsigned int flags; + hipCheck(hipGetDeviceFlags(&flags)); + switch (flags & hipDeviceScheduleMask) { + case hipDeviceScheduleAuto: + log << " thread policy: default\n"; + break; + case hipDeviceScheduleSpin: + log << " thread policy: spin\n"; + break; + case hipDeviceScheduleYield: + log << " thread policy: yield\n"; + break; + case hipDeviceScheduleBlockingSync: + log << " thread policy: blocking sync\n"; + break; + default: + log << " thread policy: undefined\n"; + } + if (flags & hipDeviceMapHost) { + log << " pinned host memory allocations: enabled\n"; + } else { + log << " pinned host memory allocations: disabled\n"; + } + if (flags & hipDeviceLmemResizeToMax) { + log << " kernel host memory reuse: enabled\n"; + } else { + log << " kernel host memory reuse: disabled\n"; + } + log << '\n'; + } + + // set and read the ROCm resource limits. + // see the documentation of hipDeviceSetLimit() for more information. + + /* + // hipLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the + // printf() device system call. + if (printfFifoSize >= 0) { + setHipLimit(hipLimitPrintfFifoSize, "hipLimitPrintfFifoSize", printfFifoSize); + } + */ + // hipLimitStackSize controls the stack size in bytes of each GPU thread. + if (stackSize >= 0) { + setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize); + } + // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc() + // and free() device system calls. + if (mallocHeapSize >= 0) { + setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize); + } + /* + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + // hipLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which + // a thread can safely call hipDeviceSynchronize(). + if (devRuntimeSyncDepth >= 0) { + setHipLimit(hipLimitDevRuntimeSyncDepth, "hipLimitDevRuntimeSyncDepth", devRuntimeSyncDepth); + } + // hipLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding + // device runtime launches that can be made from the current device. + if (devRuntimePendingLaunchCount >= 0) { + setHipLimit( + hipLimitDevRuntimePendingLaunchCount, "hipLimitDevRuntimePendingLaunchCount", devRuntimePendingLaunchCount); + } + } + */ + + if (verbose_) { + size_t value; + log << "ROCm limits\n"; + /* + hipCheck(hipDeviceGetLimit(&value, hipLimitPrintfFifoSize)); + log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n"; + */ + hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize)); + log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n"; + hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize)); + log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n"; + /* + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimeSyncDepth)); + log << " runtime sync depth: " << std::setw(10) << value << '\n'; + hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimePendingLaunchCount)); + log << " runtime pending launch count: " << std::setw(10) << value << '\n'; + } + */ + } + } + + edm::Service resourceInformationService; + if (resourceInformationService.isAvailable()) { + std::vector modelsV(models.begin(), models.end()); + resourceInformationService->setGPUModels(modelsV); + /* + std::string nvidiaDriverVersion{systemDriverVersion}; + resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion); + resourceInformationService->setCudaDriverVersion(driverVersion); + resourceInformationService->setCudaRuntimeVersion(runtimeVersion); + */ + } + + if (verbose_) { + log << '\n' << "ROCmService fully initialized"; + } + enabled_ = true; +} + +ROCmService::~ROCmService() { + if (enabled_) { + for (int i = 0; i < numberOfDevices_; ++i) { + hipCheck(hipSetDevice(i)); + hipCheck(hipDeviceSynchronize()); + // Explicitly destroys and cleans up all resources associated with the current device in the + // current process. Any subsequent API call to this device will reinitialize the device. + // Useful to check for memory leaks. + hipCheck(hipDeviceReset()); + } + } +} + +void ROCmService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.addUntracked("enabled", true); + desc.addUntracked("verbose", false); + + edm::ParameterSetDescription limits; + /* + limits.addUntracked("hipLimitPrintfFifoSize", -1) + ->setComment("Size in bytes of the shared FIFO used by the printf() device system call."); + */ + limits.addUntracked("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread."); + limits.addUntracked("hipLimitMallocHeapSize", -1) + ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls."); + limits.addUntracked("hipLimitDevRuntimeSyncDepth", -1) + ->setComment("Maximum nesting depth of a grid at which a thread can safely call hipDeviceSynchronize()."); + limits.addUntracked("hipLimitDevRuntimePendingLaunchCount", -1) + ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device."); + desc.addUntracked("limits", limits) + ->setComment( + "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps " + "the default value."); + + descriptions.add("ROCmService", desc); +} + +int ROCmService::deviceWithMostFreeMemory() const { + // save the current device + int currentDevice; + hipCheck(hipGetDevice(¤tDevice)); + + size_t maxFreeMemory = 0; + int device = -1; + for (int i = 0; i < numberOfDevices_; ++i) { + size_t freeMemory, totalMemory; + hipCheck(hipSetDevice(i)); + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + edm::LogPrint("ROCmService") << "ROCm device " << i << ": " << freeMemory / (1 << 20) << " MB free / " + << totalMemory / (1 << 20) << " MB total memory"; + if (freeMemory > maxFreeMemory) { + maxFreeMemory = freeMemory; + device = i; + } + } + // restore the current device + hipCheck(hipSetDevice(currentDevice)); + return device; +} diff --git a/HeterogeneousCore/ROCmServices/test/BuildFile.xml b/HeterogeneousCore/ROCmServices/test/BuildFile.xml new file mode 100644 index 0000000000000..7fbe8d1931848 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/test/testROCmService.cpp b/HeterogeneousCore/ROCmServices/test/testROCmService.cpp new file mode 100644 index 0000000000000..06b2c90c6db8b --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/testROCmService.cpp @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include + +#include + +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h" +#include "FWCore/PluginManager/interface/PluginManager.h" +#include "FWCore/PluginManager/interface/standard.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" +#include "FWCore/ServiceRegistry/interface/ServiceToken.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "FWCore/Utilities/interface/ResourceInformation.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace { + ROCmService makeROCmService(edm::ParameterSet ps) { + auto desc = edm::ConfigurationDescriptions("Service", "ROCmService"); + ROCmService::fillDescriptions(desc); + desc.validate(ps, "ROCmService"); + return ROCmService(ps); + } +} // namespace + +TEST_CASE("Tests of ROCmService", "[ROCmService]") { + // Test setup: check if a simple ROCm runtime API call fails: + // if so, skip the test with the ROCmService enabled + int deviceCount = 0; + auto ret = hipGetDeviceCount(&deviceCount); + + if (ret != hipSuccess) { + WARN("Unable to query the ROCm capable devices from the ROCm runtime API: (" + << ret << ") " << hipGetErrorString(ret) << ". Running only tests not requiring devices."); + } + + // Make Service system available as ROCmService depends on ResourceInformationService + std::vector psets; + edm::ServiceToken serviceToken = edm::ServiceRegistry::createSet(psets); + edm::ServiceRegistry::Operate operate(serviceToken); + + SECTION("ROCmService enabled") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + SECTION("Enabled only if there are ROCm capable GPUs") { + auto cs = makeROCmService(ps); + if (deviceCount <= 0) { + REQUIRE(cs.enabled() == false); + WARN("ROCmService is disabled as there are no ROCm GPU devices"); + } else { + REQUIRE(cs.enabled() == true); + INFO("ROCmService is enabled"); + } + } + + if (deviceCount <= 0) { + return; + } + + auto cs = makeROCmService(ps); + int driverVersion = 0, runtimeVersion = 0; + ret = hipDriverGetVersion(&driverVersion); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm driver version from the ROCm runtime API: (" << ret << ") " + << hipGetErrorString(ret)); + } + ret = hipRuntimeGetVersion(&runtimeVersion); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm runtime API version: (" << ret << ") " << hipGetErrorString(ret)); + } + + SECTION("ROCm Queries") { + WARN("ROCm Driver Version / Runtime Version: " << driverVersion / 1000 << "." << (driverVersion % 100) / 10 + << " / " << runtimeVersion / 1000 << "." + << (runtimeVersion % 100) / 10); + + // Test that the number of devices found by the service + // is the same as detected by the ROCm runtime API + REQUIRE(cs.numberOfDevices() == deviceCount); + WARN("Detected " << cs.numberOfDevices() << " ROCm Capable device(s)"); + + // Test that the compute capabilities of each device + // are the same as detected by the ROCm runtime API + for (int i = 0; i < deviceCount; ++i) { + hipDeviceProp_t deviceProp; + ret = hipGetDeviceProperties(&deviceProp, i); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm properties for device " << i << " from the ROCm runtime API: (" << ret << ") " + << hipGetErrorString(ret)); + } + + REQUIRE(deviceProp.major == cs.computeCapability(i).first); + REQUIRE(deviceProp.minor == cs.computeCapability(i).second); + INFO("Device " << i << ": " << deviceProp.name << "\n ROCm Capability Major/Minor version number: " + << deviceProp.major << "." << deviceProp.minor); + } + } + + SECTION("ROCmService device free memory") { + size_t mem = 0; + int dev = -1; + for (int i = 0; i < deviceCount; ++i) { + size_t free, tot; + REQUIRE_NOTHROW(hipCheck(hipSetDevice(i))); + REQUIRE_NOTHROW(hipCheck(hipMemGetInfo(&free, &tot))); + WARN("Device " << i << " memory total " << tot << " free " << free); + if (free > mem) { + mem = free; + dev = i; + } + } + WARN("Device with most free memory " << dev << "\n" + << " as given by ROCmService " << cs.deviceWithMostFreeMemory()); + } + + SECTION("With ResourceInformationService available") { + edmplugin::PluginManager::configure(edmplugin::standard::config()); + + std::string const config = R"_(import FWCore.ParameterSet.Config as cms +process = cms.Process('Test') +process.add_(cms.Service('ResourceInformationService')) +)_"; + std::unique_ptr params; + edm::makeParameterSets(config, params); + edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params))); + edm::ServiceRegistry::Operate operate2(tempToken); + + auto cs = makeROCmService(edm::ParameterSet{}); + REQUIRE(cs.enabled()); + edm::Service ri; + REQUIRE(ri->gpuModels().size() > 0); + /* + REQUIRE(ri->nvidiaDriverVersion().size() > 0); + REQUIRE(ri->cudaDriverVersion() == driverVersion); + REQUIRE(ri->cudaRuntimeVersion() == runtimeVersion); + */ + } + } + + SECTION("Force to be disabled") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", false); + auto cs = makeROCmService(ps); + REQUIRE(cs.enabled() == false); + REQUIRE(cs.numberOfDevices() == 0); + } +} diff --git a/HeterogeneousCore/ROCmServices/test/testROCmService.py b/HeterogeneousCore/ROCmServices/test/testROCmService.py new file mode 100644 index 0000000000000..d96d02f25be44 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/testROCmService.py @@ -0,0 +1,20 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process( "TEST" ) + +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32( 4 ), + numberOfStreams = cms.untracked.uint32( 0 ), +) + +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.ROCmService = {} + +process.load('HeterogeneousCore.ROCmServices.ROCmService_cfi') +process.ROCmService.verbose = True + +process.source = cms.Source("EmptySource") + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32( 0 ) +) diff --git a/HeterogeneousTest/ROCmDevice/BuildFile.xml b/HeterogeneousTest/ROCmDevice/BuildFile.xml new file mode 100644 index 0000000000000..1b2f19e002eff --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/HeterogeneousTest/ROCmDevice/README.md b/HeterogeneousTest/ROCmDevice/README.md new file mode 100644 index 0000000000000..cb1142be6c3ad --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/README.md @@ -0,0 +1,46 @@ +# Introduction + +The packages `HeterogeneousTest/ROCmDevice`, `HeterogeneousTest/ROCmKernel`, +`HeterogeneousTest/ROCmWrapper` and `HeterogeneousTest/ROCmOpaque` implement a set of libraries, +plugins and tests to exercise the build rules for ROCm. +In particular, these tests show what is supported and what are the limitations implementing +ROCm-based libraries, and using them from multiple plugins. + + +# `HeterogeneousTest/ROCmDevice` + +The package `HeterogeneousTest/ROCmDevice` implements a library that defines and exports ROCm +device-side functions: +```c++ +namespace cms::cudatest { + + __device__ void add_vectors_f(...); + __device__ void add_vectors_d(...); + +} // namespace cms::cudatest +``` + +The `plugins` directory implements the `ROCmTestDeviceAdditionModule` `EDAnalyzer` that launches a +ROCm kernel using the functions defined in ths library. As a byproduct this plugin also shows how +to split an `EDAnalyzer` or other framework plugin into a host-only part (in a `.cc` file) and a +device part (in a `.cu` file). + +The `test` directory implements the `testCudaDeviceAddition` binary that launches a ROCm kernel +using these functions. +It also contains the `testROCmTestDeviceAdditionModule.py` python configuration to exercise the +`ROCmTestDeviceAdditionModule` plugin. + + +# Other packages + +For various ways in which this library and plugin can be tested, see also the other +`HeterogeneousTest/ROCm...` packages: + - [`HeterogeneousTest/ROCmKernel/README.md`](../../HeterogeneousTest/ROCmKernel/README.md) + - [`HeterogeneousTest/ROCmWrapper/README.md`](../../HeterogeneousTest/ROCmWrapper/README.md) + - [`HeterogeneousTest/ROCmOpaque/README.md`](../../HeterogeneousTest/ROCmOpaque/README.md) + + +# Combining plugins + +`HeterogeneousTest/ROCmOpaque/test` contains the `testROCmTestAdditionModules.py` python +configuration that exercise all four plugins in a single application. diff --git a/HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h b/HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h new file mode 100644 index 0000000000000..c4df75d1d519e --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h @@ -0,0 +1,22 @@ +#ifndef HeterogeneousTest_ROCmDevice_interface_DeviceAddition_h +#define HeterogeneousTest_ROCmDevice_interface_DeviceAddition_h + +#include + +#include + +namespace cms::rocmtest { + + __device__ void add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size); + + __device__ void add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size); + +} // namespace cms::rocmtest + +#endif // HeterogeneousTest_ROCmDevice_interface_DeviceAddition_h diff --git a/HeterogeneousTest/ROCmDevice/plugins/BuildFile.xml b/HeterogeneousTest/ROCmDevice/plugins/BuildFile.xml new file mode 100644 index 0000000000000..d4ea3b3e698af --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.h b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.h new file mode 100644 index 0000000000000..17a04ef5d23d0 --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.h @@ -0,0 +1,15 @@ +#ifndef HeterogeneousTest_ROCmDevice_plugins_ROCmTestDeviceAdditionAlgo_h +#define HeterogeneousTest_ROCmDevice_plugins_ROCmTestDeviceAdditionAlgo_h + +#include + +namespace HeterogeneousCoreROCmTestDevicePlugins { + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size); + +} // namespace HeterogeneousCoreROCmTestDevicePlugins + +#endif // HeterogeneousTest_ROCmDevice_plugins_ROCmTestDeviceAdditionAlgo_h diff --git a/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.hip.cc b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.hip.cc new file mode 100644 index 0000000000000..3d54ecdf04e83 --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionAlgo.hip.cc @@ -0,0 +1,27 @@ +#include + +#include + +#include "HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +#include "ROCmTestDeviceAdditionAlgo.h" + +namespace HeterogeneousCoreROCmTestDevicePlugins { + + __global__ void kernel_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + cms::rocmtest::add_vectors_f(in1, in2, out, size); + } + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + kernel_add_vectors_f<<<32, 32>>>(in1, in2, out, size); + hipCheck(hipGetLastError()); + } + +} // namespace HeterogeneousCoreROCmTestDevicePlugins diff --git a/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionModule.cc b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionModule.cc new file mode 100644 index 0000000000000..7cb12d3b0ce70 --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/plugins/ROCmTestDeviceAdditionModule.cc @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include + +#include + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +#include "ROCmTestDeviceAdditionAlgo.h" + +class ROCmTestDeviceAdditionModule : public edm::global::EDAnalyzer<> { +public: + explicit ROCmTestDeviceAdditionModule(edm::ParameterSet const& config); + ~ROCmTestDeviceAdditionModule() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const override; + +private: + const uint32_t size_; +}; + +ROCmTestDeviceAdditionModule::ROCmTestDeviceAdditionModule(edm::ParameterSet const& config) + : size_(config.getParameter("size")) {} + +void ROCmTestDeviceAdditionModule::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size", 1024 * 1024); + descriptions.addWithDefaultLabel(desc); +} + +void ROCmTestDeviceAdditionModule::analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const { + // require ROCm for running + edm::Service cs; + if (not cs->enabled()) { + std::cout << "The ROCmService is disabled, the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // allocate input and output host buffers + std::vector in1_h(size_); + std::vector in2_h(size_); + std::vector out_h(size_); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size_; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + hipCheck(hipMalloc(&in1_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&in2_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&out_d, size_ * sizeof(float))); + + // copy the input data to the device + hipCheck(hipMemcpy(in1_d, in1_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + hipCheck(hipMemcpy(in2_d, in2_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + + // fill the output buffer with zeros + hipCheck(hipMemset(out_d, 0, size_ * sizeof(float))); + + // launch the 1-dimensional kernel for vector addition + HeterogeneousCoreROCmTestDevicePlugins::wrapper_add_vectors_f(in1_d, in2_d, out_d, size_); + + // copy the results from the device to the host + hipCheck(hipMemcpy(out_h.data(), out_d, size_ * sizeof(float), hipMemcpyDeviceToHost)); + + // wait for all the operations to complete + hipCheck(hipDeviceSynchronize()); + + // check the results + for (size_t i = 0; i < size_; ++i) { + float sum = in1_h[i] + in2_h[i]; + assert(out_h[i] < sum + epsilon); + assert(out_h[i] > sum - epsilon); + } + + std::cout << "All tests passed.\n"; +} + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(ROCmTestDeviceAdditionModule); diff --git a/HeterogeneousTest/ROCmDevice/src/DeviceAddition.hip.cc b/HeterogeneousTest/ROCmDevice/src/DeviceAddition.hip.cc new file mode 100644 index 0000000000000..4d9da1d624596 --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/src/DeviceAddition.hip.cc @@ -0,0 +1,34 @@ +#include +#include + +#include + +#include "HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h" + +namespace cms::rocmtest { + + __device__ void add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t stride = blockDim.x * gridDim.x; + + for (size_t i = thread; i < size; i += stride) { + out[i] = in1[i] + in2[i]; + } + } + + __device__ void add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size) { + uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x; + uint32_t stride = blockDim.x * gridDim.x; + + for (size_t i = thread; i < size; i += stride) { + out[i] = in1[i] + in2[i]; + } + } + +} // namespace cms::rocmtest diff --git a/HeterogeneousTest/ROCmDevice/test/BuildFile.xml b/HeterogeneousTest/ROCmDevice/test/BuildFile.xml new file mode 100644 index 0000000000000..474538ef1509a --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/test/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmDevice/test/testDeviceAddition.hip.cc b/HeterogeneousTest/ROCmDevice/test/testDeviceAddition.hip.cc new file mode 100644 index 0000000000000..87bce95ecea5c --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/test/testDeviceAddition.hip.cc @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include + +#include "HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" +#include "HeterogeneousCore/ROCmUtilities/interface/requireDevices.h" + +__global__ void kernel_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + cms::rocmtest::add_vectors_f(in1, in2, out, size); +} + +TEST_CASE("HeterogeneousTest/ROCmDevice test", "[rocmTestDeviceAddition]") { + cms::rocmtest::requireDevices(); + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + constexpr size_t size = 1024 * 1024; + + // allocate input and output host buffers + std::vector in1_h(size); + std::vector in2_h(size); + std::vector out_h(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + SECTION("Test add_vectors_f") { + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in1_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in2_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&out_d, size * sizeof(float)))); + + // copy the input data to the device + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in1_d, in1_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in2_d, in2_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + + // fill the output buffer with zeros + REQUIRE_NOTHROW(hipCheck(hipMemset(out_d, 0, size * sizeof(float)))); + + // launch the 1-dimensional kernel for vector addition + kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size); + REQUIRE_NOTHROW(hipCheck(hipGetLastError())); + + // copy the results from the device to the host + REQUIRE_NOTHROW(hipCheck(hipMemcpy(out_h.data(), out_d, size * sizeof(float), hipMemcpyDeviceToHost))); + + // wait for all the operations to complete + REQUIRE_NOTHROW(hipCheck(hipDeviceSynchronize())); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + CHECK_THAT(out_h[i], Catch::Matchers::WithinAbs(sum, epsilon)); + } + } +} diff --git a/HeterogeneousTest/ROCmDevice/test/testROCmTestDeviceAdditionModule.py b/HeterogeneousTest/ROCmDevice/test/testROCmTestDeviceAdditionModule.py new file mode 100644 index 0000000000000..5e31e902452f0 --- /dev/null +++ b/HeterogeneousTest/ROCmDevice/test/testROCmTestDeviceAdditionModule.py @@ -0,0 +1,15 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('TestROCmTestDeviceAdditionModule') + +process.source = cms.Source('EmptySource') + +process.ROCmService = cms.Service('ROCmService') + +process.rocmTestDeviceAdditionModule = cms.EDAnalyzer('ROCmTestDeviceAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.path = cms.Path(process.rocmTestDeviceAdditionModule) + +process.maxEvents.input = 1 diff --git a/HeterogeneousTest/ROCmKernel/BuildFile.xml b/HeterogeneousTest/ROCmKernel/BuildFile.xml new file mode 100644 index 0000000000000..7282aea1ec4e1 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/HeterogeneousTest/ROCmKernel/README.md b/HeterogeneousTest/ROCmKernel/README.md new file mode 100644 index 0000000000000..e857d1a5477be --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/README.md @@ -0,0 +1,46 @@ +# Introduction + +The packages `HeterogeneousTest/ROCmDevice`, `HeterogeneousTest/ROCmKernel`, +`HeterogeneousTest/ROCmWrapper` and `HeterogeneousTest/ROCmOpaque` implement a set of libraries, +plugins and tests to exercise the build rules for ROCm. +In particular, these tests show what is supported and what are the limitations implementing +ROCm-based libraries, and using them from multiple plugins. + + +# `HeterogeneousTest/ROCmKernel` + +The package `HeterogeneousTest/ROCmKernel` implements a library that defines and exports ROCm +kernels that call the device functions defined in the `HeterogeneousTest/ROCmDevice` library: +```c++ +namespace cms::cudatest { + + __global__ void kernel_add_vectors_f(...); + __global__ void kernel_add_vectors_d(...); + +} // namespace cms::cudatest +``` + +The `plugins` directory implements the `ROCmTestKernelAdditionModule` `EDAnalyzer` that launches the +ROCm kernels defined in this library. As a byproduct this plugin also shows how to split an +`EDAnalyzer` or other framework plugin into a host-only part (in a `.cc` file) and a device part (in +a `.cu` file). + +The `test` directory implements the `testCudaKernelAddition` test binary that launches the ROCm kernel +defined in this library. +It also contains the `testROCmTestKernelAdditionModule.py` python configuration to exercise the +`ROCmTestKernelAdditionModule` module. + + +# Other packages + +For various ways in which this library and plugin can be tested, see also the other +`HeterogeneousTest/ROCm...` packages: + - [`HeterogeneousTest/ROCmDevice/README.md`](../../HeterogeneousTest/ROCmDevice/README.md) + - [`HeterogeneousTest/ROCmWrapper/README.md`](../../HeterogeneousTest/ROCmWrapper/README.md) + - [`HeterogeneousTest/ROCmOpaque/README.md`](../../HeterogeneousTest/ROCmOpaque/README.md) + + +# Combining plugins + +`HeterogeneousTest/ROCmOpaque/test` contains the `testROCmTestAdditionModules.py` python +configuration that exercise all four plugins in a single application. diff --git a/HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h b/HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h new file mode 100644 index 0000000000000..4f8ba57d08552 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h @@ -0,0 +1,22 @@ +#ifndef HeterogeneousTest_ROCmKernel_interface_DeviceAdditionKernel_h +#define HeterogeneousTest_ROCmKernel_interface_DeviceAdditionKernel_h + +#include + +#include + +namespace cms::rocmtest { + + __global__ void kernel_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size); + + __global__ void kernel_add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size); + +} // namespace cms::rocmtest + +#endif // HeterogeneousTest_ROCmKernel_interface_DeviceAdditionKernel_h diff --git a/HeterogeneousTest/ROCmKernel/plugins/BuildFile.xml b/HeterogeneousTest/ROCmKernel/plugins/BuildFile.xml new file mode 100644 index 0000000000000..41ac7a796a79c --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.h b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.h new file mode 100644 index 0000000000000..2461fad80ff17 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.h @@ -0,0 +1,15 @@ +#ifndef HeterogeneousTest_ROCmKernel_plugins_ROCmTestKernelAdditionAlgo_h +#define HeterogeneousTest_ROCmKernel_plugins_ROCmTestKernelAdditionAlgo_h + +#include + +namespace HeterogeneousCoreROCmTestKernelPlugins { + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size); + +} // namespace HeterogeneousCoreROCmTestKernelPlugins + +#endif // HeterogeneousTest_ROCmKernel_plugins_ROCmTestKernelAdditionAlgo_h diff --git a/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.hip.cc b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.hip.cc new file mode 100644 index 0000000000000..6239e70905196 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionAlgo.hip.cc @@ -0,0 +1,20 @@ +#include + +#include + +#include "HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +#include "ROCmTestKernelAdditionAlgo.h" + +namespace HeterogeneousCoreROCmTestKernelPlugins { + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + cms::rocmtest::kernel_add_vectors_f<<<32, 32>>>(in1, in2, out, size); + hipCheck(hipGetLastError()); + } + +} // namespace HeterogeneousCoreROCmTestKernelPlugins diff --git a/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionModule.cc b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionModule.cc new file mode 100644 index 0000000000000..cab3415e4551d --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/plugins/ROCmTestKernelAdditionModule.cc @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include + +#include + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +#include "ROCmTestKernelAdditionAlgo.h" + +class ROCmTestKernelAdditionModule : public edm::global::EDAnalyzer<> { +public: + explicit ROCmTestKernelAdditionModule(edm::ParameterSet const& config); + ~ROCmTestKernelAdditionModule() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const override; + +private: + const uint32_t size_; +}; + +ROCmTestKernelAdditionModule::ROCmTestKernelAdditionModule(edm::ParameterSet const& config) + : size_(config.getParameter("size")) {} + +void ROCmTestKernelAdditionModule::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size", 1024 * 1024); + descriptions.addWithDefaultLabel(desc); +} + +void ROCmTestKernelAdditionModule::analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const { + // require ROCm for running + edm::Service cs; + if (not cs->enabled()) { + std::cout << "The ROCmService is disabled, the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // allocate input and output host buffers + std::vector in1_h(size_); + std::vector in2_h(size_); + std::vector out_h(size_); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size_; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + hipCheck(hipMalloc(&in1_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&in2_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&out_d, size_ * sizeof(float))); + + // copy the input data to the device + hipCheck(hipMemcpy(in1_d, in1_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + hipCheck(hipMemcpy(in2_d, in2_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + + // fill the output buffer with zeros + hipCheck(hipMemset(out_d, 0, size_ * sizeof(float))); + + // launch the 1-dimensional kernel for vector addition + HeterogeneousCoreROCmTestKernelPlugins::wrapper_add_vectors_f(in1_d, in2_d, out_d, size_); + + // copy the results from the device to the host + hipCheck(hipMemcpy(out_h.data(), out_d, size_ * sizeof(float), hipMemcpyDeviceToHost)); + + // wait for all the operations to complete + hipCheck(hipDeviceSynchronize()); + + // check the results + for (size_t i = 0; i < size_; ++i) { + float sum = in1_h[i] + in2_h[i]; + assert(out_h[i] < sum + epsilon); + assert(out_h[i] > sum - epsilon); + } + + std::cout << "All tests passed.\n"; +} + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(ROCmTestKernelAdditionModule); diff --git a/HeterogeneousTest/ROCmKernel/src/DeviceAdditionKernel.hip.cc b/HeterogeneousTest/ROCmKernel/src/DeviceAdditionKernel.hip.cc new file mode 100644 index 0000000000000..6928328a2446b --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/src/DeviceAdditionKernel.hip.cc @@ -0,0 +1,24 @@ +#include + +#include + +#include "HeterogeneousTest/ROCmDevice/interface/DeviceAddition.h" +#include "HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h" + +namespace cms::rocmtest { + + __global__ void kernel_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + add_vectors_f(in1, in2, out, size); + } + + __global__ void kernel_add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size) { + add_vectors_d(in1, in2, out, size); + } + +} // namespace cms::rocmtest diff --git a/HeterogeneousTest/ROCmKernel/test/BuildFile.xml b/HeterogeneousTest/ROCmKernel/test/BuildFile.xml new file mode 100644 index 0000000000000..893f80e2ba861 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/test/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmKernel/test/testDeviceAdditionKernel.hip.cc b/HeterogeneousTest/ROCmKernel/test/testDeviceAdditionKernel.hip.cc new file mode 100644 index 0000000000000..a31ff6d61d5f4 --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/test/testDeviceAdditionKernel.hip.cc @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include + +#include "HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" +#include "HeterogeneousCore/ROCmUtilities/interface/requireDevices.h" + +TEST_CASE("HeterogeneousTest/ROCmKernel test", "[rocmTestKernelAdditionKernel]") { + cms::rocmtest::requireDevices(); + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + constexpr size_t size = 1024 * 1024; + + // allocate input and output host buffers + std::vector in1_h(size); + std::vector in2_h(size); + std::vector out_h(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + SECTION("Test add_vectors_f") { + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in1_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in2_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&out_d, size * sizeof(float)))); + + // copy the input data to the device + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in1_d, in1_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in2_d, in2_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + + // fill the output buffer with zeros + REQUIRE_NOTHROW(hipCheck(hipMemset(out_d, 0, size * sizeof(float)))); + + // launch the 1-dimensional kernel for vector addition + cms::rocmtest::kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size); + REQUIRE_NOTHROW(hipCheck(hipGetLastError())); + + // copy the results from the device to the host + REQUIRE_NOTHROW(hipCheck(hipMemcpy(out_h.data(), out_d, size * sizeof(float), hipMemcpyDeviceToHost))); + + // wait for all the operations to complete + REQUIRE_NOTHROW(hipCheck(hipDeviceSynchronize())); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + CHECK_THAT(out_h[i], Catch::Matchers::WithinAbs(sum, epsilon)); + } + } +} diff --git a/HeterogeneousTest/ROCmKernel/test/testROCmTestKernelAdditionModule.py b/HeterogeneousTest/ROCmKernel/test/testROCmTestKernelAdditionModule.py new file mode 100644 index 0000000000000..b05991338da3b --- /dev/null +++ b/HeterogeneousTest/ROCmKernel/test/testROCmTestKernelAdditionModule.py @@ -0,0 +1,15 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('TestROCmTestKernelAdditionModule') + +process.source = cms.Source('EmptySource') + +process.ROCmService = cms.Service('ROCmService') + +process.rocmTestKernelAdditionModule = cms.EDAnalyzer('ROCmTestKernelAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.path = cms.Path(process.rocmTestKernelAdditionModule) + +process.maxEvents.input = 1 diff --git a/HeterogeneousTest/ROCmOpaque/BuildFile.xml b/HeterogeneousTest/ROCmOpaque/BuildFile.xml new file mode 100644 index 0000000000000..1ce9781f55dab --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/HeterogeneousTest/ROCmOpaque/README.md b/HeterogeneousTest/ROCmOpaque/README.md new file mode 100644 index 0000000000000..4da89f879e12d --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/README.md @@ -0,0 +1,46 @@ +# Introduction + +The packages `HeterogeneousTest/ROCmDevice`, `HeterogeneousTest/ROCmKernel`, +`HeterogeneousTest/ROCmWrapper` and `HeterogeneousTest/ROCmOpaque` implement a set of libraries, +plugins and tests to exercise the build rules for ROCm. +In particular, these tests show what is supported and what are the limitations implementing +ROCm-based libraries, and using them from multiple plugins. + + +# `HeterogeneousTest/ROCmOpaque` + +The package `HeterogeneousTest/ROCmOpaque` implements a non-ROCm aware library, with functions that +call the wrappers defined in the `HeterogeneousTest/ROCmWrapper` library: +```c++ +namespace cms::cudatest { + + void opaque_add_vectors_f(...); + void opaque_add_vectors_d(...); + +} // namespace cms::cudatest +``` + +The `plugins` directory implements the `ROCmTestOpqaueAdditionModule` `EDAnalyzer` that calls the +function defined in this library. This plugin shows how the function can be used directly from a +host-only, non-ROCm aware plugin. + +The `test` directory implements the `testCudaDeviceAdditionOpqaue` test binary that calls the +function defined in this library, and shows how they can be used directly from a host-only, non-ROCm +aware application. +It also contains the `testROCmTestOpqaueAdditionModule.py` python configuration to exercise the +`ROCmTestOpqaueAdditionModule` module. + + +# Other packages + +For various ways in which this library and plugin can be tested, see also the other +`HeterogeneousTest/ROCm...` packages: + - [`HeterogeneousTest/ROCmDevice/README.md`](../../HeterogeneousTest/ROCmDevice/README.md) + - [`HeterogeneousTest/ROCmKernel/README.md`](../../HeterogeneousTest/ROCmKernel/README.md) + - [`HeterogeneousTest/ROCmWrapper/README.md`](../../HeterogeneousTest/ROCmWrapper/README.md) + + +# Combining plugins + +`HeterogeneousTest/ROCmOpaque/test` contains also the `testROCmTestAdditionModules.py` python +configuration that exercise all four plugins in a single application. diff --git a/HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h b/HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h new file mode 100644 index 0000000000000..9d4a314bd17c9 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h @@ -0,0 +1,14 @@ +#ifndef HeterogeneousTest_ROCmOpaque_interface_DeviceAdditionOpaque_h +#define HeterogeneousTest_ROCmOpaque_interface_DeviceAdditionOpaque_h + +#include + +namespace cms::rocmtest { + + void opqaue_add_vectors_f(const float* in1, const float* in2, float* out, size_t size); + + void opqaue_add_vectors_d(const double* in1, const double* in2, double* out, size_t size); + +} // namespace cms::rocmtest + +#endif // HeterogeneousTest_ROCmOpaque_interface_DeviceAdditionOpaque_h diff --git a/HeterogeneousTest/ROCmOpaque/plugins/BuildFile.xml b/HeterogeneousTest/ROCmOpaque/plugins/BuildFile.xml new file mode 100644 index 0000000000000..91fdd97658824 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmOpaque/plugins/ROCmTestOpaqueAdditionModule.cc b/HeterogeneousTest/ROCmOpaque/plugins/ROCmTestOpaqueAdditionModule.cc new file mode 100644 index 0000000000000..901b6eac51122 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/plugins/ROCmTestOpaqueAdditionModule.cc @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h" + +class ROCmTestOpaqueAdditionModule : public edm::global::EDAnalyzer<> { +public: + explicit ROCmTestOpaqueAdditionModule(edm::ParameterSet const& config); + ~ROCmTestOpaqueAdditionModule() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const override; + +private: + const uint32_t size_; +}; + +ROCmTestOpaqueAdditionModule::ROCmTestOpaqueAdditionModule(edm::ParameterSet const& config) + : size_(config.getParameter("size")) {} + +void ROCmTestOpaqueAdditionModule::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size", 1024 * 1024); + descriptions.addWithDefaultLabel(desc); +} + +void ROCmTestOpaqueAdditionModule::analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const { + // require ROCm for running + edm::Service cs; + if (not cs->enabled()) { + std::cout << "The ROCmService is disabled, the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // allocate input and output host buffers + std::vector in1(size_); + std::vector in2(size_); + std::vector out(size_); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size_; ++i) { + in1[i] = dist(rand); + in2[i] = dist(rand); + out[i] = 0.; + } + + // launch the 1-dimensional kernel for vector addition + cms::rocmtest::opqaue_add_vectors_f(in1.data(), in2.data(), out.data(), size_); + + // check the results + for (size_t i = 0; i < size_; ++i) { + float sum = in1[i] + in2[i]; + assert(out[i] < sum + epsilon); + assert(out[i] > sum - epsilon); + } + + std::cout << "All tests passed.\n"; +} + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(ROCmTestOpaqueAdditionModule); diff --git a/HeterogeneousTest/ROCmOpaque/src/DeviceAdditionOpaque.cc b/HeterogeneousTest/ROCmOpaque/src/DeviceAdditionOpaque.cc new file mode 100644 index 0000000000000..3de89369df8a9 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/src/DeviceAdditionOpaque.cc @@ -0,0 +1,73 @@ +#include + +#include + +#include "HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h" +#include "HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace cms::rocmtest { + + void opqaue_add_vectors_f(const float* in1_h, const float* in2_h, float* out_h, size_t size) { + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + hipCheck(hipMalloc(&in1_d, size * sizeof(float))); + hipCheck(hipMalloc(&in2_d, size * sizeof(float))); + hipCheck(hipMalloc(&out_d, size * sizeof(float))); + + // copy the input data to the device + hipCheck(hipMemcpy(in1_d, in1_h, size * sizeof(float), hipMemcpyHostToDevice)); + hipCheck(hipMemcpy(in2_d, in2_h, size * sizeof(float), hipMemcpyHostToDevice)); + + // fill the output buffer with zeros + hipCheck(hipMemset(out_d, 0, size * sizeof(float))); + + // launch the 1-dimensional kernel for vector addition + wrapper_add_vectors_f(in1_d, in2_d, out_d, size); + + // copy the results from the device to the host + hipCheck(hipMemcpy(out_h, out_d, size * sizeof(float), hipMemcpyDeviceToHost)); + + // wait for all the operations to complete + hipCheck(hipDeviceSynchronize()); + + // free the input and output buffers on the device + hipCheck(hipFree(in1_d)); + hipCheck(hipFree(in2_d)); + hipCheck(hipFree(out_d)); + } + + void opqaue_add_vectors_d(const double* in1_h, const double* in2_h, double* out_h, size_t size) { + // allocate input and output buffers on the device + double* in1_d; + double* in2_d; + double* out_d; + hipCheck(hipMalloc(&in1_d, size * sizeof(double))); + hipCheck(hipMalloc(&in2_d, size * sizeof(double))); + hipCheck(hipMalloc(&out_d, size * sizeof(double))); + + // copy the input data to the device + hipCheck(hipMemcpy(in1_d, in1_h, size * sizeof(double), hipMemcpyHostToDevice)); + hipCheck(hipMemcpy(in2_d, in2_h, size * sizeof(double), hipMemcpyHostToDevice)); + + // fill the output buffer with zeros + hipCheck(hipMemset(out_d, 0, size * sizeof(double))); + + // launch the 1-dimensional kernel for vector addition + wrapper_add_vectors_d(in1_d, in2_d, out_d, size); + + // copy the results from the device to the host + hipCheck(hipMemcpy(out_h, out_d, size * sizeof(double), hipMemcpyDeviceToHost)); + + // wait for all the operations to complete + hipCheck(hipDeviceSynchronize()); + + // free the input and output buffers on the device + hipCheck(hipFree(in1_d)); + hipCheck(hipFree(in2_d)); + hipCheck(hipFree(out_d)); + } + +} // namespace cms::rocmtest diff --git a/HeterogeneousTest/ROCmOpaque/test/BuildFile.xml b/HeterogeneousTest/ROCmOpaque/test/BuildFile.xml new file mode 100644 index 0000000000000..a26e1a8a43b05 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/test/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmOpaque/test/testDeviceAdditionOpaque.cc b/HeterogeneousTest/ROCmOpaque/test/testDeviceAdditionOpaque.cc new file mode 100644 index 0000000000000..c3ea68dbce85d --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/test/testDeviceAdditionOpaque.cc @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include "HeterogeneousTest/ROCmOpaque/interface/DeviceAdditionOpaque.h" +#include "HeterogeneousCore/ROCmUtilities/interface/requireDevices.h" + +TEST_CASE("HeterogeneousTest/ROCmOpaque test", "[rocmTestOpaqueAdditionOpaque]") { + cms::rocmtest::requireDevices(); + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + constexpr size_t size = 1024 * 1024; + + // allocate input and output host buffers + std::vector in1(size); + std::vector in2(size); + std::vector out(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1[i] = dist(rand); + in2[i] = dist(rand); + out[i] = 0.; + } + + SECTION("Test add_vectors_f") { + // launch the 1-dimensional kernel for vector addition + REQUIRE_NOTHROW(cms::rocmtest::opqaue_add_vectors_f(in1.data(), in2.data(), out.data(), size)); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1[i] + in2[i]; + CHECK_THAT(out[i], Catch::Matchers::WithinAbs(sum, epsilon)); + } + } +} diff --git a/HeterogeneousTest/ROCmOpaque/test/testROCmTestAdditionModules.py b/HeterogeneousTest/ROCmOpaque/test/testROCmTestAdditionModules.py new file mode 100644 index 0000000000000..2ae6853a8e7ee --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/test/testROCmTestAdditionModules.py @@ -0,0 +1,31 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('TestROCmTestOpaqueAdditionModule') + +process.source = cms.Source('EmptySource') + +process.ROCmService = cms.Service('ROCmService') + +process.rocmTestDeviceAdditionModule = cms.EDAnalyzer('ROCmTestDeviceAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.rocmTestKernelAdditionModule = cms.EDAnalyzer('ROCmTestKernelAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.rocmTestWrapperAdditionModule = cms.EDAnalyzer('ROCmTestWrapperAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.rocmTestOpaqueAdditionModule = cms.EDAnalyzer('ROCmTestOpaqueAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.path = cms.Path( + process.rocmTestDeviceAdditionModule + + process.rocmTestKernelAdditionModule + + process.rocmTestWrapperAdditionModule + + process.rocmTestOpaqueAdditionModule) + +process.maxEvents.input = 1 diff --git a/HeterogeneousTest/ROCmOpaque/test/testROCmTestOpaqueAdditionModule.py b/HeterogeneousTest/ROCmOpaque/test/testROCmTestOpaqueAdditionModule.py new file mode 100644 index 0000000000000..05c4bf20d3f17 --- /dev/null +++ b/HeterogeneousTest/ROCmOpaque/test/testROCmTestOpaqueAdditionModule.py @@ -0,0 +1,15 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('TestROCmTestOpaqueAdditionModule') + +process.source = cms.Source('EmptySource') + +process.ROCmService = cms.Service('ROCmService') + +process.rocmTestOpaqueAdditionModule = cms.EDAnalyzer('ROCmTestOpaqueAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.path = cms.Path(process.rocmTestOpaqueAdditionModule) + +process.maxEvents.input = 1 diff --git a/HeterogeneousTest/ROCmWrapper/BuildFile.xml b/HeterogeneousTest/ROCmWrapper/BuildFile.xml new file mode 100644 index 0000000000000..4e9ba08b32e57 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/HeterogeneousTest/ROCmWrapper/README.md b/HeterogeneousTest/ROCmWrapper/README.md new file mode 100644 index 0000000000000..4ccdcea86958f --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/README.md @@ -0,0 +1,48 @@ +# Introduction + +The packages `HeterogeneousTest/ROCmDevice`, `HeterogeneousTest/ROCmKernel`, +`HeterogeneousTest/ROCmWrapper` and `HeterogeneousTest/ROCmOpaque` implement a set of libraries, +plugins and tests to exercise the build rules for ROCm. +In particular, these tests show what is supported and what are the limitations implementing +ROCm-based libraries, and using them from multiple plugins. + + +# `HeterogeneousTest/ROCmWrapper` + +The package `HeterogeneousTest/ROCmWrapper` implements a library that defines and exports host-side +wrappers that launch the kernels defined in the `HeterogeneousTest/ROCmKernel` library: +```c++ +namespace cms::cudatest { + + void wrapper_add_vectors_f(...); + void wrapper_add_vectors_d(...); + +} // namespace cms::cudatest +``` +These wrappers can be used from host-only, non-ROCm aware libraries, plugins and applications. They +can be linked with the standard host linker. + +The `plugins` directory implements the `ROCmTestWrapperAdditionModule` `EDAnalyzer` that calls the +wrappers defined in this library. This plugin shows how the wrappers can be used directly from a +host-only, non-ROCm aware plugin. + +The `test` directory implements the `testCudaDeviceAdditionWrapper` test binary that calls the +wrappers defined in this library, and shows how they can be used directly from a host-only, non-ROCm +aware application. +It also contains the `testROCmTestWrapperAdditionModule.py` python configuration to exercise the +`ROCmTestWrapperAdditionModule` module. + + +# Other packages + +For various ways in which this library and plugin can be tested, see also the other +`HeterogeneousTest/ROCm...` packages: + - [`HeterogeneousTest/ROCmDevice/README.md`](../../HeterogeneousTest/ROCmDevice/README.md) + - [`HeterogeneousTest/ROCmKernel/README.md`](../../HeterogeneousTest/ROCmKernel/README.md) + - [`HeterogeneousTest/ROCmOpaque/README.md`](../../HeterogeneousTest/ROCmOpaque/README.md) + + +# Combining plugins + +`HeterogeneousTest/ROCmOpaque/test` contains the `testROCmTestAdditionModules.py` python +configuration that exercise all four plugins in a single application. diff --git a/HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h b/HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h new file mode 100644 index 0000000000000..2dc6aadaa8a96 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h @@ -0,0 +1,20 @@ +#ifndef HeterogeneousTest_ROCmWrapper_interface_DeviceAdditionWrapper_h +#define HeterogeneousTest_ROCmWrapper_interface_DeviceAdditionWrapper_h + +#include + +namespace cms::rocmtest { + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size); + + void wrapper_add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size); + +} // namespace cms::rocmtest + +#endif // HeterogeneousTest_ROCmWrapper_interface_DeviceAdditionWrapper_h diff --git a/HeterogeneousTest/ROCmWrapper/plugins/BuildFile.xml b/HeterogeneousTest/ROCmWrapper/plugins/BuildFile.xml new file mode 100644 index 0000000000000..e00d97b1654c2 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmWrapper/plugins/ROCmTestWrapperAdditionModule.cc b/HeterogeneousTest/ROCmWrapper/plugins/ROCmTestWrapperAdditionModule.cc new file mode 100644 index 0000000000000..48b2b9dc91a20 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/plugins/ROCmTestWrapperAdditionModule.cc @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include + +#include + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +class ROCmTestWrapperAdditionModule : public edm::global::EDAnalyzer<> { +public: + explicit ROCmTestWrapperAdditionModule(edm::ParameterSet const& config); + ~ROCmTestWrapperAdditionModule() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void analyze(edm::StreamID, edm::Event const& event, edm::EventSetup const& setup) const override; + +private: + const uint32_t size_; +}; + +ROCmTestWrapperAdditionModule::ROCmTestWrapperAdditionModule(edm::ParameterSet const& config) + : size_(config.getParameter("size")) {} + +void ROCmTestWrapperAdditionModule::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("size", 1024 * 1024); + descriptions.addWithDefaultLabel(desc); +} + +void ROCmTestWrapperAdditionModule::analyze(edm::StreamID, + edm::Event const& event, + edm::EventSetup const& setup) const { + // require ROCm for running + edm::Service cs; + if (not cs->enabled()) { + std::cout << "The ROCmService is disabled, the test will be skipped.\n"; + return; + } + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // allocate input and output host buffers + std::vector in1_h(size_); + std::vector in2_h(size_); + std::vector out_h(size_); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size_; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + hipCheck(hipMalloc(&in1_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&in2_d, size_ * sizeof(float))); + hipCheck(hipMalloc(&out_d, size_ * sizeof(float))); + + // copy the input data to the device + hipCheck(hipMemcpy(in1_d, in1_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + hipCheck(hipMemcpy(in2_d, in2_h.data(), size_ * sizeof(float), hipMemcpyHostToDevice)); + + // fill the output buffer with zeros + hipCheck(hipMemset(out_d, 0, size_ * sizeof(float))); + + // launch the 1-dimensional kernel for vector addition + cms::rocmtest::wrapper_add_vectors_f(in1_d, in2_d, out_d, size_); + + // copy the results from the device to the host + hipCheck(hipMemcpy(out_h.data(), out_d, size_ * sizeof(float), hipMemcpyDeviceToHost)); + + // wait for all the operations to complete + hipCheck(hipDeviceSynchronize()); + + // check the results + for (size_t i = 0; i < size_; ++i) { + float sum = in1_h[i] + in2_h[i]; + assert(out_h[i] < sum + epsilon); + assert(out_h[i] > sum - epsilon); + } + + std::cout << "All tests passed.\n"; +} + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(ROCmTestWrapperAdditionModule); diff --git a/HeterogeneousTest/ROCmWrapper/src/DeviceAdditionWrapper.hip.cc b/HeterogeneousTest/ROCmWrapper/src/DeviceAdditionWrapper.hip.cc new file mode 100644 index 0000000000000..fcf7d41bd4974 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/src/DeviceAdditionWrapper.hip.cc @@ -0,0 +1,29 @@ +#include + +#include + +#include "HeterogeneousTest/ROCmKernel/interface/DeviceAdditionKernel.h" +#include "HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace cms::rocmtest { + + void wrapper_add_vectors_f(const float* __restrict__ in1, + const float* __restrict__ in2, + float* __restrict__ out, + size_t size) { + // launch the 1-dimensional kernel for vector addition + kernel_add_vectors_f<<<32, 32>>>(in1, in2, out, size); + hipCheck(hipGetLastError()); + } + + void wrapper_add_vectors_d(const double* __restrict__ in1, + const double* __restrict__ in2, + double* __restrict__ out, + size_t size) { + // launch the 1-dimensional kernel for vector addition + kernel_add_vectors_d<<<32, 32>>>(in1, in2, out, size); + hipCheck(hipGetLastError()); + } + +} // namespace cms::rocmtest diff --git a/HeterogeneousTest/ROCmWrapper/test/BuildFile.xml b/HeterogeneousTest/ROCmWrapper/test/BuildFile.xml new file mode 100644 index 0000000000000..6cc8125fcaa27 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/test/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/HeterogeneousTest/ROCmWrapper/test/testDeviceAdditionWrapper.cc b/HeterogeneousTest/ROCmWrapper/test/testDeviceAdditionWrapper.cc new file mode 100644 index 0000000000000..b34cd285b7373 --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/test/testDeviceAdditionWrapper.cc @@ -0,0 +1,72 @@ +#include +#include +#include +#include + +#define CATCH_CONFIG_MAIN +#include + +#include + +#include "HeterogeneousTest/ROCmWrapper/interface/DeviceAdditionWrapper.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" +#include "HeterogeneousCore/ROCmUtilities/interface/requireDevices.h" + +TEST_CASE("HeterogeneousTest/ROCmWrapper test", "[rocmTestWrapperAdditionWrapper]") { + cms::rocmtest::requireDevices(); + + // random number generator with a gaussian distribution + std::random_device rd{}; + std::default_random_engine rand{rd()}; + std::normal_distribution dist{0., 1.}; + + // tolerance + constexpr float epsilon = 0.000001; + + // buffer size + constexpr size_t size = 1024 * 1024; + + // allocate input and output host buffers + std::vector in1_h(size); + std::vector in2_h(size); + std::vector out_h(size); + + // fill the input buffers with random data, and the output buffer with zeros + for (size_t i = 0; i < size; ++i) { + in1_h[i] = dist(rand); + in2_h[i] = dist(rand); + out_h[i] = 0.; + } + + SECTION("Test add_vectors_f") { + // allocate input and output buffers on the device + float* in1_d; + float* in2_d; + float* out_d; + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in1_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&in2_d, size * sizeof(float)))); + REQUIRE_NOTHROW(hipCheck(hipMalloc(&out_d, size * sizeof(float)))); + + // copy the input data to the device + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in1_d, in1_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + REQUIRE_NOTHROW(hipCheck(hipMemcpy(in2_d, in2_h.data(), size * sizeof(float), hipMemcpyHostToDevice))); + + // fill the output buffer with zeros + REQUIRE_NOTHROW(hipCheck(hipMemset(out_d, 0, size * sizeof(float)))); + + // launch the 1-dimensional kernel for vector addition + REQUIRE_NOTHROW(cms::rocmtest::wrapper_add_vectors_f(in1_d, in2_d, out_d, size)); + + // copy the results from the device to the host + REQUIRE_NOTHROW(hipCheck(hipMemcpy(out_h.data(), out_d, size * sizeof(float), hipMemcpyDeviceToHost))); + + // wait for all the operations to complete + REQUIRE_NOTHROW(hipCheck(hipDeviceSynchronize())); + + // check the results + for (size_t i = 0; i < size; ++i) { + float sum = in1_h[i] + in2_h[i]; + CHECK_THAT(out_h[i], Catch::Matchers::WithinAbs(sum, epsilon)); + } + } +} diff --git a/HeterogeneousTest/ROCmWrapper/test/testROCmTestWrapperAdditionModule.py b/HeterogeneousTest/ROCmWrapper/test/testROCmTestWrapperAdditionModule.py new file mode 100644 index 0000000000000..b493b484ed82a --- /dev/null +++ b/HeterogeneousTest/ROCmWrapper/test/testROCmTestWrapperAdditionModule.py @@ -0,0 +1,15 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process('TestROCmTestWrapperAdditionModule') + +process.source = cms.Source('EmptySource') + +process.ROCmService = cms.Service('ROCmService') + +process.rocmTestWrapperAdditionModule = cms.EDAnalyzer('ROCmTestWrapperAdditionModule', + size = cms.uint32( 1024*1024 ) +) + +process.path = cms.Path(process.rocmTestWrapperAdditionModule) + +process.maxEvents.input = 1