diff --git a/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc b/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc index 5158200741e..41c900f61be 100644 --- a/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc +++ b/projects/hip-tests/catch/perftests/memory/hipPerfBufferCopySpeed.cc @@ -23,20 +23,22 @@ THE SOFTWARE. * @ingroup perfMemoryTest * `hipMemcpyAsync(void* dst, const void* src, size_t count, * hipMemcpyKind kind, hipStream_t stream = 0)` - - * Copies data between host and device. + * Copies data between host and device, or device to device etc. */ - +#include #include +#include +#include // Add this at the top if not already included +#define ENABLE_DEBUG 1 #define NUM_SIZES 9 -// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 -static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 262144, 524288, - 1048576, 4194304, 16777216, 16777216 + 10}; - +// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10, 128 MB, 512 MB +static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 1048576, 4194304, + 16777216, 16777216 + 10, 134217728, 536870912}; static const unsigned int Iterations[2] = {1, 1000}; -#define BUF_TYPES 4 -// 16 ways to combine 4 different buffer types +#define BUF_TYPES 5 +// 25 ways to combine 5 different buffer types #define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES) static void setData(void* ptr, unsigned int size, char value) { @@ -57,165 +59,335 @@ static void checkData(void* ptr, unsigned int size, char value) { } static bool hipPerfBufferCopySpeed_test(int p_tests) { + int testIdx = 0; unsigned int bufSize_; unsigned int numIter; bool hostMalloc[2] = {false}; bool hostRegister[2] = {false}; bool unpinnedMalloc[2] = {false}; + bool deviceMallocUncached[2] = {false}; void* memptr[2] = {NULL}; void* alignedmemptr[2] = {NULL}; void* srcBuffer = NULL; void* dstBuffer = NULL; - int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; - - for (; test <= numTests; test++) { - unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES; - unsigned int dstTest = (test / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES; - bufSize_ = Sizes[test % NUM_SIZES]; - hostMalloc[0] = hostMalloc[1] = false; - hostRegister[0] = hostRegister[1] = false; - unpinnedMalloc[0] = unpinnedMalloc[1] = false; - srcBuffer = dstBuffer = 0; - memptr[0] = memptr[1] = NULL; - alignedmemptr[0] = alignedmemptr[1] = NULL; - - if (srcTest == 3) { - hostRegister[0] = true; - } else if (srcTest == 2) { - hostMalloc[0] = true; - } else if (srcTest == 1) { - unpinnedMalloc[0] = true; - } - - if (dstTest == 1) { - unpinnedMalloc[1] = true; - } else if (dstTest == 2) { - hostMalloc[1] = true; - } else if (dstTest == 3) { - hostRegister[1] = true; - } - - numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)]; - - if (hostMalloc[0]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), bufSize_, 0)); - setData(srcBuffer, bufSize_, 0xd0); - } else if (hostRegister[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = reinterpret_cast(memptr[0]); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0)); - } else if (unpinnedMalloc[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = reinterpret_cast(memptr[0]); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - } else { + // int test = (p_tests == -1) ? 0 : p_tests; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + int test = 0; + // 1. Run all P2P for all sizes + if (numDevices >= 2) { + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + HIP_CHECK(hipSetDevice(0)); HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); - HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); - } - - if (hostMalloc[1]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), bufSize_, 0)); - } else if (hostRegister[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = reinterpret_cast(memptr[1]); - dstBuffer = alignedmemptr[1]; - HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0)); - } else if (unpinnedMalloc[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = reinterpret_cast(memptr[1]); - dstBuffer = alignedmemptr[1]; - } else { + hipError_t errMemset = hipMemset(srcBuffer, 0xd0, bufSize_); + if (errMemset != hipSuccess) { + hipFree(srcBuffer); + continue; + } + HIP_CHECK(hipSetDevice(1)); HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + int canAccessPeer01 = 0, canAccessPeer10 = 0; + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer01, 0, 1)); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer10, 1, 0)); + if (!canAccessPeer01 || !canAccessPeer10) { + HIP_CHECK(hipSetDevice(0)); + hipDeviceDisablePeerAccess(1); + HIP_CHECK(hipSetDevice(1)); + hipDeviceDisablePeerAccess(0); + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + continue; + } + HIP_CHECK(hipSetDevice(0)); + hipError_t errPeer0 = hipDeviceEnablePeerAccess(1, 0); + HIP_CHECK(hipSetDevice(1)); + hipError_t errPeer1 = hipDeviceEnablePeerAccess(0, 0); + if (errPeer0 != hipSuccess || errPeer1 != hipSuccess) { + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + continue; + } + HIP_CHECK(hipMemcpyPeer(dstBuffer, 1, srcBuffer, 0, bufSize_)); + auto all_start = std::chrono::steady_clock::now(); + for (unsigned int i = 0; i < numIter; i++) { + HIP_CHECK(hipMemcpyPeerAsync(dstBuffer, 1, srcBuffer, 0, bufSize_, 0)); + } + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); + } + HIP_CHECK(hipDeviceSynchronize()); + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_secs = all_end - all_start; + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + CONSOLE_PRINT("HIPPerfBufferCopySpeedP2P[%d] %u s:dev0 d:dev1 i:%u (GB/s) perf %f\n", test, + bufSize_, numIter, (float)perf); + CONSOLE_PRINT("P2P,%d,%u,dev0,dev1,%u,%f\n", test, bufSize_, numIter, (float)perf); + test++; + void* temp = malloc(bufSize_ + 4096); + void* chkBuf = reinterpret_cast(temp); + HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); + checkData(chkBuf, bufSize_, 0xd0); + free(temp); + HIP_CHECK(hipSetDevice(0)); + hipDeviceDisablePeerAccess(1); + HIP_CHECK(hipSetDevice(1)); + hipDeviceDisablePeerAccess(0); + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + ++testIdx; } - - // warm up - HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault)); - - // measure performance based on host time + } + int dstTest = 0; + int srcTest = 0; + // 2. Run all NoCU (intra) for all sizes + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); + HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU)); auto all_start = std::chrono::steady_clock::now(); - for (unsigned int i = 0; i < numIter; i++) { - HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault, NULL)); + HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU, NULL)); + } + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); } HIP_CHECK(hipDeviceSynchronize()); - auto all_end = std::chrono::steady_clock::now(); std::chrono::duration elapsed_secs = all_end - all_start; - - // read speed in GB/s - double perf = (static_cast(bufSize_ * numIter) * static_cast(1e-09)) / - elapsed_secs.count(); - - const char* strSrc = NULL; - const char* strDst = NULL; - if (hostMalloc[0]) - strSrc = "hHM"; - else if (hostRegister[0]) - strSrc = "hHR"; - else if (unpinnedMalloc[0]) - strSrc = "unp"; - else - strSrc = "hM"; - - if (hostMalloc[1]) - strDst = "hHM"; - else if (hostRegister[1]) - strDst = "hHR"; - else if (unpinnedMalloc[1]) - strDst = "unp"; - else - strDst = "hM"; - - // Double results when src and dst are both on device - if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && - (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) - perf *= 2.0; - // Double results when src and dst are both in sysmem - if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && - (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) - perf *= 2.0; - - INFO("HIPPerfBufferCopySpeed[" << test << "]\t( " << bufSize_ << ")\ts:" << strSrc - << " d:" << strDst << "\ti:" << numIter << "\t(GB/s) perf\t" - << (float)perf); - - // Verification + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + CONSOLE_PRINT("HIPPerfBufferCopySpeedNoCU[%d] %u s:dev0 d:dev0 i:%u (GB/s) perf %f\n", test, + bufSize_, numIter, (float)perf); + CONSOLE_PRINT("NoCU,%d,%u,dev0,dev0,%u,%f\n", test, bufSize_, numIter, (float)perf); + test++; void* temp = malloc(bufSize_ + 4096); void* chkBuf = reinterpret_cast(temp); HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); checkData(chkBuf, bufSize_, 0xd0); free(temp); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipFree(dstBuffer)); + ++testIdx; + } - // Free src - if (hostMalloc[0]) { - HIP_CHECK(hipHostFree(srcBuffer)); - } else if (hostRegister[0]) { - HIP_CHECK(hipHostUnregister(srcBuffer)); - free(memptr[0]); - } else if (unpinnedMalloc[0]) { - free(memptr[0]); - } else { - HIP_CHECK(hipFree(srcBuffer)); - } - - // Free dst - if (hostMalloc[1]) { - HIP_CHECK(hipHostFree(dstBuffer)); - } else if (hostRegister[1]) { - HIP_CHECK(hipHostUnregister(dstBuffer)); - free(memptr[1]); - } else if (unpinnedMalloc[1]) { - free(memptr[1]); - } else { - HIP_CHECK(hipFree(dstBuffer)); + // 3. Run all buffer type (default) for all sizes + + for (int srcTest = 0; srcTest < BUF_TYPES; ++srcTest) { + for (int dstTest = 0; dstTest < BUF_TYPES; ++dstTest) { + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + bool hostMalloc[2] = {false}; + bool hostRegister[2] = {false}; + bool unpinnedMalloc[2] = {false}; + bool deviceMallocUncached[2] = {false}; + void* memptr[2] = {NULL}; + void* alignedmemptr[2] = {NULL}; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + if (srcTest == 4) { + deviceMallocUncached[0] = true; + } else if (srcTest == 3) { + hostRegister[0] = true; + } else if (srcTest == 2) { + hostMalloc[0] = true; + } else if (srcTest == 1) { + unpinnedMalloc[0] = true; + } + if (dstTest == 1) { + unpinnedMalloc[1] = true; + } else if (dstTest == 2) { + hostMalloc[1] = true; + } else if (dstTest == 3) { + hostRegister[1] = true; + } else if (dstTest == 4) { + deviceMallocUncached[1] = true; + } + if (deviceMallocUncached[0]) { + HIP_CHECK(hipExtMallocWithFlags(&srcBuffer, bufSize_, hipDeviceMallocUncached)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + } else if (hostMalloc[0]) { + HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), bufSize_, 0)); + setData(srcBuffer, bufSize_, 0xd0); + } else if (hostRegister[0]) { + memptr[0] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[0]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[0] = reinterpret_cast(aligned); + srcBuffer = alignedmemptr[0]; + setData(srcBuffer, bufSize_, 0xd0); + HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0)); + } else if (unpinnedMalloc[0]) { + memptr[0] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[0]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[0] = reinterpret_cast(aligned); + srcBuffer = alignedmemptr[0]; + setData(srcBuffer, bufSize_, 0xd0); + } else { + HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + } + if (deviceMallocUncached[1]) { + HIP_CHECK(hipExtMallocWithFlags(&dstBuffer, bufSize_, hipDeviceMallocUncached)); + } else if (hostMalloc[1]) { + HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), bufSize_, 0)); + } else if (hostRegister[1]) { + memptr[1] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[1]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[1] = reinterpret_cast(aligned); + dstBuffer = alignedmemptr[1]; + HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0)); + } else if (unpinnedMalloc[1]) { + memptr[1] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[1]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[1] = reinterpret_cast(aligned); + dstBuffer = alignedmemptr[1]; + } else { + HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + } + HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault)); + auto all_start = std::chrono::steady_clock::now(); + for (unsigned int i = 0; i < numIter; i++) { + HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault, NULL)); + } + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); + } + HIP_CHECK(hipDeviceSynchronize()); + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_secs = all_end - all_start; + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + const char* strSrc = NULL; + const char* strDst = NULL; + if (deviceMallocUncached[0]) + strSrc = "hMUC"; + else if (hostMalloc[0]) + strSrc = "hHM"; + else if (hostRegister[0]) + strSrc = "hHR"; + else if (unpinnedMalloc[0]) + strSrc = "unp"; + else + strSrc = "hM"; + if (deviceMallocUncached[1]) + strDst = "hMUC"; + else if (hostMalloc[1]) + strDst = "hHM"; + else if (hostRegister[1]) + strDst = "hHR"; + else if (unpinnedMalloc[1]) + strDst = "unp"; + else + strDst = "hM"; + if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && + (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) + perf *= 2.0; + if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && + (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) + perf *= 2.0; + CONSOLE_PRINT("HIPPerfBufferCopySpeed[%d] %u s:%s d:%s i:%u (GB/s) perf %f\n", test, + bufSize_, strSrc, strDst, numIter, (float)perf); + std::cout << "Type," << bufSize_ << "," << strSrc << "," << strDst << "," << numIter << "," + << (float)perf << std::endl; + test++; + void* temp = malloc(bufSize_ + 4096); + void* chkBuf = reinterpret_cast(temp); + HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); + checkData(chkBuf, bufSize_, 0xd0); + free(temp); + if (deviceMallocUncached[0]) { + HIP_CHECK(hipFree(srcBuffer)); + } else if (hostMalloc[0]) { + HIP_CHECK(hipHostFree(srcBuffer)); + } else if (hostRegister[0]) { + HIP_CHECK(hipHostUnregister(srcBuffer)); + free(memptr[0]); + } else if (unpinnedMalloc[0]) { + free(memptr[0]); + } else { + HIP_CHECK(hipFree(srcBuffer)); + } + if (deviceMallocUncached[1]) { + HIP_CHECK(hipFree(dstBuffer)); + } else if (hostMalloc[1]) { + HIP_CHECK(hipHostFree(dstBuffer)); + } else if (hostRegister[1]) { + HIP_CHECK(hipHostUnregister(dstBuffer)); + free(memptr[1]); + } else if (unpinnedMalloc[1]) { + free(memptr[1]); + } else { + HIP_CHECK(hipFree(dstBuffer)); + } + } + ++testIdx; } } - return true; } @@ -234,7 +406,6 @@ static bool hipPerfBufferCopySpeed_test(int p_tests) { TEST_CASE("Perf_hipPerfBufferCopySpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); - if (numDevices <= 0) { SUCCEED( "Skipped testcase hipPerfBufferCopySpeed as" @@ -245,17 +416,18 @@ TEST_CASE("Perf_hipPerfBufferCopySpeed_test") { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("hipPerfBufferCopySpeed - info: Set device to " - << deviceId << " : " << props.name - << "Legend: unp - unpinned(malloc)," - " hM - hipMalloc(device)\n hHR - hipHostRegister(pinned)," - " hHM - hipHostMalloc(prePinned)\n"); + CONSOLE_PRINT( + "hipPerfBufferCopySpeed - info: Set device to %d : %s\nLegend: unp - unpinned(malloc), hM " + "- hipMalloc(device), hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned), hMUC " + "- hipMallocUncached\n", + deviceId, props.name); - REQUIRE(true == hipPerfBufferCopySpeed_test(1)); + // Run the test with all sizes and buffer types, alter p_tests to run a specific test + REQUIRE(true == hipPerfBufferCopySpeed_test(-1)); } } /** * End doxygen group perfMemoryTest. * @} - */ + */ \ No newline at end of file diff --git a/projects/hip-tests/perftests/memory/hipPerfBufferCopySpeed.cpp b/projects/hip-tests/perftests/memory/hipPerfBufferCopySpeed.cpp deleted file mode 100644 index ac1bf8c4ceb..00000000000 --- a/projects/hip-tests/perftests/memory/hipPerfBufferCopySpeed.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include - -#include "timer.h" -#include "test_common.h" - -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif - -#define NUM_SIZES 11 -// 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 -static const unsigned int Sizes[NUM_SIZES] = { - 8, 64, 1024, 8192, 65536, 262144, 524288, 1048576, 4194304, 16777216, 16777216 + 10}; - -static const unsigned int Iterations[2] = {1, 1000}; - -#define BUF_TYPES 5 -// 16 ways to combine 4 different buffer types -#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES) - -#define CHECK_RESULT(test, msg) \ - if ((test)) { \ - printf("\n%s\n", msg); \ - abort(); \ - } - -void setData(void* ptr, unsigned int size, char value) { - char* ptr2 = (char*)ptr; - for (unsigned int i = 0; i < size; i++) { - ptr2[i] = value; - } -} - -void checkData(void* ptr, unsigned int size, char value) { - char* ptr2 = (char*)ptr; - for (unsigned int i = 0; i < size; i++) { - if (ptr2[i] != value) { - printf("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); - printf("Expected 0x%08x\n", value); - CHECK_RESULT(true, "Data validation failed!"); - break; - } - } -} - - -int main(int argc, char* argv[]) { - HipTest::parseStandardArguments(argc, argv, true); - - hipError_t err = hipSuccess; - hipDeviceProp_t props = {0}; - hipGetDeviceProperties(&props, p_gpuDevice); - CHECK_RESULT(err != hipSuccess, "hipGetDeviceProperties failed"); - printf("Set device to %d : %s\n", p_gpuDevice, props.name); - printf("Legend: unp - unpinned(malloc), hM - hipMalloc(device)\n"); - printf(" hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned)\n"); - printf(" hMUC - hipMallocUncached\n"); - err = hipSetDevice(p_gpuDevice); - CHECK_RESULT(err != hipSuccess, "hipSetDevice failed"); - - unsigned int bufSize_; - bool hostMalloc[2] = {false}; - bool hostRegister[2] = {false}; - bool unpinnedMalloc[2] = {false}; - bool deviceMallocUncached[2] = {false}; - unsigned int numIter; - void* memptr[2] = {NULL}; - void* alignedmemptr[2] = {NULL}; - void* srcBuffer = NULL; - void* dstBuffer = NULL; - - int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; - - for (; test <= numTests; test++) { - unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES; - unsigned int dstTest = (test / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES; - bufSize_ = Sizes[test % NUM_SIZES]; - hostMalloc[0] = hostMalloc[1] = false; - hostRegister[0] = hostRegister[1] = false; - unpinnedMalloc[0] = unpinnedMalloc[1] = false; - deviceMallocUncached[0] = deviceMallocUncached[1] = false; - srcBuffer = dstBuffer = 0; - memptr[0] = memptr[1] = NULL; - alignedmemptr[0] = alignedmemptr[1] = NULL; - - if (srcTest == 4) { - deviceMallocUncached[0] = true; - } else if (srcTest == 3) { - hostRegister[0] = true; - } else if (srcTest == 2) { - hostMalloc[0] = true; - } else if (srcTest == 1) { - unpinnedMalloc[0] = true; - } - - - if (dstTest == 1) { - unpinnedMalloc[1] = true; - } else if (dstTest == 2) { - hostMalloc[1] = true; - } else if (dstTest == 3) { - hostRegister[1] = true; - } else if (dstTest == 4) { - deviceMallocUncached[1] = true; - } - - numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)]; - - if (hostMalloc[0]) { - err = hipHostMalloc((void**)&srcBuffer, bufSize_, 0); - setData(srcBuffer, bufSize_, 0xd0); - CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed"); - } else if (hostRegister[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - err = hipHostRegister(srcBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostRegister failed"); - } else if (unpinnedMalloc[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - } else if (deviceMallocUncached[0]) { - err = hipExtMallocWithFlags(&srcBuffer, bufSize_, hipDeviceMallocUncached); - CHECK_RESULT(err != hipSuccess, "hipExtMallocWithFlags failed"); - err = hipMemset(srcBuffer, 0xd0, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMemset failed") - } else { - err = hipMalloc(&srcBuffer, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMalloc failed"); - err = hipMemset(srcBuffer, 0xd0, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMemset failed"); - } - - if (hostMalloc[1]) { - err = hipHostMalloc((void**)&dstBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed"); - } else if (hostRegister[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095); - dstBuffer = alignedmemptr[1]; - err = hipHostRegister(dstBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostRegister failed"); - } else if (unpinnedMalloc[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095); - dstBuffer = alignedmemptr[1]; - } else if (deviceMallocUncached[1]) { - err = hipExtMallocWithFlags(&dstBuffer, bufSize_, hipDeviceMallocUncached); - CHECK_RESULT(err != hipSuccess, "hipExtMallocWithFlags failed"); - } else { - err = hipMalloc(&dstBuffer, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMalloc failed"); - } - - CPerfCounter timer; - - // warm up - err = hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault); - CHECK_RESULT(err, "hipMemcpy failed"); - - timer.Reset(); - timer.Start(); - for (unsigned int i = 0; i < numIter; i++) { - err = hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault, NULL); - CHECK_RESULT(err, "hipMemcpyAsync failed"); - } - err = hipDeviceSynchronize(); - CHECK_RESULT(err, "hipDeviceSynchronize failed"); - timer.Stop(); - double sec = timer.GetElapsedTime(); - - // Buffer copy bandwidth in GB/s - double perf = ((double)bufSize_ * numIter * (double)(1e-09)) / sec; - - const char* strSrc = NULL; - const char* strDst = NULL; - if (hostMalloc[0]) - strSrc = "hHM"; - else if (hostRegister[0]) - strSrc = "hHR"; - else if (unpinnedMalloc[0]) - strSrc = "unp"; - else if (deviceMallocUncached[0]) - strSrc = "hMUC"; - else - strSrc = "hM"; - - if (hostMalloc[1]) - strDst = "hHM"; - else if (hostRegister[1]) - strDst = "hHR"; - else if (unpinnedMalloc[1]) - strDst = "unp"; - else if (deviceMallocUncached[1]) - strDst = "hMUC"; - else - strDst = "hM"; - // Double results when src and dst are both on device - if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && - (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) - perf *= 2.0; - // Double results when src and dst are both in sysmem - if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && - (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) - perf *= 2.0; - - char buf[256]; - SNPRINTF(buf, sizeof(buf), - "HIPPerfBufferCopySpeed[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) perf\t%f", test, - bufSize_, strSrc, strDst, numIter, (float)perf); - printf("%s\n", buf); - - // Verification - void* temp = malloc(bufSize_ + 4096); - void* chkBuf = (void*)(((size_t)temp + 4095) & ~4095); - err = hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault); - CHECK_RESULT(err, "hipMemcpy failed"); - checkData(chkBuf, bufSize_, 0xd0); - free(temp); - - // Free src - if (hostMalloc[0]) { - hipHostFree(srcBuffer); - } else if (hostRegister[0]) { - hipHostUnregister(srcBuffer); - free(memptr[0]); - } else if (unpinnedMalloc[0]) { - free(memptr[0]); - } else { - hipFree(srcBuffer); - } - - // Free dst - if (hostMalloc[1]) { - hipHostFree(dstBuffer); - } else if (hostRegister[1]) { - hipHostUnregister(dstBuffer); - free(memptr[1]); - } else if (unpinnedMalloc[1]) { - free(memptr[1]); - } else { - hipFree(dstBuffer); - } - } - - passed(); -}