diff --git a/catch/hipTestMain/config/config_amd_linux b/catch/hipTestMain/config/config_amd_linux index 16340e45f..8d891b9e5 100644 --- a/catch/hipTestMain/config/config_amd_linux +++ b/catch/hipTestMain/config/config_amd_linux @@ -744,6 +744,8 @@ "Unit_hipGetLastError_KernelFailure_ValidAndInvalidOperations", "Unit_hipGetLastError_KernelFailure_TwoDevices", "Unit_hipGetLastError_KernelFailure_TwoStreams", + "=== Enable the below test when multi-device graph launches are fully supported ===", + "Unit_hipGraphInstantiateWithFlags_DependencyGraphDeviceCtxtChg", #endif #if defined gfx90a || defined gfx942 || defined gfx950 "=== SWDEV-443630 : Below test failed in stress test on 19/01/24 ===", diff --git a/catch/include/hip_test_common.hh b/catch/include/hip_test_common.hh index d2cd7ec4e..cf0b2f392 100644 --- a/catch/include/hip_test_common.hh +++ b/catch/include/hip_test_common.hh @@ -44,6 +44,26 @@ THE SOFTWARE. #define HIP_PRINT_STATUS(status) INFO(hipGetErrorName(status) << " at line: " << __LINE__); +#define CHAR_BUF_SIZE 512 + +#define CONSOLE_PRINT(fmt, ...) \ + do { \ + std::printf(fmt "\n", ##__VA_ARGS__); \ + } while (0) + +// DEBUG_PRINT: If ENABLE_DEBUG is defined, prints immediately to console. +// Otherwise, uses Catch2 INFO() - debug messages will only appear if the test fails. +#if defined(ENABLE_DEBUG) +#define DEBUG_PRINT(fmt, ...) CONSOLE_PRINT("[DEBUG]: " fmt, ##__VA_ARGS__) +#else +#define DEBUG_PRINT(fmt, ...) \ + do { \ + char buf[CHAR_BUF_SIZE]; \ + std::snprintf(buf, CHAR_BUF_SIZE, "[INFO]: " fmt, ##__VA_ARGS__); \ + INFO(buf); \ + } while (0) +#endif + // Not thread-safe #define HIP_CHECK(error) \ { \ @@ -323,6 +343,26 @@ inline bool isPcieAtomicsSupported() { return pcieAtomics != 0; } +inline bool isP2PSupported(int& d1, int& d2) { + int num_devices = HipTest::getDeviceCount(); + int supported = 1; + for (auto i = 0u; i < num_devices; ++i) { + int canAccess = 0; + for (auto j = 0u; j < num_devices; ++j) { + if (i != j) { + HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j)); + if (!canAccess) { + supported = 0; + d1 = i; + d2 = j; + break; + } + } + } + } + return supported; +} + inline bool areWarpMatchFunctionsSupported() { int matchFunctionsSupported = 1; #if HT_NVIDIA @@ -516,6 +556,14 @@ class BlockingContext { return; \ } +#define CHECK_P2P_SUPPORT \ + int d1, d2; \ + if (!HipTest::isP2PSupported(d1,d2)) { \ + std::string msg = "P2P access check failed between dev1:" + std::to_string(d1) + ",dev2:" + \ + std::to_string(d2); \ + HipTest::HIP_SKIP_TEST(msg.c_str()); \ + return; \ + } \ // This must be called in the beginning of warp test app's main() to indicate warp match functions // are supported. #define CHECK_WARP_MATCH_FUNCTIONS_SUPPORT \ diff --git a/catch/include/hip_test_kernels.hh b/catch/include/hip_test_kernels.hh index 68f452ba5..e47614ee6 100644 --- a/catch/include/hip_test_kernels.hh +++ b/catch/include/hip_test_kernels.hh @@ -93,7 +93,12 @@ template __global__ void vector_square(const T* A_d, T* C_d, size_t size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x; for (size_t i = gputhread; i < N_ELMTS; i += stride) { +#if HT_AMD + T result = A_d[i] * A_d[i]; + __hip_atomic_store(&C_d[i], result, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); +#else C_d[i] = A_d[i] * A_d[i]; +#endif } } diff --git a/catch/multiproc/hipIpcEventHandle.cc b/catch/multiproc/hipIpcEventHandle.cc index bd6fdff29..499ef0862 100644 --- a/catch/multiproc/hipIpcEventHandle.cc +++ b/catch/multiproc/hipIpcEventHandle.cc @@ -196,6 +196,9 @@ void runMultiProcKernel(ipcEventInfo_t *shmEventInfo, int index) { } } } + for (int i = 1; i < g_processCnt; i++) { + HIP_CHECK(hipEventDestroy(event[i])); + } } else { hipEvent_t event; HIP_CHECK(hipEventCreateWithFlags(&event, diff --git a/catch/perftests/CMakeLists.txt b/catch/perftests/CMakeLists.txt index 38078c0a4..a6e4ecd91 100644 --- a/catch/perftests/CMakeLists.txt +++ b/catch/perftests/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -27,3 +27,4 @@ add_subdirectory(dispatch) add_subdirectory(compute) add_subdirectory(graph) add_subdirectory(event) +add_subdirectory(vmm) \ No newline at end of file diff --git a/catch/perftests/compute/hipPerfDotProduct.cc b/catch/perftests/compute/hipPerfDotProduct.cc index aad97f063..eddcf4bb2 100644 --- a/catch/perftests/compute/hipPerfDotProduct.cc +++ b/catch/perftests/compute/hipPerfDotProduct.cc @@ -18,10 +18,10 @@ */ /** -* @addtogroup hipPerfDotProduct hipPerfDotProduct -* @{ -* @ingroup perfComputeTest -*/ + * @addtogroup hipPerfDotProduct hipPerfDotProduct + * @{ + * @ingroup perfComputeTest + */ #include #include @@ -31,11 +31,9 @@ using namespace std; template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_not_equal(int n, - const double* __restrict__ x, - const double* __restrict__ y, - double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ + void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y, + double* __restrict__ workspace) { int gid = blockIdx.x * blockDim.x + threadIdx.x; double sum = 0.0; @@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n, } template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_equal(int n, const double* __restrict__ x, - double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ + void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) { int gid = blockIdx.x * blockDim.x + threadIdx.x; double sum = 0.0; @@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x, __syncthreads(); if (threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; + sdata[threadIdx.x] += sdata[threadIdx.x + 8]; } __syncthreads(); @@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x, if (threadIdx.x == 0) { workspace[blockIdx.x] = sdata[0]; - } + } } template -__launch_bounds__(BLOCKSIZE) -__global__ void dot_reduction(double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) { __shared__ double sdata[BLOCKSIZE]; sdata[threadIdx.x] = workspace[threadIdx.x]; @@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) { if (threadIdx.x < 4) { sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } __syncthreads(); + } + __syncthreads(); if (threadIdx.x < 2) { sdata[threadIdx.x] += sdata[threadIdx.x + 2]; @@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) { } } -void computeDotProduct(int n, const double* x, const double* y, double& result, - double* workspace) { +void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) { dim3 blocks(DOT_DIM); dim3 threadsPerBlock(DOT_DIM); @@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result, } /** -* Test Description -* ------------------------ -* - Verify the device kernel results comparing it with the host results. -* Test source -* ------------------------ -* - perftests/compute/hipPerfDotProduct.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the device kernel results comparing it with the host results. + * Test source + * ------------------------ + * - perftests/compute/hipPerfDotProduct.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDotProduct") { int nGpu = 0; @@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") { for (unsigned int testCase = 0; testCase < 3; testCase++) { vector vectorSize = {200, 300, 50}; switch (testCase) { - case 0: - nx = vectorSize[0]; - ny = vectorSize[0]; - nz = vectorSize[0]; - break; - - case 1: - nx = vectorSize[1]; - ny = vectorSize[1]; - nz = vectorSize[1]; - break; - - case 2: - nx = vectorSize[0]; - ny = vectorSize[1]; - nz = vectorSize[2]; - break; - - default: - break; - } - - int trials = 200; - int size = nx * ny * nz; + case 0: + nx = vectorSize[0]; + ny = vectorSize[0]; + nz = vectorSize[0]; + break; + + case 1: + nx = vectorSize[1]; + ny = vectorSize[1]; + nz = vectorSize[1]; + break; + + case 2: + nx = vectorSize[0]; + ny = vectorSize[1]; + nz = vectorSize[2]; + break; + + default: + break; + } - vector hx(size); - vector hy(size); - double hresult_xy = 0.0; - double hresult_xx = 0.0; + int trials = 200; + int size = nx * ny * nz; - srand(time(NULL)); + vector hx(size); + vector hy(size); + double hresult_xy = 0.0; + double hresult_xx = 0.0; - for (int i = 0; i < size; ++i) { - hx[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; - hy[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; + srand(time(NULL)); - hresult_xy += hx[i] * hy[i]; - hresult_xx += hx[i] * hx[i]; - } + for (int i = 0; i < size; ++i) { + hx[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; + hy[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; - double* dx; - double* dy; - double* workspace; - double dresult; - - HIP_CHECK(hipMalloc(reinterpret_cast(&dx), sizeof(double) * size)); - HIP_CHECK(hipMalloc(reinterpret_cast(&dy), sizeof(double) * size)); - HIP_CHECK(hipMalloc(reinterpret_cast(&workspace), sizeof(double) * DOT_DIM)); + hresult_xy += hx[i] * hy[i]; + hresult_xx += hx[i] * hx[i]; + } - HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); + double* dx; + double* dy; + double* workspace; + double dresult; - // Warm up - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); + HIP_CHECK(hipMalloc(reinterpret_cast(&dx), sizeof(double) * size)); + HIP_CHECK(hipMalloc(reinterpret_cast(&dy), sizeof(double) * size)); + HIP_CHECK(hipMalloc(reinterpret_cast(&workspace), sizeof(double) * DOT_DIM)); - // Timed run for - HIP_CHECK(hipDeviceSynchronize()); - auto all_start = std::chrono::steady_clock::now(); + HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - for (int i = 0; i < trials; ++i) { + // Warm up + computeDotProduct(size, dx, dy, dresult, workspace); + computeDotProduct(size, dx, dy, dresult, workspace); computeDotProduct(size, dx, dy, dresult, workspace); - } - float time = 0; - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); + // Timed run for + HIP_CHECK(hipDeviceSynchronize()); + auto all_start = std::chrono::steady_clock::now(); - time /= trials; + for (int i = 0; i < trials; ++i) { + computeDotProduct(size, dx, dy, dresult, workspace); + } - double bw = sizeof(double) * size * 2.0 / 1e9; - double gf = 2.0 * size / 1e9; + float time = 0; + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + time = all_kernel_time.count(); - cout << "\nVector Size: " << size << "\n[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; + time /= trials; - // Verify the device kernel results comparing it with the host results - REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8)); + double bw = sizeof(double) * size * 2.0 / 1e9; + double gf = 2.0 * size / 1e9; - // Warm up - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); + CONSOLE_PRINT("\nVector Size: %d\n[ddot] %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size, + time, bw / (time / 1e3), gf / (time / 1e3)); - // Timed run for - HIP_CHECK(hipDeviceSynchronize()); - all_start = std::chrono::steady_clock::now(); + // Verify the device kernel results comparing it with the host results + REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8)); - for (int i = 0; i < trials; ++i) { + // Warm up computeDotProduct(size, dx, dx, dresult, workspace); - } + computeDotProduct(size, dx, dx, dresult, workspace); + computeDotProduct(size, dx, dx, dresult, workspace); + + // Timed run for + HIP_CHECK(hipDeviceSynchronize()); + all_start = std::chrono::steady_clock::now(); - all_end = std::chrono::steady_clock::now(); - all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); + for (int i = 0; i < trials; ++i) { + computeDotProduct(size, dx, dx, dresult, workspace); + } + + all_end = std::chrono::steady_clock::now(); + all_kernel_time = all_end - all_start; + time = all_kernel_time.count(); - time /= trials; - bw = sizeof(double) * size / 1e9; + time /= trials; + bw = sizeof(double) * size / 1e9; - cout << "[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; + CONSOLE_PRINT("[ddot] %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3), + gf / (time / 1e3)); - // Verify the device kernel results comparing it with the host results - REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8)); + // Verify the device kernel results comparing it with the host results + REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8)); - HIP_CHECK(hipFree(dx)); - HIP_CHECK(hipFree(dy)); - HIP_CHECK(hipFree(workspace)); + HIP_CHECK(hipFree(dx)); + HIP_CHECK(hipFree(dy)); + HIP_CHECK(hipFree(workspace)); } } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/catch/perftests/compute/hipPerfMandelbrot.cc b/catch/perftests/compute/hipPerfMandelbrot.cc index a500b7df3..ef007e06c 100644 --- a/catch/perftests/compute/hipPerfMandelbrot.cc +++ b/catch/perftests/compute/hipPerfMandelbrot.cc @@ -18,10 +18,10 @@ */ /** -* @addtogroup hipPerfMandelbrot hipPerfMandelbrot -* @{ -* @ingroup perfComputeTest -*/ + * @addtogroup hipPerfMandelbrot hipPerfMandelbrot + * @{ + * @ingroup perfComputeTest + */ #include #include @@ -45,36 +45,35 @@ coordRec coords[] = { static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); template -__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, - T xStep, T yStep, uint maxIter) { +__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*i); - float y0 = static_cast(yPos + yStep*j); + float x0 = static_cast(xPos + xStep * i); + float y0 = static_cast(yPos + yStep * j); float x = x0; float y = y0; uint iter = 0; float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; } template -__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - +__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*static_cast(i)); - float y0 = static_cast(yPos + yStep*static_cast(j)); + float x0 = static_cast(xPos + xStep * static_cast(i)); + float y0 = static_cast(yPos + yStep * static_cast(j)); float x = x0; float y = y0; @@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, float tmp; int stay; int ccount = 0; - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; float savx = x; float savy = y; #ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) { + for (iter = 0; (iter < maxIter); iter += 16) { #else - for (iter = 0; stay && (iter < maxIter); iter+=16) { + for (iter = 0; stay && (iter < maxIter); iter += 16) { #endif x = savx; y = savy; // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; savx = (stay ? x : savx); savy = (stay ? y : savy); - ccount += stay*16; + ccount += stay * 16; #ifdef FAST - if (!stay) - break; + if (!stay) break; #endif } // Handle remainder @@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, do { x = savx; y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); tmp = x; - x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); ccount += stay; iter--; savx = (stay ? x : savx); @@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, } template -__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { +__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - double x0 = static_cast(xPos + xStep*i); - double y0 = static_cast(yPos + yStep*j); + double x0 = static_cast(xPos + xStep * i); + double y0 = static_cast(yPos + yStep * j); double x = x0; double y = y0; uint iter = 0; double tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; - x = fma(-y, y,fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; }; template -__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { +__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - double x0 = static_cast(xPos + xStep*static_cast(i)); - double y0 = static_cast(yPos + yStep*static_cast(j)); + double x0 = static_cast(xPos + xStep * static_cast(i)); + double y0 = static_cast(yPos + yStep * static_cast(j)); double x = x0; double y = y0; @@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, double tmp; int stay; int ccount = 0; - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; double savx = x; double savy = y; #ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) + for (iter = 0; (iter < maxIter); iter += 16) #else - for (iter = 0; stay && (iter < maxIter); iter+=16) + for (iter = 0; stay && (iter < maxIter); iter += 16) #endif { x = savx; @@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; savx = (stay ? x : savx); savy = (stay ? y : savy); - ccount += stay*16; + ccount += stay * 16; #ifdef FAST - if (!stay) - break; + if (!stay) break; #endif - } + } // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x, x, x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } - while (stay && iter); - } - out[tid] = (uint)ccount; + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while (stay && iter); + } + out[tid] = (uint)ccount; }; // Expected results for each kernel run at each coord unsigned long long expectedIters[] = { - 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, - 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, - 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, - 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull, + 203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull, + 203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull, + 203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; class hipPerfMandelBrot { public: hipPerfMandelBrot(); ~hipPerfMandelBrot(); - void setNumKernels(unsigned int num) { - numKernels = num; - } + void setNumKernels(unsigned int num) { numKernels = num; } - unsigned int getNumKernels() { - return numKernels; - } + unsigned int getNumKernels() { return numKernels; } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } + void setNumStreams(unsigned int num) { numStreams = num; } + unsigned int getNumStreams() { return numStreams; } void open(int deviceID); bool run(unsigned int testCase); void printResults(void); // array of funtion pointers - typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); // Wrappers - void float_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); + void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); + void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); hipStream_t streams[2]; private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numKernels; unsigned int numStreams; @@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; + + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); numCUs = props.multiProcessorCount; } @@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) { void hipPerfMandelBrot::printResults() { int numStreams = getNumStreams(); - std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " - << numStreams << " streams (s)" << std::endl; + CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams); - std::map>:: iterator itr; + std::map>::iterator itr; for (itr = results.begin(); itr != results.end(); itr++) { - std::cout << "\n" << std::setw(20) << itr->first << " "; - for (auto i : results[itr->first]) { - std::cout << std::setw(10) << i << " "; - } - } + CONSOLE_PRINT("\n%s ", itr->first.c_str()); + for (auto i : results[itr->first]) { + CONSOLE_PRINT("%10f ", i); + } + } results.clear(); - std::cout << std::endl; + CONSOLE_PRINT("\n"); } // Wrappers for the kernel launches -void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, - maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } bool hipPerfMandelBrot::run(unsigned int testCase) { @@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { coordIdx = testCase % numCoords; funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, - &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; + &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; // Maximum iteration count maxIter = 32768; - uint ** hPtr = new uint *[numKernels]; - uint ** dPtr = new uint *[numKernels]; + uint** hPtr = new uint*[numKernels]; + uint** dPtr = new uint*[numKernels]; // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. width_ = 256; - bufSize = width_ * width_ * sizeof(uint); + bufSize = width_ * width_ * sizeof(uint); // Create streams for concurrency for (uint i = 0; i < numStreams; i++) { @@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { // Allocate memory on the host and device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) } // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); + int threads = (bufSize / sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); // Copy memory asynchronously and concurrently from host to device for (uint i = 0; i < numKernels; i++) { @@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { HIP_CHECK(hipStreamSynchronize(0)); int kernelIdx; - if(testCase == 0 || testCase == 5 || testCase == 10) { + if (testCase == 0 || testCase == 5 || testCase == 10) { kernelIdx = 0; - } else if(testCase == 1 || testCase == 6 || testCase == 11) { + } else if (testCase == 1 || testCase == 6 || testCase == 11) { kernelIdx = 1; - } else if(testCase == 2 || testCase == 7 || testCase == 12) { + } else if (testCase == 2 || testCase == 7 || testCase == 12) { kernelIdx = 2; - } else if(testCase == 3 || testCase == 8 || testCase == 13){ + } else if (testCase == 3 || testCase == 8 || testCase == 13) { kernelIdx = 3; } double totalTime = 0.0; for (unsigned int k = 0; k < numLoops; k++) { - if ((testCase == 0 || testCase == 1 || testCase == 2 || - testCase == 5 || testCase == 6 || testCase == 7 || - testCase == 10 || testCase == 11 || testCase == 12)) { - float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); - float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); - float xPos = static_cast(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = static_cast(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - // Synchronize all the concurrent streams to have completed execution - HIP_CHECK(hipStreamSynchronize(0)); + if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 || + testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) { + float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); + float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); + float xPos = static_cast(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = static_cast(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } else { - double xStep = coords[coordIdx].width / static_cast(width_); - double yStep = -coords[coordIdx].width / static_cast(width_); - double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; - double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - // Synchronize all the concurrent streams to have completed execution - HIP_CHECK(hipStreamSynchronize(0)); + // Synchronize all the concurrent streams to have completed execution + HIP_CHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } else { + double xStep = coords[coordIdx].width / static_cast(width_); + double yStep = -coords[coordIdx].width / static_cast(width_); + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + // Synchronize all the concurrent streams to have completed execution + HIP_CHECK(hipStreamSynchronize(0)); - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } } // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); - } - for(uint i = 0; i < numKernels; i++) { - checkData(hPtr[i]); - int j =0; - while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { - j++; + for (uint i = 0; i < numKernels; i++) { + HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); } + for (uint i = 0; i < numKernels; i++) { + checkData(hPtr[i]); + int j = 0; + while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { + j++; + } - if(j==30) { - std::cout << "Incorrect iteration count detected. "; - } + if (j == 30) { + CONSOLE_PRINT("Incorrect iteration count detected. "); + } } // Compute GFLOPS. There are 7 FLOPs per iteration - double perf = (static_cast(totalIters*numKernels) * 7 * static_cast(1e-09)) / - (totalTime / (double)numLoops); + double perf = (static_cast(totalIters * numKernels) * 7 * static_cast(1e-09)) / + (totalTime / (double)numLoops); - std::vector kernelName = {"float", "float_unroll", - "double", "double_unroll"}; + std::vector kernelName = {"float", "float_unroll", "double", "double_unroll"}; // Print results except for Warm-up kernel if (testCase != 100) { - results[kernelName[testCase % 4]].push_back(perf); - } + results[kernelName[testCase % 4]].push_back(perf); + } - for(uint i = 0 ; i < numStreams; i++) { + for (uint i = 0; i < numStreams; i++) { HIP_CHECK(hipStreamDestroy(streams[i])); } @@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { HIP_CHECK(hipHostFree(hPtr[i])); HIP_CHECK(hipFree(dPtr[i])); } - delete [] hPtr; - delete [] dPtr; + delete[] hPtr; + delete[] dPtr; return true; } -void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; +void hipPerfMandelBrot::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; for (unsigned int i = 0; i < width_ * width_; i++) { - ptr2[i] = value; + ptr2[i] = value; } } -void hipPerfMandelBrot::checkData(uint *ptr) { +void hipPerfMandelBrot::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_ * width_; i++) { totalIters += ptr[i]; @@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the warm-up kernel default stream executes serially. -* - verify by running all kernels - sync. -* - verify by running all kernels - async. -* Test source -* ------------------------ -* - perftests/compute/hipPerfMandelbrot.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the warm-up kernel default stream executes serially. + * - verify by running all kernels - sync. + * - verify by running all kernels - async. + * Test source + * ------------------------ + * - perftests/compute/hipPerfMandelbrot.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMandelbrot") { hipPerfMandelBrot mandelbrotCompute; int deviceId = 0; mandelbrotCompute.open(deviceId); - #if HT_AMD +#if HT_AMD SECTION("warm-up kernel default stream executes serially") { mandelbrotCompute.setNumStreams(1); mandelbrotCompute.setNumKernels(1); - REQUIRE(true == mandelbrotCompute.run(100/*Random number*/)); + REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/)); } - #endif +#endif SECTION("run all - sync") { int i = 0; do { @@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") { mandelbrotCompute.setNumKernels(1); REQUIRE(true == mandelbrotCompute.run(i)); i++; - }while(i < 12); + } while (i < 12); mandelbrotCompute.printResults(); } @@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") { mandelbrotCompute.setNumKernels(2); REQUIRE(true == mandelbrotCompute.run(i)); i++; - }while(i < 12); + } while (i < 12); mandelbrotCompute.printResults(); } } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/catch/perftests/dispatch/hipPerfDispatchSpeed.cc b/catch/perftests/dispatch/hipPerfDispatchSpeed.cc index 897999d8b..4054dbd05 100644 --- a/catch/perftests/dispatch/hipPerfDispatchSpeed.cc +++ b/catch/perftests/dispatch/hipPerfDispatchSpeed.cc @@ -18,163 +18,190 @@ */ /** -* @addtogroup hipPerfDispatchSpeed hipPerfDispatchSpeed -* @{ -* @ingroup perfDispatchTest -*/ + * @addtogroup hipPerfDispatchSpeed hipPerfDispatchSpeed + * @{ + * @ingroup perfDispatchTest + */ + +// #define ENABLE_DEBUG 1 #include #include #include -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif -#define CHAR_BUF_SIZE 512 - -typedef struct { - unsigned int iterations; - int flushEvery; -} testStruct; - -testStruct testList[] = { - { 1, -1}, - { 1, -1}, - { 10, 1}, - { 10, -1}, - { 100, 1}, - { 100, 10}, - { 100, -1}, - { 1000, 1}, - { 1000, 10}, - { 1000, 100}, - { 1000, -1}, - { 10000, 1}, - { 10000, 10}, - { 10000, 100}, - { 10000, 1000}, - { 10000, -1}, - { 100000, 1}, - { 100000, 10}, - { 100000, 100}, - { 100000, 1000}, - { 100000, 10000}, - { 100000, -1}, -}; +/** + * Test Description + * ------------------------ + * - Verify the hipPerf Dispatch and Execution speed, AKA total kernel latency + * Test source + * ------------------------ + * - perftests/dispatch/hipPerfDispatchSpeed.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ -unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000}; +unsigned int testList[] = {1, 10, 100, 1000, 10000}; -__global__ void _dispatchSpeed(float *outBuf) { +// dummy kernel that just dispatches and does nothing +__global__ void _dispatchSpeed(float* outBuf) { int i = (blockIdx.x * blockDim.x + threadIdx.x); - if (i < 0) - outBuf[i] = 0.0f; + if (i < 0) outBuf[i] = 0.0f; }; -/** -* Test Description -* ------------------------ -* - Verify the hipPerf Dispatch speed. -* Test source -* ------------------------ -* - perftests/compute/hipPerfMandelbrot.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ - -TEST_CASE("Perf_hipPerfDispatchSpeed") { - int p_gpuDevice = 0; - int p_tests = -1; +// kernel that has an execution of count, in GPU clock ticks +__global__ void _TimingKernel(uint64_t count) { + uint64_t begin_time = __builtin_amdgcn_s_memrealtime(); + uint64_t curr_time = begin_time; + do { + curr_time = __builtin_amdgcn_s_memrealtime(); + } while (begin_time + count > curr_time); +} + +enum TimingMode { TimingMode_WallTime, TimingMode_HIPEvent, TimingMode_NumModes }; + +TEST_CASE("Perf_hipPerfDispatchAndExecutionSpeed") { hipError_t err = hipSuccess; - hipDeviceProp_t props; - HIP_CHECK(hipGetDeviceProperties(&props, p_gpuDevice)); - unsigned int testListSize = sizeof(testList) / sizeof(testStruct); - int numTests = (p_tests == -1) ? (2*2*testListSize - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; + unsigned int testListSize = sizeof(testList) / sizeof(testList[0]); + int numTests = testListSize; + int warmup = 10; // number of warmup iterations + + DEBUG_PRINT("numTests %d", numTests); + + // set up timing kernel + uint64_t timer_freq_in_hz; + int clock_rate = 0; // in kHz + HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0)); + timer_freq_in_hz = clock_rate * 1000; + uint64_t timing_in_us = 4; // CHANGE THIS TO CHANGE EXECUTION TIME + const uint64_t timing_count = timer_freq_in_hz * timing_in_us / 1000000; + + int iterations = 100; // number of times to run the test to get an average time float* srcBuffer = NULL; - unsigned int bufSize_ = 64*sizeof(float); + unsigned int bufSize_ = 64 * sizeof(float); err = hipMalloc(&srcBuffer, bufSize_); REQUIRE(err == hipSuccess); - for (; test <= numTests; test++) { - int openTest = test % testListSize; - bool sleep = false; + hipEvent_t startEvent, stopEvent; + + HIP_CHECK(hipEventCreate(&startEvent)); + HIP_CHECK(hipEventCreate(&stopEvent)); + + + // run twice for both dispatch speed and full kernel latency + for (int j = 0; j < 2; j++) { + bool useTimingKernel = (j == 1); + if (useTimingKernel) { + CONSOLE_PRINT("\nTIMING KERNEL TEST ()"); + CONSOLE_PRINT("--------------------------------------------------------------"); - if (test >= (testListSize * 2)) { - sleep = true; + } else { + CONSOLE_PRINT("EMPTY KERNEL TEST"); + CONSOLE_PRINT("--------------------------------------------------------------"); } - int threads = (bufSize_ / sizeof(float)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - // warmup - hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block), - 0, hipStream_t(0), srcBuffer); - err = hipDeviceSynchronize(); - REQUIRE(err == hipSuccess); - - auto start = std::chrono::high_resolution_clock::now(); - for (unsigned int i = 0; i < testList[openTest].iterations; i++) { - hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), - dim3(threads_per_block), 0, hipStream_t(0), srcBuffer); - if ((testList[openTest].flushEvery > 0) && - (((i + 1) % testList[openTest].flushEvery) == 0)) { - if (sleep) { - err = hipDeviceSynchronize(); - REQUIRE(err == hipSuccess); - } else { - do { - err = hipStreamQuery(NULL); - } while (err == hipErrorNotReady); + + + // loop through all possible timing methods + for (unsigned int i = 0; i < TimingMode_NumModes; i++) { + TimingMode mode = static_cast(i); + CONSOLE_PRINT("\nTIMING METHOD:"); + + switch (mode) { + case TimingMode_WallTime: + CONSOLE_PRINT("Wall Time"); + break; + case TimingMode_HIPEvent: + CONSOLE_PRINT("HIP Events"); + break; + default: + CONSOLE_PRINT("Unknown Mode"); + } + + // go through test iterations + for (int test = 0; test < numTests; test++) { + int openTest = test % testListSize; + + int threads = (bufSize_ / sizeof(float)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); + double finalPerf = 0.0; + double wallMicroSec = 0.0; + + std::chrono::high_resolution_clock::time_point startWall, stopWall; + + // warmup + for (int i = 0; i < warmup; i++) { + hipLaunchKernelGGL(_TimingKernel, dim3(blocks), dim3(threads_per_block), 0, + hipStream_t(0), timing_count); + } + HIP_CHECK(hipStreamSynchronize(0)); + + for (int it = 0; it < iterations; it++) { + switch (mode) { + case TimingMode_WallTime: + startWall = std::chrono::high_resolution_clock::now(); + break; + case TimingMode_HIPEvent: + HIP_CHECK(hipEventRecord(startEvent, 0)); + break; + default: + CONSOLE_PRINT("Unknown Mode"); + } + + for (unsigned int i = 0; i < testList[openTest]; i++) { + if (useTimingKernel) { + // use the timing kernel to measure dispatch and execution speed + hipLaunchKernelGGL(_TimingKernel, dim3(blocks), dim3(threads_per_block), 0, + hipStream_t(0), timing_count); + } else { + // use the dispatch speed kernel + hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block), 0, + hipStream_t(0), srcBuffer); + } + } + + switch (mode) { + case TimingMode_WallTime: { + err = hipStreamSynchronize(0); + REQUIRE(err == hipSuccess); + stopWall = std::chrono::high_resolution_clock::now(); + wallMicroSec = + std::chrono::duration(stopWall - startWall).count(); + finalPerf += wallMicroSec / testList[openTest]; + break; + } + case TimingMode_HIPEvent: { + HIP_CHECK(hipEventRecord(stopEvent, 0)); + HIP_CHECK(hipEventSynchronize(stopEvent)); + float elapsed; + HIP_CHECK(hipEventElapsedTime(&elapsed, startEvent, stopEvent)); // in milliseconds + finalPerf += (elapsed * 1000.0f) / testList[openTest]; // convert ms to µs + break; + } + default: + CONSOLE_PRINT("Unknown Mode"); + } } + + finalPerf /= iterations; // average the performance over all iterations + + + CONSOLE_PRINT("HIPPerfDispatchSpeed[%3d] %7d dispatches (us/disp) %3f", test, + testList[openTest], (float)finalPerf); } } - if (sleep) { - err = hipDeviceSynchronize(); - REQUIRE(err == hipSuccess); - } else { - do { - err = hipStreamQuery(NULL); - } while (err == hipErrorNotReady); - } - auto stop = std::chrono::high_resolution_clock::now(); - double microSec = std::chrono::duration(stop - start).count(); - - // microseconds per launch - double perf = (microSec/testList[openTest].iterations); - const char *waitType; - const char *extraChar; - const char *n; - if (sleep) { - waitType = "sleep"; - extraChar = ""; - n = ""; - } else { - waitType = "spin"; - n = "n"; - extraChar = " "; - } - char buf[256]; - if (testList[openTest].flushEvery > 0) { - SNPRINTF(buf, sizeof(buf), "HIPPerfDispatchSpeed[%3d] %7d dispatches %s%sing every %5d (us/disp) %3f", - test, testList[openTest].iterations, - waitType, n, testList[openTest].flushEvery, (float)perf); - } else { - SNPRINTF(buf, sizeof(buf), "HIPPerfDispatchSpeed[%3d] %7d dispatches (%s%s) (us/disp) %3f", - test, testList[openTest].iterations, - waitType, extraChar, (float)perf); - } - printf("%s\n", buf); } + + HIP_CHECK(hipEventDestroy(startEvent)); + HIP_CHECK(hipEventDestroy(stopEvent)); + HIP_CHECK(hipFree(srcBuffer)); } + /** -* End doxygen group perfDispatchTest. -* @} -*/ + * End doxygen group perfDispatchTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc b/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc index 5fc601b14..61a9bd7b4 100644 --- a/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc +++ b/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc @@ -18,30 +18,31 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, -* size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, + * size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0)` - + * Copies data between host and device. + */ #include +// #define ENABLE_DEBUG 1 #define NUM_SIZES 8 // 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 -static const unsigned int Sizes[NUM_SIZES] = - {4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216+10}; +static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 262144, + 1048576, 4194304, 16777216, 16777216 + 10}; static const unsigned int Iterations[2] = {1, 1000}; #define BUF_TYPES 4 // 16 ways to combine 4 different buffer types -#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES) +#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES) -static void setData(void *ptr, unsigned int size, char value) { - char *ptr2 = reinterpret_cast(ptr); - for (unsigned int i = 0; i < size ; i++) { +static void setData(void* ptr, unsigned int size, char value) { + char* ptr2 = reinterpret_cast(ptr); + for (unsigned int i = 0; i < size; i++) { ptr2[i] = value; } } @@ -52,17 +53,17 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { bool hostMalloc[2] = {false}; bool hostRegister[2] = {false}; bool unpinnedMalloc[2] = {false}; - void *memptr[2] = {NULL}; - void *alignedmemptr[2] = {NULL}; - void *srcBuffer = NULL; - void *dstBuffer = NULL; + void* memptr[2] = {NULL}; + void* alignedmemptr[2] = {NULL}; + void* srcBuffer = NULL; + void* dstBuffer = NULL; - int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests; + int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests; int test = (p_tests == -1) ? 0 : p_tests; - for ( ; test <= numTests ; test++ ) { + for (; test <= numTests; test++) { unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES; - unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES; + unsigned int dstTest = (test / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES; bufSize_ = Sizes[test % NUM_SIZES]; hostMalloc[0] = hostMalloc[1] = false; hostRegister[0] = hostRegister[1] = false; @@ -92,8 +93,7 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)]; if (hostMalloc[0]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), - bufSize_, 0)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), bufSize_, 0)); setData(srcBuffer, bufSize_, 0xd0); } else if (hostRegister[0]) { memptr[0] = malloc(bufSize_ + 4096); @@ -112,8 +112,7 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { } if (hostMalloc[1]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), - bufSize_, 0)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), bufSize_, 0)); } else if (hostRegister[1]) { memptr[1] = malloc(bufSize_ + 4096); alignedmemptr[1] = reinterpret_cast(memptr[0]); @@ -128,15 +127,14 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { } // warm up - HIP_CHECK(hipMemcpy2D(dstBuffer, width, srcBuffer, - width, width, width, hipMemcpyDefault)); + HIP_CHECK(hipMemcpy2D(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault)); // measure performance based on host time auto all_start = std::chrono::steady_clock::now(); for (unsigned int i = 0; i < numIter; i++) { - HIP_CHECK(hipMemcpy2DAsync(dstBuffer, width, srcBuffer, - width, width, width, hipMemcpyDefault, NULL)); + HIP_CHECK(hipMemcpy2DAsync(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault, + NULL)); } HIP_CHECK(hipDeviceSynchronize()); @@ -144,11 +142,11 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { std::chrono::duration elapsed_secs = all_end - all_start; // read speed in GB/s - double perf = (static_cast(bufSize_ * numIter) * - static_cast(1e-09)) / elapsed_secs.count(); + double perf = (static_cast(bufSize_ * numIter) * static_cast(1e-09)) / + elapsed_secs.count(); - const char *strSrc = NULL; - const char *strDst = NULL; + const char* strSrc = NULL; + const char* strDst = NULL; if (hostMalloc[0]) strSrc = "hHM"; else if (hostRegister[0]) @@ -170,15 +168,14 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { // Double results when src and dst are both on device if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) - perf *= 2.0; + perf *= 2.0; // Double results when src and dst are both in sysmem if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) - perf *= 2.0; + perf *= 2.0; - INFO("hipPerfBufferCopyRectSpeed[" << test << "]\t( " << bufSize_ << - ")\ts:" << strSrc << " d:" << strDst << "\ti:" << numIter << - "\t(GB/s) perf\t" << (float)perf); + CONSOLE_PRINT("hipPerfBufferCopyRectSpeed[%d]\t( %u )\ts:%s d:%s\ti:%u\t(GB/s) perf\t%.2f\n", + test, bufSize_, strSrc, strDst, numIter, (float)perf); // Free src if (hostMalloc[0]) { @@ -208,40 +205,42 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) { } /** -* Test Description -* ------------------------ -*  - Verify hipPerfBufferCopy status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfBufferCopyRectSpeed.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfBufferCopy status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfBufferCopyRectSpeed.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfBufferCopyRectSpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfBufferCopyRectSpeed" - "as there is no device to test."); + SUCCEED( + "Skipped testcase hipPerfBufferCopyRectSpeed" + "as there is no device to test."); } else { int deviceId = 0; HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("hipPerfBufferCopyRectSpeed - info: Set device to " << deviceId - << " : " << props.name << "Legend: unp - unpinned(malloc)," - " hM - hipMalloc(device)\n hHR - hipHostRegister(pinned)," - " hHM - hipHostMalloc(prePinned)\n"); + CONSOLE_PRINT( + "hipPerfBufferCopyRectSpeed - info: Set device to %d : %s Legend: unp - unpinned(malloc), " + "hM - hipMalloc(device)\n hHR - hipHostRegister(pinned), hHM - " + "hipHostMalloc(prePinned)\n", + deviceId, props.name); REQUIRE(true == hipPerfBufferCopyRectSpeed_test(1)); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfBufferCopySpeed.cc b/catch/perftests/memory/hipPerfBufferCopySpeed.cc index 3a30b2501..4a8d9a571 100644 --- a/catch/perftests/memory/hipPerfBufferCopySpeed.cc +++ b/catch/perftests/memory/hipPerfBufferCopySpeed.cc @@ -18,246 +18,441 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpyAsync hipMemcpyAsync -* @{ -* @ingroup perfMemoryTest -* `hipMemcpyAsync(void* dst, const void* src, size_t count, -* hipMemcpyKind kind, hipStream_t stream = 0)` - -* Copies data between host and device. -*/ - + * @addtogroup hipMemcpyAsync hipMemcpyAsync + * @{ + * @ingroup perfMemoryTest + * `hipMemcpyAsync(void* dst, const void* src, size_t count, + * hipMemcpyKind kind, hipStream_t stream = 0)` - + * Copies data between host and device, or device to device etc. + */ +#include #include +#include +#include // Add this at the top if not already included +#define ENABLE_DEBUG 1 #define NUM_SIZES 9 // 4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 -static const unsigned int Sizes[NUM_SIZES] = - {4096, 8192, 65536, 262144, 524288, 1048576, 4194304, 16777216, 16777216+10}; - +static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 1048576, 4194304, + 16777216, 16777216 + 10, 134217728, 536870912}; +// static const unsigned int Sizes[NUM_SIZES] = {134217728}; static const unsigned int Iterations[2] = {1, 1000}; -#define BUF_TYPES 4 -// 16 ways to combine 4 different buffer types -#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES) +#define BUF_TYPES 5 +// 25 ways to combine 5 different buffer types +#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES) -static void setData(void *ptr, unsigned int size, char value) { - char *ptr2 = reinterpret_cast(ptr); - for (unsigned int i = 0; i < size ; i++) { +static void setData(void* ptr, unsigned int size, char value) { + char* ptr2 = reinterpret_cast(ptr); + for (unsigned int i = 0; i < size; i++) { ptr2[i] = value; } } -static void checkData(void *ptr, unsigned int size, char value) { - char *ptr2 = reinterpret_cast(ptr); +static void checkData(void* ptr, unsigned int size, char value) { + char* ptr2 = reinterpret_cast(ptr); for (unsigned int i = 0; i < size; i++) { if (ptr2[i] != value) { - INFO("Validation failed at " << i << " Got " << ptr2[i] << - " Expected " << value); + INFO("Validation failed at " << i << " Got " << ptr2[i] << " Expected " << value); REQUIRE(false); } } } static bool hipPerfBufferCopySpeed_test(int p_tests) { + int testIdx = 0; unsigned int bufSize_; unsigned int numIter; bool hostMalloc[2] = {false}; bool hostRegister[2] = {false}; bool unpinnedMalloc[2] = {false}; - void *memptr[2] = {NULL}; - void *alignedmemptr[2] = {NULL}; - void *srcBuffer = NULL; - void *dstBuffer = NULL; - - int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; - - for ( ; test <= numTests; test++ ) { - unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES; - unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES; - bufSize_ = Sizes[test % NUM_SIZES]; - hostMalloc[0] = hostMalloc[1] = false; - hostRegister[0] = hostRegister[1] = false; - unpinnedMalloc[0] = unpinnedMalloc[1] = false; - srcBuffer = dstBuffer = 0; - memptr[0] = memptr[1] = NULL; - alignedmemptr[0] = alignedmemptr[1] = NULL; - - if (srcTest == 3) { - hostRegister[0] = true; - } else if (srcTest == 2) { - hostMalloc[0] = true; - } else if (srcTest == 1) { - unpinnedMalloc[0] = true; - } - - if (dstTest == 1) { - unpinnedMalloc[1] = true; - } else if (dstTest == 2) { - hostMalloc[1] = true; - } else if (dstTest == 3) { - hostRegister[1] = true; - } - - numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)]; - - if (hostMalloc[0]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), - bufSize_, 0)); - setData(srcBuffer, bufSize_, 0xd0); - } else if (hostRegister[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = reinterpret_cast(memptr[0]); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0)); - } else if (unpinnedMalloc[0]) { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = reinterpret_cast(memptr[0]); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - } else { + bool deviceMallocUncached[2] = {false}; + void* memptr[2] = {NULL}; + void* alignedmemptr[2] = {NULL}; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests; + // int test = (p_tests == -1) ? 0 : p_tests; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + int test = 0; + // 1. Run all P2P for all sizes + if (numDevices >= 2) { + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + HIP_CHECK(hipSetDevice(0)); HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); - HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); - } - - if (hostMalloc[1]) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), - bufSize_, 0)); - } else if (hostRegister[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = reinterpret_cast(memptr[1]); - dstBuffer = alignedmemptr[1]; - HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0)); - } else if (unpinnedMalloc[1]) { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = reinterpret_cast(memptr[1]); - dstBuffer = alignedmemptr[1]; - } else { + hipError_t errMemset = hipMemset(srcBuffer, 0xd0, bufSize_); + if (errMemset != hipSuccess) { + hipFree(srcBuffer); + continue; + } + HIP_CHECK(hipSetDevice(1)); HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + int canAccessPeer01 = 0, canAccessPeer10 = 0; + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer01, 0, 1)); + HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer10, 1, 0)); + if (!canAccessPeer01 || !canAccessPeer10) { + HIP_CHECK(hipSetDevice(0)); + hipDeviceDisablePeerAccess(1); + HIP_CHECK(hipSetDevice(1)); + hipDeviceDisablePeerAccess(0); + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + continue; + } + HIP_CHECK(hipSetDevice(0)); + hipError_t errPeer0 = hipDeviceEnablePeerAccess(1, 0); + HIP_CHECK(hipSetDevice(1)); + hipError_t errPeer1 = hipDeviceEnablePeerAccess(0, 0); + if (errPeer0 != hipSuccess || errPeer1 != hipSuccess) { + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + continue; + } + HIP_CHECK(hipMemcpyPeer(dstBuffer, 1, srcBuffer, 0, bufSize_)); + auto all_start = std::chrono::steady_clock::now(); + for (unsigned int i = 0; i < numIter; i++) { + HIP_CHECK(hipMemcpyPeerAsync(dstBuffer, 1, srcBuffer, 0, bufSize_, 0)); + } + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); + } + HIP_CHECK(hipDeviceSynchronize()); + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_secs = all_end - all_start; + auto start_s = + std::chrono::duration_cast>(all_start.time_since_epoch()) + .count(); + auto end_s = + std::chrono::duration_cast>(all_end.time_since_epoch()) + .count(); + + DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s); + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + CONSOLE_PRINT("HIPPerfBufferCopySpeedP2P[%d] %u s:dev0 d:dev1 i:%u (GB/s) perf %f\n", test, + bufSize_, numIter, (float)perf); + CONSOLE_PRINT("P2P,%d,%u,dev0,dev1,%u,%f\n", test, bufSize_, numIter, (float)perf); + test++; + void* temp = malloc(bufSize_ + 4096); + void* chkBuf = reinterpret_cast(temp); + HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); + checkData(chkBuf, bufSize_, 0xd0); + free(temp); + HIP_CHECK(hipSetDevice(0)); + hipDeviceDisablePeerAccess(1); + HIP_CHECK(hipSetDevice(1)); + hipDeviceDisablePeerAccess(0); + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipFree(dstBuffer)); + HIP_CHECK(hipSetDevice(0)); + ++testIdx; } - - // warm up - HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault)); - - // measure performance based on host time + } + int dstTest = 0; + int srcTest = 0; + // 2. Run all NoCU (intra) for all sizes + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); + HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU)); auto all_start = std::chrono::steady_clock::now(); - for (unsigned int i = 0; i < numIter; i++) { - HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, - hipMemcpyDefault, NULL)); + HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU, NULL)); + } + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); } HIP_CHECK(hipDeviceSynchronize()); - auto all_end = std::chrono::steady_clock::now(); + auto start_s = + std::chrono::duration_cast>(all_start.time_since_epoch()) + .count(); + auto end_s = + std::chrono::duration_cast>(all_end.time_since_epoch()) + .count(); + + DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s); std::chrono::duration elapsed_secs = all_end - all_start; - - // read speed in GB/s - double perf = (static_cast(bufSize_ * numIter) * - static_cast(1e-09)) / elapsed_secs.count(); - - const char *strSrc = NULL; - const char *strDst = NULL; - if (hostMalloc[0]) - strSrc = "hHM"; - else if (hostRegister[0]) - strSrc = "hHR"; - else if (unpinnedMalloc[0]) - strSrc = "unp"; - else - strSrc = "hM"; - - if (hostMalloc[1]) - strDst = "hHM"; - else if (hostRegister[1]) - strDst = "hHR"; - else if (unpinnedMalloc[1]) - strDst = "unp"; - else - strDst = "hM"; - - // Double results when src and dst are both on device - if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && - (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) - perf *= 2.0; - // Double results when src and dst are both in sysmem - if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && - (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) - perf *= 2.0; - - INFO("HIPPerfBufferCopySpeed[" << test << "]\t( " << bufSize_ << - ")\ts:" << strSrc << " d:" << strDst << "\ti:" << numIter << - "\t(GB/s) perf\t" << (float)perf); - - // Verification + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + CONSOLE_PRINT("HIPPerfBufferCopySpeedNoCU[%d] %u s:dev0 d:dev0 i:%u (GB/s) perf %f\n", test, + bufSize_, numIter, (float)perf); + CONSOLE_PRINT("NoCU,%d,%u,dev0,dev0,%u,%f\n", test, bufSize_, numIter, (float)perf); + test++; void* temp = malloc(bufSize_ + 4096); void* chkBuf = reinterpret_cast(temp); HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); checkData(chkBuf, bufSize_, 0xd0); free(temp); + HIP_CHECK(hipFree(srcBuffer)); + HIP_CHECK(hipFree(dstBuffer)); + ++testIdx; + } - // Free src - if (hostMalloc[0]) { - HIP_CHECK(hipHostFree(srcBuffer)); - } else if (hostRegister[0]) { - HIP_CHECK(hipHostUnregister(srcBuffer)); - free(memptr[0]); - } else if (unpinnedMalloc[0]) { - free(memptr[0]); - } else { - HIP_CHECK(hipFree(srcBuffer)); - } - - // Free dst - if (hostMalloc[1]) { - HIP_CHECK(hipHostFree(dstBuffer)); - } else if (hostRegister[1]) { - HIP_CHECK(hipHostUnregister(dstBuffer)); - free(memptr[1]); - } else if (unpinnedMalloc[1]) { - free(memptr[1]); - } else { - HIP_CHECK(hipFree(dstBuffer)); + // 3. Run all buffer type (default) for all sizes + + for (int srcTest = 0; srcTest < BUF_TYPES; ++srcTest) { + for (int dstTest = 0; dstTest < BUF_TYPES; ++dstTest) { + for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) { + if (p_tests != -1 && testIdx != p_tests) { + ++testIdx; + continue; + } + unsigned int bufSize_ = Sizes[sizeIdx]; + bool hostMalloc[2] = {false}; + bool hostRegister[2] = {false}; + bool unpinnedMalloc[2] = {false}; + bool deviceMallocUncached[2] = {false}; + void* memptr[2] = {NULL}; + void* alignedmemptr[2] = {NULL}; + void* srcBuffer = NULL; + void* dstBuffer = NULL; + numIter = Iterations[1]; + if (srcTest == 4) { + deviceMallocUncached[0] = true; + } else if (srcTest == 3) { + hostRegister[0] = true; + } else if (srcTest == 2) { + hostMalloc[0] = true; + } else if (srcTest == 1) { + unpinnedMalloc[0] = true; + } + if (dstTest == 1) { + unpinnedMalloc[1] = true; + } else if (dstTest == 2) { + hostMalloc[1] = true; + } else if (dstTest == 3) { + hostRegister[1] = true; + } else if (dstTest == 4) { + deviceMallocUncached[1] = true; + } + if (deviceMallocUncached[0]) { + HIP_CHECK(hipExtMallocWithFlags(&srcBuffer, bufSize_, hipDeviceMallocUncached)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + } else if (hostMalloc[0]) { + HIP_CHECK(hipHostMalloc(reinterpret_cast(&srcBuffer), bufSize_, 0)); + setData(srcBuffer, bufSize_, 0xd0); + } else if (hostRegister[0]) { + memptr[0] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[0]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[0] = reinterpret_cast(aligned); + srcBuffer = alignedmemptr[0]; + setData(srcBuffer, bufSize_, 0xd0); + HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0)); + } else if (unpinnedMalloc[0]) { + memptr[0] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[0]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[0] = reinterpret_cast(aligned); + srcBuffer = alignedmemptr[0]; + setData(srcBuffer, bufSize_, 0xd0); + } else { + HIP_CHECK(hipMalloc(&srcBuffer, bufSize_)); + HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_)); + } + if (deviceMallocUncached[1]) { + HIP_CHECK(hipExtMallocWithFlags(&dstBuffer, bufSize_, hipDeviceMallocUncached)); + } else if (hostMalloc[1]) { + HIP_CHECK(hipHostMalloc(reinterpret_cast(&dstBuffer), bufSize_, 0)); + } else if (hostRegister[1]) { + memptr[1] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[1]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[1] = reinterpret_cast(aligned); + dstBuffer = alignedmemptr[1]; + HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0)); + } else if (unpinnedMalloc[1]) { + memptr[1] = malloc(bufSize_ + 4096); + uintptr_t raw = reinterpret_cast(memptr[1]); + uintptr_t aligned = (raw + 4095) & ~static_cast(4095); + alignedmemptr[1] = reinterpret_cast(aligned); + dstBuffer = alignedmemptr[1]; + } else { + HIP_CHECK(hipMalloc(&dstBuffer, bufSize_)); + } + HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault)); + auto all_start = std::chrono::steady_clock::now(); + for (unsigned int i = 0; i < numIter; i++) { + HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault, NULL)); + } + HIP_CHECK(hipDeviceSynchronize()); + hipError_t syncErr = hipGetLastError(); + if (syncErr != hipSuccess) { + DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr)); + } + HIP_CHECK(hipDeviceSynchronize()); + auto all_end = std::chrono::steady_clock::now(); + auto start_s = + std::chrono::duration_cast>(all_start.time_since_epoch()) + .count(); + auto end_s = + std::chrono::duration_cast>(all_end.time_since_epoch()) + .count(); + + DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s); + std::chrono::duration elapsed_secs = all_end - all_start; + DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count()); + double bufSizeWithIter = static_cast(bufSize_); + DEBUG_PRINT("%f\n", bufSizeWithIter); + double perf_pre = bufSizeWithIter / elapsed_secs.count(); + DEBUG_PRINT("%f\n", perf_pre); + double perf = perf_pre * static_cast(numIter); + DEBUG_PRINT("%f\n", perf_pre); + perf *= static_cast(1e-09); + CONSOLE_PRINT("%f\n", perf); + const char* strSrc = NULL; + const char* strDst = NULL; + if (deviceMallocUncached[0]) + strSrc = "hMUC"; + else if (hostMalloc[0]) + strSrc = "hHM"; + else if (hostRegister[0]) + strSrc = "hHR"; + else if (unpinnedMalloc[0]) + strSrc = "unp"; + else + strSrc = "hM"; + if (deviceMallocUncached[1]) + strDst = "hMUC"; + else if (hostMalloc[1]) + strDst = "hHM"; + else if (hostRegister[1]) + strDst = "hHR"; + else if (unpinnedMalloc[1]) + strDst = "unp"; + else + strDst = "hM"; + if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && + (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) + perf *= 2.0; + if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && + (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) + perf *= 2.0; + CONSOLE_PRINT("HIPPerfBufferCopySpeed[%d] %u s:%s d:%s i:%u (GB/s) perf %f\n", test, + bufSize_, strSrc, strDst, numIter, (float)perf); + std::cout << "Type," << bufSize_ << "," << strSrc << "," << strDst << "," << numIter << "," + << (float)perf << std::endl; + test++; + void* temp = malloc(bufSize_ + 4096); + void* chkBuf = reinterpret_cast(temp); + HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault)); + checkData(chkBuf, bufSize_, 0xd0); + free(temp); + if (deviceMallocUncached[0]) { + HIP_CHECK(hipFree(srcBuffer)); + } else if (hostMalloc[0]) { + HIP_CHECK(hipHostFree(srcBuffer)); + } else if (hostRegister[0]) { + HIP_CHECK(hipHostUnregister(srcBuffer)); + free(memptr[0]); + } else if (unpinnedMalloc[0]) { + free(memptr[0]); + } else { + HIP_CHECK(hipFree(srcBuffer)); + } + if (deviceMallocUncached[1]) { + HIP_CHECK(hipFree(dstBuffer)); + } else if (hostMalloc[1]) { + HIP_CHECK(hipHostFree(dstBuffer)); + } else if (hostRegister[1]) { + HIP_CHECK(hipHostUnregister(dstBuffer)); + free(memptr[1]); + } else if (unpinnedMalloc[1]) { + free(memptr[1]); + } else { + HIP_CHECK(hipFree(dstBuffer)); + } + } + ++testIdx; } } - return true; } /** -* Test Description -* ------------------------ -*  - Verify hipPerfBufferCopySpeed status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfBufferCopySpeed.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfBufferCopySpeed status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfBufferCopySpeed.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfBufferCopySpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); - if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfBufferCopySpeed as" - "there is no device to test."); + SUCCEED( + "Skipped testcase hipPerfBufferCopySpeed as" + "there is no device to test."); } else { int deviceId = 0; HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("hipPerfBufferCopySpeed - info: Set device to " << deviceId - << " : " << props.name << "Legend: unp - unpinned(malloc)," - " hM - hipMalloc(device)\n hHR - hipHostRegister(pinned)," - " hHM - hipHostMalloc(prePinned)\n"); + CONSOLE_PRINT( + "hipPerfBufferCopySpeed - info: Set device to %d : %s\nLegend: unp - unpinned(malloc), hM " + "- hipMalloc(device), hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned), hMUC " + "- hipMallocUncached\n", + deviceId, props.name); - REQUIRE(true == hipPerfBufferCopySpeed_test(1)); + // Run the test with all sizes and buffer types, alter p_tests to run a specific test + REQUIRE(true == hipPerfBufferCopySpeed_test(-1)); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ \ No newline at end of file diff --git a/catch/perftests/memory/hipPerfDevMemReadSpeed.cc b/catch/perftests/memory/hipPerfDevMemReadSpeed.cc index ae5f63186..9b874b660 100644 --- a/catch/perftests/memory/hipPerfDevMemReadSpeed.cc +++ b/catch/perftests/memory/hipPerfDevMemReadSpeed.cc @@ -18,13 +18,14 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ - + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ + +// #define ENABLE_DEBUG 1 #include #define ARRAY_SIZE 16 @@ -33,7 +34,7 @@ typedef struct d_uint16 { uint data[ARRAY_SIZE]; } d_uint16; -__global__ static void read_kernel(d_uint16 *src, ulong N, uint *dst) { +__global__ static void read_kernel(d_uint16* src, ulong N, uint* dst) { size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); size_t stride = blockDim.x * gridDim.x; @@ -59,8 +60,8 @@ static bool hipPerfDevMemReadSpeed_test() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("info: running on bus " << "0x" << props.pciBusID << " " << - props.name << " with " << props.multiProcessorCount << " CUs \n"); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name, + props.multiProcessorCount); const unsigned threadsPerBlock = 64; const unsigned blocks = props.multiProcessorCount * 4; @@ -70,7 +71,7 @@ static bool hipPerfDevMemReadSpeed_test() { hSrc = new d_uint16[nBytes]; REQUIRE(hSrc != nullptr); - hDst = new uint; + hDst = new uint; REQUIRE(hDst != nullptr); hDst[0] = 0; @@ -88,15 +89,15 @@ static bool hipPerfDevMemReadSpeed_test() { HIP_CHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dSrc, N, dDst); + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); if (hDst[0] != (nBytes / sizeof(uint))) { - INFO("hipPerfDevMemReadSpeed - Data validation failed for warm up run!" << - " expected " << nBytes / sizeof(uint) << " got " << hDst[0]); + DEBUG_PRINT( + "hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %u got %u\n", + nBytes / sizeof(uint), hDst[0]); return false; } @@ -104,8 +105,7 @@ static bool hipPerfDevMemReadSpeed_test() { auto all_start = std::chrono::steady_clock::now(); for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dSrc, N, dDst); + hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); HIP_CHECK(hipGetLastError()); } HIP_CHECK(hipDeviceSynchronize()); @@ -114,14 +114,14 @@ static bool hipPerfDevMemReadSpeed_test() { std::chrono::duration all_kernel_time = all_end - all_start; // read speed in GB/s - double perf = (static_cast(nBytes * nIter * (1e-09))) / - all_kernel_time.count(); + double perf = (static_cast(nBytes * nIter * (1e-09))) / all_kernel_time.count(); - INFO("hipPerfDevMemReadSpeed - info: average read speed of " << - perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB"); + CONSOLE_PRINT( + "hipPerfDevMemReadSpeed - average read speed of %.2f GB/s achieved for memory size of %u " + "MB\n", + perf, nBytes / (1024 * 1024)); - delete [] hSrc; + delete[] hSrc; delete hDst; HIP_CHECK(hipFree(dSrc)); HIP_CHECK(hipFree(dDst)); @@ -130,30 +130,31 @@ static bool hipPerfDevMemReadSpeed_test() { } /** -* Test Description -* ------------------------ -*  - Verify hipPerfDevMemReadSpeed status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfDevMemReadSpeed.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfDevMemReadSpeed status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfDevMemReadSpeed.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDevMemReadSpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfDevMemReadSpeed as" - "there is no device to test."); + SUCCEED( + "Skipped testcase hipPerfDevMemReadSpeed as" + "there is no device to test."); } else { REQUIRE(true == hipPerfDevMemReadSpeed_test()); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc b/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc index 77eac4b8b..e96b206c8 100644 --- a/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc +++ b/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc @@ -18,12 +18,12 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include @@ -33,12 +33,12 @@ typedef struct d_uint16 { uint data[ARRAY_SIZE]; } d_uint16; -__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) { - size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x; - for (size_t i = idx; i < N; i += stride) { - dst[i] = pval; - } +__global__ void write_kernel(d_uint16* dst, ulong N, d_uint16 pval) { + size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for (size_t i = idx; i < N; i += stride) { + dst[i] = pval; + } } static bool hipPerfDevMemWriteSpeed_test() { @@ -55,8 +55,8 @@ static bool hipPerfDevMemWriteSpeed_test() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("info: running on bus " << "0x" << props.pciBusID << " " << - props.name << " with " << props.multiProcessorCount << " CUs \n"); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name, + props.multiProcessorCount); const unsigned threadsPerBlock = 64; const unsigned blocks = props.multiProcessorCount * 4; @@ -65,7 +65,7 @@ static bool hipPerfDevMemWriteSpeed_test() { pval.data[i] = inputData; } - hDst = new d_uint16[nBytes]; + hDst = new d_uint16[nBytes]; REQUIRE(hDst != nullptr); for (size_t i = 0; i < N; i++) { @@ -78,18 +78,18 @@ static bool hipPerfDevMemWriteSpeed_test() { HIP_CHECK(hipStreamCreate(&stream)); HIP_CHECK(hipMalloc(&dDst, nBytes)); - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dDst, N, pval); + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); for (uint i = 0; i < N; i++) { for (uint j = 0; j < ARRAY_SIZE; j++) { if (hDst[i].data[j] != inputData) { - INFO("hipPerfDevMemWriteSpeed - Data validation failed for warm up run!" - << "at index i: " << i << " element j: " << j << - "expected " << inputData << " but got " << hDst[i].data[j]); + DEBUG_PRINT( + "hipPerfDevMemWriteSpeed - Data validation failed for warm up run! at index i: %u " + "element j: %u expected 0x%x but got 0x%x\n", + i, j, inputData, hDst[i].data[j]); return false; } } @@ -99,8 +99,7 @@ static bool hipPerfDevMemWriteSpeed_test() { auto all_start = std::chrono::steady_clock::now(); for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dDst, N, pval); + hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); HIP_CHECK(hipGetLastError()); } HIP_CHECK(hipDeviceSynchronize()); @@ -109,44 +108,45 @@ static bool hipPerfDevMemWriteSpeed_test() { std::chrono::duration all_kernel_time = all_end - all_start; // read speed in GB/s - double perf = (static_cast(nBytes * nIter * (1e-09))) / - all_kernel_time.count(); + double perf = (static_cast(nBytes * nIter * (1e-09))) / all_kernel_time.count(); - INFO("hipPerfDevMemReadSpeed - info: average write speed of " << - perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB"); + CONSOLE_PRINT( + "hipPerfDevMemWriteSpeed - average write speed of %.2f GB/s achieved for memory size of %u " + "MB\n", + perf, nBytes / (1024 * 1024)); - delete [] hDst; + delete[] hDst; HIP_CHECK(hipFree(dDst)); HIP_CHECK(hipStreamDestroy(stream)); return true; } /** -* Test Description -* ------------------------ -*  - Verify hipPerfDevMemWriteSpeed status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfDevMemWriteSpeed.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfDevMemWriteSpeed status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfDevMemWriteSpeed.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDevMemWriteSpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfDevMemWriteSpeed as" - "there is no device to test."); + SUCCEED( + "Skipped testcase hipPerfDevMemWriteSpeed as" + "there is no device to test."); } else { REQUIRE(true == hipPerfDevMemWriteSpeed_test()); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfHostNumaAlloc.cc b/catch/perftests/memory/hipPerfHostNumaAlloc.cc index b1f056681..39a437b68 100644 --- a/catch/perftests/memory/hipPerfHostNumaAlloc.cc +++ b/catch/perftests/memory/hipPerfHostNumaAlloc.cc @@ -18,28 +18,27 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include #include - +// #define ENABLE_DEBUG 1 // To run it correctly, we must not export HIP_VISIBLE_DEVICES. // And we must explicitly link libnuma because of numa api move_pages(). #define NUM_PAGES 4 -char *h = nullptr; -char *d_h = nullptr; -char *m = nullptr; -char *d_m = nullptr; +char* h = nullptr; +char* d_h = nullptr; +char* m = nullptr; +char* d_m = nullptr; int page_size = 1024; -const int mode[] = { MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE }; -const char* modeStr[] = { "MPOL_DEFAULT", "MPOL_BIND", - "MPOL_PREFERRED", "MPOL_INTERLEAVE" }; +const int mode[] = {MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE}; +const char* modeStr[] = {"MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE"}; std::string exeCommand(const char* cmd) { std::array buff; @@ -55,23 +54,22 @@ std::string exeCommand(const char* cmd) { } int getCpuAgentCount() { - const char* cmd = - "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l"; + const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l"; int cpuAgentCount = std::atoi(exeCommand(cmd).c_str()); return cpuAgentCount; } bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { - void *pages[NUM_PAGES]; + void* pages[NUM_PAGES]; int status[NUM_PAGES]; int ret_code; - INFO("set cpu " << cpuId << ", gpu " << gpuId << ", numaMode " - << numaMode << ", hostMallocflags " << hostMallocflags << "\n"); + CONSOLE_PRINT("set cpu %d, gpu %d, numaMode %d, hostMallocflags %u\n", cpuId, gpuId, numaMode, + hostMallocflags); if (cpuId >= 0) { - unsigned long nodeMask = 1 << cpuId; //NOLINT - unsigned long maxNode = sizeof(nodeMask) * 8; //NOLINT + unsigned long nodeMask = 1 << cpuId; // NOLINT + unsigned long maxNode = sizeof(nodeMask) * 8; // NOLINT if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask, numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) { WARN("set_mempolicy() failed with err " << errno << "\n"); @@ -83,7 +81,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { HIP_CHECK(hipSetDevice(gpuId)); } - posix_memalign(reinterpret_cast(&m), page_size, page_size*NUM_PAGES); + posix_memalign(reinterpret_cast(&m), page_size, page_size * NUM_PAGES); HIP_CHECK(hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped)); HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&d_m), m, 0)); @@ -94,15 +92,13 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { } ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0); - INFO("Memory (malloc) ret " << ret_code << " at " << m << - " (dev " << d_m << "%p) is at node: "); + CONSOLE_PRINT("Memory (malloc) ret %d at %p (dev %p) is at node: ", ret_code, m, d_m); for (int i = 0; i < NUM_PAGES; i++) { - INFO(status[i]); // Don't verify as it's out of our control + CONSOLE_PRINT("%d ", status[i]); // Don't verify as it's out of our control } - INFO("\n"); + CONSOLE_PRINT("\n"); - HIP_CHECK(hipHostMalloc(reinterpret_cast(&h), - page_size*NUM_PAGES, hostMallocflags)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&h), page_size * NUM_PAGES, hostMallocflags)); pages[0] = h; for (int i = 1; i < NUM_PAGES; i++) { pages[i] = reinterpret_cast(pages[0]) + page_size; @@ -111,16 +107,14 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { d_h = nullptr; if (hostMallocflags & hipHostMallocMapped) { HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&d_h), h, 0)); - INFO("Memory (hipHostMalloc) ret " << ret_code << " at " << h - << " (dev " << d_h << ") is at node: "); + CONSOLE_PRINT("Memory (hipHostMalloc) ret %d at %p (dev %p) is at node: ", ret_code, h, d_h); } else { - INFO("Memory (hipHostMalloc) ret " << ret_code << " at " - << h << " is at node: "); + CONSOLE_PRINT("Memory (hipHostMalloc) ret %d at %p is at node: ", ret_code, h); } for (int i = 0; i < NUM_PAGES; i++) { - INFO(status[i]); // Always print it even if it's wrong. Verify later + CONSOLE_PRINT("%d ", status[i]); // Always print it even if it's wrong. Verify later } - INFO("\n"); + CONSOLE_PRINT("\n"); HIP_CHECK(hipHostFree(reinterpret_cast(h))); HIP_CHECK(hipHostUnregister(m)); @@ -129,8 +123,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) { for (int i = 0; i < NUM_PAGES; i++) { if (status[i] != cpuId) { // Now verify - WARN("Failed at " << i << " status[i] = " << status[i] - << " cpuId " << cpuId << "\n"); + WARN("Failed at " << i << " status[i] = " << status[i] << " cpuId " << cpuId << "\n"); return false; } } @@ -138,12 +131,12 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { return true; } -bool runTest(const int &cpuCount, const int &gpuCount, - unsigned int hostMallocflags, const std::string &str) { - INFO("Test- " << str.c_str() << "\n"); +bool runTest(const int& cpuCount, const int& gpuCount, unsigned int hostMallocflags, + const std::string& str) { + CONSOLE_PRINT("Test- %s\n", str.c_str()); for (int m = 0; m < sizeof(mode) / sizeof(mode[0]); m++) { - INFO("Testing " << modeStr[m] << "\n"); + CONSOLE_PRINT("Testing %s\n", modeStr[m]); for (int i = 0; i < cpuCount; i++) { for (int j = 0; j < gpuCount; j++) { @@ -157,39 +150,40 @@ bool runTest(const int &cpuCount, const int &gpuCount, } /** -* Test Description -* ------------------------ -*  - Verify hipPerfHostNumaAlloc status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfHostNumaAlloc.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfHostNumaAlloc status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfHostNumaAlloc.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfHostNumaAlloc_test") { int gpuCount = 0; HIP_CHECK(hipGetDeviceCount(&gpuCount)); int cpuCount = getCpuAgentCount(); - INFO("Cpu count " << cpuCount << ", Gpu count " << gpuCount << "\n"); + CONSOLE_PRINT("Cpu count %d, Gpu count %d\n", cpuCount, gpuCount); if (cpuCount < 0 || gpuCount < 0) { - SUCCEED("Skipped testcase hipPerfHostNumaAlloc as " - "there is no device to test.\n"); + SUCCEED( + "Skipped testcase hipPerfHostNumaAlloc as " + "there is no device to test.\n"); return; } - REQUIRE(true == runTest(cpuCount, gpuCount, - hipHostMallocDefault | hipHostMallocNumaUser, - "Testing hipHostMallocDefault | hipHostMallocNumaUser......")); + REQUIRE(true == + runTest(cpuCount, gpuCount, hipHostMallocDefault | hipHostMallocNumaUser, + "Testing hipHostMallocDefault | hipHostMallocNumaUser......")); - REQUIRE(true == runTest(cpuCount, gpuCount, - hipHostMallocMapped | hipHostMallocNumaUser, - "Testing hipHostMallocMapped | hipHostMallocNumaUser.......")); + REQUIRE(true == + runTest(cpuCount, gpuCount, hipHostMallocMapped | hipHostMallocNumaUser, + "Testing hipHostMallocMapped | hipHostMallocNumaUser.......")); } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfMemFill.cc b/catch/perftests/memory/hipPerfMemFill.cc index 5892c10f8..32bfa622c 100644 --- a/catch/perftests/memory/hipPerfMemFill.cc +++ b/catch/perftests/memory/hipPerfMemFill.cc @@ -18,20 +18,19 @@ */ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include #define SIMPLY_ASSIGN 0 #define USE_HIPTEST_SETNUMBLOCKS 0 -template -__global__ void vec_fill(T *x, T coef, int N) { +template __global__ void vec_fill(T* x, T coef, int N) { const int istart = threadIdx.x + blockIdx.x * blockDim.x; const int ishift = blockDim.x * gridDim.x; for (int i = istart; i < N; i += ishift) { @@ -51,8 +50,7 @@ __device__ void print_log(int i, int value, int expected) { printf("failed at %d: val=%d, expected=%d\n", i, value, expected); } -template -__global__ void vec_verify(T *x, T coef, int N) { +template __global__ void vec_verify(T* x, T coef, int N) { const int istart = threadIdx.x + blockIdx.x * blockDim.x; const int ishift = blockDim.x * gridDim.x; for (int i = istart; i < N; i += ishift) { @@ -68,20 +66,17 @@ __global__ void vec_verify(T *x, T coef, int N) { } } -template -__global__ void daxpy(T *__restrict__ x, T *__restrict__ y, - const T coef, int Niter, int N) { +template +__global__ void daxpy(T* __restrict__ x, T* __restrict__ y, const T coef, int Niter, int N) { const int istart = threadIdx.x + blockIdx.x * blockDim.x; const int ishift = blockDim.x * gridDim.x; for (int iter = 0; iter < Niter; ++iter) { T iv = coef * iter; - for (int i = istart; i < N; i += ishift) - y[i] = iv * x[i] + y[i]; + for (int i = istart; i < N; i += ishift) y[i] = iv * x[i] + y[i]; } } -template -class hipPerfMemFill { +template class hipPerfMemFill { private: static constexpr int NUM_START = 27; static constexpr int NUM_SIZE = 4; @@ -96,26 +91,20 @@ class hipPerfMemFill { public: hipPerfMemFill() { for (int i = 0; i < NUM_SIZE; i++) { - // 128M, 256M, 512M, 1024M + // 128M, 256M, 512M, 1024M totalSizes_[i] = 1ull << (i + NUM_START); } } - ~hipPerfMemFill() { } + ~hipPerfMemFill() {} - bool supportLargeBar() { - return props_.isLargeBar != 0; - } + bool supportLargeBar() { return props_.isLargeBar != 0; } - bool supportManagedMemory() { - return props_.managedMemory != 0; - } + bool supportManagedMemory() { return props_.managedMemory != 0; } - const T getCoefficient(double val) { - return static_cast(val); - } + const T getCoefficient(double val) { return static_cast(val); } - void setHostBuffer(T *A, T val, size_t size) { + void setHostBuffer(T* A, T val, size_t size) { size_t len = size / sizeof(T); for (int i = 0; i < len; i++) { A[i] = val; @@ -138,33 +127,29 @@ class hipPerfMemFill { HIP_CHECK(hipGetDeviceProperties(&props_, deviceId)); blocksPerCU_ = props_.multiProcessorCount * 4; - std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x" - << props_.pciBusID << " " << props_.name << " with " - << props_.multiProcessorCount << " CUs, large bar: " - << supportLargeBar() << ", managed memory: " << supportManagedMemory() - << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained() - << std::endl; + std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x" << props_.pciBusID + << " " << props_.name << " with " << props_.multiProcessorCount + << " CUs, large bar: " << supportLargeBar() + << ", managed memory: " << supportManagedMemory() + << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained() << std::endl; return true; } void log_host(const char* title, double GBytes, double sec) { - std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " - << std::setw(10) << sec << " s in bandwidth " << std::setw(10) - << GBytes / sec << " [GB/s]" << std::endl; + std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " << std::setw(10) << sec + << " s in bandwidth " << std::setw(10) << GBytes / sec << " [GB/s]" << std::endl; } - void log_kernel(const char* title, double GBytes, double sec, - double sec_hv, double sec_kv) { - std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " - << std::setw(10) << sec << " s in bandwidth " << std::setw(10) - << GBytes / sec << " [GB/s]" << ", hostVerify cost " - << std::setw(10) << sec_hv << " s in bandwidth " << std::setw(10) - << GBytes / sec_hv << " [GB/s]" << ", kernelVerify cost " - << std::setw(10) << sec_kv << " s in bandwidth " << std::setw(10) - << GBytes / sec_kv << " [GB/s]" << std::endl; + void log_kernel(const char* title, double GBytes, double sec, double sec_hv, double sec_kv) { + std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " << std::setw(10) << sec + << " s in bandwidth " << std::setw(10) << GBytes / sec << " [GB/s]" + << ", hostVerify cost " << std::setw(10) << sec_hv << " s in bandwidth " + << std::setw(10) << GBytes / sec_hv << " [GB/s]" << ", kernelVerify cost " + << std::setw(10) << sec_kv << " s in bandwidth " << std::setw(10) << GBytes / sec_kv + << " [GB/s]" << std::endl; } - void hostFill(size_t size, T *data, T coef, double *sec) { + void hostFill(size_t size, T* data, T coef, double* sec) { size_t num = size / sizeof(T); // Size of elements auto start = std::chrono::steady_clock::now(); for (int i = 0; i < num; ++i) { @@ -179,29 +164,29 @@ class hipPerfMemFill { *sec = diff.count(); } - void kernelFill(size_t size, T *data, T coef, double *sec) { + void kernelFill(size_t size, T* data, T coef, double* sec) { size_t num = size / sizeof(T); // Size of elements unsigned blocks = setNumBlocks(num); // kernel will be loaded first time - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), - dim3(threadsPerBlock_), 0, 0, data, 0, num); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), dim3(threadsPerBlock_), 0, 0, + data, 0, num); HIP_CHECK(hipDeviceSynchronize()); auto start = std::chrono::steady_clock::now(); for (int iter = 0; iter < NUM_ITER; ++iter) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), - dim3(threadsPerBlock_), 0, 0, data, coef, num); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), dim3(threadsPerBlock_), 0, 0, + data, coef, num); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::steady_clock::now(); std::chrono::duration diff = end - start; // in second - *sec = diff.count() / NUM_ITER; // in second + *sec = diff.count() / NUM_ITER; // in second } - void hostVerify(size_t size, T *data, T coef, double *sec) { + void hostVerify(size_t size, T* data, T coef, double* sec) { size_t num = size / sizeof(T); // Size of elements auto start = std::chrono::steady_clock::now(); for (int i = 0; i < num; ++i) { @@ -224,27 +209,27 @@ class hipPerfMemFill { *sec = diff.count(); } - void kernelVerify(size_t size, T *data, T coef, double *sec) { + void kernelVerify(size_t size, T* data, T coef, double* sec) { size_t num = size / sizeof(T); // Size of elements unsigned blocks = setNumBlocks(num); // kernel will be loaded first time - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), - dim3(threadsPerBlock_), 0, 0, data, coef, num); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), dim3(threadsPerBlock_), 0, 0, + data, coef, num); HIP_CHECK(hipDeviceSynchronize()); // Now all data verified. The following is to test bandwidth. auto start = std::chrono::steady_clock::now(); for (int iter = 0; iter < NUM_ITER; ++iter) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), - dim3(threadsPerBlock_), 0, 0, data, coef, num); + hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), dim3(threadsPerBlock_), 0, 0, + data, coef, num); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::steady_clock::now(); std::chrono::duration diff = end - start; // in second - *sec = diff.count() / NUM_ITER; // in second + *sec = diff.count() / NUM_ITER; // in second } bool testLargeBarDeviceMemoryHostFill(size_t size) { @@ -254,7 +239,7 @@ class hipPerfMemFill { double GBytes = static_cast(size) / NUM_1GB; - T *A; + T* A; HIP_CHECK(hipMalloc(&A, size)); double sec = 0; hostFill(size, A, coef_, &sec); // Cpu can access device mem in LB @@ -285,7 +270,7 @@ class hipPerfMemFill { } double GBytes = static_cast(size) / NUM_1GB; - T *A; + T* A; HIP_CHECK(hipMallocManaged(&A, size)); double sec = 0; hostFill(size, A, coef_, &sec); // Cpu can access HMM mem @@ -301,7 +286,7 @@ class hipPerfMemFill { } double GBytes = static_cast(size) / NUM_1GB; - T *A; + T* A; HIP_CHECK(hipMallocManaged(&A, size)); double sec = 0, sec_hv = 0, sec_kv = 0; @@ -340,7 +325,7 @@ class hipPerfMemFill { bool testHostMemoryHostFill(size_t size, unsigned int flags) { double GBytes = static_cast(size) / NUM_1GB; - T *A; + T* A; HIP_CHECK(hipHostMalloc(&A, size, flags)); double sec = 0; hostFill(size, A, coef_, &sec); @@ -353,8 +338,8 @@ class hipPerfMemFill { bool testHostMemoryKernelFill(size_t size, unsigned int flags) { double GBytes = static_cast(size) / NUM_1GB; - T *A; - HIP_CHECK(hipHostMalloc(reinterpret_cast(&A), size, flags)); + T* A; + HIP_CHECK(hipHostMalloc(reinterpret_cast(&A), size, flags)); double sec = 0, sec_hv = 0, sec_kv = 0; kernelFill(size, A, coef_, &sec); hostVerify(size, A, coef_, &sec_hv); @@ -400,10 +385,11 @@ class hipPerfMemFill { /* This function should be via device attribute query*/ bool supportDeviceMallocFinegrained() { #ifdef __HIP_PLATFORM_AMD__ - T *A = nullptr; + T* A = nullptr; hipError_t err; - err = hipExtMallocWithFlags(reinterpret_cast(&A), sizeof(T), - hipDeviceMallocFinegrained); + + err = + hipExtMallocWithFlags(reinterpret_cast(&A), sizeof(T), hipDeviceMallocFinegrained); if (err || !A) { return false; } @@ -415,7 +401,7 @@ class hipPerfMemFill { } unsigned int setNumBlocks(size_t size) { - size_t num = size/sizeof(T); + size_t num = size / sizeof(T); #if USE_HIPTEST_SETNUMBLOCKS return HipTest::setNumBlocks(blocksPerCU_, threadsPerBlock_, num); @@ -428,12 +414,11 @@ class hipPerfMemFill { bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) { double GBytes = static_cast(size) / NUM_1GB; - T *A = nullptr; - HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast(&A), - size, flags)); + T* A = nullptr; + HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast(&A), size, flags)); if (!A) { - std::cout << "failed hipExtMallocWithFlags() with size =" << - size << " flags="<< std::hex << flags << std::endl; + std::cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" << std::hex + << flags << std::endl; return false; } @@ -448,12 +433,11 @@ class hipPerfMemFill { bool testExtDeviceMemoryKernelFill(size_t size, unsigned int flags) { double GBytes = static_cast(size) / NUM_1GB; - T *A = nullptr; - HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast(&A), - size, flags)); + T* A = nullptr; + HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast(&A), size, flags)); if (!A) { - std::cout << "failed hipExtMallocWithFlags() with size =" << - size << " flags=" << std::hex << flags << std::endl; + std::cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" << std::hex + << flags << std::endl; return false; } @@ -470,20 +454,16 @@ class hipPerfMemFill { } bool testExtDeviceMemory() { - std::cout << "Test fine grained device memory host filling" - << std::endl; + std::cout << "Test fine grained device memory host filling" << std::endl; for (int i = 0; i < NUM_SIZE; i++) { - if (!testExtDeviceMemoryHostFill(totalSizes_[i], - hipDeviceMallocFinegrained)) { + if (!testExtDeviceMemoryHostFill(totalSizes_[i], hipDeviceMallocFinegrained)) { return false; } } - std::cout << "Test fine grained device memory kernel filling" - << std::endl; + std::cout << "Test fine grained device memory kernel filling" << std::endl; for (int i = 0; i < NUM_SIZE; i++) { - if (!testExtDeviceMemoryKernelFill(totalSizes_[i], - hipDeviceMallocFinegrained)) { + if (!testExtDeviceMemoryKernelFill(totalSizes_[i], hipDeviceMallocFinegrained)) { return false; } } @@ -521,16 +501,16 @@ class hipPerfMemFill { }; /** -* Test Description -* ------------------------ -*  - Verify hipPerfMemFill status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfMemFill.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfMemFill status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfMemFill.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMemFill_test") { std::cout << "Test int" << std::endl; @@ -545,6 +525,6 @@ TEST_CASE("Perf_hipPerfMemFill_test") { } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfMemMallocCpyFree.cc b/catch/perftests/memory/hipPerfMemMallocCpyFree.cc index 3960a16bd..e48bc53e5 100644 --- a/catch/perftests/memory/hipPerfMemMallocCpyFree.cc +++ b/catch/perftests/memory/hipPerfMemMallocCpyFree.cc @@ -18,13 +18,13 @@ THE SOFTWARE. */ /** -* @addtogroup hipMemcpy hipMemcpy -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ - + * @addtogroup hipMemcpy hipMemcpy + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ +// #define ENABLE_DEBUG 1 #include #include @@ -38,7 +38,7 @@ void valSet(int* A, int val, size_t size) { } } -void setup(size_t *size, int *num, int **pA, const size_t totalGlobalMem) { +void setup(size_t* size, int* num, int** pA, const size_t totalGlobalMem) { for (int i = 0; i < *num; i++) { size[i] = 1 << (i + 6); if ((NUM_ITER + 1) * size[i] > totalGlobalMem) { @@ -50,39 +50,39 @@ void setup(size_t *size, int *num, int **pA, const size_t totalGlobalMem) { valSet(*pA, 1, size[*num - 1]); } -void testInit(size_t size, int *A) { - int *Ad; +void testInit(size_t size, int* A) { + int* Ad; clock_t start = clock(); - HIP_CHECK(hipMalloc(&Ad, size)); // hip::init() will be called + HIP_CHECK(hipMalloc(&Ad, size)); // hip::init() will be called clock_t end = clock(); double uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - INFO("Initial: hipMalloc(" << size << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("Initial: hipMalloc(%zu) cost %.2fus\n", size, uS); start = clock(); HIP_CHECK(hipMemcpy(Ad, A, size, hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); end = clock(); uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - INFO("hipMemcpy(" << size << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("hipMemcpy(%zu) cost %.2fus\n", size, uS); start = clock(); HIP_CHECK(hipFree(Ad)); end = clock(); uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - INFO("hipFree(" << size << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("hipFree(%zu) cost %.2fus\n", size, uS); } static bool hipPerfMemMallocCpyFree_test() { double uS; clock_t start, end; - size_t size[NUM_SIZE] = { 0 }; - int *Ad[NUM_ITER] = { nullptr }; - int *A; + size_t size[NUM_SIZE] = {0}; + int* Ad[NUM_ITER] = {nullptr}; + int* A; hipDeviceProp_t props; memset(&props, 0, sizeof(props)); HIP_CHECK(hipGetDeviceProperties(&props, 0)); - INFO("totalGlobalMem: " << props.totalGlobalMem << "\n"); + CONSOLE_PRINT("totalGlobalMem: %zu\n", props.totalGlobalMem); int num = NUM_SIZE; setup(size, &num, &A, props.totalGlobalMem); @@ -91,59 +91,60 @@ static bool hipPerfMemMallocCpyFree_test() { for (int i = 0; i < num; i++) { start = clock(); for (int j = 0; j < NUM_ITER; j++) { - HIP_CHECK(hipMalloc(&Ad[j], size[i])); + HIP_CHECK(hipMalloc(&Ad[j], size[i])); } end = clock(); uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - INFO("hipMalloc(" << size[i] << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("hipMalloc(%zu) cost %.2fus\n", size[i], uS); start = clock(); for (int j = 0; j < NUM_ITER; j++) { - HIP_CHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice)); } HIP_CHECK(hipDeviceSynchronize()); end = clock(); uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - INFO("hipMemcpy(" << size[i] << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("hipMemcpy(%zu) cost %.2fus\n", size[i], uS); start = clock(); for (int j = 0; j < NUM_ITER; j++) { - HIP_CHECK(hipFree(Ad[j])); - Ad[j] = nullptr; + HIP_CHECK(hipFree(Ad[j])); + Ad[j] = nullptr; } end = clock(); double uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - INFO("hipFree(" << size[i] << ") cost " << uS << "us" << "\n"); + CONSOLE_PRINT("hipFree(%zu) cost %.2fus\n", size[i], uS); } free(A); return true; } /** -* Test Description -* ------------------------ -*  - Verify hipPerfMemMallocCpyFree status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfMemMallocCpyFree.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfMemMallocCpyFree status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfMemMallocCpyFree.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMemMallocCpyFree_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfDevMemReadSpeed as" - "there is no device to test."); + SUCCEED( + "Skipped testcase hipPerfDevMemReadSpeed as" + "there is no device to test."); } else { REQUIRE(true == hipPerfMemMallocCpyFree_test()); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfMemcpy.cc b/catch/perftests/memory/hipPerfMemcpy.cc index badc3d43c..d5ad0786a 100644 --- a/catch/perftests/memory/hipPerfMemcpy.cc +++ b/catch/perftests/memory/hipPerfMemcpy.cc @@ -18,15 +18,15 @@ */ /** -* @addtogroup hipMemcpy hipMemcpy -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpy hipMemcpy + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include - +// #define ENABLE_DEBUG 1 #define NUM_SIZE 14 #define NUM_ITER 1000 // max BW number for DevicetoDeviceNoCU @@ -35,7 +35,8 @@ class hipPerfMemcpy { private: size_t totalSizes_[NUM_SIZE]; - void setHostBuffer(int *A, int val, size_t size); + void setHostBuffer(int* A, int val, size_t size); + public: hipPerfMemcpy(); ~hipPerfMemcpy() {} @@ -53,7 +54,7 @@ hipPerfMemcpy::hipPerfMemcpy() { } } -void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) { +void hipPerfMemcpy::setHostBuffer(int* A, int val, size_t size) { size_t len = size / sizeof(int); for (int i = 0; i < len; i++) { A[i] = val; @@ -61,36 +62,31 @@ void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) { } void hipPerfMemcpy::TestResult(unsigned int numTests, - std::chrono::duration diff, hipMemcpyKind type) -{ + std::chrono::duration diff, hipMemcpyKind type) { // BW in GB/s - double perf = (static_cast(totalSizes_[numTests] * NUM_ITER) * - static_cast(1e-03)) / diff.count(); - - const char *typestr = NULL; - - if(type == hipMemcpyHostToDevice){ - typestr = "Host to Device"; - } - else if(type == hipMemcpyDeviceToHost){ - typestr = "Device to Host"; + double perf = + (static_cast(totalSizes_[numTests] * NUM_ITER) * static_cast(1e-03)) / + diff.count(); + + const char* typestr = NULL; + + if (type == hipMemcpyHostToDevice) { + typestr = "Host to Device"; + } else if (type == hipMemcpyDeviceToHost) { + typestr = "Device to Host"; + } else if (type == hipMemcpyDeviceToDevice) { + typestr = "Device to Device"; + perf *= 2.0; + } else if (type == hipMemcpyDeviceToDeviceNoCU) { + typestr = "Device to Device No CU"; + perf *= 2.0; } - else if(type == hipMemcpyDeviceToDevice){ - typestr = "Device to Device"; - perf *= 2.0; - } - else if(type == hipMemcpyDeviceToDeviceNoCU){ - typestr = "Device to Device No CU"; - perf *= 2.0; - } - - UNSCOPED_INFO("hipPerfMemcpy[" << numTests << "] " << typestr << " copy BW " - << perf << " GB/s for memory size of " << - totalSizes_[numTests] << " Bytes."); - if(totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU) - REQUIRE(perf < NOCU_MAX_BW); + CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %lu Bytes.\n", numTests, + typestr, perf, totalSizes_[numTests]); + if (totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU) + REQUIRE(perf < NOCU_MAX_BW); } bool hipPerfMemcpy::run_h2d(unsigned int numTests) { @@ -115,7 +111,7 @@ bool hipPerfMemcpy::run_h2d(unsigned int numTests) { TestResult(numTests, diff, hipMemcpyHostToDevice); HIP_CHECK(hipHostUnregister(A)); - delete [] A; + delete[] A; HIP_CHECK(hipFree(Ad)); return true; @@ -143,7 +139,7 @@ bool hipPerfMemcpy::run_d2h(unsigned int numTests) { TestResult(numTests, diff, hipMemcpyDeviceToHost); HIP_CHECK(hipHostUnregister(A)); - delete [] A; + delete[] A; HIP_CHECK(hipFree(Ad)); return true; @@ -186,8 +182,8 @@ bool hipPerfMemcpy::run_d2d_nocu(unsigned int numTests) { auto all_start = std::chrono::steady_clock::now(); for (int j = 0; j < NUM_ITER; j++) { - HIP_CHECK(hipMemcpyAsync(Ad1, Ad2, totalSizes_[numTests], hipMemcpyDeviceToDeviceNoCU, - nullptr)); + HIP_CHECK( + hipMemcpyAsync(Ad1, Ad2, totalSizes_[numTests], hipMemcpyDeviceToDeviceNoCU, nullptr)); } HIP_CHECK(hipDeviceSynchronize()); @@ -204,16 +200,16 @@ bool hipPerfMemcpy::run_d2d_nocu(unsigned int numTests) { } /** -* Test Description -* ------------------------ -*  - Verify hipPerfMemcpy status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfMemcpy.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfMemcpy status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfMemcpy.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMemcpy_test") { int numDevices = 0; @@ -227,35 +223,34 @@ TEST_CASE("Perf_hipPerfMemcpy_test") { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - UNSCOPED_INFO("info: running on bus " << "0x" << props.pciBusID << " " << - props.name << " with " << props.multiProcessorCount << " CUs " - << " and device id: " << deviceId); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); hipPerfMemcpy hipPerfMemcpy; - SECTION("Perf test Host Memory to Device Memory"){ + SECTION("Perf test Host Memory to Device Memory") { for (auto testCase = 0; testCase < NUM_SIZE; testCase++) { - REQUIRE(true == hipPerfMemcpy.run_h2d(testCase)); + REQUIRE(true == hipPerfMemcpy.run_h2d(testCase)); } } - SECTION("Perf test Device Memory to Host Memory"){ + SECTION("Perf test Device Memory to Host Memory") { for (auto testCase = 0; testCase < NUM_SIZE; testCase++) { - REQUIRE(true == hipPerfMemcpy.run_d2h(testCase)); + REQUIRE(true == hipPerfMemcpy.run_d2h(testCase)); } } - SECTION("Perf test Device Memory to Device Memory"){ + SECTION("Perf test Device Memory to Device Memory") { for (auto testCase = 0; testCase < NUM_SIZE; testCase++) { - REQUIRE(true == hipPerfMemcpy.run_d2d(testCase)); + REQUIRE(true == hipPerfMemcpy.run_d2d(testCase)); } } - SECTION("Perf test Device Memory to Device Memory No CU"){ + SECTION("Perf test Device Memory to Device Memory No CU") { for (auto testCase = 0; testCase < NUM_SIZE; testCase++) { - REQUIRE(true == hipPerfMemcpy.run_d2d_nocu(testCase)); + REQUIRE(true == hipPerfMemcpy.run_d2d_nocu(testCase)); } } } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc b/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc index 94b5e6e79..c9e963de8 100644 --- a/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc +++ b/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc @@ -17,12 +17,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif #define NUM_SIZES 6 // 256 Bytes, 512 Bytes, 1024 Bytes, 2048 Bytes, 3072 Bytes, 4096 Bytes @@ -38,9 +32,9 @@ void checkData(void* ptr, unsigned int size, char value) { char* ptr2 = (char*)ptr; for (unsigned int i = 0; i < size; i++) { if (ptr2[i] != value) { - printf("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); - printf("Expected 0x%08x\n", value); - printf("Data validation failed!"); + CONSOLE_PRINT("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); + CONSOLE_PRINT("Expected 0x%08x\n", value); + CONSOLE_PRINT("Data validation failed!"); break; } } @@ -50,7 +44,7 @@ bool extraWarmup = true; TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); - printf("Set device to %d : %s\n", 0, props.name); + CONSOLE_PRINT("Set device to %d : %s\n", 0, props.name); HIP_CHECK(hipSetDevice(0)); unsigned int bufSize_; @@ -66,9 +60,9 @@ TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") { int test = 0; uint32_t kMaxSize = (t == 0) ? 128 * 1024 * 1024 : 1024 * 1024 * 1024; if (t < 2) { - printf("----- Global buffer (MiB): %d\n", kMaxSize / (1024 * 1024)); + CONSOLE_PRINT("\n----- Global buffer (MiB): %d", kMaxSize / (1024 * 1024)); } else { - printf("----- Same buffer copy repeat\n"); + CONSOLE_PRINT("\n----- Same buffer copy repeat"); } for (; test <= numTests; test++) { bufSize_ = Sizes[test % NUM_SIZES]; @@ -131,12 +125,11 @@ TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") { // Double results when src and dst are both on device perf *= 2.0; char buf[256]; - SNPRINTF(buf, sizeof(buf), - "hipMemcpyAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) " - "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f", - test, bufSize_, strSrc, strDst, numIter, (float)perf, - sec.count() / numIter * 1000 * 1000, sec_cpu.count() / numIter * 1000 * 1000); - printf("%s\n", buf); + CONSOLE_PRINT( + "hipMemcpyAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) " + "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f", + test, bufSize_, strSrc, strDst, numIter, (float)perf, sec.count() / numIter * 1000 * 1000, + sec_cpu.count() / numIter * 1000 * 1000); // Verification void* temp = malloc(bufSize_ + 4096); diff --git a/catch/perftests/memory/hipPerfMemset.cc b/catch/perftests/memory/hipPerfMemset.cc index 5b84e9194..aaf434245 100644 --- a/catch/perftests/memory/hipPerfMemset.cc +++ b/catch/perftests/memory/hipPerfMemset.cc @@ -18,30 +18,29 @@ */ /** -* @addtogroup hipMemsetKernel hipMemsetKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemset(void* devPtr, int value, size_t count)` - -* Initializes or sets device memory to a value. -*/ - + * @addtogroup hipMemsetKernel hipMemsetKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemset(void* devPtr, int value, size_t count)` - + * Initializes or sets device memory to a value. + */ +// #define ENABLE_DEBUG 1 #include static unsigned int sizeList[] = { - 256, 512, 1024, 2048, 4096, 8192, + 256, 512, 1024, 2048, 4096, 8192, }; -static unsigned int eleNumList[] = { - 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, - 0x200000, 0x400000, 0x800000, 0x1000000 -}; +static unsigned int eleNumList[] = {0x100, 0x400, 0x1000, 0x4000, 0x10000, + 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, + 0x400000, 0x800000, 0x1000000}; typedef struct _dataType { char memsetval = 0x42; char memsetD8val = 0xDE; int16_t memsetD16val = 0xDEAD; int memsetD32val = 0xDEADBEEF; -}dataType; +} dataType; #define NUM_ITER 1000 @@ -56,7 +55,7 @@ enum MemsetType { class hipPerfMemset { private: - uint64_t bufSize_; + uint64_t bufSize_; unsigned int num_elements_; unsigned int testNumEle_; unsigned int _numSubTests = 0; @@ -78,25 +77,19 @@ class hipPerfMemset { bool open(int deviceID); - template + template void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async); - template + template void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async); - template + template void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async); - uint getNumTests() { - return _numSubTests; - } + uint getNumTests() { return _numSubTests; } - uint getNumTests2D() { - return _numSubTests2D; - } - uint getNumTests3D() { - return _numSubTests3D; - } + uint getNumTests2D() { return _numSubTests2D; } + uint getNumTests3D() { return _numSubTests3D; } }; bool hipPerfMemset::open(int deviceId) { @@ -109,15 +102,13 @@ bool hipPerfMemset::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs and device id: " - << deviceId << "\n"); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); return true; } -template -void hipPerfMemset::run1D(unsigned int test, T memsetval, - enum MemsetType type, bool async) { +template +void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) { T *A_h, *A_d; testNumEle_ = eleNumList[test % num_elements_]; @@ -126,17 +117,17 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, HIP_CHECK(hipMalloc(&A_d, bufSize_)); - A_h = reinterpret_cast (malloc(bufSize_)); + A_h = reinterpret_cast(malloc(bufSize_)); hipStream_t stream; HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); // Warm-up if (async) { - HIP_CHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream)); + HIP_CHECK(hipMemsetAsync((void*)A_d, memsetval, bufSize_, stream)); HIP_CHECK(hipStreamSynchronize(stream)); } else { - HIP_CHECK(hipMemset((void *)A_d, memsetval, bufSize_)); + HIP_CHECK(hipMemset((void*)A_d, memsetval, bufSize_)); HIP_CHECK(hipDeviceSynchronize()); } @@ -144,7 +135,7 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, for (uint i = 0; i < NUM_ITER; i++) { if (type == hipMemsetTypeDefault && !async) { - HIP_CHECK(hipMemset(reinterpret_cast(A_d), memsetval, bufSize_)); + HIP_CHECK(hipMemset(reinterpret_cast(A_d), memsetval, bufSize_)); } else if (type == hipMemsetTypeDefault && async) { HIP_CHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream)); } else if (type == hipMemsetTypeD8 && !async) { @@ -152,13 +143,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, } else if (type == hipMemsetTypeD8 && async) { HIP_CHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream)); } else if (type == hipMemsetTypeD16 && !async) { - HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T))); } else if (type == hipMemsetTypeD16 && async) { - HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); + HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T), stream)); } else if (type == hipMemsetTypeD32 && !async) { - HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); + HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T))); } else if (type == hipMemsetTypeD32 && async) { - HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); + HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T), stream)); } } if (async) { @@ -169,13 +160,12 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, auto end = std::chrono::steady_clock::now(); - HIP_CHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) ); + HIP_CHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost)); for (int i = 0; i < bufSize_ / sizeof(T); i++) { if (A_h[i] != memsetval) { - INFO("mismatch at index " << i << " computed: " << - static_cast (A_h[i]) << ", memsetval: " << - static_cast (memsetval) << "\n"); + DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast(A_h[i]), + static_cast(memsetval)); REQUIRE(false); } } @@ -188,30 +178,27 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, auto sec = diff.count(); auto perf = static_cast((bufSize_ * NUM_ITER * (1e-09)) / sec); - std::cout << "[" << std::setw(2) - << test << "] " << std::setw(5) << bufSize_/1024 - << " Kb " << std::setw(4) << " typeSize " << sizeof(T) << " : " - << std::setw(7) << perf << " GB/s \n"; + std::cout << "[" << std::setw(2) << test << "] " << std::setw(5) << bufSize_ / 1024 << " Kb " + << std::setw(4) << " typeSize " << sizeof(T) << " : " << std::setw(7) << perf + << " GB/s \n"; } -template -void hipPerfMemset::run2D(unsigned int test, T memsetval, - enum MemsetType type, bool async) { +template +void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) { bufSize_ = sizeList[test % num_sizes_]; size_t numH = bufSize_; size_t numW = bufSize_; size_t pitch_A; size_t width = numW * sizeof(char); size_t sizeElements = width * numH; - size_t elements = numW* numH; + size_t elements = numW * numH; - T * A_h, * A_d; + T *A_h, *A_d; - HIP_CHECK(hipMallocPitch(reinterpret_cast(&A_d), - &pitch_A, width, numH)); + HIP_CHECK(hipMallocPitch(reinterpret_cast(&A_d), &pitch_A, width, numH)); A_h = reinterpret_cast(malloc(sizeElements)); - for (size_t i=0; i < elements; i++) { + for (size_t i = 0; i < elements; i++) { A_h[i] = 1; } @@ -244,14 +231,12 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval, auto end = std::chrono::steady_clock::now(); - HIP_CHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, - hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost)); - for (int i=0; i < elements; i++) { + for (int i = 0; i < elements; i++) { if (A_h[i] != memsetval) { - INFO("mismatch at index " << i << " computed: " << - static_cast (A_h[i]) << ", memsetval: " << - static_cast (memsetval) << "\n"); + DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast(A_h[i]), + static_cast(memsetval)); REQUIRE(false); } } @@ -259,20 +244,19 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval, std::chrono::duration diff = end - start; auto sec = diff.count(); - auto perf = static_cast((sizeElements* NUM_ITER * (1e-09)) / sec); + auto perf = static_cast((sizeElements * NUM_ITER * (1e-09)) / sec); - std::cout << "hipPerf2DMemset" << (async ? "Async" : " ") << "[" << test << "] " - << " " << "(GB/s) for " << std::setw(5) << bufSize_ - << " x " << std::setw(5) << bufSize_ << " bytes : " << std::setw(7) << perf << "\n"; + std::cout << "hipPerf2DMemset" << (async ? "Async" : " ") << "[" << test << "] " << " " + << "(GB/s) for " << std::setw(5) << bufSize_ << " x " << std::setw(5) << bufSize_ + << " bytes : " << std::setw(7) << perf << "\n"; HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipFree(A_d)); free(A_h); } -template -void hipPerfMemset::run3D(unsigned int test, T memsetval, - enum MemsetType type, bool async) { +template +void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) { bufSize_ = sizeList[test % num_sizes_]; size_t numH = bufSize_; @@ -280,12 +264,12 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, size_t depth = 10; size_t width = numW * sizeof(char); size_t sizeElements = width * numH * depth; - size_t elements = numW* numH* depth; + size_t elements = numW * numH * depth; hipStream_t stream; HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - T *A_h; + T* A_h; hipExtent extent = make_hipExtent(width, numH, depth); hipPitchedPtr devPitchedPtr; @@ -325,12 +309,12 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, auto end = std::chrono::steady_clock::now(); - hipMemcpy3DParms myparms ; + hipMemcpy3DParms myparms; myparms.srcArray = nullptr; myparms.dstArray = nullptr; myparms.srcPos = make_hipPos(0, 0, 0); myparms.dstPos = make_hipPos(0, 0, 0); - myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH); + myparms.dstPtr = make_hipPitchedPtr(A_h, width, numW, numH); myparms.srcPtr = devPitchedPtr; myparms.extent = extent; @@ -338,11 +322,10 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, HIP_CHECK(hipMemcpy3D(&myparms)); - for (int i=0; i < elements; i++) { + for (int i = 0; i < elements; i++) { if (A_h[i] != memsetval) { - INFO("mismatch at index " << i << " computed: " << - static_cast (A_h[i]) << ", memsetval: " << - static_cast (memsetval) << "\n"); + DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast(A_h[i]), + static_cast(memsetval)); REQUIRE(false); } } @@ -352,24 +335,23 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval, auto sec = diff.count(); auto perf = static_cast((sizeElements * NUM_ITER * (1e-09)) / sec); - std::cout << "hipPerf3DMemset" << (async ? "Async" : " ") << "[" << test << "] " << " " - << "(GB/s) for " << std::setw(5) << bufSize_ << " x " << std::setw(5) - << bufSize_ << " x " << depth << " bytes : " << std::setw(7) << perf << "\n"; + CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5lu x %5lu x %lu bytes : %7.2f\n", + (async ? "Async" : " "), test, bufSize_, bufSize_, depth, perf); HIP_CHECK(hipFree(devPitchedPtr.ptr)); free(A_h); } /** -* Test Description -* ------------------------ -*  - Verify hipPerfMemset status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfMemset.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfMemset status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfMemset.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMemset_test") { hipPerfMemset hipPerfMemset; @@ -385,44 +367,44 @@ TEST_CASE("Perf_hipPerfMemset_test") { bool async = false; - for (uint i = 0; i < 2 ; i++) { - std::cout << "--------------------- 1D buffer -------------------\n"; + for (uint i = 0; i < 2; i++) { + CONSOLE_PRINT("--------------------- 1D buffer -------------------\n"); for (auto testCase = 0; testCase < numTests; testCase++) { if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) { - std::cout << "hipMemsetD8" << (async ? "Async " : " "); + CONSOLE_PRINT("hipMemsetD8%s", (async ? "Async " : " ")); hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async); } else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) { - std::cout << "hipMemsetD16" << (async ? "Async" : " "); + CONSOLE_PRINT("hipMemsetD16%s", (async ? "Async" : " ")); hipPerfMemset.run1D(testCase, pattern.memsetD16val, hipMemsetTypeD16, async); } else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) { - std::cout << "hipMemsetD32" << (async ? "Async" : " "); + CONSOLE_PRINT("hipMemsetD32%s", (async ? "Async" : " ")); hipPerfMemset.run1D(testCase, pattern.memsetD32val, hipMemsetTypeD32, async); } else { - std::cout << "hipMemset" << (async ? "Async " : " "); + CONSOLE_PRINT("hipMemset%s", (async ? "Async " : " ")); hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeDefault, async); } } async = true; } - INFO("\n"); - std::cout << "------------------ 2D buffer arrays ---------------\n"; + CONSOLE_PRINT("\n"); + CONSOLE_PRINT("\n------------------ 2D buffer arrays ---------------\n"); async = false; for (uint i = 0; i < 2; i++) { - INFO("\n"); + CONSOLE_PRINT("\n"); for (uint test = 0; test < numTests2D; test++) { hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async); } async = true; } - INFO("\n"); - std::cout << "------------------ 3D buffer arrays ---------------\n"; + CONSOLE_PRINT("\n"); + CONSOLE_PRINT("\n------------------ 3D buffer arrays ---------------\n"); async = false; for (uint i = 0; i < 2; i++) { - INFO("\n"); + CONSOLE_PRINT("\n"); for (uint test = 0; test < numTests3D; test++) { hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async); } @@ -431,6 +413,6 @@ TEST_CASE("Perf_hipPerfMemset_test") { } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc b/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc index 7e138be4d..fe2fcdb33 100644 --- a/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc +++ b/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc @@ -18,13 +18,6 @@ THE SOFTWARE. */ #include -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif - #define NUM_SIZES 6 // 256 Bytes, 512 Bytes, 1024 Bytes, 2048 Bytes, 3072 Bytes, 4096 Bytes constexpr uint32_t Mi = 1024 * 1024; @@ -39,9 +32,9 @@ void checkData_(void* ptr, unsigned int size, char value) { char* ptr2 = (char*)ptr; for (unsigned int i = 0; i < size; i++) { if (ptr2[i] != value) { - printf("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); - printf("Expected 0x%08x\n", value); - printf("Data validation failed!"); + CONSOLE_PRINT("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); + CONSOLE_PRINT("Expected 0x%08x\n", value); + CONSOLE_PRINT("Data validation failed!"); break; } } @@ -51,7 +44,7 @@ bool extraWarmup_ = true; TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); - printf("Set device to %d : %s\n", 0, props.name); + CONSOLE_PRINT("Set device to %d : %s", 0, props.name); HIP_CHECK(hipSetDevice(0)); unsigned int bufSize_; @@ -66,7 +59,7 @@ TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") { int numTests = (NUM_SIZES * NUM_SUBTESTS - 1); int test = 0; uint32_t kMaxSize = (t == 0) ? 128 * 1024 * 1024 : 1024 * 1024 * 1024; - printf("----- Global buffer (MiB): %d\n", kMaxSize / (1024 * 1024)); + CONSOLE_PRINT("\n----- Global buffer (MiB): %d", kMaxSize / (1024 * 1024)); for (; test <= numTests; test++) { bufSize_ = Sizes[test % NUM_SIZES]; hostMalloc[0] = hostMalloc[1] = false; @@ -123,13 +116,11 @@ TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") { const char* strSrc = "dM"; const char* strDst = "dM"; - char buf[256]; - SNPRINTF(buf, sizeof(buf), - "hipMemsetAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) " - "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f", - test, bufSize_, strSrc, strDst, numIter, (float)perf, - sec.count() / numIter * 1000 * 1000, sec_cpu.count() / numIter * 1000 * 1000); - printf("%s\n", buf); + CONSOLE_PRINT( + "hipMemsetAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) " + "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f", + test, bufSize_, strSrc, strDst, numIter, (float)perf, sec.count() / numIter * 1000 * 1000, + sec_cpu.count() / numIter * 1000 * 1000); // Verification void* temp = malloc(bufSize_ + 4096); diff --git a/catch/perftests/memory/hipPerfSampleRate.cc b/catch/perftests/memory/hipPerfSampleRate.cc index 9a083fa3e..a591f5a09 100644 --- a/catch/perftests/memory/hipPerfSampleRate.cc +++ b/catch/perftests/memory/hipPerfSampleRate.cc @@ -19,66 +19,69 @@ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include - +// #define ENABLE_DEBUG 1 #define NUM_TYPES 3 std::vector types = {"float", "float2", "float4"}; std::vector typeSizes = {4, 8, 16}; #define NUM_SIZES 12 -std::vector sizes = {1, 2, 4, 8, 16, 32, - 64, 128, 256, 512, 1024, 2048}; +std::vector sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; #define NUM_BUFS 6 #define MAX_BUFS (1 << (NUM_BUFS - 1)) #ifdef __HIP_PLATFORM_NVIDIA__ -__host__ __device__ void operator+=(float2 &a, float2 b) { //NOLINT - a.x += b.x; a.y += b.y; +__host__ __device__ void operator+=(float2& a, float2 b) { // NOLINT + a.x += b.x; + a.y += b.y; } -__host__ __device__ void operator+=(float4 &a, float4 b) { //NOLINT - a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; +__host__ __device__ void operator+=(float4& a, float4 b) { // NOLINT + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; } #endif template -__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, - unsigned int writeIt, T **inBuffer, int numBufs) { +__global__ void sampleRate(T* outBuffer, unsigned int inBufSize, unsigned int writeIt, T** inBuffer, + int numBufs) { uint gid = (blockIdx.x * blockDim.x + threadIdx.x); uint inputIdx = gid % inBufSize; T tmp; memset(&tmp, 0, sizeof(T)); for (int i = 0; i < numBufs; i++) { - tmp += *(*(inBuffer+i)+inputIdx); + tmp += *(*(inBuffer + i) + inputIdx); } - if (writeIt*(unsigned int)tmp.x) { + if (writeIt * (unsigned int)tmp.x) { outBuffer[gid] = tmp; } } template -__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize, - unsigned int writeIt, T ** inBuffer, int numBufs) { +__global__ void sampleRateFloat(T* outBuffer, unsigned int inBufSize, unsigned int writeIt, + T** inBuffer, int numBufs) { uint gid = (blockIdx.x * blockDim.x + threadIdx.x); uint inputIdx = gid % inBufSize; T tmp = (T)0.0f; for (int i = 0; i < numBufs; i++) { - tmp += *((*inBuffer+i)+inputIdx); + tmp += *((*inBuffer + i) + inputIdx); } - if (writeIt*(unsigned int)tmp) { + if (writeIt * (unsigned int)tmp) { outBuffer[gid] = tmp; } } @@ -93,26 +96,23 @@ class hipPerfSampleRate { void close(void); // array of funtion pointers - typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int - inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, - int grids, int blocks); + typedef void (hipPerfSampleRate::*funPtr)(void* outBuffer, unsigned int inBufSize, + unsigned int writeIt, void** inBuffer, int numBufs, + int grids, int blocks); // Wrappers - void float_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int numBufs, - int grids, int blocks); + void float_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer, + int numBufs, int grids, int blocks); - void float2_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int numBufs, - int grids, int blocks); + void float2_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer, + int numBufs, int grids, int blocks); - void float4_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int numBufs, - int grids, int blocks); + void float4_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer, + int numBufs, int grids, int blocks); private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int width_; unsigned int bufSize_; @@ -139,41 +139,36 @@ bool hipPerfSampleRate::open(void) { hipDeviceProp_t props; HIP_CHECK(hipSetDevice(deviceId)); HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - INFO("info: running on bus " << "0x" << props.pciBusID << " " << - props.name << " with " << props.multiProcessorCount << - " CUs" << " and device id: " << deviceId << "\n"); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); numCUs = props.multiProcessorCount; return true; } // Wrappers for the kernel launches -void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int numBufs, - int grids, int blocks) { - hipLaunchKernelGGL(sampleRateFloat, dim3(grids, grids, grids), - dim3(blocks), 0, 0, reinterpret_cast(outBuffer), - inBufSize, writeIt, reinterpret_cast(inBuffer), numBufs); +void hipPerfSampleRate::float_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, + void** inBuffer, int numBufs, int grids, int blocks) { + hipLaunchKernelGGL(sampleRateFloat, dim3(grids, grids, grids), dim3(blocks), 0, 0, + reinterpret_cast(outBuffer), inBufSize, writeIt, + reinterpret_cast(inBuffer), numBufs); } -void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int grids, - int blocks, int numBufs) { - hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), - dim3(blocks), 0, 0, reinterpret_cast(outBuffer), - inBufSize, writeIt, reinterpret_cast(inBuffer), numBufs); +void hipPerfSampleRate::float2_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, + void** inBuffer, int grids, int blocks, int numBufs) { + hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), dim3(blocks), 0, 0, + reinterpret_cast(outBuffer), inBufSize, writeIt, + reinterpret_cast(inBuffer), numBufs); } -void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, int grids, - int blocks, int numBufs) { - hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), - dim3(blocks), 0, 0, reinterpret_cast(outBuffer), - inBufSize, writeIt, reinterpret_cast(inBuffer), numBufs); +void hipPerfSampleRate::float4_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, + void** inBuffer, int grids, int blocks, int numBufs) { + hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), dim3(blocks), 0, 0, + reinterpret_cast(outBuffer), inBufSize, writeIt, + reinterpret_cast(inBuffer), numBufs); } void hipPerfSampleRate::run(unsigned int test) { - funPtr p[] = {&hipPerfSampleRate::float_kernel, - &hipPerfSampleRate::float2_kernel, + funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel, &hipPerfSampleRate::float4_kernel}; // We compute a square domain @@ -182,35 +177,30 @@ void hipPerfSampleRate::run(unsigned int test) { bufSize_ = width_ * width_ * typeSizes[typeIdx_]; numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES))); - void ** dPtr; - void * hOutPtr; - void * dOutPtr; - void ** hInPtr = new void *[numBufs_]; - void ** dInPtr = new void *[numBufs_]; + void** dPtr; + void* hOutPtr; + void* dOutPtr; + void** hInPtr = new void*[numBufs_]; + void** dInPtr = new void*[numBufs_]; - outBufSize_ = - sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1]; + outBufSize_ = sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1]; // Allocate memory on the host and device - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hOutPtr), outBufSize_, - hipHostMallocDefault)); - setData(reinterpret_cast(hOutPtr), 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dOutPtr), outBufSize_)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hOutPtr), outBufSize_, hipHostMallocDefault)); + setData(reinterpret_cast(hOutPtr), 0xdeadbeef); + HIP_CHECK(hipMalloc(reinterpret_cast(&dOutPtr), outBufSize_)); // Allocate 2D array in Device - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr), - numBufs_* sizeof(void *))); + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr), numBufs_ * sizeof(void*))); for (uint i = 0; i < numBufs_; i++) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hInPtr[i]), bufSize_, - hipHostMallocDefault)); - HIP_CHECK(hipMalloc(reinterpret_cast(&dInPtr[i]), bufSize_)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hInPtr[i]), bufSize_, hipHostMallocDefault)); + HIP_CHECK(hipMalloc(reinterpret_cast(&dInPtr[i]), bufSize_)); setData(hInPtr[i], 0x3f800000); } // Populate array of pointers with array addresses - HIP_CHECK(hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *), - hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(dPtr, dInPtr, numBufs_ * sizeof(void*), hipMemcpyHostToDevice)); // Copy memory from host to device for (uint i = 0; i < numBufs_; i++) { @@ -241,20 +231,19 @@ void hipPerfSampleRate::run(unsigned int test) { // Time the kernel execution auto all_start = std::chrono::steady_clock::now(); for (uint i = 0; i < maxIter; i++) { - (this->*p[idx]) (reinterpret_cast(dOutPtr), sizeDW, writeIt, - dPtr, numBufs_, grids, blocks); + (this->*p[idx])(reinterpret_cast(dOutPtr), sizeDW, writeIt, dPtr, numBufs_, grids, + blocks); } HIP_CHECK(hipDeviceSynchronize()); auto all_end = std::chrono::steady_clock::now(); std::chrono::duration all_kernel_time = all_end - all_start; - double perf = (static_cast(outBufSize_ * numBufs_ * - maxIter * (1e-09))) / all_kernel_time.count(); + double perf = + (static_cast(outBufSize_ * numBufs_ * maxIter * (1e-09))) / all_kernel_time.count(); - INFO("Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1] - << " bufs " << numBufs_ << " " << types[typeIdx_] << " " << width_ - << "x" <(numBufs_)) { - INFO("Data validation failed at "<< i << " Got "<< ptr[i] - << ", expected " << (float)numBufs_ << "\n"); + DEBUG_PRINT("Data validation failed at %u Got %u, expected %f\n", i, ptr[i], (float)numBufs_); REQUIRE(false); } } } /** -* Test Description -* ------------------------ -*  - Verify hipPerfSampleRate status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfSampleRate.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfSampleRate status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfSampleRate.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfSampleRate_test") { hipPerfSampleRate sampleTypes; REQUIRE(true == sampleTypes.open()); - for (unsigned int testCase = 0; testCase < 216 ; testCase+=36) { + for (unsigned int testCase = 0; testCase < 216; testCase += 36) { sampleTypes.run(testCase); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc b/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc index dbf10a04a..67f2e59b6 100644 --- a/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc +++ b/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc @@ -18,19 +18,19 @@ */ /** -* @addtogroup hipMemcpyKernel hipMemcpyKernel -* @{ -* @ingroup perfMemoryTest -* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - -* Copies data between host and device. -*/ + * @addtogroup hipMemcpyKernel hipMemcpyKernel + * @{ + * @ingroup perfMemoryTest + * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` - + * Copies data between host and device. + */ #include - +// #define ENABLE_DEBUG 1 #define sharedMemSize1 2048 #define sharedMemSize2 256 -__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) { +__global__ void sharedMemReadSpeed1(float* outBuf, ulong N) { size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); size_t lid = threadIdx.x; __shared__ float local[sharedMemSize1]; @@ -84,7 +84,7 @@ __global__ void sharedMemReadSpeed1(float *outBuf, ulong N) { } } -__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) { +__global__ void sharedMemReadSpeed2(float* outBuf, ulong N) { size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); size_t lid = threadIdx.x; __shared__ float local[sharedMemSize2]; @@ -116,8 +116,8 @@ __global__ void sharedMemReadSpeed2(float *outBuf, ulong N) { } static bool hipPerfSharedMemReadSpeed_test() { - float *dDst; - float *hDst; + float* dDst; + float* hDst; hipStream_t stream; constexpr uint numSizes = 4; constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216}; @@ -132,8 +132,8 @@ static bool hipPerfSharedMemReadSpeed_test() { HIP_CHECK(hipSetDevice(device)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, device)); - INFO("info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs \n"); + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name, + props.multiProcessorCount); HIP_CHECK(hipStreamCreate(&stream)); @@ -149,8 +149,8 @@ static bool hipPerfSharedMemReadSpeed_test() { HIP_CHECK(hipMalloc(&dDst, nBytes)); HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); - hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, + N); HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); @@ -160,8 +160,7 @@ static bool hipPerfSharedMemReadSpeed_test() { tmp = 0; } if (hDst[i] != tmp) { - INFO("info: Data validation failed for warm up run! \n"); - INFO("info: expected " << tmp << " got " << hDst[i] << " \n"); + DEBUG_PRINT("Data validation failed for warm up run! expected %d got %f\n", tmp, hDst[i]); return false; } tmp += threadsPerBlock / 2; @@ -169,8 +168,8 @@ static bool hipPerfSharedMemReadSpeed_test() { auto all_start = std::chrono::steady_clock::now(); for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); + hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, + N); } HIP_CHECK(hipDeviceSynchronize()); @@ -178,15 +177,14 @@ static bool hipPerfSharedMemReadSpeed_test() { std::chrono::duration all_kernel_time = all_end - all_start; // read speed in GB/s - double perf = (static_cast(blocks * threadsPerBlock) - * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) - * nIter * (1e-09)) / all_kernel_time.count(); + double perf = (static_cast(blocks * threadsPerBlock) * + (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter * (1e-09)) / + all_kernel_time.count(); - INFO("info: read speed = " << std::setw(8) << perf << " GB/s for " << - sharedMemSizeBytes1 / 1024 << " KB shared memory with " << - std::setw(8) << blocks * threadsPerBlock << " threads, " - << std::setw(4) << numReads1 << - " reads in sharedMemReadSpeed1 kernel \n"); + CONSOLE_PRINT( + "info: read speed = %.2f GB/s for %d KB shared memory with %d threads, %d reads in " + "sharedMemReadSpeed1 kernel\n", + perf, sharedMemSizeBytes1 / 1024, blocks * threadsPerBlock, numReads1); delete[] hDst; HIP_CHECK(hipFree(dDst)); @@ -204,15 +202,15 @@ static bool hipPerfSharedMemReadSpeed_test() { HIP_CHECK(hipMalloc(&dDst, nBytes)); HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); - hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, + N); HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); auto all_start = std::chrono::steady_clock::now(); for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); + hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, + N); } HIP_CHECK(hipDeviceSynchronize()); @@ -220,15 +218,14 @@ static bool hipPerfSharedMemReadSpeed_test() { std::chrono::duration all_kernel_time = all_end - all_start; // read speed in GB/s - double perf = (static_cast(blocks * threadsPerBlock) - * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) - * nIter * (1e-09)) / all_kernel_time.count(); + double perf = (static_cast(blocks * threadsPerBlock) * + (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter * (1e-09)) / + all_kernel_time.count(); - INFO("info: read speed = " << std::setw(8) << perf << " GB/s for " - << sharedMemSizeBytes2 / 1024 << " KB shared memory with " - << std::setw(8) << blocks * threadsPerBlock << " threads, " - << std::setw(4) << numReads2 << - " reads in sharedMemReadSpeed2 kernel \n"); + CONSOLE_PRINT( + "info: read speed = %.2f GB/s for %d KB shared memory with %d threads, %d reads in " + "sharedMemReadSpeed2 kernel\n", + perf, sharedMemSizeBytes2 / 1024, blocks * threadsPerBlock, numReads2); delete[] hDst; HIP_CHECK(hipFree(dDst)); @@ -238,30 +235,31 @@ static bool hipPerfSharedMemReadSpeed_test() { } /** -* Test Description -* ------------------------ -*  - Verify hipPerfSharedMemReadSpeed status. -* Test source -* ------------------------ -*  - perftests/memory/hipPerfSharedMemReadSpeed.cc -* Test requirements -* ------------------------ -*  - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + *  - Verify hipPerfSharedMemReadSpeed status. + * Test source + * ------------------------ + *  - perftests/memory/hipPerfSharedMemReadSpeed.cc + * Test requirements + * ------------------------ + *  - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfSharedMemReadSpeed_test") { int numDevices = 0; HIP_CHECK(hipGetDeviceCount(&numDevices)); if (numDevices <= 0) { - SUCCEED("Skipped testcase hipPerfSharedMemReadSpeed as" - "there is no device to test.\n"); + SUCCEED( + "Skipped testcase hipPerfSharedMemReadSpeed as" + "there is no device to test.\n"); } else { REQUIRE(true == hipPerfSharedMemReadSpeed_test()); } } /** -* End doxygen group perfMemoryTest. -* @} -*/ + * End doxygen group perfMemoryTest. + * @} + */ diff --git a/catch/perftests/stream/hipPerfDeviceConcurrency.cc b/catch/perftests/stream/hipPerfDeviceConcurrency.cc index b07c9f49e..dfe1d83c0 100644 --- a/catch/perftests/stream/hipPerfDeviceConcurrency.cc +++ b/catch/perftests/stream/hipPerfDeviceConcurrency.cc @@ -18,12 +18,12 @@ */ /** -* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency -* @{ -* @ingroup perfStreamTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency + * @{ + * @ingroup perfStreamTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include @@ -34,28 +34,28 @@ typedef struct { } coordRec; static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black + {0.0, 0.0, 0.00001}, // All black }; static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); -__global__ void mandelbrot(uint *out, uint width, float xPos, - float yPos, float xStep, float yStep, uint maxIter) { +__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*i); - float y0 = static_cast(yPos + yStep*j); + float x0 = static_cast(xPos + xStep * i); + float y0 = static_cast(yPos + yStep * j); float x = x0; float y = y0; uint iter = 0; float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; }; @@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency { hipPerfDeviceConcurrency(); ~hipPerfDeviceConcurrency(); - void setNumGpus(unsigned int num) { - numDevices = num; - } - unsigned int getNumGpus() { - return numDevices; - } + void setNumGpus(unsigned int num) { numDevices = num; } + unsigned int getNumGpus() { return numDevices; } void open(void); void close(void); bool run(unsigned int testCase, int numGpus); private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numDevices; unsigned int width_; @@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) { } } -void hipPerfDeviceConcurrency::close() { -} +void hipPerfDeviceConcurrency::close() {} bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { static int deviceId; - uint ** hPtr = new uint*[numGpus]; - uint ** dPtr = new uint*[numGpus]; - hipStream_t * streams = new hipStream_t[numGpus]; - int *numCUs = new int[numGpus]; - unsigned int *maxIter = new unsigned int[numGpus]; - unsigned long long *expectedIters = new unsigned long long[numGpus]; + uint** hPtr = new uint*[numGpus]; + uint** dPtr = new uint*[numGpus]; + hipStream_t* streams = new hipStream_t[numGpus]; + int* numCUs = new int[numGpus]; + unsigned int* maxIter = new unsigned int[numGpus]; + unsigned long long* expectedIters = new unsigned long long[numGpus]; int threads, threads_per_block, blocks; float xStep, yStep, xPos, yPos; @@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, i)); if (testCase != 0) { - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount - << " CUs" << " and device ID: " << i << std::endl; + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID, + props.name, props.multiProcessorCount, i); } - numCUs[i] = props.multiProcessorCount; int clkFrequency = 0; - HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, - hipDeviceAttributeClockRate, i)); + HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); if (clkFrequency == 0) { - std::cout << "clkFrequency = 0, set it to 1000000\n"; + CONSOLE_PRINT("clkFrequency = 0, set it to 1000000"); clkFrequency = 1000000; } - clkFrequency =(unsigned int)clkFrequency/1000; + clkFrequency = (unsigned int)clkFrequency / 1000; // Maximum iteration count // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) - * numCUs[i]) / 128); + maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); maxIter[i] = (maxIter[i] + 15) & ~15; // Width is divisible by 4 because the mandelbrot @@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { HIP_CHECK(hipStreamCreate(&streams[i])); // Allocate memory on the host and device - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), - bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) // Prepare kernel launch parameters - threads = (bufSize/sizeof(uint)); - threads_per_block = 64; - blocks = (threads/threads_per_block) + (threads % threads_per_block); + threads = (bufSize / sizeof(uint)); + threads_per_block = 64; + blocks = (threads / threads_per_block) + (threads % threads_per_block); coordIdx = testCase % numCoords; xStep = static_cast(coords[coordIdx].width / static_cast(width_)); @@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { deviceId = i; } - HIP_CHECK(hipSetDevice(deviceId)); - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, - streams[i], dPtr[i], width_, xPos, yPos, xStep, - yStep, maxIter[i]); + HIP_CHECK(hipSetDevice(deviceId)); + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i], + width_, xPos, yPos, xStep, yStep, maxIter[i]); } for (int i = 0; i < numGpus; i++) { HIP_CHECK(hipStreamSynchronize(0)); @@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { auto all_end = std::chrono::steady_clock::now(); std::chrono::duration all_kernel_time = all_end - all_start; - for(int i = 0; i < numGpus; i++) { - if(testCase != 0) { + for (int i = 0; i < numGpus; i++) { + if (testCase != 0) { deviceId = i; } HIP_CHECK(hipSetDevice(deviceId)); @@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { // Copy data back from device to the host HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); checkData(hPtr[i]); - expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; + expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i]; if (testCase != 0) { checkData(hPtr[i]); if (totalIters != expectedIters[i]) { - std::cout << "Incorrect iteration count detected" << std::endl; + CONSOLE_PRINT("Incorrect iteration count detected"); } } @@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { } if (testCase != 0) { - std::cout << '\n' << "Measured time for kernel computation on " << numGpus - << " device (s): " << all_kernel_time.count() << " (s) " - << '\n' << std::endl; + CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus, + all_kernel_time.count()); } if (testCase == 0) { deviceId++; } - delete [] hPtr; - delete [] dPtr; - delete [] streams; - delete [] numCUs; - delete [] maxIter; - delete [] expectedIters; + delete[] hPtr; + delete[] dPtr; + delete[] streams; + delete[] numCUs; + delete[] maxIter; + delete[] expectedIters; return true; } -void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_ ; i++) { - ptr2[i] = value; +void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; + for (unsigned int i = 0; i < width_ * width_; i++) { + ptr2[i] = value; } } -void hipPerfDeviceConcurrency::checkData(uint *ptr) { +void hipPerfDeviceConcurrency::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_ * width_; i++) { totalIters += ptr[i]; @@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the different levels of device concurrency. -* Test source -* ------------------------ -* - perftests/stream/hipPerfDeviceConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the different levels of device concurrency. + * Test source + * ------------------------ + * - perftests/stream/hipPerfDeviceConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDeviceConcurrency") { hipPerfDeviceConcurrency deviceConcurrency; @@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") { } /** -* End doxygen group perfStreamTest. -* @} -*/ + * End doxygen group perfStreamTest. + * @} + */ diff --git a/catch/perftests/stream/hipPerfStreamConcurrency.cc b/catch/perftests/stream/hipPerfStreamConcurrency.cc index ba4a04aa9..aa069e2fd 100644 --- a/catch/perftests/stream/hipPerfStreamConcurrency.cc +++ b/catch/perftests/stream/hipPerfStreamConcurrency.cc @@ -18,12 +18,12 @@ */ /** -* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency -* @{ -* @ingroup perfComputeTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency + * @{ + * @ingroup perfComputeTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include #include @@ -55,23 +55,23 @@ static coordRec coords[] = { static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); -__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter) { +__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % (width/4); - int j = tid / (width/4); - int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3); + int i = tid % (width / 4); + int j = tid / (width / 4); + int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3); int4 vecj = make_int4(j, j, j, j); float4 x0; - x0.x = static_cast(xPos + xStep*veci.x); - x0.y = static_cast(xPos + xStep*veci.y); - x0.z = static_cast(xPos + xStep*veci.z); - x0.w = static_cast(xPos + xStep*veci.w); + x0.x = static_cast(xPos + xStep * veci.x); + x0.y = static_cast(xPos + xStep * veci.y); + x0.z = static_cast(xPos + xStep * veci.z); + x0.w = static_cast(xPos + xStep * veci.w); float4 y0; - y0.x = static_cast(yPos + yStep*vecj.x); - y0.y = static_cast(yPos + yStep*vecj.y); - y0.z = static_cast(yPos + yStep*vecj.z); - y0.w = static_cast(yPos + yStep*vecj.w); + y0.x = static_cast(yPos + yStep * vecj.x); + y0.y = static_cast(yPos + yStep * vecj.y); + y0.z = static_cast(yPos + yStep * vecj.z); + y0.w = static_cast(yPos + yStep * vecj.w); float4 x = x0; float4 y = y0; uint iter = 0; @@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, int4 ccount = make_int4(0, 0, 0, 0); float4 savx = x; float4 savy = y; - stay.x = (x.x*x.x+y.x*y.x) <= static_cast(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= static_cast(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= static_cast(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= static_cast(4.0f); - for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); - iter+=16) { + stay.x = (x.x * x.x + y.x * y.x) <= static_cast(4.0f); + stay.y = (x.y * x.y + y.y * y.y) <= static_cast(4.0f); + stay.z = (x.z * x.z + y.z * y.z) <= static_cast(4.0f); + stay.w = (x.w * x.w + y.w * y.w) <= static_cast(4.0f); + for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) { x = savx; y = savy; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; - stay.x = (x.x*x.x+y.x*y.x) <= static_cast(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= static_cast(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= static_cast(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= static_cast(4.0f); + stay.x = (x.x * x.x + y.x * y.x) <= static_cast(4.0f); + stay.y = (x.y * x.y + y.y * y.y) <= static_cast(4.0f); + stay.z = (x.z * x.z + y.z * y.z) <= static_cast(4.0f); + stay.w = (x.w * x.w + y.w * y.w) <= static_cast(4.0f); savx.x = static_cast(stay.x ? x.x : savx.x); savx.y = static_cast(stay.y ? x.y : savx.y); savx.z = static_cast(stay.z ? x.z : savx.z); @@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, savy.y = static_cast(stay.y ? y.y : savy.y); savy.z = static_cast(stay.z ? y.z : savy.z); savy.w = static_cast(stay.w ? y.w : savy.w); - ccount.x -= stay.x*16; - ccount.y -= stay.y*16; - ccount.z -= stay.z*16; - ccount.w -= stay.w*16; + ccount.x -= stay.x * 16; + ccount.y -= stay.y * 16; + ccount.z -= stay.z * 16; + ccount.w -= stay.w * 16; } // Handle remainder if (!(stay.x & stay.y & stay.z & stay.w)) { @@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, do { x = savx; y = savy; - stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter); - stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter); - stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter); - stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter); + stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter); + stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter); + stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter); + stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter); tmp = x; - x = x*x + x0 - y*y; - y = 2.0f*tmp*y + y0; + x = x * x + x0 - y * y; + y = 2.0f * tmp * y + y0; ccount.x += stay.x; ccount.y += stay.y; ccount.z += stay.z; @@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, savy.w = (stay.w ? y.w : savy.w); } while ((stay.x | stay.y | stay.z | stay.w) && iter); } - uint4 *vecOut = reinterpret_cast(out); + uint4* vecOut = reinterpret_cast(out); vecOut[tid].x = (uint)(ccount.x); vecOut[tid].y = (uint)(ccount.y); vecOut[tid].z = (uint)(ccount.z); @@ -180,27 +179,19 @@ class hipPerfStreamConcurrency { hipPerfStreamConcurrency(); ~hipPerfStreamConcurrency(); - void setNumKernels(unsigned int num) { - numKernels = num; - } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } + void setNumKernels(unsigned int num) { numKernels = num; } + void setNumStreams(unsigned int num) { numStreams = num; } + unsigned int getNumStreams() { return numStreams; } - unsigned int getNumKernels() { - return numKernels; - } + unsigned int getNumKernels() { return numKernels; } bool open(int deviceID); bool run(unsigned int testCase, unsigned int deviceId); void close(void); private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numKernels; unsigned int numStreams; @@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount << " CUs" - << " and device id: " << deviceId << std::endl; + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); + numCUs = props.multiProcessorCount; return true; } -void hipPerfStreamConcurrency::close() { -} +void hipPerfStreamConcurrency::close() {} -bool hipPerfStreamConcurrency::run(unsigned int testCase, - unsigned int deviceId) { +bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) { int clkFrequency = 0; unsigned int numStreams = getNumStreams(); unsigned int numKernels = getNumKernels(); - HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, - hipDeviceAttributeClockRate, deviceId)); + HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId)); if (clkFrequency == 0) { - std::cout << "clkFrequency = 0, set it to 1000000\n"; + CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n"); clkFrequency = 1000000; } - clkFrequency =(unsigned int)clkFrequency/1000; + clkFrequency = (unsigned int)clkFrequency / 1000; // Maximum iteration count // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter = (unsigned int)(((8388608 * (static_cast(clkFrequency) / 1000)) - * numCUs) / 128); + maxIter = (unsigned int)(((8388608 * (static_cast(clkFrequency) / 1000)) * numCUs) / 128); maxIter = (maxIter + 15) & ~15; - hipStream_t *streams = new hipStream_t[numStreams]; - uint ** hPtr = new uint*[numKernels]; - uint ** dPtr = new uint*[numKernels]; + hipStream_t* streams = new hipStream_t[numStreams]; + uint** hPtr = new uint*[numKernels]; + uint** dPtr = new uint*[numKernels]; // Width is divisible by 4 because the mandelbrot kernel // processes 4 pixels at once. @@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Allocate memory on the host and device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), - bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) } // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); + int threads = (bufSize / sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); coordIdx = testCase % numCoords; float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); @@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Copy memory asynchronously and concurrently from host to device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), - hPtr[i], bufSize, streams[i % numStreams])); + HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), hPtr[i], bufSize, + streams[i % numStreams])); } // Synchronize to make sure all the copies are completed @@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, auto all_start = std::chrono::steady_clock::now(); for (uint i = 0; i < numKernels; i++) { - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), - 0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep, - yStep, maxIter); + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, + streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter); } // Synchronize all the concurrent streans to have completed execution @@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Copy data back from device to the host for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], - reinterpret_cast(dPtr[i]), bufSize, - streams[i % numStreams])); + HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast(dPtr[i]), bufSize, + streams[i % numStreams])); } if (testCase != 0) { - std::cout <<"Measured time for " << numKernels <<" kernels (s) on " - << numStreams <<" stream (s): " << all_kernel_time.count() << std::endl; + CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams, + all_kernel_time.count()); } - for (uint i = 0 ; i < numStreams; i++) { + for (uint i = 0; i < numStreams; i++) { HIP_CHECK(hipStreamDestroy(streams[i])); } @@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, HIP_CHECK(hipFree(dPtr[i])); } - delete [] streams; - delete [] hPtr; - delete [] dPtr; + delete[] streams; + delete[] hPtr; + delete[] dPtr; return true; } -void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ ; i++) { - ptr2[i] = value; +void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; + for (unsigned int i = 0; i < width_; i++) { + ptr2[i] = value; } } -void hipPerfStreamConcurrency::checkData(uint *ptr) { +void hipPerfStreamConcurrency::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_; i++) { totalIters += ptr[i]; @@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the different levels of stream concurrency. -* Test source -* ------------------------ -* - perftests/stream/hipPerfStreamConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the different levels of stream concurrency. + * Test source + * ------------------------ + * - perftests/stream/hipPerfStreamConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfStreamConcurrency") { hipPerfStreamConcurrency streamConcurrency; @@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") { break; case 1: - // default stream executes serially - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; + // default stream executes serially + streamConcurrency.setNumStreams(1); + streamConcurrency.setNumKernels(1); + break; case 2: // 2-way concurrency @@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") { } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc b/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc index edbe4c004..9b240ac13 100644 --- a/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc +++ b/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc @@ -18,19 +18,17 @@ */ /** -* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy -* @{ -* @ingroup perfStreamTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy + * @{ + * @ingroup perfStreamTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include #include #include -using namespace std; - #define BufSize 0x1000 #define Iterations 0x100 #define TotalStreams 4 @@ -39,17 +37,20 @@ using namespace std; class hipPerfStreamCreateCopyDestroy { private: - unsigned int numBuffers_; - unsigned int numStreams_; - const size_t totalStreams_[TotalStreams]; - const size_t totalBuffers_[TotalBufs]; + unsigned int numBuffers_; + unsigned int numStreams_; + const size_t totalStreams_[TotalStreams]; + const size_t totalBuffers_[TotalBufs]; + public: - hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0), - totalStreams_{1, 2, 4, 8}, - totalBuffers_{1, 100, 1000, 5000} {}; - ~hipPerfStreamCreateCopyDestroy() {}; - bool open(int deviceID); - bool run(unsigned int testNumber); + hipPerfStreamCreateCopyDestroy() + : numBuffers_(0), + numStreams_(0), + totalStreams_{1, 2, 4, 8}, + totalBuffers_{1, 100, 1000, 5000} {}; + ~hipPerfStreamCreateCopyDestroy(){}; + bool open(int deviceID); + bool run(unsigned int testNumber); }; bool hipPerfStreamCreateCopyDestroy::open(int deviceId) { @@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount << " CUs" - << " and device id: " << deviceId << std::endl; + + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); return true; } bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { numStreams_ = totalStreams_[testNumber % TotalStreams]; - size_t iter = Iterations / (numStreams_ * (static_cast(1) - << (testNumber / TotalBufs + 1))); - hipStream_t *streams = new hipStream_t[numStreams_]; + size_t iter = + Iterations / (numStreams_ * (static_cast(1) << (testNumber / TotalBufs + 1))); + hipStream_t* streams = new hipStream_t[numStreams_]; numBuffers_ = totalBuffers_[testNumber / TotalBufs]; - float ** dSrc = new float*[numBuffers_]; + float** dSrc = new float*[numBuffers_]; size_t nBytes = BufSize * sizeof(float); for (size_t b = 0; b < numBuffers_; ++b) { @@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { for (size_t s = 0; s < numStreams_; ++s) { for (size_t b = 0; b < numBuffers_; ++b) { - HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, - hipMemcpyHostToDevice, streams[s])); + HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s])); } } @@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { auto time = static_cast(diff.count() * 1000 / (iter * numStreams_)); - cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and " - << setw(4) << numBuffers_ << " buffers " << " and " << setw(4) - << iter << " iterations " << time << " (ms) " << endl; + CONSOLE_PRINT( + "Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n", + numStreams_, numBuffers_, iter, time); - delete [] hSrc; + delete[] hSrc; for (size_t b = 0; b < numBuffers_; ++b) { HIP_CHECK(hipFree(dSrc[b])); } - delete [] streams; - delete [] dSrc; + delete[] streams; + delete[] dSrc; return true; } /** -* Test Description -* ------------------------ -* - Verify the Create+Copy+Destroy time for different stream. -* Test source -* ------------------------ -* - perftests/stream/hipPerfDeviceConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the Create+Copy+Destroy time for different stream. + * Test source + * ------------------------ + * - perftests/stream/hipPerfDeviceConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") { hipPerfStreamCreateCopyDestroy streamCCD; @@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") { } /** -* End doxygen group perfStreamTest. -* @} -*/ + * End doxygen group perfStreamTest. + * @} + */ diff --git a/catch/perftests/vmm/CMakeLists.txt b/catch/perftests/vmm/CMakeLists.txt new file mode 100644 index 000000000..57f6d0a51 --- /dev/null +++ b/catch/perftests/vmm/CMakeLists.txt @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +set(TEST_SRC + hipPerfVMMAlloc.cc +) + +hip_add_exe_to_target(NAME perfVMMTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME perf_test) diff --git a/catch/perftests/vmm/hipPerfVMMAlloc.cc b/catch/perftests/vmm/hipPerfVMMAlloc.cc new file mode 100644 index 000000000..702615b2a --- /dev/null +++ b/catch/perftests/vmm/hipPerfVMMAlloc.cc @@ -0,0 +1,305 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @addtogroup hipMemCreate hipMemCreate + * @{ + * @ingroup perfVMMTest + * `hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, + * const hipMemAllocationProp* prop, unsigned long long flags)` - + * Creates a memory allocation described by the properties and size. + */ + +#include + +/** + * Test Description + * ------------------------ + * - Verify hipPerfBufferCopySpeed status. + * Test source + * ------------------------ + * - perftests/memory/hipPerfBufferCopySpeed.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +// Control Variables +constexpr bool single_map = false; +constexpr bool debug_failure = false; +constexpr size_t kMB = (1024 * 1024); +constexpr size_t kGB = (1024 * 1024 * 1024); +constexpr size_t chunk_size = (64 * kMB); + +bool CheckVMMSupportedOnDevice(int deviceId) { + int value = 0; + hipDeviceAttribute_t attr = hipDeviceAttributeVirtualMemoryManagementSupported; + HIP_CHECK(hipDeviceGetAttribute(&value, attr, deviceId)); + return static_cast(value); +} + +bool GetVMMGranularityOnDevice(int deviceId, size_t& granularity) { + hipMemAllocationProp prop{}; + prop.type = hipMemAllocationTypePinned; + prop.location.type = hipMemLocationTypeDevice; + prop.location.id = deviceId; // Current Devices + HIP_CHECK( + hipMemGetAllocationGranularity(&granularity, &prop, hipMemAllocationGranularityMinimum)); + return true; +} + +template size_t GetSizeN(size_t total_size) { + if (total_size % sizeof(T) != 0) { + INFO("Size " << total_size << " is not a multiple of size of type T"); + assert(false); + } + return (total_size / sizeof(T)); +} + +bool ValidateUsingCopy(int deviceId, void* dev_ptr, size_t data_size, + std::chrono::microseconds& h2d_elapsed, + std::chrono::microseconds& d2h_elapsed) { + // Get Host Data + std::vector A_h(data_size), B_h(data_size); + size_t size_n = GetSizeN(data_size); + + for (size_t idx = 0; idx < size_n; ++idx) { + A_h[idx] = idx; + B_h[idx] = 0; + } + + HIP_CHECK(hipSetDevice(deviceId)); + auto start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemcpy(dev_ptr, A_h.data(), data_size, hipMemcpyHostToDevice)); + auto end = std::chrono::high_resolution_clock::now(); + h2d_elapsed = std::chrono::duration_cast(end - start); + + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemcpy(B_h.data(), dev_ptr, data_size, hipMemcpyDeviceToHost)); + end = std::chrono::high_resolution_clock::now(); + d2h_elapsed = std::chrono::duration_cast(end - start); + + if (debug_failure) { + REQUIRE(true == std::equal(B_h.begin(), B_h.end(), A_h.data())); + } else { + assert(A_h.size() == B_h.size()); + for (size_t idx = 0; idx < A_h.size(); ++idx) { + if (A_h[idx] != B_h[idx]) { + std::cout << "Failed at first index: " << idx + << " Expected: " << A_h[idx] + << " Value: " << B_h[idx] << std::endl; + break; + } + } + } + return true; +} + +bool TestOnDevice(int deviceId) { + HIP_CHECK(hipSetDevice(deviceId)); + // Check if VMM is supported + if (!CheckVMMSupportedOnDevice(deviceId)) { + INFO("VMM is not suppored on device: " << deviceId); + return false; + } + // Get VMM granularity + size_t granularity = 0; + if (!GetVMMGranularityOnDevice(deviceId, granularity)) { + INFO("Cannot get granularity on device: " << deviceId); + return false; + } + + // Measure CPU time of allocation taken + size_t start_size = 1 * kGB; + size_t max_size = 64 * kGB; + for (size_t size_idx = start_size; size_idx <= max_size; (size_idx <<= 1)) { + void* dev_ptr = nullptr; + // This seems to be a completely blocking call, measuring CPU time in this test for now. + // Create Memory Reservation + auto start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemAddressReserve(&dev_ptr, size_idx, granularity, nullptr, 0)); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds reserve_elapsed + = std::chrono::duration_cast(end - start); + std::vector physmem_handles; + std::chrono::microseconds alloc_elapsed; + std::chrono::microseconds map_elapsed; + assert(size_idx % chunk_size == 0); + size_t chunk_max = size_idx / chunk_size; + size_t chunk_idx = 0; + + size_t freeVRAM = 0, totalVRAM = 0; + HIP_CHECK(hipMemGetInfo(&freeVRAM, &totalVRAM)); + INFO("Available total device memory : " << totalVRAM); + + if (freeVRAM < size_idx) { + WARN("Further free device memory unavailable, hence exiting!"); + break; + } + + if (single_map) { + // Create Physical memory + hipMemAllocationProp prop{}; + prop.type = hipMemAllocationTypePinned; + prop.location.type = hipMemLocationTypeDevice; + prop.location.id = deviceId; + hipMemGenericAllocationHandle_t handle; + + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemCreate(&handle, size_idx, &prop, 0)); + end = std::chrono::high_resolution_clock::now(); + alloc_elapsed = std::chrono::duration_cast(end - start); + physmem_handles.push_back(handle); + + // Map the memory + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemMap(dev_ptr, size_idx, 0, handle, 0)); + end = std::chrono::high_resolution_clock::now(); + map_elapsed = std::chrono::duration_cast(end - start); + } else { + start = std::chrono::high_resolution_clock::now(); + while (chunk_idx < chunk_max) { + hipMemAllocationProp prop{}; + prop.type = hipMemAllocationTypePinned; + prop.location.type = hipMemLocationTypeDevice; + prop.location.id = deviceId; + hipMemGenericAllocationHandle_t handle; + HIP_CHECK(hipMemCreate(&handle, chunk_size, &prop, 0)); + physmem_handles.push_back(handle); + ++chunk_idx; + } + end = std::chrono::high_resolution_clock::now(); + alloc_elapsed = std::chrono::duration_cast(end - start); + + chunk_idx = 0; + start = std::chrono::high_resolution_clock::now(); + while (chunk_idx < chunk_max) { + // Use chunk size to map multiple maps + uint64_t uiptr = reinterpret_cast(dev_ptr); + uiptr = uiptr + chunk_idx * chunk_size; + HIP_CHECK(hipMemMap(reinterpret_cast(uiptr), chunk_size, 0, + physmem_handles[chunk_idx], 0)); + ++chunk_idx; + } + end = std::chrono::high_resolution_clock::now(); + map_elapsed = std::chrono::duration_cast(end - start); + + chunk_idx = 0; + while (chunk_idx < chunk_max) { + // Set access + hipMemAccessDesc accessDesc = {}; + accessDesc.location.type = hipMemLocationTypeDevice; + accessDesc.location.id = deviceId; + accessDesc.flags = hipMemAccessFlagsProtReadWrite; + uint64_t uiptr = reinterpret_cast(dev_ptr); + uiptr = uiptr + chunk_idx * chunk_size; + // Make the address accessible to GPU 0 + HIP_CHECK(hipMemSetAccess(reinterpret_cast(uiptr), chunk_size, &accessDesc, 1)); + ++chunk_idx; + } + } + + // Also measure the memcpy time elapsed + std::chrono::microseconds h2d_elapsed; + std::chrono::microseconds d2h_elapsed; + + // Validate using copy + if (!ValidateUsingCopy(deviceId, dev_ptr, size_idx, h2d_elapsed, d2h_elapsed)) { + INFO("Validation failed for size: " << size_idx); + } + + start = std::chrono::high_resolution_clock::now(); + chunk_idx = 0; + while (chunk_idx < chunk_max) { + uint64_t uiptr = reinterpret_cast(dev_ptr); + uiptr = uiptr + chunk_idx * chunk_size; + HIP_CHECK(hipMemUnmap(reinterpret_cast(uiptr), chunk_size)); + ++chunk_idx; + } + end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds unmap_elapsed + = std::chrono::duration_cast(end - start); + + start = std::chrono::high_resolution_clock::now(); + for (auto& physmem_handle : physmem_handles) { + HIP_CHECK(hipMemRelease(physmem_handle)); + } + end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds release_elapsed + = std::chrono::duration_cast(end - start); + + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMemAddressFree(dev_ptr, size_idx)); + end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds free_elapsed + = std::chrono::duration_cast(end - start); + + // Print the results + std::cout << "-------------Size: " << (size_idx / kGB) << " GB----------------" << std::endl; + std::cout << "Time taken to reserve : " << reserve_elapsed.count() + << " micro seconds and free: " << free_elapsed.count() + << " micro seconds" << std::endl; + std::cout <<"Time taken to alloc : " << alloc_elapsed.count() + << " micro seconds and release: "<< release_elapsed.count() + << " micro seconds" << std::endl; + std::cout << "Time taken to map : " << map_elapsed.count() + << " micro seconds and unmap: " << unmap_elapsed.count() + << " micro seconds" << std::endl; + std::cout << "Time taken to H2D : " << h2d_elapsed.count() + << " micro seconds and D2H: " << d2h_elapsed.count() << " micro seconds" << std::endl; + std::cout << "-------------------------/hipMallocPerf------------------------" << std::endl; + + void* dev_ptr_legacy = nullptr; + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipMalloc(&dev_ptr_legacy, size_idx)); + end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds hm_elapsed + = std::chrono::duration_cast(end - start); + start = std::chrono::high_resolution_clock::now(); + HIP_CHECK(hipFree(dev_ptr_legacy)); + end = std::chrono::high_resolution_clock::now(); + std::chrono::microseconds hf_elapsed + = std::chrono::duration_cast(end - start); + std::cout << "Time taken for hipMalloc : " << hm_elapsed.count() + << " micro seconds and hipFree: " << hf_elapsed.count() + << " micro seconds" << std::endl; + std::cout << "---------------------------------------------------------------" << std::endl; + std::cout << std::endl; + } + + return true; +} + +TEST_CASE("Perf_hipPerfVMMAllocSpeed_test") { + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + if (numDevices <= 0) { + SUCCEED( + "Skipped testcase hipPerfBufferCopySpeed as" + "there is no device to test."); + } else { + // Test on Primary Device first + int deviceId = 0; + TestOnDevice(deviceId); + } +} +/** + * End doxygen group perfVMMTest. + * @} + */ diff --git a/catch/unit/atomics/arithmetic_common.hh b/catch/unit/atomics/arithmetic_common.hh index 3ff5bb4d7..2b4db75dd 100644 --- a/catch/unit/atomics/arithmetic_common.hh +++ b/catch/unit/atomics/arithmetic_common.hh @@ -446,6 +446,7 @@ void TestCore(const TestParams& p) { // Launch Kernel for (auto i = 0u; i < p.num_devices; ++i) { + HIP_CHECK(hipSetDevice(i)); for (auto j = 0u; j < p.kernel_count; ++j) { const auto& stream = streams[i * p.kernel_count + j].stream(); const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount(); @@ -580,6 +581,8 @@ void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices, } } + CHECK_P2P_SUPPORT + if (kernel_count > 1) { for (auto i = 0u; i < num_devices; ++i) { int canAccess = 0; diff --git a/catch/unit/atomics/atomicExch_common.hh b/catch/unit/atomics/atomicExch_common.hh index e0fcf84c8..ba3105de6 100644 --- a/catch/unit/atomics/atomicExch_common.hh +++ b/catch/unit/atomics/atomicExch_common.hh @@ -395,8 +395,21 @@ void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_de } } + CHECK_P2P_SUPPORT + if (kernel_count > 1) { for (auto i = 0u; i < num_devices; ++i) { + int canAccess = 0; + for (auto j = 0u; j < num_devices; ++j) { + if (i != j) { + HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j)); + if(canAccess == 0) { + std::string msg = "P2P access check failed between dev1:" + std::to_string(i) + ",dev2:" + std::to_string(j); + HipTest::HIP_SKIP_TEST(msg.c_str()); + return; + } + } + } int concurrent_kernels = 0; HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i)); if (!concurrent_kernels) { diff --git a/catch/unit/atomics/bitwise_common.hh b/catch/unit/atomics/bitwise_common.hh index b61167968..9c7bf0f5d 100644 --- a/catch/unit/atomics/bitwise_common.hh +++ b/catch/unit/atomics/bitwise_common.hh @@ -272,6 +272,7 @@ void TestCore(const TestParams& p) { } // Launch Kernel and get back old vals for (auto i = 0u; i < p.num_devices; ++i) { + HIP_CHECK(hipSetDevice(i)); for (auto j = 0u; j < p.kernel_count; ++j) { const auto& stream = streams[i * p.kernel_count + j].stream(); const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount(); diff --git a/catch/unit/atomics/min_max_common.hh b/catch/unit/atomics/min_max_common.hh index 467cfdcae..1f6d180f8 100644 --- a/catch/unit/atomics/min_max_common.hh +++ b/catch/unit/atomics/min_max_common.hh @@ -302,6 +302,7 @@ void TestCore(const TestParams& p) { // Launch kernel for (auto i = 0u; i < p.num_devices; ++i) { + HIP_CHECK(hipSetDevice(i)); for (auto j = 0u; j < p.kernel_count; ++j) { const auto& stream = streams[i * p.kernel_count + j].stream(); const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount(); @@ -422,8 +423,21 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices, } } + CHECK_P2P_SUPPORT + if (kernel_count > 1) { for (auto i = 0u; i < num_devices; ++i) { + int canAccess = 0; + for (auto j = 0u; j < num_devices; ++j) { + if (i != j) { + HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j)); + if(canAccess == 0) { + std::string msg = "P2P access check failed between dev1:" + std::to_string(i) + ",dev2:" + std::to_string(j); + HipTest::HIP_SKIP_TEST(msg.c_str()); + return; + } + } + } int concurrent_kernels = 0; HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i)); if (!concurrent_kernels) { diff --git a/catch/unit/atomics/unsafeAtomicAdd.cc b/catch/unit/atomics/unsafeAtomicAdd.cc index 1fde5861c..0e31a38e6 100644 --- a/catch/unit/atomics/unsafeAtomicAdd.cc +++ b/catch/unit/atomics/unsafeAtomicAdd.cc @@ -172,6 +172,7 @@ TEMPLATE_TEST_CASE("Unit_unsafe_atomic_add_half_and_bfloat", "", __half2, __hip_ REQUIRE(hout.x == 32.0f); REQUIRE(hout.y == 64.0f); + HIP_CHECK(hipFree(out)); } /** diff --git a/catch/unit/compiler/CMakeLists.txt b/catch/unit/compiler/CMakeLists.txt index 2b2f50c52..6f1278d40 100644 --- a/catch/unit/compiler/CMakeLists.txt +++ b/catch/unit/compiler/CMakeLists.txt @@ -19,7 +19,7 @@ if(HIP_PLATFORM MATCHES "amd") TEST_TARGET_NAME build_tests) set(OFFLOAD_ARCH_GENERIC_STR "--offload-arch=gfx9-generic --offload-arch=gfx9-4-generic:sramecc+:xnack- --offload-arch=gfx9-4-generic:sramecc-:xnack- --offload-arch=gfx9-4-generic:xnack+ --offload-arch=gfx10-1-generic --offload-arch=gfx10-3-generic --offload-arch=gfx11-generic --offload-arch=gfx12-generic") - + set(DISABLE_GENERIC_TARGET_ONLY) # Build hipSquareGenericTargetOnly to cover generic targets only @@ -62,7 +62,7 @@ if(HIP_PLATFORM MATCHES "amd") set_property(GLOBAL APPEND PROPERTY G_INSTALL_CUSTOM_TARGETS ${CMAKE_CURRENT_BINARY_DIR}/${GENERIC_TARGET_ONLY_EXE}) set_property(GLOBAL APPEND PROPERTY G_INSTALL_CUSTOM_TARGETS ${CMAKE_CURRENT_BINARY_DIR}/${GENERIC_TARGET_ONLY_COMPRESSED_EXE}) else() - set(DISABLE_GENERIC_TARGET_ONLY "-DNO_GENERIC_TARGET_ONLY_TEST") + set(DISABLE_GENERIC_TARGET_ONLY "-DNO_GENERIC_TARGET_ONLY_TEST") endif() # Build hipSquareGenericTarget to cover generic targets and the specific target @@ -84,4 +84,19 @@ if(HIP_PLATFORM MATCHES "amd") add_dependencies(hipSquareGenericTarget hipSquareGenericTargetOnly) add_dependencies(hipSquareGenericTarget hipSquareGenericTargetOnlyCompressed) endif() + + # SWDEV-548807 skip building hipSpirvTest + if(false) + add_custom_target(hipSpirvTest ALL + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/hipSpirvTest.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../../hipTestMain/hip_test_context.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../../hipTestMain/main.cc + -I${CMAKE_CURRENT_SOURCE_DIR}/../../include + -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/Catch2 + -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/picojson + -I${HIP_PATH}/include/ --rocm-path=${ROCM_PATH} --offload-arch=amdgcnspirv + -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/compiler/hipSpirvTest) + add_dependencies(CompilerTest hipSpirvTest) + endif() + endif() diff --git a/catch/unit/compiler/hipSpirvTest.cc b/catch/unit/compiler/hipSpirvTest.cc new file mode 100644 index 000000000..cd54350fd --- /dev/null +++ b/catch/unit/compiler/hipSpirvTest.cc @@ -0,0 +1,28 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +__global__ void kernel() { asm volatile("v_nop" ::: "memory"); } + +// This test case compiles with --offload-arch=amdgcnspirv to verify SPIRV mode +TEST_CASE("Unit_test_spirv_mode") { kernel<<<1, 32>>>(); } \ No newline at end of file diff --git a/catch/unit/compiler/hipSquare.cc b/catch/unit/compiler/hipSquare.cc index 693f2b2e5..94f4529cd 100644 --- a/catch/unit/compiler/hipSquare.cc +++ b/catch/unit/compiler/hipSquare.cc @@ -78,6 +78,10 @@ TEST_CASE("Unit_test_compressed_codeobject") { HIP_CHECK(hipErrorUnknown); } } + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); + free(A_h); + free(C_h); printf("PASSED!\n"); REQUIRE(true); } diff --git a/catch/unit/context/CMakeLists.txt b/catch/unit/context/CMakeLists.txt index 0d20df1bf..8f5ac897a 100644 --- a/catch/unit/context/CMakeLists.txt +++ b/catch/unit/context/CMakeLists.txt @@ -22,6 +22,7 @@ set(TEST_SRC hipDrvGetPCIBusId.cc hipDrvMemcpy.cc hipMemsetD8.cc + hipCtxNotSupported.cc ) hip_add_exe_to_target(NAME Context TEST_SRC ${TEST_SRC} diff --git a/catch/unit/context/hipCtxNotSupported.cc b/catch/unit/context/hipCtxNotSupported.cc new file mode 100644 index 000000000..cf6392bc0 --- /dev/null +++ b/catch/unit/context/hipCtxNotSupported.cc @@ -0,0 +1,264 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +/** + * Test Description + * ------------------------ + * - Validates handling of invalid arguments: + * -# When the device id passed is greater than available + * - Expected output: return `hipErrorInvalidValue` + * -# else + * - Expected output: return `hipErrorContextAlreadyInUse` + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipDevicePrimaryCtxSetFlags_Negative") { + hipDevice_t dev; + unsigned int flags = 0; + SECTION("Negative device index") { + dev = static_cast(-1); + auto res = hipDevicePrimaryCtxSetFlags(dev, flags); + REQUIRE(res == hipErrorInvalidDevice); + } + SECTION("Valid device index") { + dev = static_cast(0); + auto res = hipDevicePrimaryCtxSetFlags(dev, flags); + REQUIRE(res == hipErrorContextAlreadyInUse); + } +} + +/** + * Test Description + * ------------------------ + * - Validates handling of invalid arguments: + * -# When nullptr passed to hipCtxGetDevice + * - Expected output: return `hipErrorInvalidValue` + * -# When a non-nullptr is passed + * - Expected output: returned device ID, within [0, numDevices] + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipDeviceAPIs_not_supported") { + hipDevice_t device; + int numDevices = -1; + HIP_CHECK(hipDeviceGet(&device, 0)); + auto res = hipGetDeviceCount(&numDevices); + REQUIRE(res == hipSuccess); + REQUIRE(numDevices > 0); + + SECTION("hipDevicePrimaryCtxReset_not_supported") { HIP_CHECK(hipDevicePrimaryCtxReset(device)); } + + SECTION("hipCtxGetDevice_not_supported") { + SECTION("hipCtxGetDevice") { + auto res = hipCtxGetDevice(nullptr); + REQUIRE(res == hipErrorInvalidValue); + } + SECTION("hipCtxGetDevice_deviceCount") { + hipDevice_t dev = static_cast(-1); + HIP_CHECK(hipCtxGetDevice(&dev)); + // Ensure the returned device ID is within [0, numDevices] + REQUIRE(dev >= 0); + REQUIRE(dev < numDevices); + } + } +} + +/** + * Test Description + * ------------------------ + * - Validates handling of invalid arguments: + * -# any value not equal to the four valid hipFuncCache_t constants + * - Expected output: return `hipErrorInvalidValue` + * -# When valid enum values are passed + * - Expected output: return `hipErrorNotSupported` + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipCtxGetSetCacheConfig_not_supported") { + hipFuncCache_t cacheConfig; + SECTION("hipCtxSetCacheConfig_not_supported") { + SECTION("Invalid enum value") { + // any value not equal to the four valid hipFuncCache_t constants + cacheConfig = static_cast(0x100); + auto res = hipCtxSetCacheConfig(cacheConfig); + REQUIRE(res == hipErrorInvalidValue); + } + + SECTION("Valid enum values") { + std::array validCfgs = {hipFuncCachePreferNone, hipFuncCachePreferShared, + hipFuncCachePreferL1, hipFuncCachePreferEqual}; + + for (auto cfg : validCfgs) { + auto res = hipCtxSetCacheConfig(cfg); + REQUIRE(res == hipErrorNotSupported); + } + } + } + SECTION("hipCtxGetCacheConfig_not_supported") { + auto res = hipCtxGetCacheConfig(&cacheConfig); + REQUIRE(res == hipErrorNotSupported); + } +} + +/** + * Test Description + * ------------------------ + * - hipCtxGetSetSharedMemConfig APIs are verified to be unsupported + * or return an empty hipSuccess + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - Textures supported on device + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipCtxGetSetSharedMemConfig_not_supported") { + hipSharedMemConfig config; + config = hipSharedMemBankSizeEightByte; + SECTION("hipCtxSetSharedMemConfig_not_supported") { + auto res = hipCtxGetSharedMemConfig(&config); + REQUIRE(res == hipSuccess); + REQUIRE(config == hipSharedMemBankSizeFourByte); + } + SECTION("hipCtxSetSharedMemConfig_not_supported") { + auto res = hipCtxSetSharedMemConfig(config); + REQUIRE(res == hipErrorNotSupported); + } +} + +/** + * Test Description + * ------------------------ + * - hipCtxEnable/DisablePeerAccess APIs are verified to return hipSuccess: + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - Textures supported on device + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipCtxPeerAccess_not_supported") { + hipCtx_t peerCtx = nullptr; + unsigned int flags = 0; + SECTION("hipCtxEnablePeerAccess_not_supported") { + auto res = hipCtxEnablePeerAccess(peerCtx, flags); + REQUIRE(res == hipSuccess); + } + SECTION("hipCtxDisablePeerAccess_not_supported") { + auto res = hipCtxDisablePeerAccess(peerCtx); + REQUIRE(res == hipSuccess); + } +} + +/** + * Test Description + * ------------------------ + * - hipCtx APIs are verified to be unsupported: + * Test source + * ------------------------ + * - unit/texture/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("Unit_hipCtxAPIs_not_supported") { + SECTION("hipCtxGetFlags_not_supported") { + unsigned int flags = 0x100; + auto res = hipCtxGetFlags(&flags); + REQUIRE(res == hipErrorNotSupported); + // In release builds (asserts disabled), flags should remain unchanged + REQUIRE(flags == 0x100); + } + + SECTION("hipCtxSynchronize_not_supported") { + auto res = hipCtxSynchronize(); + REQUIRE(res == hipErrorNotSupported); + } + + SECTION("hipCtxGetApiVersion_not_supported") { + hipCtx_t ctx = nullptr; + unsigned int apiVersion; + auto res = hipCtxGetApiVersion(ctx, &apiVersion); + REQUIRE(res == hipErrorNotSupported); + } +} + +/** + * Test Description + * ------------------------ + * - Goes through the retain-reset-release cycle on a valid and invalid device: + * Verifies + * - a valid primary context is returned + * - an active state is returned + * - an invalidDevice is returned + * Test source + * ------------------------ + * - unit/context/hipCtxNotSupported.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.4 + */ +TEST_CASE("hipDevicePrimaryCtxGetState_Negative") { + hipDevice_t device; + HIP_CHECK(hipDeviceGet(&device, 0)); + hipCtx_t primaryCtx = nullptr; + + SECTION("Valid device") { + HIP_CHECK(hipDevicePrimaryCtxRetain(&primaryCtx, device)); + REQUIRE(primaryCtx != nullptr); + // Make it current + HIP_CHECK(hipCtxSetCurrent(primaryCtx)); + unsigned int flags = 0; + int active = 0; + HIP_CHECK(hipDevicePrimaryCtxGetState(device, &flags, &active)); + // Reset the primary context + HIP_CHECK(hipDevicePrimaryCtxReset(device)); + // Release our retain-handle + HIP_CHECK(hipDevicePrimaryCtxRelease(device)); + } + SECTION("Invalid device") { + device = -1; + // Retain the primary context + auto res = hipDevicePrimaryCtxRetain(&primaryCtx, device); + REQUIRE(res == hipErrorInvalidDevice); + unsigned int flags = 0; + int active = 0; + res = hipDevicePrimaryCtxGetState(device, &flags, &active); + REQUIRE(res == hipErrorInvalidDevice); + // Release our retain-handle + res = hipDevicePrimaryCtxRelease(device); + REQUIRE(res == hipErrorInvalidDevice); + } +} diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc index 675da24dd..f395c5978 100644 --- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc +++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc @@ -233,6 +233,7 @@ static void test_shfl_down() { HIPCHECK(hipHostFree(hPtr)); HIPCHECK(hipFree(dPtr)); + HIPCHECK(hipFree(dResults)); free(cpuResultsArr); } } diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc index 54cd62c19..0118caf45 100644 --- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc +++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc @@ -221,6 +221,7 @@ static void test_shfl_up() { HIPCHECK(hipHostFree(hPtr)); HIPCHECK(hipFree(dPtr)); + HIPCHECK(hipFree(dResults)); free(cpuResultsArr); } } diff --git a/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc b/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc index 11933e99d..9f0d2d854 100644 --- a/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc +++ b/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc @@ -394,6 +394,8 @@ static void test_shfl_any_to_any() { HIPCHECK(hipHostFree(hPtr)); HIPCHECK(hipFree(dPtr)); + HIPCHECK(hipFree(dResults)); + HIPCHECK(hipFree(dsrcArr)); free(srcArr); free(srcArrCpu); free(cpuResultsArr); @@ -461,6 +463,7 @@ static void test_shfl_broadcast() { HIPCHECK(hipHostFree(hPtr)); HIPCHECK(hipFree(dPtr)); + HIPCHECK(hipFree(dResults)); free(cpuResultsArr); } } @@ -554,4 +557,11 @@ TEST_CASE("Unit_coalesced_groups") { std::cout << "Now grouping active threads based on branch divergence" << '\n' << std::endl; test_active_threads_grouping(); + + HIPCHECK(hipFree(d_data_to_filter)); + HIPCHECK(hipFree(d_filtered_data)); + HIPCHECK(hipFree(d_nres)); + free(data_to_filter); + free(filtered_data); + free(host_filtered_data); } diff --git a/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc b/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc index fd1dcc829..2c19a33d1 100644 --- a/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc +++ b/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc @@ -141,4 +141,4 @@ TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Negative_MultiKernelSameDevic for (const auto params : params_list) { HIP_CHECK(hipStreamDestroy(params.stream)); } -} \ No newline at end of file +} diff --git a/catch/unit/graph/hipGetProcAddressGraphApis.cc b/catch/unit/graph/hipGetProcAddressGraphApis.cc index 25fbc6cd2..63e50979a 100644 --- a/catch/unit/graph/hipGetProcAddressGraphApis.cc +++ b/catch/unit/graph/hipGetProcAddressGraphApis.cc @@ -375,9 +375,7 @@ TEST_CASE("Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes") { hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); HIP_CHECK(hipGraphLaunch(graphExec, 0)); - #ifdef _WIN32 HIP_CHECK(hipStreamSynchronize(0)); - #endif REQUIRE(validateArrayT(hostMemDst, N, value) == true); diff --git a/catch/unit/memory/hipMallocAsync.cc b/catch/unit/memory/hipMallocAsync.cc index 3a5baf4f1..6adb1088a 100644 --- a/catch/unit/memory/hipMallocAsync.cc +++ b/catch/unit/memory/hipMallocAsync.cc @@ -319,8 +319,8 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice") { * - HIP_VERSION >= 6.2 */ #if HT_AMD -static void threadQAsyncCommands(streamMemAllocTest* testObj, - hipStream_t strm) { +static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) { + HIP_CHECK(hipSetDevice(idx)); // Create host buffer with test data. testObj->createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on stream. @@ -350,7 +350,7 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent") { // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); - std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]); + std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx); test.join(); } // Wait for the streams @@ -405,10 +405,10 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream") { // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); - std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx], - stream_buf[streamPerAsic*idx]); - std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1], - stream_buf[streamPerAsic*idx + 1]); + std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx], + stream_buf[streamPerAsic * idx], idx); + std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1], + stream_buf[streamPerAsic * idx + 1], idx); test1.join(); test2.join(); } diff --git a/catch/unit/memory/hipMallocFromPoolAsync.cc b/catch/unit/memory/hipMallocFromPoolAsync.cc index 6993dfa87..ca9f41f07 100644 --- a/catch/unit/memory/hipMallocFromPoolAsync.cc +++ b/catch/unit/memory/hipMallocFromPoolAsync.cc @@ -371,8 +371,8 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_ReleaseThreshold_Mgpu") { /** * Local Thread Functions */ -static void threadQAsyncCommands(streamMemAllocTest* testObj, - hipStream_t strm) { +static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) { + HIP_CHECK(hipSetDevice(idx)); // Create host buffer with test data. testObj->createHostBufferWithData(); // Allocate device memory and transfer data to it asyncronously on stream. @@ -616,7 +616,7 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_Concurrent") { // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); - std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]); + std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx); test.join(); } // Wait for the streams @@ -675,10 +675,10 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_MultiStream") { // Queue commands in each device for (int idx = 0; idx < num_devices; idx++) { HIP_CHECK(hipSetDevice(idx)); - std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx], - stream_buf[streamPerAsic*idx]); - std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1], - stream_buf[streamPerAsic*idx + 1]); + std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx], + stream_buf[streamPerAsic * idx], idx); + std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1], + stream_buf[streamPerAsic * idx + 1], idx); test1.join(); test2.join(); } diff --git a/catch/unit/memory/hipMemAdvise_old.cc b/catch/unit/memory/hipMemAdvise_old.cc index 9b785aa61..6116aa34b 100644 --- a/catch/unit/memory/hipMemAdvise_old.cc +++ b/catch/unit/memory/hipMemAdvise_old.cc @@ -836,7 +836,6 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") { int *Hmm = NULL, NumElms = (1024 * 1024), InitVal = 123, blockSize = 64; int *Hmm1 = NULL, DataMismatch = 0; hipStream_t strm; - HIP_CHECK(hipStreamCreate(&strm)); HIP_CHECK(hipMallocManaged(&Hmm, (NumElms * sizeof(int)))); // Initializing memory for (int i = 0; i < NumElms; ++i) { @@ -852,6 +851,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") { for (int i = 1; i < Ngpus; ++i) { DataMismatch = 0; HIP_CHECK(hipSetDevice(i)); + HIP_CHECK(hipStreamCreate(&strm)); HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int)))); MemAdvise3<<>>(Hmm, Hmm1, NumElms); HIP_CHECK(hipStreamSynchronize(strm)); @@ -865,6 +865,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") { WARN("DataMismatch is observed with the gpu: " << i); REQUIRE(false); } + HIP_CHECK(hipStreamDestroy(strm)); HIP_CHECK(hipFree(Hmm1)); } } @@ -873,10 +874,12 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") { for (int i = 0; i < Ngpus; ++i) { DataMismatch = 0; HIP_CHECK(hipSetDevice(i)); + HIP_CHECK(hipStreamCreate(&strm)); HIP_CHECK(hipMemAdvise(Hmm, (NumElms * sizeof(int)), hipMemAdviseSetReadMostly, i)); MemAdvise2<<>>(Hmm, NumElms); HIP_CHECK(hipStreamSynchronize(strm)); + HIP_CHECK(hipStreamDestroy(strm)); } // verifying the final result for (int i = 0; i < NumElms; ++i) { @@ -892,7 +895,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") { } #endif HIP_CHECK(hipFree(Hmm)); - HIP_CHECK(hipStreamDestroy(strm)); + } else { SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory " "attribute. Hence skipping the testing with Pass result.\n"); diff --git a/catch/unit/memory/hipMemcpyPeerAsync.cc b/catch/unit/memory/hipMemcpyPeerAsync.cc index 5e1b384ff..c27fa9c0b 100644 --- a/catch/unit/memory/hipMemcpyPeerAsync.cc +++ b/catch/unit/memory/hipMemcpyPeerAsync.cc @@ -51,9 +51,6 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") { HipTest::HIP_SKIP_TEST("Skipping because devices < 2"); return; } - const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); - const StreamGuard stream_guard(stream_type); - const hipStream_t stream = stream_guard.stream(); const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2); @@ -64,6 +61,11 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") { INFO("Src device: " << src_device << ", Dst device: " << dst_device); HIP_CHECK(hipSetDevice(src_device)); + + const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created); + const StreamGuard stream_guard(stream_type); + const hipStream_t stream = stream_guard.stream(); + HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device)); if (can_access_peer) { HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0)); diff --git a/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc b/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc index 6e906d6ab..45d455e5c 100644 --- a/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc +++ b/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc @@ -510,6 +510,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(bool& val_res) } for (int i = 0; i < numDevices; ++i) { + HIP_CHECK_THREAD(hipSetDevice(i)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], static_cast(A_d[i]), static_cast(B_d[i]), C_d[i], N); HIP_CHECK_THREAD(hipGetLastError()); diff --git a/catch/unit/memory/hipMemcpyWithStream_old.cc b/catch/unit/memory/hipMemcpyWithStream_old.cc index 2b9bba966..b0f730994 100644 --- a/catch/unit/memory/hipMemcpyWithStream_old.cc +++ b/catch/unit/memory/hipMemcpyWithStream_old.cc @@ -475,6 +475,7 @@ void TestkindDefaultForDtoD(void) { } for (int i=0; i < NumDevices; ++i) { + HIP_CHECK(hipSetDevice(i)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], static_cast(A_d[i]), diff --git a/catch/unit/memory/hipSVMTestByteGranularity.cpp b/catch/unit/memory/hipSVMTestByteGranularity.cpp index 9f3a89614..207bf7d1b 100644 --- a/catch/unit/memory/hipSVMTestByteGranularity.cpp +++ b/catch/unit/memory/hipSVMTestByteGranularity.cpp @@ -109,6 +109,7 @@ TEST_CASE("test_svm_byte_granularity") { // get all the devices going simultaneously for(unsigned int d = 0; d < num_devices; d++) // device ids starting at 1. { + HIP_CHECK(hipSetDevice(d)); write_owned_locations<<>>(pA, num_devices_plus_host, d); HIP_CHECK(hipGetLastError()); } @@ -125,6 +126,7 @@ TEST_CASE("test_svm_byte_granularity") { size_t adjusted_num_elements = num_elements - num_devices; for(unsigned int d = 0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); sum_neighbor_locations<<>>(pA, num_devices_plus_host, error_counts[d]); HIP_CHECK(hipGetLastError()); diff --git a/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp b/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp index cd5dd8fa6..655327ba5 100644 --- a/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp +++ b/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp @@ -129,6 +129,7 @@ void launch_kernels_and_verify(std::vector &streams, unsigned int n // all the pixels. for(unsigned int d=0; d < num_devices; d++) { + HIP_CHECK(hipSetDevice(d)); build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>( pInputImage, num_pixels, pNodes, pNumNodes, numBins, d); HIP_CHECK(hipGetLastError()); diff --git a/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp b/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp index 83dc5b870..ee7944fc9 100644 --- a/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp +++ b/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp @@ -208,6 +208,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") { } else { + HIP_CHECK(hipSetDevice(ci)); create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists, ListLength); } @@ -218,6 +219,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") { } else { + HIP_CHECK(hipSetDevice(vi)); verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists, ListLength); } diff --git a/catch/unit/memory/hipStreamAttachMemAsync.cc b/catch/unit/memory/hipStreamAttachMemAsync.cc index df9c5895b..f2b0a9a0e 100644 --- a/catch/unit/memory/hipStreamAttachMemAsync.cc +++ b/catch/unit/memory/hipStreamAttachMemAsync.cc @@ -87,6 +87,9 @@ TEST_CASE("Unit_hipStreamAttachMemAsync_Positive_AttachGlobal") { HIP_CHECK(hipStreamSynchronize(nullptr)); for (int i = 0; i < stream_count; ++i) { + if (device_count > 1) { + HIP_CHECK(hipSetDevice(i)); + } HipTest::launchKernel(Set, 1, 1, 0, streams.at(i)->stream(), managed_global.ptr() + i, i); } diff --git a/catch/unit/memory/mempool_common.hh b/catch/unit/memory/mempool_common.hh index 50d0a2f56..1a01f567b 100644 --- a/catch/unit/memory/mempool_common.hh +++ b/catch/unit/memory/mempool_common.hh @@ -407,6 +407,7 @@ class streamMemAllocTest { dim3(THREADS_PER_BLOCK), 0, stream, static_cast(A_d), static_cast(B_d), C_d, size); + HIP_CHECK(hipGetLastError()); } // Transfer data from device to host asynchronously. void transferFromMempool(hipStream_t stream) { diff --git a/catch/unit/module/hipExtModuleLaunchKernel.cc b/catch/unit/module/hipExtModuleLaunchKernel.cc index 3fe64b8bb..4faf712d9 100644 --- a/catch/unit/module/hipExtModuleLaunchKernel.cc +++ b/catch/unit/module/hipExtModuleLaunchKernel.cc @@ -389,7 +389,7 @@ void ModuleLaunchKernel::AllocateMemory() { args2.clockRate = clkRate; size1 = sizeof(args1); size2 = sizeof(args2); - size3 = sizeof(args3); + size3 = 0; HIP_CHECK(hipEventCreate(&start_event1)); HIP_CHECK(hipEventCreate(&end_event1)); HIP_CHECK(hipEventCreate(&start_event2)); diff --git a/catch/unit/stream/hipStreamLegacy_Ext.cc b/catch/unit/stream/hipStreamLegacy_Ext.cc index 9b7b1d554..0f7c64327 100644 --- a/catch/unit/stream/hipStreamLegacy_Ext.cc +++ b/catch/unit/stream/hipStreamLegacy_Ext.cc @@ -731,9 +731,8 @@ TEST_CASE("Unit_hipStreamLegacy_TwoThreadsInTwoDevicesEachOneDiffOperation") { HIP_CHECK(hipSetDevice(0)); std::thread dev0Thread(operationsInDev0, devArrDev0, devArrDev1); - std::thread dev1Thread(operationsInDev1, devArrDev1, hostArrDst); - dev0Thread.join(); + std::thread dev1Thread(operationsInDev1, devArrDev1, hostArrDst); dev1Thread.join(); for ( int i = 0; i < N; i++ ) { diff --git a/catch/unit/streamperthread/hipStreamPerThread_Basic.cc b/catch/unit/streamperthread/hipStreamPerThread_Basic.cc index 51437b453..0d2ccc961 100644 --- a/catch/unit/streamperthread/hipStreamPerThread_Basic.cc +++ b/catch/unit/streamperthread/hipStreamPerThread_Basic.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -135,4 +135,8 @@ TEST_CASE("Unit_hipStreamPerThread_MemcpyAsync") { for (unsigned int i = 0; i < ele_size; ++i) { REQUIRE(A_h[i] == 123); } -} \ No newline at end of file + + // Clean-up + HIP_CHECK(hipHostFree(A_h)); + HIP_CHECK(hipFree(A_d)); +} diff --git a/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc b/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc index 9eea33da6..ca11e50d6 100644 --- a/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc +++ b/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -88,6 +88,9 @@ TEST_CASE("Unit_hipStreamPerThread_DeviceReset_2") { if (status != hipSuccess) return; HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); + // Host Memory is not destroyed with hipDeviceReset, need to free it + // explicitly to avoid memory leaks + HIP_CHECK(hipHostFree(A_h)); HIP_CHECK(hipDeviceReset()); // After reset all memory objects will be destroyed hence allocating them again diff --git a/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc b/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc index ec34c6de7..914cf6acd 100644 --- a/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc +++ b/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -34,6 +34,9 @@ static void Copy_to_device() { } HIP_CHECK(hipMemcpyAsync(A_d, A_h, ele_size * sizeof(int), hipMemcpyHostToDevice, hipStreamPerThread)); + // Clean up + HIP_CHECK(hipHostFree(A_h)); + HIP_CHECK(hipFree(A_d)); } /* diff --git a/catch/unit/texture/CMakeLists.txt b/catch/unit/texture/CMakeLists.txt index 69a578542..19ca57e57 100644 --- a/catch/unit/texture/CMakeLists.txt +++ b/catch/unit/texture/CMakeLists.txt @@ -64,6 +64,12 @@ set(TEST_SRC hipTexRefGetFlags.cc hipTexRefSetAddressMode.cc hipTexRefGetAddressMode.cc + hipTexRefSetGetFilterMode.cc + hipTexRefSetGetMipmapFilterMode.cc + hipTexRefSetGetMipmapLevelBias.cc + hipTexRefSetGetMipmapLevelClamp.cc + hipTexRefSetGetMipmappedArray.cc + ) # tests not for gfx90a+ diff --git a/catch/unit/texture/hipTexRefSetGetFilterMode.cc b/catch/unit/texture/hipTexRefSetGetFilterMode.cc new file mode 100644 index 000000000..6203e2d85 --- /dev/null +++ b/catch/unit/texture/hipTexRefSetGetFilterMode.cc @@ -0,0 +1,65 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +texture tex; +// Test for hipTexRefSetFilterMode and hipTexRefGetFilterMode, including error handling +TEST_CASE("Unit_hipTexRefSetGetFilterMode") { + CHECK_IMAGE_SUPPORT; + + // Retrieve the texture reference for our symbol + const textureReference* texRefConst = nullptr; + HIP_CHECK(hipGetTextureReference(&texRefConst, &tex)); + REQUIRE(texRefConst != nullptr); + // Implementation expects non-const textureReference* + textureReference* texRef = const_cast(texRefConst); + + hipTextureFilterMode mode; + + SECTION("Default filter mode is Point") { + HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef)); + REQUIRE(mode == hipFilterModePoint); + } + + SECTION("Set filter mode to Linear and verify") { + HIP_CHECK(hipTexRefSetFilterMode(texRef, hipFilterModeLinear)); + HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef)); + REQUIRE(mode == hipFilterModeLinear); + } + + SECTION("Set filter mode back to Point and verify") { + HIP_CHECK(hipTexRefSetFilterMode(texRef, hipFilterModePoint)); + HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef)); + REQUIRE(mode == hipFilterModePoint); + } + + SECTION("Invalid arguments: null texture reference pointer") { + // Setting filter mode with null texRef should fail + hipError_t errSet = hipTexRefSetFilterMode(nullptr, hipFilterModeLinear); + REQUIRE(errSet == hipErrorInvalidValue); + + // Getting filter mode with null texRef should fail + hipError_t errGetRef = hipTexRefGetFilterMode(&mode, nullptr); + REQUIRE(errGetRef == hipErrorInvalidValue); + + // Getting filter mode with null mode pointer should fail + hipError_t errGetMode = hipTexRefGetFilterMode(nullptr, texRef); + REQUIRE(errGetMode == hipErrorInvalidValue); + } +} diff --git a/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc b/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc new file mode 100644 index 000000000..73406b563 --- /dev/null +++ b/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc @@ -0,0 +1,58 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +texture tex; +// Test for hipTexRefSetMipmapFilterMode and hipTexRefGetMipmapFilterMode, including error handling +TEST_CASE("Unit_hipTexRefSetGetMipmapFilterMode") { + CHECK_IMAGE_SUPPORT; + + // Retrieve the texture reference for our symbol + const textureReference* texRefConst = nullptr; + HIP_CHECK(hipGetTextureReference(&texRefConst, &tex)); + REQUIRE(texRefConst != nullptr); + // Implementation expects non-const textureReference* + textureReference* texRef = const_cast(texRefConst); + + hipTextureFilterMode mipMode; + + SECTION("Set mipmap filter mode to Linear and verify") { + HIP_CHECK(hipTexRefSetMipmapFilterMode(texRef, hipFilterModeLinear)); + auto res = hipTexRefGetMipmapFilterMode(&mipMode, texRef); + REQUIRE(res == hipErrorInvalidValue); + REQUIRE(mipMode == hipFilterModeLinear); + } + + SECTION("Set mipmap filter mode back to Point and verify") { + HIP_CHECK(hipTexRefSetMipmapFilterMode(texRef, hipFilterModePoint)); + auto res = hipTexRefGetMipmapFilterMode(&mipMode, texRef); + REQUIRE(res == hipErrorInvalidValue); + REQUIRE(mipMode == hipFilterModePoint); + } + + SECTION("Invalid arguments: null pointers") { + hipError_t err; + err = hipTexRefSetMipmapFilterMode(nullptr, hipFilterModeLinear); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapFilterMode(&mipMode, nullptr); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapFilterMode(nullptr, texRef); + REQUIRE(err == hipErrorInvalidValue); + } +} diff --git a/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc b/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc new file mode 100644 index 000000000..e2e4298d8 --- /dev/null +++ b/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc @@ -0,0 +1,52 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +texture tex; +// Test for hipTexRefSetMipmapLevelBias and hipTexRefGetMipmapLevelBias, including error handling +TEST_CASE("Unit_hipTexRefSetGetMipmapLevelBias") { + CHECK_IMAGE_SUPPORT; + + // Retrieve the texture reference for our symbol + const textureReference* texRefConst = nullptr; + HIP_CHECK(hipGetTextureReference(&texRefConst, &tex)); + REQUIRE(texRefConst != nullptr); + // Implementation expects non-const textureReference* + textureReference* texRef = const_cast(texRefConst); + + float bias = 0.0; + + SECTION("Set mipmap level bias to custom value and verify") { + float newBias = 2.25; + HIP_CHECK(hipTexRefSetMipmapLevelBias(texRef, newBias)); + auto res = hipTexRefGetMipmapLevelBias(&bias, texRef); + REQUIRE(res == hipErrorInvalidValue); + REQUIRE(bias == newBias); + } + + SECTION("Invalid arguments: null pointers") { + hipError_t err; + err = hipTexRefSetMipmapLevelBias(nullptr, 1.0f); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapLevelBias(nullptr, texRef); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapLevelBias(&bias, nullptr); + REQUIRE(err == hipErrorInvalidValue); + } +} diff --git a/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc b/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc new file mode 100644 index 000000000..764464a85 --- /dev/null +++ b/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc @@ -0,0 +1,57 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +texture tex; +// Test for hipTexRefSetMipmapLevelClamp and hipTexRefGetMipmapLevelClamp, including error handling +TEST_CASE("Unit_texRefSetGetMipmapLevelClamp") { + CHECK_IMAGE_SUPPORT; + + // Retrieve the texture reference for our symbol + const textureReference* texRefConst = nullptr; + HIP_CHECK(hipGetTextureReference(&texRefConst, &tex)); + REQUIRE(texRefConst != nullptr); + // Implementation expects non-const textureReference* + textureReference* texRef = const_cast(texRefConst); + + + float minClamp = 0.0f, maxClamp = 0.0f; + + SECTION("Set mipmap level clamp to custom values and verify") { + float newMin = 1.5f, newMax = 5.5f; + HIP_CHECK(hipTexRefSetMipmapLevelClamp(texRef, newMin, newMax)); + auto res = hipTexRefGetMipmapLevelClamp(&minClamp, &maxClamp, texRefConst); + REQUIRE(res == hipErrorInvalidValue); + REQUIRE(minClamp == newMin); + REQUIRE(maxClamp == newMax); + } + + SECTION("Invalid arguments: null pointers") { + hipError_t err; + err = hipTexRefSetMipmapLevelClamp(nullptr, 1.0f, 2.0f); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapLevelClamp(nullptr, &maxClamp, texRefConst); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapLevelClamp(&minClamp, nullptr, texRefConst); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipmapLevelClamp(&minClamp, &maxClamp, nullptr); + REQUIRE(err == hipErrorInvalidValue); + } +} diff --git a/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc b/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc new file mode 100644 index 000000000..d12bd3159 --- /dev/null +++ b/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc @@ -0,0 +1,76 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +texture tex; + +// Test for hipTexRefSetMipmappedArray and hipTexRefGetMipmappedArray, including error handling +TEST_CASE("Unit_hipTexRefSetGetMipmappedArray") { + CHECK_IMAGE_SUPPORT; + + // Retrieve the texture reference for our symbol + const textureReference* texRefConst = nullptr; + HIP_CHECK(hipGetTextureReference(&texRefConst, &tex)); + REQUIRE(texRefConst != nullptr); + // Implementation expects non-const textureReference* + textureReference* texRef = const_cast(texRefConst); + hipMipmappedArray_t mipArr = nullptr; + hipMipmappedArray_t outArr = nullptr; + unsigned int Flags = 0; + + + SECTION("Default mipmapped array GET returns invalid value when none bound") { + hipError_t err = hipTexRefGetMipMappedArray(&outArr, texRef); + REQUIRE(err == hipErrorInvalidValue); + } + + SECTION("Set and get mipmapped array") { + hipMipmappedArray_t mipmapped_array; + HIP_RESOURCE_DESC res_desc{}; + hipExtent extent; + hipChannelFormatDesc channel_desc; + unsigned int width = 256, height = 256, mipmap_level = 2; + + res_desc.resType = HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY; + + channel_desc = hipCreateChannelDesc(); + extent = make_hipExtent(width, height, 0); + auto res = hipMallocMipmappedArray(&mipmapped_array, &channel_desc, extent, 2 * mipmap_level, + hipArrayDefault); + if (res == hipErrorNotSupported) { + SUCCEED("Mipmapped arrays not supported on this device"); + return; + } + HIP_CHECK(res); + + HIP_CHECK(hipTexRefSetMipmappedArray(texRef, mipmapped_array, Flags)); + HIP_CHECK(hipTexRefGetMipMappedArray(&outArr, texRef)); + REQUIRE(outArr == mipmapped_array); + HIP_CHECK(hipFreeMipmappedArray(mipmapped_array)); + } + + SECTION("Invalid arguments: null pointers") { + hipError_t err; + err = hipTexRefSetMipmappedArray(nullptr, mipArr, Flags); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipMappedArray(&outArr, nullptr); + REQUIRE(err == hipErrorInvalidValue); + err = hipTexRefGetMipMappedArray(nullptr, texRef); + REQUIRE(err == hipErrorInvalidValue); + } +} diff --git a/perftests/compute/hipPerfDotProduct.cpp b/perftests/compute/hipPerfDotProduct.cpp deleted file mode 100644 index e30d5ab03..000000000 --- a/perftests/compute/hipPerfDotProduct.cpp +++ /dev/null @@ -1,382 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include - -#define DOT_DIM 256 - -using namespace std; - -template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_not_equal(int n, - const double* __restrict__ x, - const double* __restrict__ y, - double* __restrict__ workspace) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - - double sum = 0.0; - for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) { - sum = fma(y[idx], x[idx], sum); - } - - __shared__ double sdata[BLOCKSIZE]; - sdata[threadIdx.x] = sum; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64){ - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32){ - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } - __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[blockIdx.x] = sdata[0]; - } - -} - -template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_equal(int n, const double* __restrict__ x, - double* __restrict__ workspace) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - - double sum = 0.0; - for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) { - sum = fma(x[idx], x[idx], sum); - } - - __shared__ double sdata[BLOCKSIZE]; - sdata[threadIdx.x] = sum; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64) { - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32) { - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } - __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[blockIdx.x] = sdata[0]; - } -} - -template -__launch_bounds__(BLOCKSIZE) -__global__ void dot_reduction(double* __restrict__ workspace) { - - __shared__ double sdata[BLOCKSIZE]; - - sdata[threadIdx.x] = workspace[threadIdx.x]; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64) { - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32) { - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[0] = sdata[0]; - } - -} - -void computeDotProduct(int n, const double* x, const double* y, double& result, - double* workspace) -{ - dim3 blocks(DOT_DIM); - dim3 threadsPerBlock(DOT_DIM); - - if(x != y) { - hipLaunchKernelGGL(vectors_not_equal, blocks, threadsPerBlock, 0, 0, n, x, y, - workspace); - } - else { - hipLaunchKernelGGL(vectors_equal, blocks, threadsPerBlock, 0, 0, n, x, workspace); - } - - // Part 2 of dot product computation - hipLaunchKernelGGL(dot_reduction, dim3(1), threadsPerBlock, 0, 0, workspace); - - // Copy the final dot product result back from the device - HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost)); - - return; -} - -int main(int argc, char* argv[]) { - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - - if (nGpu < 1) { - failed("No GPU!"); - } - hipDeviceProp_t props = {0}; - props = {0}; - HIPCHECK(hipSetDevice(p_gpuDevice)); - HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice - << std::endl; - - int nx, ny, nz; - - for (unsigned int testCase = 0; testCase < 3; testCase++) { - - vector vectorSize = {200, 300, 50}; - switch(testCase) { - - case 0: - nx = vectorSize[0]; - ny = vectorSize[0]; - nz = vectorSize[0]; - break; - - case 1: - nx = vectorSize[1]; - ny = vectorSize[1]; - nz = vectorSize[1]; - break; - - case 2: - nx = vectorSize[0]; - ny = vectorSize[1]; - nz = vectorSize[2]; - break; - - default: - break; - - } - - int trials = 200; - - int size = nx * ny * nz; - - vector hx(size); - vector hy(size); - double hresult_xy = 0.0; - double hresult_xx = 0.0; - - srand(time(NULL)); - - for(int i = 0; i < size; ++i) { - hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0; - hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0; - - hresult_xy += hx[i] * hy[i]; - hresult_xx += hx[i] * hx[i]; - } - - double* dx; - double* dy; - double* workspace; - double dresult; - - HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size)); - HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size)); - HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM)); - - HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - - // Warm up - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - - // Timed run for - HIPCHECK(hipDeviceSynchronize()); - auto all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < trials; ++i) { - computeDotProduct(size, dx, dy, dresult, workspace); - } - - float time = 0; - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); - - time /= trials; - - double bw = sizeof(double) * size * 2.0 / 1e9; - double gf = 2.0 * size / 1e9; - - cout << "\nVector Size: " << size << "\n[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; - - // Verify the device kernel results comparing it with the host results - if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) { - cerr << " Device results inconsistent with host results. " - << " Host result: " << hresult_xy - << " Device result: " << dresult; - } - - // Warm up - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - - // Timed run for - HIPCHECK(hipDeviceSynchronize()); - all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < trials; ++i) { - computeDotProduct(size, dx, dx, dresult, workspace); - } - - all_end = std::chrono::steady_clock::now(); - all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); - - time /= trials; - bw = sizeof(double) * size / 1e9; - - cout << "[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; - - // Verify the device kernel results comparing it with the host results - if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) { - cerr << " Device results inconsistent with host results" - << " Host result: " << hresult_xy - << " Device result: " << dresult; - } - - HIPCHECK(hipFree(dx)); - HIPCHECK(hipFree(dy)); - HIPCHECK(hipFree(workspace)); - - } - passed(); - return 0; -} diff --git a/perftests/compute/hipPerfMandelbrot.cpp b/perftests/compute/hipPerfMandelbrot.cpp deleted file mode 100644 index 9f9d6b404..000000000 --- a/perftests/compute/hipPerfMandelbrot.cpp +++ /dev/null @@ -1,743 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include -#include -#include -#include -#include - -typedef struct { - double x; - double y; - double width; -} coordRec; - -coordRec coords[] = { - {0.0, 0.0, 4.0}, // Whole set - {0.0, 0.0, 0.00001}, // All black - {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -template -__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -template -__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*(float)i); - float y0 = (float)(yPos + yStep*(float)j); - - float x = x0; - float y = y0; - -#define FAST - uint iter = 0; - float tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - float savx = x; - float savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) { -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) { -#endif - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } while (stay && iter); - } - - - out[tid] = (uint)ccount; - -}; - - -template -__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*i); - double y0 = (double)(yPos + yStep*j); - - double x = x0; - double y = y0; - - uint iter = 0; - double tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - out[tid] = iter; -}; - - -template -__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*(double)i); - double y0 = (double)(yPos + yStep*(double)j); - - double x = x0; - double y = y0; - -#define FAST - uint iter = 0; - double tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - double savx = x; - double savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) -#endif - { - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } - while (stay && iter); - - } - out[tid] = (uint)ccount; -}; - -static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; - -// Expected results for each kernel run at each coord -unsigned long long expectedIters[] = { - 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, - 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, - 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, - 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; - -class hipPerfMandelBrot { - public: - hipPerfMandelBrot(); - ~hipPerfMandelBrot(); - - void setNumKernels(unsigned int num) { - numKernels = num; - } - - unsigned int getNumKernels() { - return numKernels; - } - - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } - - void open(int deviceID); - void run(unsigned int testCase, unsigned int deviceId); - void printResults(void); - - // array of funtion pointers - typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - // Wrappers - void float_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - hipStream_t streams[2]; - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numKernels; - unsigned int numStreams; - - std::map> results; - unsigned int width_; - unsigned int bufSize; - unsigned int maxIter; - unsigned int coordIdx; - volatile unsigned long long totalIters = 0; - int numCUs; - static const unsigned int numLoops = 10; -}; - - -hipPerfMandelBrot::hipPerfMandelBrot() {} - -hipPerfMandelBrot::~hipPerfMandelBrot() {} - -void hipPerfMandelBrot::open(int deviceId) { - - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; - - numCUs = props.multiProcessorCount; -} - - -void hipPerfMandelBrot::printResults() { - - int numkernels = getNumKernels(); - int numStreams = getNumStreams(); - - std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " - << numStreams << " streams (s)" << std::endl; - - std::map>:: iterator itr; - for (itr = results.begin(); itr != results.end(); itr++) { - std::cout << "\n" << std::setw(20) << itr->first << " "; - for(auto i : results[itr->first]) { - std::cout << std::setw(10) << i << " "; - } - } - results.clear(); - - std::cout << std::endl; -} - - -// Wrappers for the kernel launches -void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, - maxIter); - - -} - - -void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { - - unsigned int numStreams = getNumStreams(); - coordIdx = testCase % numCoords; - - funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, - &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; - - // Maximum iteration count - maxIter = 32768; - - uint * hPtr[numKernels]; - uint * dPtr[numKernels]; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - for (uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - - // Allocate memory on the host and device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - } - - - // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory asynchronously and concurrently from host to device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - } - - // Synchronize to make sure all the copies are completed - HIPCHECK(hipStreamSynchronize(0)); - - int kernelIdx; - if(testCase == 0 || testCase == 5 || testCase == 10) { - kernelIdx = 0; - } - - else if(testCase == 1 || testCase == 6 || testCase == 11) { - kernelIdx = 1; - } - else if(testCase == 2 || testCase == 7 || testCase == 12) { - kernelIdx = 2; - } - else if(testCase == 3 || testCase == 8 || testCase == 13){ - kernelIdx = 3; - } - - - double totalTime = 0.0; - - for (unsigned int k = 0; k < numLoops; k++) { - if ((testCase == 0 || testCase == 1 || testCase == 2 || - testCase == 5 || testCase == 6 || testCase == 7 || - testCase == 10 || testCase == 11 || testCase == 12)) { - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - - } - - - else { - double xStep = coords[coordIdx].width / (double)width_; - double yStep = -coords[coordIdx].width / (double)width_; - double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; - double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } - - - } - - // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); - } - - - for(uint i = 0; i < numKernels; i++) { - checkData(hPtr[i]); - - int j =0; - while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { - j++; - } - - if(j==30) { - std::cout << "Incorrect iteration count detected. "; - } - - } - - - // Compute GFLOPS. There are 7 FLOPs per iteration - double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) / - (totalTime / (double)numLoops); - - - std::vector kernelName = {"float", "float_unroll", - "double", "double_unroll"}; - - // Print results except for Warm-up kernel - if(testCase!=100) { - results[kernelName[testCase % 4]].push_back(perf); - } - - - for(uint i = 0 ; i < numStreams; i++) { - HIPCHECK(hipStreamDestroy(streams[i])); - } - - - // Free host and device memory - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - -} - - -void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_; i++) { - ptr2[i] = value; - } -} - - -void hipPerfMandelBrot::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfMandelBrot mandelbrotCompute; - int deviceId = 0; - - mandelbrotCompute.open(deviceId); - - for (unsigned int testCase = 0; testCase < 3; testCase++) { - - - switch (testCase) { - - - case 0: { - // Warmup-kernel - default stream executes serially - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(100/*Random number*/, deviceId); - break; - } - - - case 1: { - // run all - sync - int i = 0; - do { - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - } - - - case 2: { - // run all - async - int i = 0; - do { - mandelbrotCompute.setNumStreams(2); - mandelbrotCompute.setNumKernels(2); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - - } - - - default: { - break; - } - - - } - - - - } - - - passed(); -} diff --git a/perftests/dispatch/hipPerfDispatchSpeed.cpp b/perftests/dispatch/hipPerfDispatchSpeed.cpp deleted file mode 100644 index 56a757a54..000000000 --- a/perftests/dispatch/hipPerfDispatchSpeed.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include - -#include "timer.h" -#include "test_common.h" - -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif - -#define CHAR_BUF_SIZE 512 - -#define CHECK_RESULT(test, msg) \ - if ((test)) \ - { \ - printf("\n%s\n", msg); \ - abort(); \ - } - -typedef struct { - unsigned int iterations; - int flushEvery; -} testStruct; - -testStruct testList[] = -{ - { 1, -1}, - { 1, -1}, - { 10, 1}, - { 10, -1}, - { 100, 1}, - { 100, 10}, - { 100, -1}, - { 1000, 1}, - { 1000, 10}, - { 1000, 100}, - { 1000, -1}, - { 10000, 1}, - { 10000, 10}, - { 10000, 100}, - { 10000, 1000}, - { 10000, -1}, - { 100000, 1}, - { 100000, 10}, - { 100000, 100}, - { 100000, 1000}, - { 100000, 10000}, - { 100000, -1}, -}; - -unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000}; - -__global__ void _dispatchSpeed(float *outBuf) -{ - int i = (blockIdx.x * blockDim.x + threadIdx.x); - if (i < 0) - outBuf[i] = 0.0f; -}; - - -int main(int argc, char* argv[]) { - HipTest::parseStandardArguments(argc, argv, true); - - hipError_t err = hipSuccess; - hipDeviceProp_t props = {0}; - hipGetDeviceProperties(&props, p_gpuDevice); - CHECK_RESULT(err != hipSuccess, "hipGetDeviceProperties failed" ); - printf("Set device to %d : %s\n", p_gpuDevice, props.name); - - unsigned int testListSize = sizeof(testList) / sizeof(testStruct); - int numTests = (p_tests == -1) ? (2*2*testListSize - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; - - float* srcBuffer = NULL; - unsigned int bufSize_ = 64*sizeof(float); - err = hipMalloc(&srcBuffer, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMalloc failed"); - - for(;test <= numTests; test++) - { - int openTest = test % testListSize; - bool sleep = false; - - if (test >= (testListSize * 2)) - { - sleep = true; - } - - int threads = (bufSize_ / sizeof(float)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - // warmup - hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block), - 0, hipStream_t(0), srcBuffer); - err = hipDeviceSynchronize(); - CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed"); - - CPerfCounter timer; - - timer.Reset(); - timer.Start(); - for (unsigned int i = 0; i < testList[openTest].iterations; i++) - { - hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block), - 0, hipStream_t(0), srcBuffer); - - if ((testList[openTest].flushEvery > 0) && - (((i + 1) % testList[openTest].flushEvery) == 0)) - { - if (sleep) - { - err = hipDeviceSynchronize(); - CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed"); - } - else - { - do { - err = hipStreamQuery(NULL); - } while (err == hipErrorNotReady); - } - } - } - if (sleep) - { - err = hipDeviceSynchronize(); - CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed"); - } - else - { - do { - err = hipStreamQuery(NULL); - } while (err == hipErrorNotReady); - } - timer.Stop(); - - double sec = timer.GetElapsedTime(); - - // microseconds per launch - double perf = (1000000.f*sec/testList[openTest].iterations); - const char *waitType; - const char *extraChar; - const char *n; - if (sleep) - { - waitType = "sleep"; - extraChar = ""; - n = ""; - } - else - { - waitType = "spin"; - n = "n"; - extraChar = " "; - } - - - char buf[256]; - if (testList[openTest].flushEvery > 0) - { - SNPRINTF(buf, sizeof(buf), - "HIPPerfDispatchSpeed[%3d] %7d dispatches %s%sing every %5d (us/disp) %3f", - test, testList[openTest].iterations, - waitType, n, testList[openTest].flushEvery, (float)perf); - } - else - { - SNPRINTF(buf, sizeof(buf), - "HIPPerfDispatchSpeed[%3d] %7d dispatches (%s%s) (us/disp) %3f", - test, testList[openTest].iterations, waitType, extraChar, (float)perf); - } - printf("%s\n", buf); - } - - hipFree(srcBuffer); - passed(); -} diff --git a/perftests/memory/hipPerfBufferCopyRectSpeed.cpp b/perftests/memory/hipPerfBufferCopyRectSpeed.cpp deleted file mode 100644 index 78096844f..000000000 --- a/perftests/memory/hipPerfBufferCopyRectSpeed.cpp +++ /dev/null @@ -1,300 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include - -#include "timer.h" -#include "test_common.h" - -// Quiet pesky warnings -#ifdef WIN_OS -#define SNPRINTF sprintf_s -#else -#define SNPRINTF snprintf -#endif - -#define NUM_SIZES 8 -//4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10 -static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216+10}; - -static const unsigned int Iterations[2] = {1, 1000}; - -#define BUF_TYPES 4 -// 16 ways to combine 4 different buffer types -#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES) - -#define CHECK_RESULT(test, msg) \ - if ((test)) \ - { \ - printf("\n%s\n", msg); \ - abort(); \ - } - -void setData(void *ptr, unsigned int size, char value) -{ - char *ptr2 = (char *)ptr; - for (unsigned int i = 0; i < size ; i++) - { - ptr2[i] = value; - } -} - -void checkData(void *ptr, unsigned int size, char value) -{ - char *ptr2 = (char *)ptr; - for (unsigned int i = 0; i < size; i++) - { - if (ptr2[i] != value) - { - printf("Data validation failed at %d! Got 0x%08x\n", i, ptr2[i]); - printf("Expected 0x%08x\n", value); - CHECK_RESULT(true, "Data validation failed!"); - break; - } - } -} - - -int main(int argc, char* argv[]) { - HipTest::parseStandardArguments(argc, argv, true); - - hipError_t err = hipSuccess; - hipDeviceProp_t props = {0}; - hipGetDeviceProperties(&props, p_gpuDevice); - CHECK_RESULT(err != hipSuccess, "hipGetDeviceProperties failed" ); - printf("Set device to %d : %s\n", p_gpuDevice, props.name); - printf("Legend: unp - unpinned(malloc), hM - hipMalloc(device)\n"); - printf(" hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned)\n"); - err = hipSetDevice(p_gpuDevice); - CHECK_RESULT(err != hipSuccess, "hipSetDevice failed" ); - - unsigned int bufSize_; - bool hostMalloc[2] = {false}; - bool hostRegister[2] = {false}; - bool unpinnedMalloc[2] = {false}; - unsigned int numIter; - void *memptr[2] = {NULL}; - void *alignedmemptr[2] = {NULL}; - void* srcBuffer = NULL; - void* dstBuffer = NULL; - - int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests; - int test = (p_tests == -1) ? 0 : p_tests; - - for(;test <= numTests; test++) - { - unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES; - unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES; - bufSize_ = Sizes[test % NUM_SIZES]; - hostMalloc[0] = hostMalloc[1] = false; - hostRegister[0] = hostRegister[1] = false; - unpinnedMalloc[0] = unpinnedMalloc[1] = false; - srcBuffer = dstBuffer = 0; - memptr[0] = memptr[1] = NULL; - alignedmemptr[0] = alignedmemptr[1] = NULL; - - size_t width = static_cast(sqrt(static_cast(bufSize_))); - - if (srcTest == 3) - { - hostRegister[0] = true; - } - else if (srcTest == 2) - { - hostMalloc[0] = true; - } - else if (srcTest == 1) - { - unpinnedMalloc[0] = true; - } - - if (dstTest == 1) - { - unpinnedMalloc[1] = true; - } - else if (dstTest == 2) - { - hostMalloc[1] = true; - } - else if (dstTest == 3) - { - hostRegister[1] = true; - } - - numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)]; - - if (hostMalloc[0]) - { - err = hipHostMalloc((void**)&srcBuffer, bufSize_, 0); - setData(srcBuffer, bufSize_, 0xd0); - CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed"); - } - else if (hostRegister[0]) - { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - err = hipHostRegister(srcBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostRegister failed"); - } - else if (unpinnedMalloc[0]) - { - memptr[0] = malloc(bufSize_ + 4096); - alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095); - srcBuffer = alignedmemptr[0]; - setData(srcBuffer, bufSize_, 0xd0); - } - else - { - err = hipMalloc(&srcBuffer, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMalloc failed"); - err = hipMemset(srcBuffer, 0xd0, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMemset failed"); - } - - if (hostMalloc[1]) - { - err = hipHostMalloc((void**)&dstBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed"); - } - else if (hostRegister[1]) - { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095); - dstBuffer = alignedmemptr[1]; - err = hipHostRegister(dstBuffer, bufSize_, 0); - CHECK_RESULT(err != hipSuccess, "hipHostRegister failed"); - } - else if (unpinnedMalloc[1]) - { - memptr[1] = malloc(bufSize_ + 4096); - alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095); - dstBuffer = alignedmemptr[1]; - } - else - { - err = hipMalloc(&dstBuffer, bufSize_); - CHECK_RESULT(err != hipSuccess, "hipMalloc failed"); - } - - CPerfCounter timer; - - //warm up - err = hipMemcpy2D(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault); - CHECK_RESULT(err, "hipMemcpy2D failed"); - - timer.Reset(); - timer.Start(); - for (unsigned int i = 0; i < numIter; i++) - { - err = hipMemcpy2DAsync(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault, NULL); - CHECK_RESULT(err, "hipMemcpyAsync2D failed"); - } - err = hipDeviceSynchronize(); - CHECK_RESULT(err, "hipDeviceSynchronize failed"); - timer.Stop(); - double sec = timer.GetElapsedTime(); - - // Buffer copy bandwidth in GB/s - double perf = ((double)bufSize_*numIter*(double)(1e-09)) / sec; - - const char *strSrc = NULL; - const char *strDst = NULL; - if (hostMalloc[0]) - strSrc = "hHM"; - else if (hostRegister[0]) - strSrc = "hHR"; - else if (unpinnedMalloc[0]) - strSrc = "unp"; - else - strSrc = "hM"; - - if (hostMalloc[1]) - strDst = "hHM"; - else if (hostRegister[1]) - strDst = "hHR"; - else if (unpinnedMalloc[1]) - strDst = "unp"; - else - strDst = "hM"; - // Double results when src and dst are both on device - if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) && - (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1])) - perf *= 2.0; - // Double results when src and dst are both in sysmem - if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) && - (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1])) - perf *= 2.0; - - char buf[256]; - SNPRINTF(buf, sizeof(buf), "HIPPerfBufferCopyRectSpeed[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) perf\t%f", - test, bufSize_, strSrc, strDst, numIter, (float)perf); - printf("%s\n", buf); - - //Free src - if (hostMalloc[0]) - { - hipHostFree(srcBuffer); - } - else if (hostRegister[0]) - { - hipHostUnregister(srcBuffer); - free(memptr[0]); - } - else if (unpinnedMalloc[0]) - { - free(memptr[0]); - } - else - { - hipFree(srcBuffer); - } - - //Free dst - if (hostMalloc[1]) - { - hipHostFree(dstBuffer); - } - else if (hostRegister[1]) - { - hipHostUnregister(dstBuffer); - free(memptr[1]); - } - else if (unpinnedMalloc[1]) - { - free(memptr[1]); - } - else - { - hipFree(dstBuffer); - } - } - - passed(); -} diff --git a/perftests/memory/hipPerfDevMemReadSpeed.cpp b/perftests/memory/hipPerfDevMemReadSpeed.cpp deleted file mode 100644 index 6548da94c..000000000 --- a/perftests/memory/hipPerfDevMemReadSpeed.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -using namespace std; - -#define arraySize 16 - -typedef struct d_uint16 { - uint data[arraySize]; -} d_uint16; - -__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) { - - size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x ; - - uint tmp = 0; - for (size_t i = idx; i < N; i += stride) { - for (size_t j = 0; j < arraySize; j++) { - tmp += src[i].data[j]; - } - } - - atomicAdd(dst, tmp); -} - -int main(int argc, char* argv[]) { - d_uint16 *dSrc; - d_uint16 *hSrc; - uint *dDst; - uint *hDst; - hipStream_t stream; - ulong N = 4 * 1024 * 1024; - uint nBytes = N * sizeof(d_uint16); - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - cout << "info: didn't find any GPU! skipping the test!\n"; - passed(); - return 0; - } - - static int device = 0; - HIPCHECK(hipSetDevice(device)); - hipDeviceProp_t props; - HIPCHECK(hipGetDeviceProperties(&props, device)); - cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << - " with " << props.multiProcessorCount << " CUs" << endl; - - const unsigned threadsPerBlock = 64; - const unsigned blocks = props.multiProcessorCount * 4; - - uint inputData = 0x1; - int nIter = 1000; - - hSrc = new d_uint16[nBytes]; - HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); - hDst = new uint; - hDst[0] = 0; - HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); - for (size_t i = 0; i < N; i++) { - for (int j = 0; j < arraySize; j++) { - hSrc[i].data[j] = inputData; - } - } - - HIPCHECK(hipMalloc(&dSrc, nBytes)); - HIPCHECK(hipMalloc(&dDst, sizeof(uint))); - - HIPCHECK(hipStreamCreate(&stream)); - - HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice)); - HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); - HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost)); - hipDeviceSynchronize(); - - if (hDst[0] != (nBytes / sizeof(uint))) { - cout << "info: Data validation failed for warm up run!" << endl; - cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl; - HIPCHECK(hipErrorUnknown); - } - - // measure performance based on host time - auto all_start = chrono::steady_clock::now(); - - for(int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst); - } - hipDeviceSynchronize(); - - auto all_end = chrono::steady_clock::now(); - chrono::duration all_kernel_time = all_end - all_start; - - // read speed in GB/s - double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); - - cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB" << endl; - - delete [] hSrc; - delete hDst; - hipFree(dSrc); - hipFree(dDst); - HIPCHECK(hipStreamDestroy(stream)); - - passed(); -} diff --git a/perftests/memory/hipPerfDevMemWriteSpeed.cpp b/perftests/memory/hipPerfDevMemWriteSpeed.cpp deleted file mode 100644 index cc4883660..000000000 --- a/perftests/memory/hipPerfDevMemWriteSpeed.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -using namespace std; - -#define arraySize 16 - -typedef struct d_uint16 { - uint data[arraySize]; -} d_uint16; - -__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) { - size_t idx = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x; - for (size_t i = idx; i < N; i += stride) { - dst[i] = pval; - } -}; - -int main(int argc, char* argv[]) { - d_uint16 *dDst; - d_uint16 *hDst; - hipStream_t stream; - ulong N = 4 * 1024 * 1024; - uint nBytes = N * sizeof(d_uint16); - d_uint16 pval; - - for (int i = 0; i < arraySize; i++) { - pval.data[i] = 0xabababab; - } - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - cout << "info: didn't find any GPU! skipping the test!\n"; - passed(); - return 0; - } - - static int device = 0; - HIPCHECK(hipSetDevice(device)); - hipDeviceProp_t props; - HIPCHECK(hipGetDeviceProperties(&props, device)); - cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name << - " with " << props.multiProcessorCount << " CUs" << endl; - - size_t threadsPerBlock = 64; - size_t blocks = props.multiProcessorCount * 4; - - uint inputData = 0xabababab; - int nIter = 1000; - - hDst = new d_uint16[nBytes]; - HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); - for (size_t i = 0; i < N; i++) { - for (size_t j = 0; j < arraySize; j++) { - hDst[i].data[j] = 0; - } - } - - HIPCHECK(hipMalloc(&dDst, nBytes)); - - HIPCHECK(hipStreamCreate(&stream)); - - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); - HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost)); - hipDeviceSynchronize(); - - for (uint i = 0; i < N; i++) { - for (uint j = 0; j < arraySize; j++) { - if (hDst[i].data[j] != inputData) { - cout << "info: Data validation failed for warm up run! " << endl; - cout << "at index i: " << i << " element j: " << j << endl; - cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl; - HIPCHECK(hipErrorUnknown); - } - } - } - - auto all_start = chrono::steady_clock::now(); - for(int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval); - } - hipDeviceSynchronize(); - auto all_end = chrono::steady_clock::now(); - chrono::duration all_kernel_time = all_end - all_start; - - // read speed in GB/s - double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count(); - - cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " << - nBytes / (1024 * 1024) << " MB" << endl; - - - delete [] hDst; - hipFree(dDst); - HIPCHECK(hipStreamDestroy(stream)); - - passed(); -} diff --git a/perftests/memory/hipPerfHostNumaAlloc.cpp b/perftests/memory/hipPerfHostNumaAlloc.cpp deleted file mode 100644 index 93cd71ace..000000000 --- a/perftests/memory/hipPerfHostNumaAlloc.cpp +++ /dev/null @@ -1,190 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD_CMD: hipPerfHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvidia - * TEST: %t - * HIT_END - */ - -#include "test_common.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "hip/hip_runtime.h" - -// To run it correctly, we must not export HIP_VISIBLE_DEVICES. -// And we must explicitly link libnuma because of numa api move_pages(). -#define NUM_PAGES 4 -char *h = nullptr; -char *d_h = nullptr; -char *m = nullptr; -char *d_m = nullptr; -int page_size = 0; -const int mode[] = { MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE }; -const char* modeStr[] = { "MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE" }; - -std::string exeCommand(const char* cmd) { - std::array buff; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - return result; - } - while (fgets(buff.data(), buff.size(), pipe.get()) != nullptr) { - result += buff.data(); - } - return result; -} - -int getCpuAgentCount() { - const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l"; - int cpuAgentCount = std::atoi(exeCommand(cmd).c_str()); - return cpuAgentCount; -} - -bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) { - void *pages[NUM_PAGES]; - int status[NUM_PAGES]; - int nodes[NUM_PAGES]; - int ret_code; - - printf("set cpu %d, gpu %d, numaMode %d, hostMallocflags 0x%x\n", cpuId, - gpuId, numaMode, hostMallocflags); - - if (cpuId >= 0) { - unsigned long nodeMask = 1 << cpuId; - unsigned long maxNode = sizeof(nodeMask) * 8; - if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask, - numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) { - printf("set_mempolicy() failed with err %d\n", errno); - return false; - } - } - - if (gpuId >= 0) { - HIPCHECK(hipSetDevice(gpuId)); - } - - posix_memalign((void**) &m, page_size, page_size * NUM_PAGES); - hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped); - hipHostGetDevicePointer((void**) &d_m, m, 0); - - status[0] = -1; - pages[0] = m; - for (int i = 1; i < NUM_PAGES; i++) { - pages[i] = (char*) pages[0] + page_size; - } - ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0); - printf("Memory (malloc) ret %d at %p (dev %p) is at node: ", ret_code, m, d_m); - for (int i = 0; i < NUM_PAGES; i++) { - printf("%d ", status[i]); // Don't verify as it's out of our control - } - printf("\n"); - - HIPCHECK(hipHostMalloc((void**) &h, page_size*NUM_PAGES, hostMallocflags)); - pages[0] = h; - for (int i = 1; i < NUM_PAGES; i++) { - pages[i] = (char*) pages[0] + page_size; - } - ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0); - d_h = nullptr; - if (hostMallocflags & hipHostMallocMapped) { - hipHostGetDevicePointer((void**) &d_h, h, 0); - printf("Memory (hipHostMalloc) ret %d at %p (dev %p) is at node: ", - ret_code, h, d_h); - } else { - printf("Memory (hipHostMalloc) ret %d at %p is at node: ", ret_code, h); - } - for (int i = 0; i < NUM_PAGES; i++) { - printf("%d ", status[i]); // Always print it even if it's wrong. Verify later - } - printf("\n"); - - HIPCHECK(hipHostFree((void* )h)); - hipHostUnregister(m); - free(m); - - if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) { - for (int i = 0; i < NUM_PAGES; i++) { - if (status[i] != cpuId) { // Now verify - printf("Failed at %d", i); - return false; - } - } - } - return true; -} - -bool runTest(const int &cpuCount, const int &gpuCount, - const unsigned int &hostMallocflags, const std::string &str) { - printf("%s\n", str.c_str()); - - for (int m = 0; m < sizeof(mode) / sizeof(mode[0]); m++) { - printf("Testing %s\n", modeStr[m]); - - for (int i = 0; i < cpuCount; i++) { - for (int j = 0; j < gpuCount; j++) { - if (!test(i, j, mode[m], hostMallocflags)) { - return false; - } - } - } - } - return true; -} - -int main(int argc, char *argv[]) { - int gpuCount = 0; - HIPCHECK(hipGetDeviceCount(&gpuCount)); - int cpuCount = getCpuAgentCount(); - page_size = getpagesize(); - printf("Cpu count %d, Gpu count %d, Page size %d\n", cpuCount, gpuCount, - page_size); - - if (cpuCount < 0 || gpuCount < 0) { - failed("Bad device count\n"); - return -1; - } - - if (!runTest(cpuCount, gpuCount, hipHostMallocDefault | hipHostMallocNumaUser, - "Testing hipHostMallocDefault | hipHostMallocNumaUser........................")) { - failed("Failed testing hipHostMallocDefault | hipHostMallocNumaUser\n"); - return -1; - } - - if (!runTest(cpuCount, gpuCount, hipHostMallocMapped | hipHostMallocNumaUser, - "Testing hipHostMallocMapped | hipHostMallocNumaUser.........................")) { - failed("Failed testing hipHostMallocMapped | hipHostMallocNumaUser\n"); - return -1; - } - - passed(); -} diff --git a/perftests/memory/hipPerfMemFill.cpp b/perftests/memory/hipPerfMemFill.cpp deleted file mode 100644 index dd54ec685..000000000 --- a/perftests/memory/hipPerfMemFill.cpp +++ /dev/null @@ -1,534 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include "test_common.h" -#include -#include -#include -#include - -#define SIMPLY_ASSIGN 0 -#define USE_HIPTEST_SETNUMBLOCKS 0 - -using namespace std; - -template -__global__ void vec_fill(T *x, T coef, int N) { - const int istart = threadIdx.x + blockIdx.x * blockDim.x; - const int ishift = blockDim.x * gridDim.x; - for (int i = istart; i < N; i += ishift) { -#if SIMPLY_ASSIGN - x[i] = coef; -#else - x[i] = coef * i; -#endif - } -} - -__device__ void print_log(int i, double value, double expected) { - printf("failed at %d: val=%g, expected=%g\n", i, value, expected); -} - -__device__ void print_log(int i, int value, int expected) { - printf("failed at %d: val=%d, expected=%d\n", i, value, expected); -} - -template -__global__ void vec_verify(T *x, T coef, int N) { - const int istart = threadIdx.x + blockIdx.x * blockDim.x; - const int ishift = blockDim.x * gridDim.x; - for (int i = istart; i < N; i += ishift) { -#if SIMPLY_ASSIGN - if(x[i] != coef) { - print_log(i, x[i], coef); - } -#else - if(x[i] != coef * i) { - print_log(i, x[i], coef * i); - } -#endif - } -} - -template -__global__ void daxpy(T *__restrict__ x, T *__restrict__ y, - const T coef, int Niter, int N) { - const int istart = threadIdx.x + blockIdx.x * blockDim.x; - const int ishift = blockDim.x * gridDim.x; - for (int iter = 0; iter < Niter; ++iter) { - T iv = coef * iter; - for (int i = istart; i < N; i += ishift) - y[i] = iv * x[i] + y[i]; - } -} - -template -class hipPerfMemFill { - private: - static constexpr int NUM_START = 27; - static constexpr int NUM_SIZE = 5; - static constexpr int NUM_ITER = 10; - size_t totalSizes_[NUM_SIZE]; - hipDeviceProp_t props_; - const T coef_ = getCoefficient(3.14159); - const unsigned int blocksPerCU_; - const unsigned int threadsPerBlock_; - - public: - hipPerfMemFill(unsigned int blocksPerCU, unsigned int threadsPerBlock) : - blocksPerCU_(blocksPerCU), threadsPerBlock_(threadsPerBlock) { - for (int i = 0; i < NUM_SIZE; i++) { - totalSizes_[i] = 1ull << (i + NUM_START); // 128M, 256M, 512M, 1024M, 2048M - } - } - - ~hipPerfMemFill() { - } - - bool supportLargeBar() { - return props_.isLargeBar != 0; - } - - bool supportManagedMemory() { - return props_.managedMemory != 0; - } - - const T getCoefficient(double val) { - return static_cast(val); - } - - void setHostBuffer(T *A, T val, size_t size) { - size_t len = size / sizeof(T); - for (int i = 0; i < len; i++) { - A[i] = val; - } - } - - void open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } else if (deviceId >= nGpu) { - failed("Info: wrong GPU Id %d\n", deviceId); - } - - HIPCHECK(hipSetDevice(deviceId)); - memset(&props_, 0, sizeof(props_)); - HIPCHECK(hipGetDeviceProperties(&props_, deviceId)); - std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x" - << props_.pciBusID << " " << props_.name << " with " - << props_.multiProcessorCount << " CUs, large bar: " - << supportLargeBar() << ", managed memory: " << supportManagedMemory() - << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained() - << std::endl; - } - - void log_host(const char* title, double GBytes, double sec) { - cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec - << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << endl; - } - - void log_kernel(const char* title, double GBytes, double sec, double sec_hv, double sec_kv) { - cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec - << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << ", hostVerify cost " - << setw(10) << sec_hv << " s in bandwidth " << setw(10) << GBytes / sec_hv << " [GB/s]" - << ", kernelVerify cost "<< setw(10) << sec_kv << " s in bandwidth " << setw(10) - << GBytes / sec_kv << " [GB/s]" << endl; - } - - void hostFill(size_t size, T *data, T coef, double &sec) { - size_t num = size / sizeof(T); // Size of elements - auto start = chrono::steady_clock::now(); - for (int i = 0; i < num; ++i) { -#if SIMPLY_ASSIGN - data[i] = coef; -#else - data[i] = coef * i; -#endif - } - auto end = chrono::steady_clock::now(); - chrono::duration diff = end - start; // in second - sec = diff.count(); - } - - void kernelFill(size_t size, T *data, T coef, double &sec) { - size_t num = size / sizeof(T); // Size of elements - unsigned blocks = setNumBlocks(num); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), - dim3(threadsPerBlock), 0, 0, data, 0, num); // kernel will be loaded first time - HIPCHECK(hipDeviceSynchronize()); - - auto start = chrono::steady_clock::now(); - - for (int iter = 0; iter < NUM_ITER; ++iter) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill), dim3(blocks), - dim3(threadsPerBlock), 0, 0, data, coef, num); - } - HIPCHECK(hipDeviceSynchronize()); - - auto end = chrono::steady_clock::now(); - chrono::duration diff = end - start; // in second - sec = diff.count() / NUM_ITER; // in second - } - - void hostVerify(size_t size, T *data, T coef, double &sec) { - size_t num = size / sizeof(T); // Size of elements - auto start = chrono::steady_clock::now(); - for (int i = 0; i < num; ++i) { -#if SIMPLY_ASSIGN - if(data[i] != coef) { - cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef << endl; - failed("failed\n"); - } -#else - if(data[i] != coef * i) { - cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef * i << endl; - failed("failed\n"); - } -#endif - } - auto end = chrono::steady_clock::now(); - chrono::duration diff = end - start; // in second - sec = diff.count(); - } - - void kernelVerify(size_t size, T *data, T coef, double &sec) { - size_t num = size / sizeof(T); // Size of elements - unsigned blocks = setNumBlocks(num); - - CaptureStream *capture = new CaptureStream(stdout); - capture->Begin(); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), - dim3(threadsPerBlock), 0, 0, data, coef, num); // kernel will be loaded first time - HIPCHECK(hipDeviceSynchronize()); - - capture->End(); - capture->Truncate(1000); // Don't want too long log if existing - std::string device_output = capture->getData(); - delete capture; - if (device_output.length() > 0) { - failed("kernelVerify failed:\n%s\n", device_output.c_str()); - } - - // Now all data verified. The following is to test bandwidth. - auto start = chrono::steady_clock::now(); - - for (int iter = 0; iter < NUM_ITER; ++iter) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify), dim3(blocks), - dim3(threadsPerBlock), 0, 0, data, coef, num); - } - HIPCHECK(hipDeviceSynchronize()); - - auto end = chrono::steady_clock::now(); - chrono::duration diff = end - start; // in second - sec = diff.count() / NUM_ITER; // in second - } - - bool testLargeBarDeviceMemoryHostFill(size_t size) { - if (!supportLargeBar()) { - return false; - } - - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A; - HIPCHECK(hipMalloc(&A, size)); - double sec = 0; - hostFill(size, A, coef_, sec); // Cpu can access device mem in LB - HIPCHECK(hipFree(A)); - - log_host("Largebar: host fill", GBytes, sec); - return true; - } - - bool testLargeBar() { - if (!supportLargeBar()) { - return false; - } - - cout << "Test large bar device memory host filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testLargeBarDeviceMemoryHostFill(totalSizes_[i])) { - return false; - } - } - - return true; - } - - bool testManagedMemoryHostFill(size_t size) { - if (!supportManagedMemory()) { - return false; - } - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A; - HIPCHECK(hipMallocManaged(&A, size)); - double sec = 0; - hostFill(size, A, coef_, sec); // Cpu can access HMM mem - HIPCHECK(hipFree(A)); - - log_host("Managed: host fill", GBytes, sec); - return true; - } - - bool testManagedMemoryKernelFill(size_t size) { - if (!supportManagedMemory()) { - return false; - } - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A; - HIPCHECK(hipMallocManaged(&A, size)); - - double sec = 0, sec_hv = 0, sec_kv = 0; - kernelFill(size, A, coef_, sec); - hostVerify(size, A, coef_, sec_hv); // Managed memory can be verified by host - kernelVerify(size, A, coef_, sec_kv); - HIPCHECK(hipFree(A)); - - log_kernel("Managed: kernel fill", GBytes, sec, sec_hv, sec_kv); - - return true; - } - - bool testManagedMemory() { - if (!supportManagedMemory()) { - return false; - } - - cout << "Test managed memory host filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testManagedMemoryHostFill(totalSizes_[i])) { - return false; - } - } - - cout << "Test managed memory kernel filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testManagedMemoryKernelFill(totalSizes_[i])) { - return false; - } - } - - return true; - } - - bool testHostMemoryHostFill(size_t size, unsigned int flags) { - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - T *A; - HIPCHECK(hipHostMalloc(&A, size, flags)); - double sec = 0; - hostFill(size, A, coef_, sec); - HIPCHECK(hipHostFree(A)); - - log_host("Host: host fill", GBytes, sec); - return true; - } - - bool testHostMemoryKernelFill(size_t size, unsigned int flags) { - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A; - HIPCHECK(hipHostMalloc((void** ) &A, size, flags)); - double sec = 0, sec_hv = 0, sec_kv = 0; - kernelFill(size, A, coef_, sec); - hostVerify(size, A, coef_, sec_hv); - kernelVerify(size, A, coef_, sec_kv); - HIPCHECK(hipHostFree(A)); - - log_kernel("Host: kernel fill", GBytes, sec, sec_hv, sec_kv); - return true; - } - - bool testHostMemory() { - cout << "Test coherent host memory host filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocCoherent)) { - return false; - } - } - - cout << "Test non-coherent host memory host filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocNonCoherent)) { - return false; - } - } - - cout << "Test coherent host memory kernel filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocCoherent)) { - return false; - } - } - - cout << "Test non-coherent host memory kernel filling" << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocNonCoherent)) { - return false; - } - } - - return true; - } - - /* This function should be via device attribute query*/ - bool supportDeviceMallocFinegrained() { -#ifdef __HIP_PLATFORM_AMD__ - T *A = nullptr; - hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained); - if (!A) { - return false; - } - HIPCHECK(hipFree(A)); - return true; -#else - return false; -#endif - } - - unsigned int setNumBlocks(size_t size) { - size_t num = size/sizeof(T); - -#if USE_HIPTEST_SETNUMBLOCKS - return HipTest::setNumBlocks(blocksPerCU_, threadsPerBlock_, - num); -#else - return (num + threadsPerBlock_ - 1) / threadsPerBlock_; -#endif - } - -#ifdef __HIP_PLATFORM_AMD__ - bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) { - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A = nullptr; - HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags)); - if (!A) { - cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" - << std::hex << flags << endl; - return false; - } - - double sec = 0; - hostFill(size, A, coef_, sec); // Cpu can access this mem - HIPCHECK(hipFree(A)); - - log_host("ExtDevice: host fill", GBytes, sec); - return true; - } - - bool testExtDeviceMemoryKernelFill(size_t size, unsigned int flags) { - double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0); - - T *A = nullptr; - HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags)); - if (!A) { - cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" - << std::hex << flags << endl; - return false; - } - - double sec = 0, sec_hv = 0, sec_kv = 0; - kernelFill(size, A, coef_, sec); - hostVerify(size, A, coef_, sec_hv); // Fine grained device memory can be verified by host - kernelVerify(size, A, coef_, sec_kv); - HIPCHECK(hipFree(A)); - - log_kernel("ExtDevice: kernel fill", GBytes, sec, sec_hv, sec_kv); - - return true; - } - - bool testExtDeviceMemory() { - cout << "Test fine grained device memory host filling" - << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testExtDeviceMemoryHostFill(totalSizes_[i], - hipDeviceMallocFinegrained)) { - return false; - } - } - - cout << "Test fine grained device memory kernel filling" - << endl; - for (int i = 0; i < NUM_SIZE; i++) { - if (!testExtDeviceMemoryKernelFill(totalSizes_[i], - hipDeviceMallocFinegrained)) { - return false; - } - } - - return true; - } -#endif - - bool run() { - if (supportLargeBar()) { - if (!testLargeBar()) { - return false; - } - } - - if (supportManagedMemory()) { - if (!testManagedMemory()) { - return false; - } - } - - if (!testHostMemory()) { - return false; - } - -#ifdef __HIP_PLATFORM_AMD__ - if (supportDeviceMallocFinegrained()) { - if (!testExtDeviceMemory()) { - return false; - } - } -#endif - return true; - } - -}; - -int main(int argc, char *argv[]) { - HipTest::parseStandardArguments(argc, argv, true); // For ::p_gpuDevice, ::blocksPerCU, ::threadsPerBlock - cout << "Test int" << endl; - hipPerfMemFill hipPerfMemFillInt(::blocksPerCU, ::threadsPerBlock); - hipPerfMemFillInt.open(::p_gpuDevice); - HIPASSERT(hipPerfMemFillInt.run()); - - cout << "Test double" << endl; - hipPerfMemFill hipPerfMemFillDouble(::blocksPerCU, ::threadsPerBlock); - hipPerfMemFillDouble.open(::p_gpuDevice); - HIPASSERT(hipPerfMemFillDouble.run()); - - passed(); -} diff --git a/perftests/memory/hipPerfMemMallocCpyFree.cpp b/perftests/memory/hipPerfMemMallocCpyFree.cpp deleted file mode 100644 index 94ceb68cb..000000000 --- a/perftests/memory/hipPerfMemMallocCpyFree.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* -Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "test_common.h" -#include -#include - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#define NUM_SIZE 19 //size up to 16M -#define NUM_ITER 500 //Total GPU memory up to 16M*500=8G - -void valSet(int* A, int val, size_t size) { - size_t len = size / sizeof(int); - for (int i = 0; i < len; i++) { - A[i] = val; - } -} - -void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) { - - std::cout << "size: "; - for (int i = 0; i < num; i++) { - size[i] = 1 << (i + 6); - if((NUM_ITER + 1) * size[i] > totalGlobalMem) { - num = i; - break; - } - std::cout << size[i] << " "; - } - std::cout << std::endl; - *pA = (int*)malloc(size[num - 1]); - valSet(*pA, 1, size[num - 1]); -} - -void testInit(size_t size, int *A) { - int *Ad; - clock_t start = clock(); - hipMalloc(&Ad, size); //hip::init() will be called - clock_t end = clock(); - double uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - std::cout << "Initial" << std::endl; - std::cout << "hipMalloc(" << size << ") cost " << uS << "us" << std::endl; - - start = clock(); - hipMemcpy(Ad, A, size, hipMemcpyHostToDevice); - hipDeviceSynchronize(); - end = clock(); - uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - std::cout << "hipMemcpy(" << size << ") cost " << uS << "us" << std::endl; - - start = clock(); - hipFree(Ad); - end = clock(); - uS = (end - start) * 1000000. / CLOCKS_PER_SEC; - std::cout << "hipFree(" << size << ") cost " << uS << "us" << std::endl; -} - -int main() { - double uS; - clock_t start, end; - size_t size[NUM_SIZE] = { 0 }; - int *Ad[NUM_ITER] = { nullptr }; - int *A; - hipDeviceProp_t props; - memset(&props, 0, sizeof(props)); - HIPCHECK(hipGetDeviceProperties(&props, 0)); - std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl; - - int num = NUM_SIZE; - setup(size, num, &A, props.totalGlobalMem); - testInit(size[0], A); - - for (int i = 0; i < num; i++) { - std::cout << size[i] << std::endl; - start = clock(); - for (int j = 0; j < NUM_ITER; j++) { - HIPCHECK(hipMalloc(&Ad[j], size[i])); - } - end = clock(); - uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - std::cout << "hipMalloc(" << size[i] << ") cost " << uS << "us" << std::endl; - - start = clock(); - for (int j = 0; j < NUM_ITER; j++) { - HIPCHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice)); - } - hipDeviceSynchronize(); - end = clock(); - uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - std::cout << "hipMemcpy(" << size[i] << ") cost " << uS << "us" << std::endl; - - start = clock(); - for (int j = 0; j < NUM_ITER; j++) { - HIPCHECK(hipFree(Ad[j])); - Ad[j] = nullptr; - } - end = clock(); - double uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC); - std::cout << "hipFree(" << size[i] << ") cost " << uS << "us" << std::endl; - } - free(A); - passed(); -} diff --git a/perftests/memory/hipPerfMemcpy.cpp b/perftests/memory/hipPerfMemcpy.cpp deleted file mode 100644 index 9751117ec..000000000 --- a/perftests/memory/hipPerfMemcpy.cpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include "test_common.h" -#include -#include - -#define NUM_SIZE 8 -#define NUM_ITER 0x40000 - - -using namespace std; - -class hipPerfMemcpy { - private: - unsigned int numBuffers_; - size_t totalSizes_[NUM_SIZE]; - void setHostBuffer(int *A, int val, size_t size); - public: - hipPerfMemcpy(); - ~hipPerfMemcpy() {}; - void open(int deviceID); - void run(unsigned int testNumber); -}; - -hipPerfMemcpy::hipPerfMemcpy() : numBuffers_(0) { - for (int i = 0; i < NUM_SIZE; i++) { - totalSizes_[i] = 1 << (i + 6); - } -}; - -void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) { - size_t len = size / sizeof(int); - for (int i = 0; i < len; i++) { - A[i] = val; - } -} - -void hipPerfMemcpy::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl; -} - -void hipPerfMemcpy::run(unsigned int testNumber) { - int *A, *Ad; - A = new int[totalSizes_[testNumber]]; - setHostBuffer(A, 1, totalSizes_[testNumber]); - hipMalloc(&Ad, totalSizes_[testNumber]); - - auto start = chrono::steady_clock::now(); - - for (int j = 0; j < NUM_ITER; j++) { - hipMemcpy(Ad, A, totalSizes_[testNumber], hipMemcpyHostToDevice); - } - - hipDeviceSynchronize(); - - auto end = chrono::steady_clock::now(); - chrono::duration diff = end - start; - - cout << "hipPerfMemcpy[" << testNumber << "] " << "Host to Device copy took " - << diff.count() / NUM_ITER << " us for memory size of " << totalSizes_[testNumber] - << " Bytes" << endl; - - delete [] A; - HIPCHECK(hipFree(Ad)); - -} - - -int main() { - hipPerfMemcpy hipPerfMemcpy; - - int deviceId = 0; - hipPerfMemcpy.open(deviceId); - - for (auto testCase = 0; testCase < NUM_SIZE; testCase++) { - hipPerfMemcpy.run(testCase); - } - - passed(); - -} diff --git a/perftests/memory/hipPerfMemset.cpp b/perftests/memory/hipPerfMemset.cpp deleted file mode 100644 index 2df0c9727..000000000 --- a/perftests/memory/hipPerfMemset.cpp +++ /dev/null @@ -1,437 +0,0 @@ -/* - Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include "test_common.h" -#include -#include - -static unsigned int sizeList[] = { - 256, 512, 1024, 2048, 4096, 8192, -}; - -static unsigned int eleNumList[] = { - 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, - 0x200000, 0x400000, 0x800000, 0x1000000 -}; - -typedef struct _dataType { -char memsetval = 0x42; -char memsetD8val = 0xDE; -int16_t memsetD16val = 0xDEAD; -int memsetD32val = 0xDEADBEEF; -}dataType; - -#define NUM_ITER 1000 - -enum MemsetType { - hipMemsetTypeDefault, - hipMemsetTypeD8, - hipMemsetTypeD16, - hipMemsetTypeD32, - hipMemsetTypeMax - -}; - -using namespace std; - -class hipPerfMemset { - private: - uint64_t bufSize_; - unsigned int num_elements_; - unsigned int testNumEle_; - unsigned int _numSubTests = 0; - unsigned int _numSubTests2D = 0; - unsigned int _numSubTests3D = 0; - unsigned int num_sizes_ =0; - - public: - hipPerfMemset() { - num_elements_ = sizeof(eleNumList) / sizeof(unsigned int); - _numSubTests = num_elements_ * hipMemsetTypeMax; - - num_sizes_ = sizeof(sizeList) / sizeof(unsigned int); - _numSubTests2D = num_sizes_; - _numSubTests3D = _numSubTests2D; - }; - - ~hipPerfMemset() {}; - - void open(int deviceID); - - template - void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - template - void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - template - void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async); - - uint getNumTests() { - return _numSubTests; - } - - uint getNumTests2D() { - return _numSubTests2D; - } - uint getNumTests3D() { - return _numSubTests3D; - } -}; - - -void hipPerfMemset::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; -} - -template -void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - T * A_h; - T * A_d; - - testNumEle_ = eleNumList[test % num_elements_]; - - bufSize_ = testNumEle_ * sizeof(uint32_t); - - HIPCHECK(hipMalloc(&A_d, bufSize_)); - - A_h = reinterpret_cast (malloc(bufSize_)); - - hipStream_t stream; - HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - - // Warm-up - if (async) { - HIPCHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream)); - HIPCHECK(hipStreamSynchronize(stream)); - } else { - HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); - HIPCHECK(hipDeviceSynchronize()); - } - - auto start = chrono::high_resolution_clock::now(); - for (uint i = 0; i < NUM_ITER; i++) { - if (type == hipMemsetTypeDefault && !async) { - HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_)); - } - else if (type == hipMemsetTypeDefault && async) { - HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream)); - } - else if (type == hipMemsetTypeD8 && !async){ - HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_)); - } - else if (type == hipMemsetTypeD8 && async) { - HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream)); - } - else if (type == hipMemsetTypeD16 && !async) { - HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - else if (type == hipMemsetTypeD16 && async) { - HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); - } - else if (type == hipMemsetTypeD32 && !async) { - HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T))); - } - else if (type == hipMemsetTypeD32 && async) { - HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream)); - } - } - if (async) { - HIPCHECK(hipStreamSynchronize(stream)); - } else { - HIPCHECK(hipDeviceSynchronize()); - } - - auto end = chrono::high_resolution_clock::now(); - - HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) ); - - for (int i = 0; i < bufSize_ / sizeof(T); i++) { - if (A_h[i] != memsetval) { - cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - HIPCHECK(hipFree(A_d)); - free(A_h); - - auto diff = std::chrono::duration(end - start); - auto sec = diff.count(); - - auto perf = static_cast((bufSize_ * NUM_ITER * (double)(1e-09)) / sec); - - cout << "[" << setw(2) << test << "] " << setw(5) << bufSize_/1024 << " Kb " << setw(4) - << " typeSize " << (int)sizeof(T) << " : " << setw(7) << perf << " GB/s " << endl; -} - -template -void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - bufSize_ = sizeList[test % num_sizes_]; - - size_t numH = bufSize_; - size_t numW = bufSize_; - size_t pitch_A; - size_t width = numW * sizeof(char); - size_t sizeElements = width * numH; - size_t elements = numW* numH; - - T * A_h; - T * A_d; - - HIPCHECK(hipMallocPitch(reinterpret_cast(&A_d), &pitch_A, width , - numH)); - A_h = reinterpret_cast(malloc(sizeElements)); - - for (size_t i=0; i < elements; i++) { - A_h[i] = 1; - } - - hipStream_t stream; - HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - - // Warm-up - if (async) { - HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); - HIPCHECK(hipStreamSynchronize(stream)); - } else { - HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); - HIPCHECK(hipDeviceSynchronize()); - } - - auto start = chrono::steady_clock::now(); - - for (uint i = 0; i < NUM_ITER; i++) { - if (type == hipMemsetTypeDefault && !async) { - HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH)); - } - else if (type == hipMemsetTypeDefault && async) { - HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream)); - } - } - - if (async) { - HIPCHECK(hipStreamSynchronize(stream)); - } else { - HIPCHECK(hipDeviceSynchronize()); - } - - auto end = chrono::steady_clock::now(); - - HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, - hipMemcpyDeviceToHost)); - - for (int i=0; i < elements; i++) { - if (A_h[i] != memsetval) { - cout << "mismatch at index " << i << " computed: " << static_cast (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - chrono::duration diff = end - start; - - auto sec = diff.count(); - - auto perf = static_cast((sizeElements* NUM_ITER * (double)(1e-09)) / sec); - - cout << " hipPerf2DMemset" << (async ? "Async" : " ") << "[" << test << "] " - << " " << "(GB/s) for " << setw(5) << bufSize_ - << " x " << setw(5) << bufSize_ << " bytes : " << setw(7) << perf << endl; - - HIPCHECK(hipStreamDestroy(stream)); - HIPCHECK(hipFree(A_d)); - free(A_h); -} - -template -void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) { - - bufSize_ = sizeList[test % num_sizes_]; - - size_t numH = bufSize_; - size_t numW = bufSize_; - size_t depth = 10; - size_t width = numW * sizeof(char); - size_t sizeElements = width * numH * depth; - size_t elements = numW* numH* depth; - - hipStream_t stream; - HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - - T *A_h; - - hipExtent extent = make_hipExtent(width, numH, depth); - hipPitchedPtr devPitchedPtr; - - HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); - A_h = (char*)malloc(sizeElements); - HIPASSERT(A_h != NULL); - - for (size_t i=0; i (A_h[i]) - << ", memsetval: " << static_cast (memsetval) << endl; - break; - } - } - - chrono::duration diff = end - start; - - auto sec = diff.count(); - - auto perf = static_cast((sizeElements * NUM_ITER * (double)(1e-09)) / sec); - - cout << " hipPerf3DMemset" << (async ? "Async" : " ") << "[" << test << "] " << " " - << "(GB/s) for " << setw(5) << bufSize_ << " x " << setw(5) - << bufSize_ << " x " << depth << " bytes : " << setw(7) << perf << endl; - HIPCHECK(hipFree(devPitchedPtr.ptr)); - free(A_h); -} - -int main() { - hipPerfMemset hipPerfMemset; - - dataType pattern; - int deviceId = 0; - hipPerfMemset.open(deviceId); - MemsetType type; - - int numTests = hipPerfMemset.getNumTests(); - int numTests2D = hipPerfMemset.getNumTests2D(); - int numTests3D = hipPerfMemset.getNumTests3D(); - - - cout << "--------------------- 1D buffer -------------------" << endl; - bool async= false; - for (uint i = 0; i < 2 ; i++) { - cout << endl; - - for (auto testCase = 0; testCase < numTests; testCase++) { - if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) { - cout << "API: hipMemsetD8" << (async ? "Async " : " "); - hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async); - } - - else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) { - cout << "API: hipMemsetD16" << (async ? "Async" : " "); - hipPerfMemset.run1D(testCase,pattern.memsetD16val, hipMemsetTypeD16, async); - } - - else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) { - cout << "API: hipMemsetD32" << (async ? "Async" : " "); - hipPerfMemset.run1D(testCase,pattern.memsetD32val, hipMemsetTypeD32, async); - } - - else { - cout << "API: hipMemset" << (async ? "Async " : " "); - hipPerfMemset.run1D(testCase,pattern.memsetval, hipMemsetTypeDefault, async); - } - } - async = true; - } - - cout << endl; - cout << "------------------ 2D buffer arrays ---------------" << endl; - - async = false; - for (uint i = 0; i < 2; i++) { - cout << endl; - for (uint test = 0; test < numTests2D; test++) { - hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async); - } - async = true; - } - - cout << endl; - cout << "------------------ 3D buffer arrays ---------------" << endl; - - async = false; - for (uint i = 0; i < 2; i++) { - cout << endl; - for (uint test =0; test < numTests3D; test++) { - hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async); - } - async = true; - } - - passed(); -} diff --git a/perftests/memory/hipPerfSampleRate.cpp b/perftests/memory/hipPerfSampleRate.cpp deleted file mode 100644 index 1ecadfe74..000000000 --- a/perftests/memory/hipPerfSampleRate.cpp +++ /dev/null @@ -1,319 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include -#include - -using namespace std; - -#define NUM_TYPES 3 -vector types= {"float", "float2", "float4"}; -vector typeSizes = {4, 8, 16}; - -#define NUM_SIZES 12 -vector sizes = {1, 2, 4, 8, 16, 32, - 64, 128, 256, 512, 1024, 2048}; - -#define NUM_BUFS 6 -#define MAX_BUFS (1 << (NUM_BUFS - 1)) - -#ifdef __HIP_PLATFORM_NVIDIA__ -inline __host__ __device__ void operator+=(float2 &a, float2 b) -{ - a.x += b.x; a.y += b.y; -} - -inline __host__ __device__ void operator+=(float4 &a, float4 b) -{ - a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; -} -#endif - -template -__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt, - T **inBuffer, int numBufs) { - - uint gid = (blockIdx.x * blockDim.x + threadIdx.x); - uint inputIdx = gid % inBufSize; - - T tmp; - memset(&tmp, 0, sizeof(T)); - for(int i = 0; i < numBufs; i++) { - tmp += *(*(inBuffer+i)+inputIdx); - } - - if (writeIt*(unsigned int)tmp.x) { - outBuffer[gid] = tmp; - } -}; - -template -__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize, unsigned int writeIt, - T ** inBuffer, int numBufs) { - - uint gid = (blockIdx.x * blockDim.x + threadIdx.x); - uint inputIdx = gid % inBufSize; - - T tmp = (T)0.0f; - - for(int i = 0; i < numBufs; i++) { - tmp += *((*inBuffer+i)+inputIdx); - } - - if (writeIt*(unsigned int)tmp) { - outBuffer[gid] = tmp; - } -}; - -class hipPerfSampleRate { - public: - hipPerfSampleRate(); - ~hipPerfSampleRate(); - - void open(void); - void run(unsigned int testCase); - void close(void); - - // array of funtion pointers - typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int - inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks, - int threads_per_block); - - // Wrappers - void float_kernel(void * outBuffer, unsigned int - inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks, - int threads_per_block); - - void float2_kernel(void * outBuffer, unsigned int - inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks, - int threads_per_block); - - void float4_kernel(void * outBuffer, unsigned int - inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks, - int threads_per_block); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int width_; - unsigned int bufSize_; - unsigned long long totalIters = 0; - int numCUs; - - unsigned int outBufSize_; - static const unsigned int MAX_ITERATIONS = 25; - unsigned int numBufs_; - unsigned int typeIdx_; -}; - - -hipPerfSampleRate::hipPerfSampleRate() {} - -hipPerfSampleRate::~hipPerfSampleRate() {} - -void hipPerfSampleRate::open(void) { - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - int deviceId = 0; - hipDeviceProp_t props = {0}; - props = {0}; - HIPCHECK(hipSetDevice(deviceId)); - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; - numCUs = props.multiProcessorCount; - } - - -void hipPerfSampleRate::close() { - -} - - -// Wrappers for the kernel launches -void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, - int numBufs, int grids, int blocks, int threads_per_block) { - - hipLaunchKernelGGL(sampleRateFloat, dim3(grids, grids, grids), dim3 (blocks), 0, 0, - (float*)outBuffer, inBufSize, writeIt, (float**)inBuffer, numBufs); - -} - -void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, - int grids, int blocks, int threads_per_block, int numBufs) { - - hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), dim3(blocks), 0, 0, - (float2 *)outBuffer, inBufSize, writeIt, (float2**)inBuffer, numBufs); -} - -void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize, - unsigned int writeIt, void **inBuffer, - int grids, int blocks, int threads_per_block, int numBufs) { - - hipLaunchKernelGGL(sampleRate, dim3(grids, grids, grids), dim3(blocks), 0, 0, - (float4 *) outBuffer, inBufSize, writeIt, (float4**)inBuffer, numBufs); -} - -void hipPerfSampleRate::run(unsigned int test) { - - funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel, - &hipPerfSampleRate::float4_kernel}; - - // We compute a square domain - width_ = sizes[test % NUM_SIZES]; - typeIdx_ = (test / NUM_SIZES) % NUM_TYPES; - bufSize_ = width_ * width_ * typeSizes[typeIdx_]; - numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES))); - - void * hOutPtr; - void * dOutPtr; - void * hInPtr[numBufs_]; - void ** dPtr; - void * dInPtr[numBufs_]; - - outBufSize_ = - sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1]; - - // Allocate memory on the host and device - HIPCHECK(hipHostMalloc((void **)&hOutPtr, outBufSize_, hipHostMallocDefault)); - setData((void *)hOutPtr, 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dOutPtr, outBufSize_)); - - // Allocate 2D array in Device - hipMalloc((void **)&dPtr, numBufs_* sizeof(void *)); - - for (uint i = 0; i < numBufs_; i++) { - HIPCHECK(hipHostMalloc((void **)&hInPtr[i], bufSize_, hipHostMallocDefault)); - HIPCHECK(hipMalloc((uint **)&dInPtr[i], bufSize_)); - setData(hInPtr[i], 0x3f800000); - } - - // Populate array of pointers with array addresses - hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *), hipMemcpyHostToDevice); - - // Copy memory from host to device - for (uint i = 0; i < numBufs_; i++) { - HIPCHECK(hipMemcpy(dInPtr[i], hInPtr[i], bufSize_, hipMemcpyHostToDevice)); - } - - HIPCHECK(hipMemcpy(dOutPtr, hOutPtr, outBufSize_, hipMemcpyHostToDevice)); - - // Prepare kernel launch parameters - // outBufSize_/sizeof(uint) - Grid size in 3D - int grids = 64; - int blocks = 64; - int threads_per_block = 1; - - unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_); - unsigned int sizeDW = width_ * width_; - unsigned int writeIt = 0; - - int idx = 0; - - if (!types[typeIdx_].compare("float")) { - idx = 0; - } - else if(!types[typeIdx_].compare("float2")) { - idx = 1; - } - else if(!types[typeIdx_].compare("float4")) { - idx = 2; - } - - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - for (uint i = 0; i < maxIter; i++) { - (this->*p[idx]) ((void *)dOutPtr, sizeDW, writeIt, dPtr, numBufs_, grids, blocks, - threads_per_block); - } - - hipDeviceSynchronize(); - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - double perf = ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) / - all_kernel_time.count(); - - cout << "Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1] << " bufs " - << numBufs_ << " " << types[typeIdx_] << " " << width_<<"x"< -#include -#include "test_common.h" - -using namespace std; - -#define sharedMemSize1 2048 -#define sharedMemSize2 256 - -__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) { - - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t lid = threadIdx.x; - __shared__ float local[sharedMemSize1]; - - float val1 = 0; - float val2 = 0; - float val3 = 0; - float val4 = 0; - - for (int i = 0; i < (sharedMemSize1 / 64); i++) { - local[lid + i * 64] = lid; - } - - __syncthreads(); - - val1 += local[lid]; - val2 += local[lid + 64]; - val3 += local[lid + 128]; - val4 += local[lid + 192]; - val1 += local[lid + 256]; - val2 += local[lid + 320]; - val3 += local[lid + 384]; - val4 += local[lid + 448]; - val1 += local[lid + 512]; - val2 += local[lid + 576]; - val3 += local[lid + 640]; - val4 += local[lid + 704]; - val1 += local[lid + 768]; - val2 += local[lid + 832]; - val3 += local[lid + 896]; - val4 += local[lid + 960]; - val1 += local[lid + 1024]; - val2 += local[lid + 1088]; - val3 += local[lid + 1152]; - val4 += local[lid + 1216]; - val1 += local[lid + 1280]; - val2 += local[lid + 1344]; - val3 += local[lid + 1408]; - val4 += local[lid + 1472]; - val1 += local[lid + 1536]; - val2 += local[lid + 1600]; - val3 += local[lid + 1664]; - val4 += local[lid + 1728]; - val1 += local[lid + 1792]; - val2 += local[lid + 1856]; - val3 += local[lid + 1920]; - val4 += local[lid + 1984]; - - if (gid < N) { - outBuf[gid] = val1 + val2 + val3 + val4; - } -}; - -__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) { - size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); - size_t lid = threadIdx.x; - __shared__ float local[sharedMemSize2]; - - float val0 = 0.0f; - float val1 = 0.0f; - - for (int i = 0; i < (sharedMemSize2 / 64); i++) { - local[lid + i * 64] = lid; - } - - __syncthreads(); - -#pragma nounroll - for (uint i = 0; i < 32; i++) { - val0 += local[8 * i + 0]; - val1 += local[8 * i + 1]; - val0 += local[8 * i + 2]; - val1 += local[8 * i + 3]; - val0 += local[8 * i + 4]; - val1 += local[8 * i + 5]; - val0 += local[8 * i + 6]; - val1 += local[8 * i + 7]; - } - - if (gid < N) { - outBuf[gid] = val0 + val1; - } -}; - -int main(int argc, char *argv[]) { - float *dDst; - float *hDst; - hipStream_t stream; - constexpr uint numSizes = 4; - constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216}; - uint numReads1 = 32; - uint numReads2 = 256; - uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float); - uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float); - int nIter = 1000; - const unsigned threadsPerBlock = 64; - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - cout << "info: didn't find any GPU! skipping the test!\n"; - passed(); - return 0; - } - - static int device = 0; - HIPCHECK(hipSetDevice(device)); - hipDeviceProp_t props; - HIPCHECK(hipGetDeviceProperties(&props, device)); - cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << endl; - - HIPCHECK(hipStreamCreate(&stream)); - - for (int nTest = 0; nTest < numSizes; nTest++) { - uint nBytes = Sizes[nTest % numSizes]; - ulong N = nBytes / sizeof(float); - const unsigned blocks = N / threadsPerBlock; - - hDst = new float[nBytes]; - HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); - memset(hDst, 0, nBytes); - - HIPCHECK(hipMalloc(&dDst, nBytes)); - HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dDst, N); - HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); - hipDeviceSynchronize(); - - int tmp = 0; - for (int i = 0; i < N; i++) { - if (i % threadsPerBlock == 0) { - tmp = 0; - } - if (hDst[i] != tmp) { - cout << "info: Data validation failed for warm up run!" << endl; - cout << "info: expected " << tmp << " got " << hDst[i] << endl; - HIPCHECK (hipErrorUnknown); - } - tmp += threadsPerBlock / 2; - } - - auto all_start = chrono::steady_clock::now(); - for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); - } - hipDeviceSynchronize(); - - auto all_end = chrono::steady_clock::now(); - chrono::duration all_kernel_time = all_end - all_start; - - // read speed in GB/s - double perf = ((double) blocks * threadsPerBlock - * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter - * (double) (1e-09)) / all_kernel_time.count(); - - cout << "info: read speed = " << setw(8) << perf << " GB/s for " - << sharedMemSizeBytes1 / 1024 << " KB shared memory" - " with " << setw(8) << blocks * threadsPerBlock << " threads, " - << setw(4) << numReads1 << " reads in sharedMemReadSpeed1 kernel" << endl; - - delete[] hDst; - hipFree(dDst); - } - - - for (int nTest = 0; nTest < numSizes; nTest++) { - uint nBytes = Sizes[nTest % numSizes]; - ulong N = nBytes / sizeof(float); - const unsigned blocks = N / threadsPerBlock; - - hDst = new float[nBytes]; - HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess); - memset(hDst, 0, nBytes); - - HIPCHECK(hipMalloc(&dDst, nBytes)); - HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), - 0, stream, dDst, N); - HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost)); - hipDeviceSynchronize(); - - auto all_start = chrono::steady_clock::now(); - for (int i = 0; i < nIter; i++) { - hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), - dim3(threadsPerBlock), 0, stream, dDst, N); - } - hipDeviceSynchronize(); - - auto all_end = chrono::steady_clock::now(); - chrono::duration all_kernel_time = all_end - all_start; - - // read speed in GB/s - double perf = ((double) blocks * threadsPerBlock - * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter - * (double) (1e-09)) / all_kernel_time.count(); - - cout << "info: read speed = " << setw(8) << perf << " GB/s for " - << sharedMemSizeBytes2 / 1024 << " KB shared memory" - " with " << setw(8) << blocks * threadsPerBlock << " threads, " - << setw(4) << numReads2 << " reads in sharedMemReadSpeed2 kernel" << endl; - - delete[] hDst; - hipFree(dDst); - } - - HIPCHECK(hipStreamDestroy(stream)); - - passed(); -} diff --git a/perftests/stream/hipPerfDeviceConcurrency.cpp b/perftests/stream/hipPerfDeviceConcurrency.cpp deleted file mode 100644 index 664bdb47e..000000000 --- a/perftests/stream/hipPerfDeviceConcurrency.cpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -typedef struct { - double x; - double y; - double width; -} coordRec; - -static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter) { - - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -class hipPerfDeviceConcurrency { - public: - hipPerfDeviceConcurrency(); - ~hipPerfDeviceConcurrency(); - - void setNumGpus(unsigned int num) { - numDevices = num; - } - unsigned int getNumGpus() { - return numDevices; - } - - void open(void); - void close(void); - void run(unsigned int testCase, int numGpus); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numDevices; - unsigned int width_; - unsigned int bufSize; - unsigned int coordIdx; - unsigned long long totalIters = 0; -}; - - -hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} - -hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} - -void hipPerfDeviceConcurrency::open(void) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - setNumGpus(nGpu); - if (nGpu < 1) { - failed("No GPU!"); - } -} - - -void hipPerfDeviceConcurrency::close() { -} - -void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { - - - static int deviceId; - uint * hPtr[numGpus]; - uint * dPtr[numGpus]; - hipStream_t streams[numGpus]; - int numCUs[numGpus]; - unsigned int maxIter[numGpus]; - unsigned long long expectedIters[numGpus]; - - int threads, threads_per_block, blocks; - float xStep, yStep, xPos, yPos; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, i)); - - if (testCase != 0) { - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device ID: " - << i << std::endl; - } - - numCUs[i] = props.multiProcessorCount; - int clkFrequency = 0; - HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); - - clkFrequency =(unsigned int)clkFrequency/1000; - - // Maximum iteration count - // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); - maxIter[i] = (maxIter[i] + 15) & ~15; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - HIPCHECK(hipStreamCreate(&streams[i])); - - // Allocate memory on the host and device - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - - // Prepare kernel launch parameters - threads = (bufSize/sizeof(uint)); - threads_per_block = 64; - blocks = (threads/threads_per_block) + (threads % threads_per_block); - - coordIdx = testCase % numCoords; - xStep = (float)(coords[coordIdx].width / (double)width_); - yStep = (float)(-coords[coordIdx].width / (double)width_); - xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory from host to device - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - - } - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], - dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); - - } - - for(int i = 0; i < numGpus; i++) { - HIPCHECK(hipStreamSynchronize(0)); - } - - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - HIPCHECK(hipSetDevice(deviceId)); - - // Copy data back from device to the host - HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); - - checkData(hPtr[i]); - expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; - - if (testCase != 0) { - checkData(hPtr[i]); - if(totalIters != expectedIters[i]) { - std::cout << "Incorrect iteration count detected" << std::endl; - } - } - - - HIPCHECK(hipStreamDestroy(streams[i])); - - // Free host and device memory - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - if (testCase != 0) { - std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " - << all_kernel_time.count() << " (s) " << '\n' << std::endl; - } - - if(testCase == 0) { - deviceId++; - } - - -} - - -void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_ ; i++) { - ptr2[i] = value; - } -} - - -void hipPerfDeviceConcurrency::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfDeviceConcurrency deviceConcurrency; - - deviceConcurrency.open(); - - int nGpu = deviceConcurrency.getNumGpus(); - - // testCase = 0 refers to warmup kernel run - int testCase = 0; - - for (int i = 0; i < nGpu; i++) { - // Warm-up kernel on all devices - deviceConcurrency.run(testCase, 1); - } - - // Time for kernel on 1 device - deviceConcurrency.run(++testCase, 1); - - // Time for kernel on all available devices - deviceConcurrency.run(++testCase, nGpu); - - passed(); -} diff --git a/perftests/stream/hipPerfStreamConcurrency.cpp b/perftests/stream/hipPerfStreamConcurrency.cpp deleted file mode 100644 index 16e29bc06..000000000 --- a/perftests/stream/hipPerfStreamConcurrency.cpp +++ /dev/null @@ -1,432 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include - -#ifdef __HIP_PLATFORM_NVIDIA__ -inline __device__ float4 operator*(float s, float4 a) -{ - return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); -} -inline __device__ float4 operator*(float4 a, float4 b) -{ - return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); -} -inline __device__ float4 operator+(float4 a, float4 b) -{ - return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} -inline __device__ float4 operator-(float4 a, float4 b) -{ - return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); -} -#endif - -typedef struct { - double x; - double y; - double width; -} coordRec; - -static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter) { - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % (width/4); - int j = tid / (width/4); - int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3); - int4 vecj = make_int4(j, j, j, j); - float4 x0; - x0.x = (float)(xPos + xStep*veci.x); - x0.y = (float)(xPos + xStep*veci.y); - x0.z = (float)(xPos + xStep*veci.z); - x0.w = (float)(xPos + xStep*veci.w); - float4 y0; - y0.x = (float)(yPos + yStep*vecj.x); - y0.y = (float)(yPos + yStep*vecj.y); - y0.z = (float)(yPos + yStep*vecj.z); - y0.w = (float)(yPos + yStep*vecj.w); - float4 x = x0; - float4 y = y0; - uint iter = 0; - float4 tmp; - int4 stay; - int4 ccount = make_int4(0, 0, 0, 0); - float4 savx = x; - float4 savy = y; - stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); - for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); - iter+=16) { - x = savx; - y = savy; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); - savx.x = (bool)(stay.x ? x.x : savx.x); - savx.y = (bool)(stay.y ? x.y : savx.y); - savx.z = (bool)(stay.z ? x.z : savx.z); - savx.w = (bool)(stay.w ? x.w : savx.w); - savy.x = (bool)(stay.x ? y.x : savy.x); - savy.y = (bool)(stay.y ? y.y : savy.y); - savy.z = (bool)(stay.z ? y.z : savy.z); - savy.w = (bool)(stay.w ? y.w : savy.w); - ccount.x -= stay.x*16; - ccount.y -= stay.y*16; - ccount.z -= stay.z*16; - ccount.w -= stay.w*16; - } - // Handle remainder - if (!(stay.x & stay.y & stay.z & stay.w)) - { - iter = 16; - do - { - x = savx; - y = savy; - stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter); - stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter); - stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter); - stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter); - tmp = x; - x = x*x + x0 - y*y; - y = 2.0f*tmp*y + y0; - ccount.x += stay.x; - ccount.y += stay.y; - ccount.z += stay.z; - ccount.w += stay.w; - iter--; - savx.x = (stay.x ? x.x : savx.x); - savx.y = (stay.y ? x.y : savx.y); - savx.z = (stay.z ? x.z : savx.z); - savx.w = (stay.w ? x.w : savx.w); - savy.x = (stay.x ? y.x : savy.x); - savy.y = (stay.y ? y.y : savy.y); - savy.z = (stay.z ? y.z : savy.z); - savy.w = (stay.w ? y.w : savy.w); - } while ((stay.x | stay.y | stay.z | stay.w) && iter); - } - uint4 *vecOut = (uint4 *)out; - vecOut[tid].x = (uint)(ccount.x); - vecOut[tid].y = (uint)(ccount.y); - vecOut[tid].z = (uint)(ccount.z); - vecOut[tid].w = (uint)(ccount.w); -} - -class hipPerfStreamConcurrency { - public: - hipPerfStreamConcurrency(); - ~hipPerfStreamConcurrency(); - - void setNumKernels(unsigned int num) { - numKernels = num; - } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } - - unsigned int getNumKernels() { - return numKernels; - } - - void open(int deviceID); - void run(unsigned int testCase, unsigned int deviceId); - void close(void); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numKernels; - unsigned int numStreams; - - unsigned int width_; - unsigned int bufSize; - unsigned int maxIter; - unsigned int coordIdx; - unsigned long long totalIters; - int numCUs; - -}; - - -hipPerfStreamConcurrency::hipPerfStreamConcurrency() {} - -hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {} - -void hipPerfStreamConcurrency::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl; - - numCUs = props.multiProcessorCount; -} - - -void hipPerfStreamConcurrency::close() { -} - - -void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) { - - int clkFrequency = 0; - unsigned int numStreams = getNumStreams(); - unsigned int numKernels = getNumKernels(); - - HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId)); - - clkFrequency =(unsigned int)clkFrequency/1000; - - // Maximum iteration count - // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128); - maxIter = (maxIter + 15) & ~15; - - hipStream_t streams[numStreams]; - - uint * hPtr[numKernels]; - uint * dPtr[numKernels]; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * sizeof(uint); - - // Create streams for concurrency - for (uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - - // Allocate memory on the host and device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - } - - - // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - coordIdx = testCase % numCoords; - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory asynchronously and concurrently from host to device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams])); - } - - - // Synchronize to make sure all the copies are completed - for(uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamSynchronize(streams[i])); - } - - // Warm-up kernel with lower iteration - if (testCase == 0) { - maxIter = 256; - } - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams], - dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter); - } - - - // Synchronize all the concurrent streans to have completed execution - for(uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamSynchronize(streams[i])); - } - - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast(dPtr[i]), bufSize, streams[i % numStreams])); - } - - - if (testCase != 0) { - std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): " - << all_kernel_time.count() << std::endl; - } - - - unsigned long long expected = - (unsigned long long)width_ * (unsigned long long)maxIter; - - for(uint i = 0 ; i < numStreams; i++) { - HIPCHECK(hipStreamDestroy(streams[i])); - } - - - // Free host and device memory - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - -} - - -void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ ; i++) { - ptr2[i] = value; - } -} - - -void hipPerfStreamConcurrency::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfStreamConcurrency streamConcurrency; - int deviceId = 0; - - streamConcurrency.open(deviceId); - - for (unsigned int testCase = 0; testCase < 5; testCase++) { - - - switch (testCase) { - - - case 0: - // Warm-up kernel - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; - - case 1: - // default stream executes serially - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; - - case 2: - // 2-way concurrency - streamConcurrency.setNumStreams(2); - streamConcurrency.setNumKernels(2); - break; - - case 3: - // 4-way concurrency - streamConcurrency.setNumStreams(4); - streamConcurrency.setNumKernels(4); - break; - - case 4: - streamConcurrency.setNumStreams(2); - streamConcurrency.setNumKernels(4); - break; - - case 5: - break; - - default: - break; - } - streamConcurrency.run(testCase, deviceId); - - } - - - passed(); -} diff --git a/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp b/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp deleted file mode 100644 index 103f40c7b..000000000 --- a/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -using namespace std; - -#define BufSize 0x1000 -#define Iterations 0x100 -#define TotalStreams 4 -#define TotalBufs 4 - - -class hipPerfStreamCreateCopyDestroy { - private: - unsigned int numBuffers_; - unsigned int numStreams_; - const size_t totalStreams_[TotalStreams]; - const size_t totalBuffers_[TotalBufs]; - public: - hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0), - totalStreams_{1, 2, 4, 8}, - totalBuffers_{1, 100, 1000, 5000} {}; - ~hipPerfStreamCreateCopyDestroy() {}; - void open(int deviceID); - void run(unsigned int testNumber); -}; - -void hipPerfStreamCreateCopyDestroy::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl; -} - -void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { - numStreams_ = totalStreams_[testNumber % TotalStreams]; - size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1))); - hipStream_t streams[numStreams_]; - - numBuffers_ = totalBuffers_[testNumber / TotalBufs]; - float* dSrc[numBuffers_]; - size_t nBytes = BufSize * sizeof(float); - - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipMalloc(&dSrc[b], nBytes)); - } - - float* hSrc; - hSrc = new float[nBytes]; - HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); - for (size_t i = 0; i < BufSize; i++) { - hSrc[i] = 1.618f + i; - } - - auto start = std::chrono::steady_clock::now(); - - for (size_t i = 0; i < iter; ++i) { - for (size_t s = 0; s < numStreams_; ++s) { - HIPCHECK(hipStreamCreate(&streams[s])); - } - - for (size_t s = 0; s < numStreams_; ++s) { - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s])); - } - } - - for (size_t s = 0; s < numStreams_; ++s) { - HIPCHECK(hipStreamDestroy(streams[s])); - } - } - - auto end = std::chrono::steady_clock::now(); - std::chrono::duration diff = end - start; - - auto time = static_cast(diff.count() * 1000 / (iter * numStreams_)); - - cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and " - << setw(4) << numBuffers_ << " buffers " << " and " << setw(4) - << iter << " iterations " << time << " (ms) " << endl; - - delete [] hSrc; - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipFree(dSrc[b])); - } -} - -int main(int argc, char* argv[]) { - hipPerfStreamCreateCopyDestroy streamCCD; - - int deviceId = 0; - streamCCD.open(deviceId); - - for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) { - streamCCD.run(testCase); - } - - passed(); -}