diff --git a/catch/hipTestMain/config/config_amd_linux b/catch/hipTestMain/config/config_amd_linux
index 16340e45f..8d891b9e5 100644
--- a/catch/hipTestMain/config/config_amd_linux
+++ b/catch/hipTestMain/config/config_amd_linux
@@ -744,6 +744,8 @@
         "Unit_hipGetLastError_KernelFailure_ValidAndInvalidOperations",
         "Unit_hipGetLastError_KernelFailure_TwoDevices",
         "Unit_hipGetLastError_KernelFailure_TwoStreams",
+        "=== Enable the below test when multi-device graph launches are fully supported ===",
+        "Unit_hipGraphInstantiateWithFlags_DependencyGraphDeviceCtxtChg",
     #endif
     #if defined gfx90a || defined gfx942 || defined gfx950
         "=== SWDEV-443630 : Below test failed in stress test on 19/01/24 ===",
diff --git a/catch/include/hip_test_common.hh b/catch/include/hip_test_common.hh
index d2cd7ec4e..cf0b2f392 100644
--- a/catch/include/hip_test_common.hh
+++ b/catch/include/hip_test_common.hh
@@ -44,6 +44,26 @@ THE SOFTWARE.
 
 #define HIP_PRINT_STATUS(status) INFO(hipGetErrorName(status) << " at line: " << __LINE__);
 
+#define CHAR_BUF_SIZE 512
+
+#define CONSOLE_PRINT(fmt, ...)                                                                    \
+  do {                                                                                             \
+    std::printf(fmt "\n", ##__VA_ARGS__);                                                          \
+  } while (0)
+
+// DEBUG_PRINT: If ENABLE_DEBUG is defined, prints immediately to console.
+// Otherwise, uses Catch2 INFO() - debug messages will only appear if the test fails.
+#if defined(ENABLE_DEBUG)
+#define DEBUG_PRINT(fmt, ...) CONSOLE_PRINT("[DEBUG]: " fmt, ##__VA_ARGS__)
+#else
+#define DEBUG_PRINT(fmt, ...)                                                                      \
+  do {                                                                                             \
+    char buf[CHAR_BUF_SIZE];                                                                       \
+    std::snprintf(buf, CHAR_BUF_SIZE, "[INFO]: " fmt, ##__VA_ARGS__);                              \
+    INFO(buf);                                                                                     \
+  } while (0)
+#endif
+
 // Not thread-safe
 #define HIP_CHECK(error)                                                                           \
   {                                                                                                \
@@ -323,6 +343,26 @@ inline bool isPcieAtomicsSupported() {
   return pcieAtomics != 0;
 }
 
+inline bool isP2PSupported(int& d1, int& d2) {
+  int num_devices = HipTest::getDeviceCount();
+  int supported  = 1;
+  for (auto i = 0u; i < num_devices; ++i) {
+    int canAccess = 0;
+    for (auto j = 0u; j < num_devices; ++j) {  
+      if (i != j) {
+        HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j));
+        if (!canAccess) {
+          supported = 0;
+          d1 = i;
+          d2 = j;
+          break;
+        }
+      }
+    }
+  }
+  return supported;
+}
+
 inline bool areWarpMatchFunctionsSupported() {
   int matchFunctionsSupported = 1;
 #if HT_NVIDIA
@@ -516,6 +556,14 @@ class BlockingContext {
     return;                                                                                        \
   }
 
+#define CHECK_P2P_SUPPORT                                                                          \
+  int d1, d2;                                                                                      \
+  if (!HipTest::isP2PSupported(d1,d2)) {                                                           \
+    std::string msg = "P2P access check failed between dev1:" + std::to_string(d1) + ",dev2:" +    \
+                                                                std::to_string(d2);                \
+    HipTest::HIP_SKIP_TEST(msg.c_str());                                                           \
+    return;                                                                                        \
+  }                                                                                                \
 // This must be called in the beginning of warp test app's main() to indicate warp match functions
 // are supported.
 #define CHECK_WARP_MATCH_FUNCTIONS_SUPPORT                                                         \
diff --git a/catch/include/hip_test_kernels.hh b/catch/include/hip_test_kernels.hh
index 68f452ba5..e47614ee6 100644
--- a/catch/include/hip_test_kernels.hh
+++ b/catch/include/hip_test_kernels.hh
@@ -93,7 +93,12 @@ template <typename T> __global__ void vector_square(const T* A_d, T* C_d, size_t
   size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t stride = blockDim.x * gridDim.x;
   for (size_t i = gputhread; i < N_ELMTS; i += stride) {
+#if HT_AMD
+    T result = A_d[i] * A_d[i];
+    __hip_atomic_store(&C_d[i], result, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+#else
     C_d[i] = A_d[i] * A_d[i];
+#endif
   }
 }
 
diff --git a/catch/multiproc/hipIpcEventHandle.cc b/catch/multiproc/hipIpcEventHandle.cc
index bd6fdff29..499ef0862 100644
--- a/catch/multiproc/hipIpcEventHandle.cc
+++ b/catch/multiproc/hipIpcEventHandle.cc
@@ -196,6 +196,9 @@ void runMultiProcKernel(ipcEventInfo_t *shmEventInfo, int index) {
             }
         }
     }
+    for (int i = 1; i < g_processCnt; i++) {
+      HIP_CHECK(hipEventDestroy(event[i]));
+    }
   } else {
     hipEvent_t event;
     HIP_CHECK(hipEventCreateWithFlags(&event,
diff --git a/catch/perftests/CMakeLists.txt b/catch/perftests/CMakeLists.txt
index 38078c0a4..a6e4ecd91 100644
--- a/catch/perftests/CMakeLists.txt
+++ b/catch/perftests/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -27,3 +27,4 @@ add_subdirectory(dispatch)
 add_subdirectory(compute)
 add_subdirectory(graph)
 add_subdirectory(event)
+add_subdirectory(vmm)
\ No newline at end of file
diff --git a/catch/perftests/compute/hipPerfDotProduct.cc b/catch/perftests/compute/hipPerfDotProduct.cc
index aad97f063..eddcf4bb2 100644
--- a/catch/perftests/compute/hipPerfDotProduct.cc
+++ b/catch/perftests/compute/hipPerfDotProduct.cc
@@ -18,10 +18,10 @@
  */
 
 /**
-* @addtogroup hipPerfDotProduct hipPerfDotProduct
-* @{
-* @ingroup perfComputeTest
-*/
+ * @addtogroup hipPerfDotProduct hipPerfDotProduct
+ * @{
+ * @ingroup perfComputeTest
+ */
 
 #include <hip_test_common.hh>
 #include <vector>
@@ -31,11 +31,9 @@
 using namespace std;
 
 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_not_equal(int n,
-                                 const double* __restrict__ x,
-                                 const double* __restrict__ y,
-                                 double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__
+    void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y,
+                           double* __restrict__ workspace) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
 
   double sum = 0.0;
@@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n,
 }
 
 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_equal(int n, const double* __restrict__ x,
-                                  double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__
+    void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
 
   double sum = 0.0;
@@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
   __syncthreads();
 
   if (threadIdx.x < 8) {
-          sdata[threadIdx.x] += sdata[threadIdx.x + 8];
+    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
   }
   __syncthreads();
 
@@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
 
   if (threadIdx.x == 0) {
     workspace[blockIdx.x] = sdata[0];
-    }
+  }
 }
 
 template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void dot_reduction(double* __restrict__ workspace) {
+__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) {
   __shared__ double sdata[BLOCKSIZE];
 
   sdata[threadIdx.x] = workspace[threadIdx.x];
@@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
 
   if (threadIdx.x < 4) {
     sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  } __syncthreads();
+  }
+  __syncthreads();
 
   if (threadIdx.x < 2) {
     sdata[threadIdx.x] += sdata[threadIdx.x + 2];
@@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
   }
 }
 
-void computeDotProduct(int n, const double* x, const double* y, double& result,
-                      double* workspace) {
+void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) {
   dim3 blocks(DOT_DIM);
   dim3 threadsPerBlock(DOT_DIM);
 
@@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result,
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify the device kernel results comparing it with the host results.
-* Test source
-* ------------------------
-*  - perftests/compute/hipPerfDotProduct.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the device kernel results comparing it with the host results.
+ * Test source
+ * ------------------------
+ *  - perftests/compute/hipPerfDotProduct.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfDotProduct") {
   int nGpu = 0;
@@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") {
   for (unsigned int testCase = 0; testCase < 3; testCase++) {
     vector<int> vectorSize = {200, 300, 50};
     switch (testCase) {
-    case 0:
-    nx = vectorSize[0];
-    ny = vectorSize[0];
-    nz = vectorSize[0];
-    break;
-
-    case 1:
-    nx = vectorSize[1];
-    ny = vectorSize[1];
-    nz = vectorSize[1];
-    break;
-
-    case 2:
-    nx = vectorSize[0];
-    ny = vectorSize[1];
-    nz = vectorSize[2];
-    break;
-
-    default:
-      break;
-  }
-
-  int trials = 200;
-  int size = nx * ny * nz;
+      case 0:
+        nx = vectorSize[0];
+        ny = vectorSize[0];
+        nz = vectorSize[0];
+        break;
+
+      case 1:
+        nx = vectorSize[1];
+        ny = vectorSize[1];
+        nz = vectorSize[1];
+        break;
+
+      case 2:
+        nx = vectorSize[0];
+        ny = vectorSize[1];
+        nz = vectorSize[2];
+        break;
+
+      default:
+        break;
+    }
 
-  vector<double> hx(size);
-  vector<double> hy(size);
-  double hresult_xy = 0.0;
-  double hresult_xx = 0.0;
+    int trials = 200;
+    int size = nx * ny * nz;
 
-  srand(time(NULL));
+    vector<double> hx(size);
+    vector<double> hy(size);
+    double hresult_xy = 0.0;
+    double hresult_xx = 0.0;
 
-  for (int i = 0; i < size; ++i) {
-    hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
-    hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
+    srand(time(NULL));
 
-    hresult_xy += hx[i] * hy[i];
-    hresult_xx += hx[i] * hx[i];
-  }
+    for (int i = 0; i < size; ++i) {
+      hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
+      hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
 
-  double* dx;
-  double* dy;
-  double* workspace;
-  double  dresult;
-
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
+      hresult_xy += hx[i] * hy[i];
+      hresult_xx += hx[i] * hx[i];
+    }
 
-  HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
+    double* dx;
+    double* dy;
+    double* workspace;
+    double dresult;
 
-  // Warm up
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
 
-  // Timed run for <x,y>
-  HIP_CHECK(hipDeviceSynchronize());
-  auto all_start = std::chrono::steady_clock::now();
+    HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
 
-  for (int i = 0; i < trials; ++i) {
+    // Warm up
+    computeDotProduct(size, dx, dy, dresult, workspace);
+    computeDotProduct(size, dx, dy, dresult, workspace);
     computeDotProduct(size, dx, dy, dresult, workspace);
-  }
 
-  float time = 0;
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
+    // Timed run for <x,y>
+    HIP_CHECK(hipDeviceSynchronize());
+    auto all_start = std::chrono::steady_clock::now();
 
-  time /= trials;
+    for (int i = 0; i < trials; ++i) {
+      computeDotProduct(size, dx, dy, dresult, workspace);
+    }
 
-  double bw = sizeof(double) * size * 2.0 / 1e9;
-  double gf = 2.0 * size / 1e9;
+    float time = 0;
+    auto all_end = std::chrono::steady_clock::now();
+    std::chrono::duration<double> all_kernel_time = all_end - all_start;
+    time = all_kernel_time.count();
 
-  cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
+    time /= trials;
 
-  // Verify the device kernel results comparing it with the host results
-  REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
+    double bw = sizeof(double) * size * 2.0 / 1e9;
+    double gf = 2.0 * size / 1e9;
 
-  // Warm up
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
+    CONSOLE_PRINT("\nVector Size: %d\n[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size,
+                  time, bw / (time / 1e3), gf / (time / 1e3));
 
-  // Timed run for <x,x>
-  HIP_CHECK(hipDeviceSynchronize());
-  all_start = std::chrono::steady_clock::now();
+    // Verify the device kernel results comparing it with the host results
+    REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
 
-  for (int i = 0; i < trials; ++i) {
+    // Warm up
     computeDotProduct(size, dx, dx, dresult, workspace);
-  }
+    computeDotProduct(size, dx, dx, dresult, workspace);
+    computeDotProduct(size, dx, dx, dresult, workspace);
+
+    // Timed run for <x,x>
+    HIP_CHECK(hipDeviceSynchronize());
+    all_start = std::chrono::steady_clock::now();
 
-  all_end = std::chrono::steady_clock::now();
-  all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
+    for (int i = 0; i < trials; ++i) {
+      computeDotProduct(size, dx, dx, dresult, workspace);
+    }
+
+    all_end = std::chrono::steady_clock::now();
+    all_kernel_time = all_end - all_start;
+    time = all_kernel_time.count();
 
-  time /= trials;
-  bw = sizeof(double) * size / 1e9;
+    time /= trials;
+    bw = sizeof(double) * size / 1e9;
 
-  cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
+    CONSOLE_PRINT("[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3),
+                  gf / (time / 1e3));
 
-  // Verify the device kernel results comparing it with the host results
-  REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
+    // Verify the device kernel results comparing it with the host results
+    REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
 
-  HIP_CHECK(hipFree(dx));
-  HIP_CHECK(hipFree(dy));
-  HIP_CHECK(hipFree(workspace));
+    HIP_CHECK(hipFree(dx));
+    HIP_CHECK(hipFree(dy));
+    HIP_CHECK(hipFree(workspace));
   }
 }
 
 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
diff --git a/catch/perftests/compute/hipPerfMandelbrot.cc b/catch/perftests/compute/hipPerfMandelbrot.cc
index a500b7df3..ef007e06c 100644
--- a/catch/perftests/compute/hipPerfMandelbrot.cc
+++ b/catch/perftests/compute/hipPerfMandelbrot.cc
@@ -18,10 +18,10 @@
  */
 
 /**
-* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
-* @{
-* @ingroup perfComputeTest
-*/
+ * @addtogroup hipPerfMandelbrot hipPerfMandelbrot
+ * @{
+ * @ingroup perfComputeTest
+ */
 
 #include <hip_test_common.hh>
 #include <hip/hip_vector_types.h>
@@ -45,36 +45,35 @@ coordRec coords[] = {
 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
 
 template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos,
-                                   T xStep, T yStep, uint maxIter) {
+__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                 uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
   int i = tid % width;
   int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*i);
-  float y0 = static_cast<float>(yPos + yStep*j);
+  float x0 = static_cast<float>(xPos + xStep * i);
+  float y0 = static_cast<float>(yPos + yStep * j);
 
   float x = x0;
   float y = y0;
 
   uint iter = 0;
   float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
     tmp = x;
     x = fma(-y, y, fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * tmp, y, y0);
   }
   out[tid] = iter;
 }
 
 template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
+__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                           uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
   int i = tid % width;
   int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*static_cast<float>(i));
-  float y0 = static_cast<float>(yPos + yStep*static_cast<float>(j));
+  float x0 = static_cast<float>(xPos + xStep * static_cast<float>(i));
+  float y0 = static_cast<float>(yPos + yStep * static_cast<float>(j));
 
   float x = x0;
   float y = y0;
@@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
   float tmp;
   int stay;
   int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
+  stay = (x * x + y * y) <= 4.0;
   float savx = x;
   float savy = y;
 #ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
+  for (iter = 0; (iter < maxIter); iter += 16) {
 #else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
+  for (iter = 0; stay && (iter < maxIter); iter += 16) {
 #endif
     x = savx;
     y = savy;
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =  fma(2.0f*x, y, y0);
-    x =  fma(-y, y, fma(tmp, tmp, x0));
-    y =  fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
-    stay = (x*x+y*y) <= 4.0;
+    stay = (x * x + y * y) <= 4.0;
     savx = (stay ? x : savx);
     savy = (stay ? y : savy);
-    ccount += stay*16;
+    ccount += stay * 16;
 #ifdef FAST
-    if (!stay)
-      break;
+    if (!stay) break;
 #endif
   }
   // Handle remainder
@@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
     do {
       x = savx;
       y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+      stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
       tmp = x;
-      x =  fma(-y, y, fma(x, x, x0));
-      y =  fma(2.0f*tmp, y, y0);
+      x = fma(-y, y, fma(x, x, x0));
+      y = fma(2.0f * tmp, y, y0);
       ccount += stay;
       iter--;
       savx = (stay ? x : savx);
@@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
 }
 
 template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
+__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                  uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
   int i = tid % width;
   int j = tid / width;
-  double x0 = static_cast<double>(xPos + xStep*i);
-  double y0 = static_cast<double>(yPos + yStep*j);
+  double x0 = static_cast<double>(xPos + xStep * i);
+  double y0 = static_cast<double>(yPos + yStep * j);
 
   double x = x0;
   double y = y0;
 
   uint iter = 0;
   double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
     tmp = x;
-    x = fma(-y, y,fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    x = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * tmp, y, y0);
   }
   out[tid] = iter;
 };
 
 template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
+__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
+                                            uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
 
   int i = tid % width;
   int j = tid / width;
-  double x0 = static_cast<double>(xPos + xStep*static_cast<double>(i));
-  double y0 = static_cast<double>(yPos + yStep*static_cast<double>(j));
+  double x0 = static_cast<double>(xPos + xStep * static_cast<double>(i));
+  double y0 = static_cast<double>(yPos + yStep * static_cast<double>(j));
 
   double x = x0;
   double y = y0;
@@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
   double tmp;
   int stay;
   int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
+  stay = (x * x + y * y) <= 4.0;
   double savx = x;
   double savy = y;
 #ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
+  for (iter = 0; (iter < maxIter); iter += 16)
 #else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
+  for (iter = 0; stay && (iter < maxIter); iter += 16)
 #endif
   {
     x = savx;
@@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
 
     // Two iterations
     tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
     tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
     tmp = fma(-y, y, fma(x, x, x0));
-    y =   fma(2.0f*x, y, y0);
-    x =   fma(-y, y, fma(tmp, tmp, x0));
-    y =   fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
     // Two iterations
-    tmp =  fma(-y, y, fma(x, x, x0));
-    y =    fma(2.0f*x, y, y0);
-    x =    fma(-y, y, fma(tmp, tmp, x0));
-    y =    fma(2.0f*tmp, y, y0);
+    tmp = fma(-y, y, fma(x, x, x0));
+    y = fma(2.0f * x, y, y0);
+    x = fma(-y, y, fma(tmp, tmp, x0));
+    y = fma(2.0f * tmp, y, y0);
 
-    stay = (x*x+y*y) <= 4.0;
+    stay = (x * x + y * y) <= 4.0;
     savx = (stay ? x : savx);
     savy = (stay ? y : savy);
-    ccount += stay*16;
+    ccount += stay * 16;
 #ifdef FAST
-    if (!stay)
-      break;
+    if (!stay) break;
 #endif
-    }
+  }
   // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x, x, x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-    }
-    out[tid] = (uint)ccount;
+  if (!stay) {
+    iter = 16;
+    do {
+      x = savx;
+      y = savy;
+      stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
+      tmp = x;
+      x = fma(-y, y, fma(x, x, x0));
+      y = fma(2.0f * tmp, y, y0);
+      ccount += stay;
+      iter--;
+      savx = (stay ? x : savx);
+      savy = (stay ? y : savy);
+    } while (stay && iter);
+  }
+  out[tid] = (uint)ccount;
 };
 
 // Expected results for each kernel run at each coord
 unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+    203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull,
+    203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull,
+    203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull,
+    203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull,
+    203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
 
 class hipPerfMandelBrot {
  public:
   hipPerfMandelBrot();
   ~hipPerfMandelBrot();
 
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
+  void setNumKernels(unsigned int num) { numKernels = num; }
 
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
+  unsigned int getNumKernels() { return numKernels; }
 
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
+  void setNumStreams(unsigned int num) { numStreams = num; }
+  unsigned int getNumStreams() { return numStreams; }
 
   void open(int deviceID);
   bool run(unsigned int testCase);
   void printResults(void);
 
   // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
+  typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos,
+                                            float xStep, float yStep, uint maxIter,
+                                            hipStream_t* streams, int blocks, int threads_per_block,
+                                            int kernelCnt);
 
   // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
+  void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                 uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                 int kernelCnt);
 
-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
+  void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                           uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                           int kernelCnt);
 
-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
+  void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                  uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                  int kernelCnt);
 
-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
+  void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                            uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
+                            int kernelCnt);
 
   hipStream_t streams[2];
 
  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);
 
   unsigned int numKernels;
   unsigned int numStreams;
@@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) {
   HIP_CHECK(hipSetDevice(deviceId));
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
+
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
 
   numCUs = props.multiProcessorCount;
 }
@@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) {
 void hipPerfMandelBrot::printResults() {
   int numStreams = getNumStreams();
 
-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
+  CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams);
 
-  std::map<std::string, std::vector<double>>:: iterator itr;
+  std::map<std::string, std::vector<double>>::iterator itr;
   for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for (auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
+    CONSOLE_PRINT("\n%s ", itr->first.c_str());
+    for (auto i : results[itr->first]) {
+      CONSOLE_PRINT("%10f ", i);
+    }
+  }
   results.clear();
-  std::cout << std::endl;
+  CONSOLE_PRINT("\n");
 }
 
 // Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep,
+                                  float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                                  int threads_per_block, int kernelCnt) {
   int streamCnt = getNumStreams();
   hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep,
-                      maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }
 
-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos,
+                                            float xStep, float yStep, uint maxIter,
+                                            hipStream_t* streams, int blocks, int threads_per_block,
+                                            int kernelCnt) {
   int streamCnt = getNumStreams();
   hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }
 
-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep,
+                                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                                   int threads_per_block, int kernelCnt) {
   int streamCnt = getNumStreams();
   hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }
 
-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
+void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos,
+                                             float xStep, float yStep, uint maxIter,
+                                             hipStream_t* streams, int blocks,
+                                             int threads_per_block, int kernelCnt) {
   int streamCnt = getNumStreams();
   hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
+                     streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
 }
 
 bool hipPerfMandelBrot::run(unsigned int testCase) {
@@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
   coordIdx = testCase % numCoords;
 
   funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
+                &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
 
   // Maximum iteration count
   maxIter = 32768;
 
-  uint ** hPtr = new uint *[numKernels];
-  uint ** dPtr = new uint *[numKernels];
+  uint** hPtr = new uint*[numKernels];
+  uint** dPtr = new uint*[numKernels];
 
   // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
   width_ = 256;
 
-  bufSize = width_  * width_ * sizeof(uint);
+  bufSize = width_ * width_ * sizeof(uint);
 
   // Create streams for concurrency
   for (uint i = 0; i < numStreams; i++) {
@@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
 
   // Allocate memory on the host and device
   for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]), bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
     setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
   }
 
   // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+  int threads = (bufSize / sizeof(uint));
+  int threads_per_block = 64;
+  int blocks = (threads / threads_per_block) + (threads % threads_per_block);
 
   // Copy memory asynchronously and concurrently from host to device
   for (uint i = 0; i < numKernels; i++) {
@@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
   HIP_CHECK(hipStreamSynchronize(0));
 
   int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
+  if (testCase == 0 || testCase == 5 || testCase == 10) {
     kernelIdx = 0;
-  } else if(testCase == 1 || testCase == 6 || testCase == 11) {
+  } else if (testCase == 1 || testCase == 6 || testCase == 11) {
     kernelIdx = 1;
-  } else if(testCase == 2 || testCase == 7 || testCase == 12) {
+  } else if (testCase == 2 || testCase == 7 || testCase == 12) {
     kernelIdx = 2;
-  } else if(testCase == 3 || testCase == 8 || testCase == 13){
+  } else if (testCase == 3 || testCase == 8 || testCase == 13) {
     kernelIdx = 3;
   }
   double totalTime = 0.0;
   for (unsigned int k = 0; k < numLoops; k++) {
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
-  float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
-  float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIP_CHECK(hipStreamSynchronize(0));
+    if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 ||
+         testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) {
+      float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
+      float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
+      float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+      float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+      // Time the kernel execution
+      auto all_start = std::chrono::steady_clock::now();
+
+      for (uint i = 0; i < numKernels; i++) {
+        (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                              threads_per_block, i);
+      }
 
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  } else {
-  double xStep = coords[coordIdx].width / static_cast<double>(width_);
-  double yStep = -coords[coordIdx].width / static_cast<double>(width_);
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-  // Synchronize all the concurrent streams to have completed execution
-  HIP_CHECK(hipStreamSynchronize(0));
+      // Synchronize all the concurrent streams to have completed execution
+      HIP_CHECK(hipStreamSynchronize(0));
+
+      auto all_end = std::chrono::steady_clock::now();
+      std::chrono::duration<double> all_kernel_time = all_end - all_start;
+      totalTime += all_kernel_time.count();
+    } else {
+      double xStep = coords[coordIdx].width / static_cast<double>(width_);
+      double yStep = -coords[coordIdx].width / static_cast<double>(width_);
+      double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+      double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+
+      // Time the kernel execution
+      auto all_start = std::chrono::steady_clock::now();
+      for (uint i = 0; i < numKernels; i++) {
+        (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                              threads_per_block, i);
+      }
+      // Synchronize all the concurrent streams to have completed execution
+      HIP_CHECK(hipStreamSynchronize(0));
 
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
+      auto all_end = std::chrono::steady_clock::now();
+      std::chrono::duration<double> all_kernel_time = all_end - all_start;
+      totalTime += all_kernel_time.count();
+    }
   }
 
   // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
+  for (uint i = 0; i < numKernels; i++) {
+    HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
   }
+  for (uint i = 0; i < numKernels; i++) {
+    checkData(hPtr[i]);
+    int j = 0;
+    while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
+      j++;
+    }
 
-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
+    if (j == 30) {
+      CONSOLE_PRINT("Incorrect iteration count detected. ");
+    }
   }
 
   // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = (static_cast<double>(totalIters*numKernels) * 7 * static_cast<double>(1e-09)) /
-                (totalTime / (double)numLoops);
+  double perf = (static_cast<double>(totalIters * numKernels) * 7 * static_cast<double>(1e-09)) /
+      (totalTime / (double)numLoops);
 
 
-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
+  std::vector<std::string> kernelName = {"float", "float_unroll", "double", "double_unroll"};
 
   // Print results except for Warm-up kernel
   if (testCase != 100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
+    results[kernelName[testCase % 4]].push_back(perf);
+  }
 
-  for(uint i = 0 ; i < numStreams; i++) {
+  for (uint i = 0; i < numStreams; i++) {
     HIP_CHECK(hipStreamDestroy(streams[i]));
   }
 
@@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
     HIP_CHECK(hipHostFree(hPtr[i]));
     HIP_CHECK(hipFree(dPtr[i]));
   }
-  delete [] hPtr;
-  delete [] dPtr;
+  delete[] hPtr;
+  delete[] dPtr;
   return true;
 }
 
-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
+void hipPerfMandelBrot::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
   for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
+    ptr2[i] = value;
   }
 }
 
-void hipPerfMandelBrot::checkData(uint *ptr) {
+void hipPerfMandelBrot::checkData(uint* ptr) {
   totalIters = 0;
   for (unsigned int i = 0; i < width_ * width_; i++) {
     totalIters += ptr[i];
@@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify the warm-up kernel default stream executes serially.
-*  - verify by running all kernels - sync.
-*  - verify by running all kernels - async.
-* Test source
-* ------------------------
-*  - perftests/compute/hipPerfMandelbrot.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the warm-up kernel default stream executes serially.
+ *  - verify by running all kernels - sync.
+ *  - verify by running all kernels - async.
+ * Test source
+ * ------------------------
+ *  - perftests/compute/hipPerfMandelbrot.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfMandelbrot") {
   hipPerfMandelBrot mandelbrotCompute;
   int deviceId = 0;
   mandelbrotCompute.open(deviceId);
-  #if HT_AMD
+#if HT_AMD
   SECTION("warm-up kernel default stream executes serially") {
     mandelbrotCompute.setNumStreams(1);
     mandelbrotCompute.setNumKernels(1);
-    REQUIRE(true == mandelbrotCompute.run(100/*Random number*/));
+    REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/));
   }
-  #endif
+#endif
   SECTION("run all - sync") {
     int i = 0;
     do {
@@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
       mandelbrotCompute.setNumKernels(1);
       REQUIRE(true == mandelbrotCompute.run(i));
       i++;
-    }while(i < 12);
+    } while (i < 12);
     mandelbrotCompute.printResults();
   }
 
@@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
       mandelbrotCompute.setNumKernels(2);
       REQUIRE(true == mandelbrotCompute.run(i));
       i++;
-    }while(i < 12);
+    } while (i < 12);
     mandelbrotCompute.printResults();
   }
 }
 
 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
diff --git a/catch/perftests/dispatch/hipPerfDispatchSpeed.cc b/catch/perftests/dispatch/hipPerfDispatchSpeed.cc
index 897999d8b..4054dbd05 100644
--- a/catch/perftests/dispatch/hipPerfDispatchSpeed.cc
+++ b/catch/perftests/dispatch/hipPerfDispatchSpeed.cc
@@ -18,163 +18,190 @@
  */
 
 /**
-* @addtogroup hipPerfDispatchSpeed hipPerfDispatchSpeed
-* @{
-* @ingroup perfDispatchTest
-*/
+ * @addtogroup hipPerfDispatchSpeed hipPerfDispatchSpeed
+ * @{
+ * @ingroup perfDispatchTest
+ */
+
+// #define ENABLE_DEBUG 1
 
 #include <hip_test_common.hh>
 #include <string.h>
 #include <complex>
 
-// Quiet pesky warnings
-#ifdef WIN_OS
-#define SNPRINTF sprintf_s
-#else
-#define SNPRINTF snprintf
-#endif
-#define CHAR_BUF_SIZE 512
-
-typedef struct {
-    unsigned int iterations;
-    int flushEvery;
-} testStruct;
-
-testStruct testList[] = {
-    { 1, -1},
-    { 1, -1},
-    { 10, 1},
-    { 10, -1},
-    { 100, 1},
-    { 100, 10},
-    { 100, -1},
-    { 1000, 1},
-    { 1000, 10},
-    { 1000, 100},
-    { 1000, -1},
-    { 10000, 1},
-    { 10000, 10},
-    { 10000, 100},
-    { 10000, 1000},
-    { 10000, -1},
-    { 100000, 1},
-    { 100000, 10},
-    { 100000, 100},
-    { 100000, 1000},
-    { 100000, 10000},
-    { 100000, -1},
-};
+/**
+ * Test Description
+ * ------------------------
+ *  - Verify the hipPerf Dispatch and Execution speed, AKA total kernel latency
+ * Test source
+ * ------------------------
+ *  - perftests/dispatch/hipPerfDispatchSpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
-unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000};
+unsigned int testList[] = {1, 10, 100, 1000, 10000};
 
-__global__ void _dispatchSpeed(float *outBuf) {
+// dummy kernel that just dispatches and does nothing
+__global__ void _dispatchSpeed(float* outBuf) {
   int i = (blockIdx.x * blockDim.x + threadIdx.x);
-  if (i < 0)
-    outBuf[i] = 0.0f;
+  if (i < 0) outBuf[i] = 0.0f;
 };
 
-/**
-* Test Description
-* ------------------------
-*  - Verify the hipPerf Dispatch speed.
-* Test source
-* ------------------------
-*  - perftests/compute/hipPerfMandelbrot.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
-
-TEST_CASE("Perf_hipPerfDispatchSpeed") {
-  int p_gpuDevice = 0;
-  int p_tests = -1;
+// kernel that has an execution of count, in GPU clock ticks
+__global__ void _TimingKernel(uint64_t count) {
+  uint64_t begin_time = __builtin_amdgcn_s_memrealtime();
+  uint64_t curr_time = begin_time;
+  do {
+    curr_time = __builtin_amdgcn_s_memrealtime();
+  } while (begin_time + count > curr_time);
+}
+
+enum TimingMode { TimingMode_WallTime, TimingMode_HIPEvent, TimingMode_NumModes };
+
+TEST_CASE("Perf_hipPerfDispatchAndExecutionSpeed") {
   hipError_t err = hipSuccess;
-  hipDeviceProp_t props;
-  HIP_CHECK(hipGetDeviceProperties(&props, p_gpuDevice));
 
-  unsigned int testListSize = sizeof(testList) / sizeof(testStruct);
-  int numTests = (p_tests == -1) ? (2*2*testListSize - 1) : p_tests;
-  int test = (p_tests == -1) ? 0 : p_tests;
+  unsigned int testListSize = sizeof(testList) / sizeof(testList[0]);
+  int numTests = testListSize;
+  int warmup = 10;  // number of warmup iterations
+
+  DEBUG_PRINT("numTests %d", numTests);
+
+  // set up timing kernel
+  uint64_t timer_freq_in_hz;
+  int clock_rate = 0;  // in kHz
+  HIP_CHECK(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeWallClockRate, 0));
+  timer_freq_in_hz = clock_rate * 1000;
+  uint64_t timing_in_us = 4;  // CHANGE THIS TO CHANGE EXECUTION TIME
+  const uint64_t timing_count = timer_freq_in_hz * timing_in_us / 1000000;
+
+  int iterations = 100;  // number of times to run the test to get an average time
 
   float* srcBuffer = NULL;
-  unsigned int bufSize_ = 64*sizeof(float);
+  unsigned int bufSize_ = 64 * sizeof(float);
   err = hipMalloc(&srcBuffer, bufSize_);
   REQUIRE(err == hipSuccess);
 
-  for (; test <= numTests; test++) {
-    int openTest = test % testListSize;
-    bool sleep = false;
+  hipEvent_t startEvent, stopEvent;
+
+  HIP_CHECK(hipEventCreate(&startEvent));
+  HIP_CHECK(hipEventCreate(&stopEvent));
+
+
+  // run twice for both dispatch speed and full kernel latency
+  for (int j = 0; j < 2; j++) {
+    bool useTimingKernel = (j == 1);
+    if (useTimingKernel) {
+      CONSOLE_PRINT("\nTIMING KERNEL TEST ()");
+      CONSOLE_PRINT("--------------------------------------------------------------");
 
-    if (test >= (testListSize * 2)) {
-        sleep = true;
+    } else {
+      CONSOLE_PRINT("EMPTY KERNEL TEST");
+      CONSOLE_PRINT("--------------------------------------------------------------");
     }
-    int threads = (bufSize_ / sizeof(float));
-    int threads_per_block  = 64;
-    int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-    // warmup
-    hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block),
-                       0, hipStream_t(0), srcBuffer);
-    err = hipDeviceSynchronize();
-    REQUIRE(err == hipSuccess);
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for (unsigned int i = 0; i < testList[openTest].iterations; i++) {
-      hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks),
-                      dim3(threads_per_block), 0, hipStream_t(0), srcBuffer);
-      if ((testList[openTest].flushEvery > 0) &&
-        (((i + 1) % testList[openTest].flushEvery) == 0)) {
-          if (sleep) {
-            err = hipDeviceSynchronize();
-            REQUIRE(err == hipSuccess);
-          } else {
-            do {
-              err = hipStreamQuery(NULL);
-          } while (err == hipErrorNotReady);
+
+
+    // loop through all possible timing methods
+    for (unsigned int i = 0; i < TimingMode_NumModes; i++) {
+      TimingMode mode = static_cast<TimingMode>(i);
+      CONSOLE_PRINT("\nTIMING METHOD:");
+
+      switch (mode) {
+        case TimingMode_WallTime:
+          CONSOLE_PRINT("Wall Time");
+          break;
+        case TimingMode_HIPEvent:
+          CONSOLE_PRINT("HIP Events");
+          break;
+        default:
+          CONSOLE_PRINT("Unknown Mode");
+      }
+
+      // go through test iterations
+      for (int test = 0; test < numTests; test++) {
+        int openTest = test % testListSize;
+
+        int threads = (bufSize_ / sizeof(float));
+        int threads_per_block = 64;
+        int blocks = (threads / threads_per_block) + (threads % threads_per_block);
+        double finalPerf = 0.0;
+        double wallMicroSec = 0.0;
+
+        std::chrono::high_resolution_clock::time_point startWall, stopWall;
+
+        // warmup
+        for (int i = 0; i < warmup; i++) {
+          hipLaunchKernelGGL(_TimingKernel, dim3(blocks), dim3(threads_per_block), 0,
+                             hipStream_t(0), timing_count);
+        }
+        HIP_CHECK(hipStreamSynchronize(0));
+
+        for (int it = 0; it < iterations; it++) {
+          switch (mode) {
+            case TimingMode_WallTime:
+              startWall = std::chrono::high_resolution_clock::now();
+              break;
+            case TimingMode_HIPEvent:
+              HIP_CHECK(hipEventRecord(startEvent, 0));
+              break;
+            default:
+              CONSOLE_PRINT("Unknown Mode");
+          }
+
+          for (unsigned int i = 0; i < testList[openTest]; i++) {
+            if (useTimingKernel) {
+              // use the timing kernel to measure dispatch and execution speed
+              hipLaunchKernelGGL(_TimingKernel, dim3(blocks), dim3(threads_per_block), 0,
+                                 hipStream_t(0), timing_count);
+            } else {
+              // use the dispatch speed kernel
+              hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block), 0,
+                                 hipStream_t(0), srcBuffer);
+            }
+          }
+
+          switch (mode) {
+            case TimingMode_WallTime: {
+              err = hipStreamSynchronize(0);
+              REQUIRE(err == hipSuccess);
+              stopWall = std::chrono::high_resolution_clock::now();
+              wallMicroSec =
+                  std::chrono::duration<double, std::micro>(stopWall - startWall).count();
+              finalPerf += wallMicroSec / testList[openTest];
+              break;
+            }
+            case TimingMode_HIPEvent: {
+              HIP_CHECK(hipEventRecord(stopEvent, 0));
+              HIP_CHECK(hipEventSynchronize(stopEvent));
+              float elapsed;
+              HIP_CHECK(hipEventElapsedTime(&elapsed, startEvent, stopEvent));  // in milliseconds
+              finalPerf += (elapsed * 1000.0f) / testList[openTest];            // convert ms to µs
+              break;
+            }
+            default:
+              CONSOLE_PRINT("Unknown Mode");
+          }
         }
+
+        finalPerf /= iterations;  // average the performance over all iterations
+
+
+        CONSOLE_PRINT("HIPPerfDispatchSpeed[%3d] %7d dispatches              (us/disp) %3f", test,
+                      testList[openTest], (float)finalPerf);
       }
     }
-    if (sleep) {
-      err = hipDeviceSynchronize();
-      REQUIRE(err == hipSuccess);
-    } else {
-      do {
-        err = hipStreamQuery(NULL);
-      } while (err == hipErrorNotReady);
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double microSec = std::chrono::duration<double, std::micro>(stop - start).count();
-
-    // microseconds per launch
-    double perf = (microSec/testList[openTest].iterations);
-    const char *waitType;
-    const char *extraChar;
-    const char *n;
-    if (sleep) {
-      waitType = "sleep";
-      extraChar = "";
-      n = "";
-    } else {
-      waitType = "spin";
-      n = "n";
-      extraChar = " ";
-    }
-    char buf[256];
-    if (testList[openTest].flushEvery > 0) {
-      SNPRINTF(buf, sizeof(buf), "HIPPerfDispatchSpeed[%3d] %7d dispatches %s%sing every %5d (us/disp) %3f",
-                test, testList[openTest].iterations,
-                waitType, n, testList[openTest].flushEvery, (float)perf);
-    } else {
-      SNPRINTF(buf, sizeof(buf), "HIPPerfDispatchSpeed[%3d] %7d dispatches (%s%s)              (us/disp) %3f",
-                test, testList[openTest].iterations,
-                waitType, extraChar, (float)perf);
-    }
-    printf("%s\n", buf);
   }
+
+  HIP_CHECK(hipEventDestroy(startEvent));
+  HIP_CHECK(hipEventDestroy(stopEvent));
+
   HIP_CHECK(hipFree(srcBuffer));
 }
 
+
 /**
-* End doxygen group perfDispatchTest.
-* @}
-*/
+ * End doxygen group perfDispatchTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc b/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc
index 5fc601b14..61a9bd7b4 100644
--- a/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc
+++ b/catch/perftests/memory/hipPerfBufferCopyRectSpeed.cc
@@ -18,30 +18,31 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
-*     size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
+ *     size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
+// #define ENABLE_DEBUG 1
 
 #define NUM_SIZES 8
 //  4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
-static const unsigned int Sizes[NUM_SIZES] =
-        {4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216+10};
+static const unsigned int Sizes[NUM_SIZES] = {4096,    8192,    65536,    262144,
+                                              1048576, 4194304, 16777216, 16777216 + 10};
 
 static const unsigned int Iterations[2] = {1, 1000};
 
 #define BUF_TYPES 4
 //  16 ways to combine 4 different buffer types
-#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES)
+#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES)
 
-static void setData(void *ptr, unsigned int size, char value) {
-  char *ptr2 =  reinterpret_cast<char *>(ptr);
-  for (unsigned int i = 0; i < size ; i++) {
+static void setData(void* ptr, unsigned int size, char value) {
+  char* ptr2 = reinterpret_cast<char*>(ptr);
+  for (unsigned int i = 0; i < size; i++) {
     ptr2[i] = value;
   }
 }
@@ -52,17 +53,17 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
   bool hostMalloc[2] = {false};
   bool hostRegister[2] = {false};
   bool unpinnedMalloc[2] = {false};
-  void *memptr[2] = {NULL};
-  void *alignedmemptr[2] = {NULL};
-  void *srcBuffer = NULL;
-  void *dstBuffer = NULL;
+  void* memptr[2] = {NULL};
+  void* alignedmemptr[2] = {NULL};
+  void* srcBuffer = NULL;
+  void* dstBuffer = NULL;
 
-  int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests;
+  int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests;
   int test = (p_tests == -1) ? 0 : p_tests;
 
-  for ( ; test <= numTests ; test++ ) {
+  for (; test <= numTests; test++) {
     unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES;
-    unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES;
+    unsigned int dstTest = (test / (NUM_SIZES * BUF_TYPES)) % BUF_TYPES;
     bufSize_ = Sizes[test % NUM_SIZES];
     hostMalloc[0] = hostMalloc[1] = false;
     hostRegister[0] = hostRegister[1] = false;
@@ -92,8 +93,7 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
     numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)];
 
     if (hostMalloc[0]) {
-      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&srcBuffer),
-                              bufSize_, 0));
+      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&srcBuffer), bufSize_, 0));
       setData(srcBuffer, bufSize_, 0xd0);
     } else if (hostRegister[0]) {
       memptr[0] = malloc(bufSize_ + 4096);
@@ -112,8 +112,7 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
     }
 
     if (hostMalloc[1]) {
-      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&dstBuffer),
-                              bufSize_, 0));
+      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&dstBuffer), bufSize_, 0));
     } else if (hostRegister[1]) {
       memptr[1] = malloc(bufSize_ + 4096);
       alignedmemptr[1] = reinterpret_cast<void*>(memptr[0]);
@@ -128,15 +127,14 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
     }
 
     //  warm up
-    HIP_CHECK(hipMemcpy2D(dstBuffer, width, srcBuffer,
-                          width, width, width, hipMemcpyDefault));
+    HIP_CHECK(hipMemcpy2D(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault));
 
     // measure performance based on host time
     auto all_start = std::chrono::steady_clock::now();
 
     for (unsigned int i = 0; i < numIter; i++) {
-      HIP_CHECK(hipMemcpy2DAsync(dstBuffer, width, srcBuffer,
-                                 width, width, width, hipMemcpyDefault, NULL));
+      HIP_CHECK(hipMemcpy2DAsync(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault,
+                                 NULL));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -144,11 +142,11 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
     std::chrono::duration<double> elapsed_secs = all_end - all_start;
 
     // read speed in GB/s
-    double perf = (static_cast<double>(bufSize_ * numIter) *
-                   static_cast<double>(1e-09)) / elapsed_secs.count();
+    double perf = (static_cast<double>(bufSize_ * numIter) * static_cast<double>(1e-09)) /
+        elapsed_secs.count();
 
-    const char *strSrc = NULL;
-    const char *strDst = NULL;
+    const char* strSrc = NULL;
+    const char* strDst = NULL;
     if (hostMalloc[0])
       strSrc = "hHM";
     else if (hostRegister[0])
@@ -170,15 +168,14 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
     // Double results when src and dst are both on device
     if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) &&
         (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1]))
-        perf *= 2.0;
+      perf *= 2.0;
     // Double results when src and dst are both in sysmem
     if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) &&
         (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1]))
-        perf *= 2.0;
+      perf *= 2.0;
 
-    INFO("hipPerfBufferCopyRectSpeed[" << test << "]\t( " << bufSize_ <<
-         ")\ts:" << strSrc << " d:" << strDst << "\ti:" << numIter <<
-         "\t(GB/s) perf\t" << (float)perf);
+    CONSOLE_PRINT("hipPerfBufferCopyRectSpeed[%d]\t( %u )\ts:%s d:%s\ti:%u\t(GB/s) perf\t%.2f\n",
+                  test, bufSize_, strSrc, strDst, numIter, (float)perf);
 
     //  Free src
     if (hostMalloc[0]) {
@@ -208,40 +205,42 @@ static bool hipPerfBufferCopyRectSpeed_test(int p_tests) {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfBufferCopy status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfBufferCopyRectSpeed.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfBufferCopy status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfBufferCopyRectSpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfBufferCopyRectSpeed_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
 
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfBufferCopyRectSpeed"
-            "as there is no device to test.");
+    SUCCEED(
+        "Skipped testcase hipPerfBufferCopyRectSpeed"
+        "as there is no device to test.");
   } else {
     int deviceId = 0;
     HIP_CHECK(hipSetDevice(deviceId));
     hipDeviceProp_t props;
     HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
 
-    INFO("hipPerfBufferCopyRectSpeed - info: Set device to " << deviceId
-         << " : " << props.name << "Legend: unp - unpinned(malloc),"
-         " hM - hipMalloc(device)\n        hHR - hipHostRegister(pinned),"
-         " hHM - hipHostMalloc(prePinned)\n");
+    CONSOLE_PRINT(
+        "hipPerfBufferCopyRectSpeed - info: Set device to %d : %s Legend: unp - unpinned(malloc), "
+        "hM - hipMalloc(device)\n        hHR - hipHostRegister(pinned), hHM - "
+        "hipHostMalloc(prePinned)\n",
+        deviceId, props.name);
 
     REQUIRE(true == hipPerfBufferCopyRectSpeed_test(1));
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfBufferCopySpeed.cc b/catch/perftests/memory/hipPerfBufferCopySpeed.cc
index 3a30b2501..4a8d9a571 100644
--- a/catch/perftests/memory/hipPerfBufferCopySpeed.cc
+++ b/catch/perftests/memory/hipPerfBufferCopySpeed.cc
@@ -18,246 +18,441 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpyAsync hipMemcpyAsync
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpyAsync(void* dst, const void* src, size_t count,
-*                 hipMemcpyKind kind, hipStream_t stream = 0)` -
-* Copies data between host and device.
-*/
-
+ * @addtogroup hipMemcpyAsync hipMemcpyAsync
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpyAsync(void* dst, const void* src, size_t count,
+ *                 hipMemcpyKind kind, hipStream_t stream = 0)` -
+ * Copies data between host and device, or device to device etc.
+ */
+#include <hip/hip_ext.h>
 #include <hip_test_common.hh>
+#include <cstdlib>
+#include <iomanip>  // Add this at the top if not already included
+#define ENABLE_DEBUG 1
 
 #define NUM_SIZES 9
 //  4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
-static const unsigned int Sizes[NUM_SIZES] =
-  {4096, 8192, 65536, 262144, 524288, 1048576, 4194304, 16777216, 16777216+10};
-
+static const unsigned int Sizes[NUM_SIZES] = {4096,     8192,          65536,     1048576,  4194304,
+                                              16777216, 16777216 + 10, 134217728, 536870912};
+// static const unsigned int Sizes[NUM_SIZES] = {134217728};
 static const unsigned int Iterations[2] = {1, 1000};
 
-#define BUF_TYPES 4
-//  16 ways to combine 4 different buffer types
-#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES)
+#define BUF_TYPES 5
+//  25 ways to combine 5 different buffer types
+#define NUM_SUBTESTS (BUF_TYPES * BUF_TYPES)
 
-static void setData(void *ptr, unsigned int size, char value) {
-  char *ptr2 = reinterpret_cast<char *>(ptr);
-  for (unsigned int i = 0; i < size ; i++) {
+static void setData(void* ptr, unsigned int size, char value) {
+  char* ptr2 = reinterpret_cast<char*>(ptr);
+  for (unsigned int i = 0; i < size; i++) {
     ptr2[i] = value;
   }
 }
 
-static void checkData(void *ptr, unsigned int size, char value) {
-  char *ptr2 = reinterpret_cast<char *>(ptr);
+static void checkData(void* ptr, unsigned int size, char value) {
+  char* ptr2 = reinterpret_cast<char*>(ptr);
   for (unsigned int i = 0; i < size; i++) {
     if (ptr2[i] != value) {
-      INFO("Validation failed at " << i << " Got " << ptr2[i] <<
-                                           " Expected " << value);
+      INFO("Validation failed at " << i << " Got " << ptr2[i] << " Expected " << value);
       REQUIRE(false);
     }
   }
 }
 
 static bool hipPerfBufferCopySpeed_test(int p_tests) {
+  int testIdx = 0;
   unsigned int bufSize_;
   unsigned int numIter;
   bool hostMalloc[2] = {false};
   bool hostRegister[2] = {false};
   bool unpinnedMalloc[2] = {false};
-  void *memptr[2] = {NULL};
-  void *alignedmemptr[2] = {NULL};
-  void *srcBuffer = NULL;
-  void *dstBuffer = NULL;
-
-  int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests;
-  int test = (p_tests == -1) ? 0 : p_tests;
-
-  for ( ; test <= numTests; test++ ) {
-    unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES;
-    unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES;
-    bufSize_ = Sizes[test % NUM_SIZES];
-    hostMalloc[0] = hostMalloc[1] = false;
-    hostRegister[0] = hostRegister[1] = false;
-    unpinnedMalloc[0] = unpinnedMalloc[1] = false;
-    srcBuffer = dstBuffer = 0;
-    memptr[0] = memptr[1] = NULL;
-    alignedmemptr[0] = alignedmemptr[1] = NULL;
-
-    if (srcTest == 3) {
-      hostRegister[0] = true;
-    } else if (srcTest == 2) {
-      hostMalloc[0] = true;
-    } else if (srcTest == 1) {
-      unpinnedMalloc[0] = true;
-    }
-
-    if (dstTest == 1) {
-      unpinnedMalloc[1] = true;
-    } else if (dstTest == 2) {
-      hostMalloc[1] = true;
-    } else if (dstTest == 3) {
-      hostRegister[1] = true;
-    }
-
-    numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)];
-
-    if (hostMalloc[0]) {
-      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&srcBuffer),
-                              bufSize_, 0));
-      setData(srcBuffer, bufSize_, 0xd0);
-    } else if (hostRegister[0]) {
-      memptr[0] = malloc(bufSize_ + 4096);
-      alignedmemptr[0] = reinterpret_cast<void*>(memptr[0]);
-      srcBuffer = alignedmemptr[0];
-      setData(srcBuffer, bufSize_, 0xd0);
-      HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0));
-    } else if (unpinnedMalloc[0]) {
-      memptr[0] = malloc(bufSize_ + 4096);
-      alignedmemptr[0] = reinterpret_cast<void*>(memptr[0]);
-      srcBuffer = alignedmemptr[0];
-      setData(srcBuffer, bufSize_, 0xd0);
-    } else {
+  bool deviceMallocUncached[2] = {false};
+  void* memptr[2] = {NULL};
+  void* alignedmemptr[2] = {NULL};
+  void* srcBuffer = NULL;
+  void* dstBuffer = NULL;
+  int numTests = (p_tests == -1) ? (NUM_SIZES * NUM_SUBTESTS * 2 - 1) : p_tests;
+  // int test = (p_tests == -1) ? 0 : p_tests;
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  int test = 0;
+  // 1. Run all P2P for all sizes
+  if (numDevices >= 2) {
+    for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) {
+      if (p_tests != -1 && testIdx != p_tests) {
+        ++testIdx;
+        continue;
+      }
+      unsigned int bufSize_ = Sizes[sizeIdx];
+      void* srcBuffer = NULL;
+      void* dstBuffer = NULL;
+      numIter = Iterations[1];
+      HIP_CHECK(hipSetDevice(0));
       HIP_CHECK(hipMalloc(&srcBuffer, bufSize_));
-      HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_));
-    }
-
-    if (hostMalloc[1]) {
-      HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&dstBuffer),
-                                                       bufSize_, 0));
-    } else if (hostRegister[1]) {
-      memptr[1] = malloc(bufSize_ + 4096);
-      alignedmemptr[1] = reinterpret_cast<void*>(memptr[1]);
-      dstBuffer = alignedmemptr[1];
-      HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0));
-    } else if (unpinnedMalloc[1]) {
-      memptr[1] = malloc(bufSize_ + 4096);
-      alignedmemptr[1] = reinterpret_cast<void*>(memptr[1]);
-      dstBuffer = alignedmemptr[1];
-    } else {
+      hipError_t errMemset = hipMemset(srcBuffer, 0xd0, bufSize_);
+      if (errMemset != hipSuccess) {
+        hipFree(srcBuffer);
+        continue;
+      }
+      HIP_CHECK(hipSetDevice(1));
       HIP_CHECK(hipMalloc(&dstBuffer, bufSize_));
+      int canAccessPeer01 = 0, canAccessPeer10 = 0;
+      HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer01, 0, 1));
+      HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer10, 1, 0));
+      if (!canAccessPeer01 || !canAccessPeer10) {
+        HIP_CHECK(hipSetDevice(0));
+        hipDeviceDisablePeerAccess(1);
+        HIP_CHECK(hipSetDevice(1));
+        hipDeviceDisablePeerAccess(0);
+        HIP_CHECK(hipSetDevice(0));
+        HIP_CHECK(hipFree(srcBuffer));
+        HIP_CHECK(hipSetDevice(1));
+        HIP_CHECK(hipFree(dstBuffer));
+        HIP_CHECK(hipSetDevice(0));
+        continue;
+      }
+      HIP_CHECK(hipSetDevice(0));
+      hipError_t errPeer0 = hipDeviceEnablePeerAccess(1, 0);
+      HIP_CHECK(hipSetDevice(1));
+      hipError_t errPeer1 = hipDeviceEnablePeerAccess(0, 0);
+      if (errPeer0 != hipSuccess || errPeer1 != hipSuccess) {
+        HIP_CHECK(hipSetDevice(0));
+        HIP_CHECK(hipFree(srcBuffer));
+        HIP_CHECK(hipSetDevice(1));
+        HIP_CHECK(hipFree(dstBuffer));
+        HIP_CHECK(hipSetDevice(0));
+        continue;
+      }
+      HIP_CHECK(hipMemcpyPeer(dstBuffer, 1, srcBuffer, 0, bufSize_));
+      auto all_start = std::chrono::steady_clock::now();
+      for (unsigned int i = 0; i < numIter; i++) {
+        HIP_CHECK(hipMemcpyPeerAsync(dstBuffer, 1, srcBuffer, 0, bufSize_, 0));
+      }
+      HIP_CHECK(hipSetDevice(1));
+      HIP_CHECK(hipDeviceSynchronize());
+      hipError_t syncErr = hipGetLastError();
+      if (syncErr != hipSuccess) {
+        DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr));
+      }
+      HIP_CHECK(hipDeviceSynchronize());
+      auto all_end = std::chrono::steady_clock::now();
+      std::chrono::duration<double> elapsed_secs = all_end - all_start;
+      auto start_s =
+          std::chrono::duration_cast<std::chrono::duration<double>>(all_start.time_since_epoch())
+              .count();
+      auto end_s =
+          std::chrono::duration_cast<std::chrono::duration<double>>(all_end.time_since_epoch())
+              .count();
+
+      DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s);
+      DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count());
+      double bufSizeWithIter = static_cast<double>(bufSize_);
+      DEBUG_PRINT("%f\n", bufSizeWithIter);
+      double perf_pre = bufSizeWithIter / elapsed_secs.count();
+      DEBUG_PRINT("%f\n", perf_pre);
+      double perf = perf_pre * static_cast<double>(numIter);
+      DEBUG_PRINT("%f\n", perf_pre);
+      perf *= static_cast<double>(1e-09);
+      CONSOLE_PRINT("%f\n", perf);
+      CONSOLE_PRINT("HIPPerfBufferCopySpeedP2P[%d] %u s:dev0 d:dev1 i:%u (GB/s) perf %f\n", test,
+                    bufSize_, numIter, (float)perf);
+      CONSOLE_PRINT("P2P,%d,%u,dev0,dev1,%u,%f\n", test, bufSize_, numIter, (float)perf);
+      test++;
+      void* temp = malloc(bufSize_ + 4096);
+      void* chkBuf = reinterpret_cast<void*>(temp);
+      HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
+      checkData(chkBuf, bufSize_, 0xd0);
+      free(temp);
+      HIP_CHECK(hipSetDevice(0));
+      hipDeviceDisablePeerAccess(1);
+      HIP_CHECK(hipSetDevice(1));
+      hipDeviceDisablePeerAccess(0);
+      HIP_CHECK(hipSetDevice(0));
+      HIP_CHECK(hipFree(srcBuffer));
+      HIP_CHECK(hipSetDevice(1));
+      HIP_CHECK(hipFree(dstBuffer));
+      HIP_CHECK(hipSetDevice(0));
+      ++testIdx;
     }
-
-    //  warm up
-    HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault));
-
-    // measure performance based on host time
+  }
+  int dstTest = 0;
+  int srcTest = 0;
+  // 2. Run all NoCU (intra) for all sizes
+  for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) {
+    if (p_tests != -1 && testIdx != p_tests) {
+      ++testIdx;
+      continue;
+    }
+    unsigned int bufSize_ = Sizes[sizeIdx];
+    void* srcBuffer = NULL;
+    void* dstBuffer = NULL;
+    numIter = Iterations[1];
+    HIP_CHECK(hipSetDevice(0));
+    HIP_CHECK(hipMalloc(&srcBuffer, bufSize_));
+    HIP_CHECK(hipMalloc(&dstBuffer, bufSize_));
+    HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_));
+    HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU));
     auto all_start = std::chrono::steady_clock::now();
-
     for (unsigned int i = 0; i < numIter; i++) {
-      HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_,
-                               hipMemcpyDefault, NULL));
+      HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDeviceToDeviceNoCU, NULL));
+    }
+    HIP_CHECK(hipDeviceSynchronize());
+    hipError_t syncErr = hipGetLastError();
+    if (syncErr != hipSuccess) {
+      DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr));
     }
     HIP_CHECK(hipDeviceSynchronize());
-
     auto all_end = std::chrono::steady_clock::now();
+    auto start_s =
+        std::chrono::duration_cast<std::chrono::duration<double>>(all_start.time_since_epoch())
+            .count();
+    auto end_s =
+        std::chrono::duration_cast<std::chrono::duration<double>>(all_end.time_since_epoch())
+            .count();
+
+    DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s);
     std::chrono::duration<double> elapsed_secs = all_end - all_start;
-
-    // read speed in GB/s
-    double perf = (static_cast<double>(bufSize_ * numIter) *
-                   static_cast<double>(1e-09)) / elapsed_secs.count();
-
-    const char *strSrc = NULL;
-    const char *strDst = NULL;
-    if (hostMalloc[0])
-      strSrc = "hHM";
-    else if (hostRegister[0])
-      strSrc = "hHR";
-    else if (unpinnedMalloc[0])
-      strSrc = "unp";
-    else
-      strSrc = "hM";
-
-    if (hostMalloc[1])
-      strDst = "hHM";
-    else if (hostRegister[1])
-      strDst = "hHR";
-    else if (unpinnedMalloc[1])
-      strDst = "unp";
-    else
-      strDst = "hM";
-
-    // Double results when src and dst are both on device
-    if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) &&
-        (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1]))
-        perf *= 2.0;
-    // Double results when src and dst are both in sysmem
-    if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) &&
-        (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1]))
-        perf *= 2.0;
-
-    INFO("HIPPerfBufferCopySpeed[" << test << "]\t( " << bufSize_ <<
-         ")\ts:" << strSrc << " d:" << strDst << "\ti:" << numIter <<
-         "\t(GB/s) perf\t" << (float)perf);
-
-    // Verification
+    DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count());
+    double bufSizeWithIter = static_cast<double>(bufSize_);
+    DEBUG_PRINT("%f\n", bufSizeWithIter);
+    double perf_pre = bufSizeWithIter / elapsed_secs.count();
+    DEBUG_PRINT("%f\n", perf_pre);
+    double perf = perf_pre * static_cast<double>(numIter);
+    DEBUG_PRINT("%f\n", perf_pre);
+    perf *= static_cast<double>(1e-09);
+    CONSOLE_PRINT("%f\n", perf);
+    CONSOLE_PRINT("HIPPerfBufferCopySpeedNoCU[%d]  %u  s:dev0 d:dev0 i:%u (GB/s) perf %f\n", test,
+                  bufSize_, numIter, (float)perf);
+    CONSOLE_PRINT("NoCU,%d,%u,dev0,dev0,%u,%f\n", test, bufSize_, numIter, (float)perf);
+    test++;
     void* temp = malloc(bufSize_ + 4096);
     void* chkBuf = reinterpret_cast<void*>(temp);
     HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
     checkData(chkBuf, bufSize_, 0xd0);
     free(temp);
+    HIP_CHECK(hipFree(srcBuffer));
+    HIP_CHECK(hipFree(dstBuffer));
+    ++testIdx;
+  }
 
-    //  Free src
-    if (hostMalloc[0]) {
-      HIP_CHECK(hipHostFree(srcBuffer));
-    } else if (hostRegister[0]) {
-      HIP_CHECK(hipHostUnregister(srcBuffer));
-      free(memptr[0]);
-    } else if (unpinnedMalloc[0]) {
-      free(memptr[0]);
-    } else {
-      HIP_CHECK(hipFree(srcBuffer));
-    }
-
-    //  Free dst
-    if (hostMalloc[1]) {
-      HIP_CHECK(hipHostFree(dstBuffer));
-    } else if (hostRegister[1]) {
-      HIP_CHECK(hipHostUnregister(dstBuffer));
-      free(memptr[1]);
-    } else if (unpinnedMalloc[1]) {
-      free(memptr[1]);
-    } else {
-      HIP_CHECK(hipFree(dstBuffer));
+  // 3. Run all buffer type (default) for all sizes
+
+  for (int srcTest = 0; srcTest < BUF_TYPES; ++srcTest) {
+    for (int dstTest = 0; dstTest < BUF_TYPES; ++dstTest) {
+      for (int sizeIdx = 0; sizeIdx < NUM_SIZES; ++sizeIdx) {
+        if (p_tests != -1 && testIdx != p_tests) {
+          ++testIdx;
+          continue;
+        }
+        unsigned int bufSize_ = Sizes[sizeIdx];
+        bool hostMalloc[2] = {false};
+        bool hostRegister[2] = {false};
+        bool unpinnedMalloc[2] = {false};
+        bool deviceMallocUncached[2] = {false};
+        void* memptr[2] = {NULL};
+        void* alignedmemptr[2] = {NULL};
+        void* srcBuffer = NULL;
+        void* dstBuffer = NULL;
+        numIter = Iterations[1];
+        if (srcTest == 4) {
+          deviceMallocUncached[0] = true;
+        } else if (srcTest == 3) {
+          hostRegister[0] = true;
+        } else if (srcTest == 2) {
+          hostMalloc[0] = true;
+        } else if (srcTest == 1) {
+          unpinnedMalloc[0] = true;
+        }
+        if (dstTest == 1) {
+          unpinnedMalloc[1] = true;
+        } else if (dstTest == 2) {
+          hostMalloc[1] = true;
+        } else if (dstTest == 3) {
+          hostRegister[1] = true;
+        } else if (dstTest == 4) {
+          deviceMallocUncached[1] = true;
+        }
+        if (deviceMallocUncached[0]) {
+          HIP_CHECK(hipExtMallocWithFlags(&srcBuffer, bufSize_, hipDeviceMallocUncached));
+          HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_));
+        } else if (hostMalloc[0]) {
+          HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&srcBuffer), bufSize_, 0));
+          setData(srcBuffer, bufSize_, 0xd0);
+        } else if (hostRegister[0]) {
+          memptr[0] = malloc(bufSize_ + 4096);
+          uintptr_t raw = reinterpret_cast<uintptr_t>(memptr[0]);
+          uintptr_t aligned = (raw + 4095) & ~static_cast<uintptr_t>(4095);
+          alignedmemptr[0] = reinterpret_cast<void*>(aligned);
+          srcBuffer = alignedmemptr[0];
+          setData(srcBuffer, bufSize_, 0xd0);
+          HIP_CHECK(hipHostRegister(srcBuffer, bufSize_, 0));
+        } else if (unpinnedMalloc[0]) {
+          memptr[0] = malloc(bufSize_ + 4096);
+          uintptr_t raw = reinterpret_cast<uintptr_t>(memptr[0]);
+          uintptr_t aligned = (raw + 4095) & ~static_cast<uintptr_t>(4095);
+          alignedmemptr[0] = reinterpret_cast<void*>(aligned);
+          srcBuffer = alignedmemptr[0];
+          setData(srcBuffer, bufSize_, 0xd0);
+        } else {
+          HIP_CHECK(hipMalloc(&srcBuffer, bufSize_));
+          HIP_CHECK(hipMemset(srcBuffer, 0xd0, bufSize_));
+        }
+        if (deviceMallocUncached[1]) {
+          HIP_CHECK(hipExtMallocWithFlags(&dstBuffer, bufSize_, hipDeviceMallocUncached));
+        } else if (hostMalloc[1]) {
+          HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&dstBuffer), bufSize_, 0));
+        } else if (hostRegister[1]) {
+          memptr[1] = malloc(bufSize_ + 4096);
+          uintptr_t raw = reinterpret_cast<uintptr_t>(memptr[1]);
+          uintptr_t aligned = (raw + 4095) & ~static_cast<uintptr_t>(4095);
+          alignedmemptr[1] = reinterpret_cast<void*>(aligned);
+          dstBuffer = alignedmemptr[1];
+          HIP_CHECK(hipHostRegister(dstBuffer, bufSize_, 0));
+        } else if (unpinnedMalloc[1]) {
+          memptr[1] = malloc(bufSize_ + 4096);
+          uintptr_t raw = reinterpret_cast<uintptr_t>(memptr[1]);
+          uintptr_t aligned = (raw + 4095) & ~static_cast<uintptr_t>(4095);
+          alignedmemptr[1] = reinterpret_cast<void*>(aligned);
+          dstBuffer = alignedmemptr[1];
+        } else {
+          HIP_CHECK(hipMalloc(&dstBuffer, bufSize_));
+        }
+        HIP_CHECK(hipMemcpy(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault));
+        auto all_start = std::chrono::steady_clock::now();
+        for (unsigned int i = 0; i < numIter; i++) {
+          HIP_CHECK(hipMemcpyAsync(dstBuffer, srcBuffer, bufSize_, hipMemcpyDefault, NULL));
+        }
+        HIP_CHECK(hipDeviceSynchronize());
+        hipError_t syncErr = hipGetLastError();
+        if (syncErr != hipSuccess) {
+          DEBUG_PRINT("WARNING: hipDeviceSynchronize error: %s\n", hipGetErrorString(syncErr));
+        }
+        HIP_CHECK(hipDeviceSynchronize());
+        auto all_end = std::chrono::steady_clock::now();
+        auto start_s =
+            std::chrono::duration_cast<std::chrono::duration<double>>(all_start.time_since_epoch())
+                .count();
+        auto end_s =
+            std::chrono::duration_cast<std::chrono::duration<double>>(all_end.time_since_epoch())
+                .count();
+
+        DEBUG_PRINT("All_start: %f s, All_end: %f s\n", start_s, end_s);
+        std::chrono::duration<double> elapsed_secs = all_end - all_start;
+        DEBUG_PRINT("Elapsed seconds: %f\n", elapsed_secs.count());
+        double bufSizeWithIter = static_cast<double>(bufSize_);
+        DEBUG_PRINT("%f\n", bufSizeWithIter);
+        double perf_pre = bufSizeWithIter / elapsed_secs.count();
+        DEBUG_PRINT("%f\n", perf_pre);
+        double perf = perf_pre * static_cast<double>(numIter);
+        DEBUG_PRINT("%f\n", perf_pre);
+        perf *= static_cast<double>(1e-09);
+        CONSOLE_PRINT("%f\n", perf);
+        const char* strSrc = NULL;
+        const char* strDst = NULL;
+        if (deviceMallocUncached[0])
+          strSrc = "hMUC";
+        else if (hostMalloc[0])
+          strSrc = "hHM";
+        else if (hostRegister[0])
+          strSrc = "hHR";
+        else if (unpinnedMalloc[0])
+          strSrc = "unp";
+        else
+          strSrc = "hM";
+        if (deviceMallocUncached[1])
+          strDst = "hMUC";
+        else if (hostMalloc[1])
+          strDst = "hHM";
+        else if (hostRegister[1])
+          strDst = "hHR";
+        else if (unpinnedMalloc[1])
+          strDst = "unp";
+        else
+          strDst = "hM";
+        if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) &&
+            (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1]))
+          perf *= 2.0;
+        if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) &&
+            (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1]))
+          perf *= 2.0;
+        CONSOLE_PRINT("HIPPerfBufferCopySpeed[%d] %u s:%s d:%s i:%u (GB/s) perf %f\n", test,
+                      bufSize_, strSrc, strDst, numIter, (float)perf);
+        std::cout << "Type," << bufSize_ << "," << strSrc << "," << strDst << "," << numIter << ","
+                  << (float)perf << std::endl;
+        test++;
+        void* temp = malloc(bufSize_ + 4096);
+        void* chkBuf = reinterpret_cast<void*>(temp);
+        HIP_CHECK(hipMemcpy(chkBuf, dstBuffer, bufSize_, hipMemcpyDefault));
+        checkData(chkBuf, bufSize_, 0xd0);
+        free(temp);
+        if (deviceMallocUncached[0]) {
+          HIP_CHECK(hipFree(srcBuffer));
+        } else if (hostMalloc[0]) {
+          HIP_CHECK(hipHostFree(srcBuffer));
+        } else if (hostRegister[0]) {
+          HIP_CHECK(hipHostUnregister(srcBuffer));
+          free(memptr[0]);
+        } else if (unpinnedMalloc[0]) {
+          free(memptr[0]);
+        } else {
+          HIP_CHECK(hipFree(srcBuffer));
+        }
+        if (deviceMallocUncached[1]) {
+          HIP_CHECK(hipFree(dstBuffer));
+        } else if (hostMalloc[1]) {
+          HIP_CHECK(hipHostFree(dstBuffer));
+        } else if (hostRegister[1]) {
+          HIP_CHECK(hipHostUnregister(dstBuffer));
+          free(memptr[1]);
+        } else if (unpinnedMalloc[1]) {
+          free(memptr[1]);
+        } else {
+          HIP_CHECK(hipFree(dstBuffer));
+        }
+      }
+      ++testIdx;
     }
   }
-
   return true;
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfBufferCopySpeed status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfBufferCopySpeed.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfBufferCopySpeed status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfBufferCopySpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfBufferCopySpeed_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
-
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfBufferCopySpeed as"
-            "there is no device to test.");
+    SUCCEED(
+        "Skipped testcase hipPerfBufferCopySpeed as"
+        "there is no device to test.");
   } else {
     int deviceId = 0;
     HIP_CHECK(hipSetDevice(deviceId));
     hipDeviceProp_t props;
     HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
 
-    INFO("hipPerfBufferCopySpeed - info: Set device to " << deviceId
-         << " : " << props.name << "Legend: unp - unpinned(malloc),"
-         " hM - hipMalloc(device)\n        hHR - hipHostRegister(pinned),"
-         " hHM - hipHostMalloc(prePinned)\n");
+    CONSOLE_PRINT(
+        "hipPerfBufferCopySpeed - info: Set device to %d : %s\nLegend: unp - unpinned(malloc), hM "
+        "- hipMalloc(device), hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned), hMUC "
+        "- hipMallocUncached\n",
+        deviceId, props.name);
 
-    REQUIRE(true == hipPerfBufferCopySpeed_test(1));
+    // Run the test with all sizes and buffer types, alter p_tests to run a specific test
+    REQUIRE(true == hipPerfBufferCopySpeed_test(-1));
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
\ No newline at end of file
diff --git a/catch/perftests/memory/hipPerfDevMemReadSpeed.cc b/catch/perftests/memory/hipPerfDevMemReadSpeed.cc
index ae5f63186..9b874b660 100644
--- a/catch/perftests/memory/hipPerfDevMemReadSpeed.cc
+++ b/catch/perftests/memory/hipPerfDevMemReadSpeed.cc
@@ -18,13 +18,14 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
-
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
+
+// #define ENABLE_DEBUG 1
 #include <hip_test_common.hh>
 
 #define ARRAY_SIZE 16
@@ -33,7 +34,7 @@ typedef struct d_uint16 {
   uint data[ARRAY_SIZE];
 } d_uint16;
 
-__global__ static void read_kernel(d_uint16 *src, ulong N, uint *dst) {
+__global__ static void read_kernel(d_uint16* src, ulong N, uint* dst) {
   size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t stride = blockDim.x * gridDim.x;
 
@@ -59,8 +60,8 @@ static bool hipPerfDevMemReadSpeed_test() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
 
-  INFO("info: running on bus " << "0x" << props.pciBusID << " " <<
-       props.name << " with " << props.multiProcessorCount << " CUs \n");
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name,
+                props.multiProcessorCount);
 
   const unsigned threadsPerBlock = 64;
   const unsigned blocks = props.multiProcessorCount * 4;
@@ -70,7 +71,7 @@ static bool hipPerfDevMemReadSpeed_test() {
 
   hSrc = new d_uint16[nBytes];
   REQUIRE(hSrc != nullptr);
-  hDst =  new uint;
+  hDst = new uint;
   REQUIRE(hDst != nullptr);
   hDst[0] = 0;
 
@@ -88,15 +89,15 @@ static bool hipPerfDevMemReadSpeed_test() {
   HIP_CHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice));
   HIP_CHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice));
 
-  hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock),
-                                  0, stream, dSrc, N, dDst);
+  hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost));
   HIP_CHECK(hipDeviceSynchronize());
 
   if (hDst[0] != (nBytes / sizeof(uint))) {
-    INFO("hipPerfDevMemReadSpeed - Data validation failed for warm up run!" <<
-         " expected " << nBytes / sizeof(uint) << " got " << hDst[0]);
+    DEBUG_PRINT(
+        "hipPerfDevMemReadSpeed - Data validation failed for warm up run! expected %u got %u\n",
+        nBytes / sizeof(uint), hDst[0]);
     return false;
   }
 
@@ -104,8 +105,7 @@ static bool hipPerfDevMemReadSpeed_test() {
   auto all_start = std::chrono::steady_clock::now();
 
   for (int i = 0; i < nIter; i++) {
-    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock),
-                                    0, stream, dSrc, N, dDst);
+    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
     HIP_CHECK(hipGetLastError());
   }
   HIP_CHECK(hipDeviceSynchronize());
@@ -114,14 +114,14 @@ static bool hipPerfDevMemReadSpeed_test() {
   std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
   // read speed in GB/s
-  double perf = (static_cast<double>(nBytes * nIter * (1e-09))) /
-                                     all_kernel_time.count();
+  double perf = (static_cast<double>(nBytes * nIter * (1e-09))) / all_kernel_time.count();
 
-  INFO("hipPerfDevMemReadSpeed - info: average read speed of " <<
-        perf << " GB/s " << "achieved for memory size of " <<
-        nBytes / (1024 * 1024) << " MB");
+  CONSOLE_PRINT(
+      "hipPerfDevMemReadSpeed - average read speed of %.2f GB/s achieved for memory size of %u "
+      "MB\n",
+      perf, nBytes / (1024 * 1024));
 
-  delete [] hSrc;
+  delete[] hSrc;
   delete hDst;
   HIP_CHECK(hipFree(dSrc));
   HIP_CHECK(hipFree(dDst));
@@ -130,30 +130,31 @@ static bool hipPerfDevMemReadSpeed_test() {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfDevMemReadSpeed status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfDevMemReadSpeed.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfDevMemReadSpeed status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfDevMemReadSpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfDevMemReadSpeed_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
 
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfDevMemReadSpeed as"
-            "there is no device to test.");
+    SUCCEED(
+        "Skipped testcase hipPerfDevMemReadSpeed as"
+        "there is no device to test.");
   } else {
     REQUIRE(true == hipPerfDevMemReadSpeed_test());
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc b/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc
index 77eac4b8b..e96b206c8 100644
--- a/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc
+++ b/catch/perftests/memory/hipPerfDevMemWriteSpeed.cc
@@ -18,12 +18,12 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
 
@@ -33,12 +33,12 @@ typedef struct d_uint16 {
   uint data[ARRAY_SIZE];
 } d_uint16;
 
-__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) {
-    size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x;
-    for (size_t i = idx; i < N; i += stride) {
-      dst[i] = pval;
-    }
+__global__ void write_kernel(d_uint16* dst, ulong N, d_uint16 pval) {
+  size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = idx; i < N; i += stride) {
+    dst[i] = pval;
+  }
 }
 
 static bool hipPerfDevMemWriteSpeed_test() {
@@ -55,8 +55,8 @@ static bool hipPerfDevMemWriteSpeed_test() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
 
-  INFO("info: running on bus " << "0x" << props.pciBusID << " " <<
-       props.name << " with " << props.multiProcessorCount << " CUs \n");
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name,
+                props.multiProcessorCount);
 
   const unsigned threadsPerBlock = 64;
   const unsigned blocks = props.multiProcessorCount * 4;
@@ -65,7 +65,7 @@ static bool hipPerfDevMemWriteSpeed_test() {
     pval.data[i] = inputData;
   }
 
-  hDst =  new d_uint16[nBytes];
+  hDst = new d_uint16[nBytes];
   REQUIRE(hDst != nullptr);
 
   for (size_t i = 0; i < N; i++) {
@@ -78,18 +78,18 @@ static bool hipPerfDevMemWriteSpeed_test() {
   HIP_CHECK(hipStreamCreate(&stream));
 
   HIP_CHECK(hipMalloc(&dDst, nBytes));
-  hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock),
-                                   0, stream, dDst, N, pval);
+  hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
   HIP_CHECK(hipGetLastError());
-  HIP_CHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
   HIP_CHECK(hipDeviceSynchronize());
 
   for (uint i = 0; i < N; i++) {
     for (uint j = 0; j < ARRAY_SIZE; j++) {
       if (hDst[i].data[j] != inputData) {
-        INFO("hipPerfDevMemWriteSpeed - Data validation failed for warm up run!"
-              << "at index i: " << i << " element j: " << j <<
-              "expected " << inputData << " but got " << hDst[i].data[j]);
+        DEBUG_PRINT(
+            "hipPerfDevMemWriteSpeed - Data validation failed for warm up run! at index i: %u "
+            "element j: %u expected 0x%x but got 0x%x\n",
+            i, j, inputData, hDst[i].data[j]);
         return false;
       }
     }
@@ -99,8 +99,7 @@ static bool hipPerfDevMemWriteSpeed_test() {
   auto all_start = std::chrono::steady_clock::now();
 
   for (int i = 0; i < nIter; i++) {
-    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock),
-                                     0, stream, dDst, N, pval);
+    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
     HIP_CHECK(hipGetLastError());
   }
   HIP_CHECK(hipDeviceSynchronize());
@@ -109,44 +108,45 @@ static bool hipPerfDevMemWriteSpeed_test() {
   std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
   // read speed in GB/s
-  double perf = (static_cast<double>(nBytes * nIter * (1e-09))) /
-                                     all_kernel_time.count();
+  double perf = (static_cast<double>(nBytes * nIter * (1e-09))) / all_kernel_time.count();
 
-  INFO("hipPerfDevMemReadSpeed - info: average write speed of " <<
-        perf << " GB/s " << "achieved for memory size of " <<
-        nBytes / (1024 * 1024) << " MB");
+  CONSOLE_PRINT(
+      "hipPerfDevMemWriteSpeed - average write speed of %.2f GB/s achieved for memory size of %u "
+      "MB\n",
+      perf, nBytes / (1024 * 1024));
 
-  delete [] hDst;
+  delete[] hDst;
   HIP_CHECK(hipFree(dDst));
   HIP_CHECK(hipStreamDestroy(stream));
   return true;
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfDevMemWriteSpeed status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfDevMemWriteSpeed.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfDevMemWriteSpeed status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfDevMemWriteSpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfDevMemWriteSpeed_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
 
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfDevMemWriteSpeed as"
-            "there is no device to test.");
+    SUCCEED(
+        "Skipped testcase hipPerfDevMemWriteSpeed as"
+        "there is no device to test.");
   } else {
     REQUIRE(true == hipPerfDevMemWriteSpeed_test());
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfHostNumaAlloc.cc b/catch/perftests/memory/hipPerfHostNumaAlloc.cc
index b1f056681..39a437b68 100644
--- a/catch/perftests/memory/hipPerfHostNumaAlloc.cc
+++ b/catch/perftests/memory/hipPerfHostNumaAlloc.cc
@@ -18,28 +18,27 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <numaif.h>
 #include <hip_test_common.hh>
-
+// #define ENABLE_DEBUG 1
 // To run it correctly, we must not export HIP_VISIBLE_DEVICES.
 // And we must explicitly link libnuma because of numa api move_pages().
 #define NUM_PAGES 4
-char *h = nullptr;
-char *d_h = nullptr;
-char *m = nullptr;
-char *d_m = nullptr;
+char* h = nullptr;
+char* d_h = nullptr;
+char* m = nullptr;
+char* d_m = nullptr;
 int page_size = 1024;
 
-const int mode[] = { MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE };
-const char* modeStr[] = { "MPOL_DEFAULT", "MPOL_BIND",
-                          "MPOL_PREFERRED", "MPOL_INTERLEAVE" };
+const int mode[] = {MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE};
+const char* modeStr[] = {"MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE"};
 
 std::string exeCommand(const char* cmd) {
   std::array<char, 128> buff;
@@ -55,23 +54,22 @@ std::string exeCommand(const char* cmd) {
 }
 
 int getCpuAgentCount() {
-  const char* cmd =
-              "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l";
+  const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l";
   int cpuAgentCount = std::atoi(exeCommand(cmd).c_str());
   return cpuAgentCount;
 }
 
 bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
-  void *pages[NUM_PAGES];
+  void* pages[NUM_PAGES];
   int status[NUM_PAGES];
   int ret_code;
 
-  INFO("set cpu " << cpuId << ", gpu " << gpuId << ", numaMode "
-        << numaMode << ", hostMallocflags " << hostMallocflags << "\n");
+  CONSOLE_PRINT("set cpu %d, gpu %d, numaMode %d, hostMallocflags %u\n", cpuId, gpuId, numaMode,
+                hostMallocflags);
 
   if (cpuId >= 0) {
-    unsigned long nodeMask = 1 << cpuId;            //NOLINT
-    unsigned long maxNode = sizeof(nodeMask) * 8;   //NOLINT
+    unsigned long nodeMask = 1 << cpuId;           // NOLINT
+    unsigned long maxNode = sizeof(nodeMask) * 8;  // NOLINT
     if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask,
                       numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) {
       WARN("set_mempolicy() failed with err " << errno << "\n");
@@ -83,7 +81,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
     HIP_CHECK(hipSetDevice(gpuId));
   }
 
-  posix_memalign(reinterpret_cast<void**>(&m), page_size, page_size*NUM_PAGES);
+  posix_memalign(reinterpret_cast<void**>(&m), page_size, page_size * NUM_PAGES);
   HIP_CHECK(hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped));
   HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&d_m), m, 0));
 
@@ -94,15 +92,13 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
   }
 
   ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0);
-  INFO("Memory (malloc) ret " << ret_code << " at " << m <<
-                            " (dev " << d_m << "%p) is at node: ");
+  CONSOLE_PRINT("Memory (malloc) ret %d at %p (dev %p) is at node: ", ret_code, m, d_m);
   for (int i = 0; i < NUM_PAGES; i++) {
-    INFO(status[i]);  // Don't verify as it's out of our control
+    CONSOLE_PRINT("%d ", status[i]);  // Don't verify as it's out of our control
   }
-  INFO("\n");
+  CONSOLE_PRINT("\n");
 
-  HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&h),
-                          page_size*NUM_PAGES, hostMallocflags));
+  HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&h), page_size * NUM_PAGES, hostMallocflags));
   pages[0] = h;
   for (int i = 1; i < NUM_PAGES; i++) {
     pages[i] = reinterpret_cast<char*>(pages[0]) + page_size;
@@ -111,16 +107,14 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
   d_h = nullptr;
   if (hostMallocflags & hipHostMallocMapped) {
     HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&d_h), h, 0));
-    INFO("Memory (hipHostMalloc) ret " << ret_code << " at " << h
-                                  << " (dev " << d_h << ") is at node: ");
+    CONSOLE_PRINT("Memory (hipHostMalloc) ret %d at %p (dev %p) is at node: ", ret_code, h, d_h);
   } else {
-    INFO("Memory (hipHostMalloc) ret " << ret_code << " at "
-                                       << h << " is at node: ");
+    CONSOLE_PRINT("Memory (hipHostMalloc) ret %d at %p is at node: ", ret_code, h);
   }
   for (int i = 0; i < NUM_PAGES; i++) {
-    INFO(status[i]);  // Always print it even if it's wrong. Verify later
+    CONSOLE_PRINT("%d ", status[i]);  // Always print it even if it's wrong. Verify later
   }
-  INFO("\n");
+  CONSOLE_PRINT("\n");
 
   HIP_CHECK(hipHostFree(reinterpret_cast<void*>(h)));
   HIP_CHECK(hipHostUnregister(m));
@@ -129,8 +123,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
   if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) {
     for (int i = 0; i < NUM_PAGES; i++) {
       if (status[i] != cpuId) {  // Now verify
-        WARN("Failed at " << i << " status[i] = " << status[i]
-                          << " cpuId " << cpuId << "\n");
+        WARN("Failed at " << i << " status[i] = " << status[i] << " cpuId " << cpuId << "\n");
         return false;
       }
     }
@@ -138,12 +131,12 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
   return true;
 }
 
-bool runTest(const int &cpuCount, const int &gpuCount,
-             unsigned int hostMallocflags, const std::string &str) {
-  INFO("Test- " << str.c_str() << "\n");
+bool runTest(const int& cpuCount, const int& gpuCount, unsigned int hostMallocflags,
+             const std::string& str) {
+  CONSOLE_PRINT("Test- %s\n", str.c_str());
 
   for (int m = 0; m < sizeof(mode) / sizeof(mode[0]); m++) {
-    INFO("Testing " << modeStr[m] << "\n");
+    CONSOLE_PRINT("Testing %s\n", modeStr[m]);
 
     for (int i = 0; i < cpuCount; i++) {
       for (int j = 0; j < gpuCount; j++) {
@@ -157,39 +150,40 @@ bool runTest(const int &cpuCount, const int &gpuCount,
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfHostNumaAlloc status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfHostNumaAlloc.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfHostNumaAlloc status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfHostNumaAlloc.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfHostNumaAlloc_test") {
   int gpuCount = 0;
   HIP_CHECK(hipGetDeviceCount(&gpuCount));
   int cpuCount = getCpuAgentCount();
-  INFO("Cpu count " << cpuCount << ", Gpu count " << gpuCount << "\n");
+  CONSOLE_PRINT("Cpu count %d, Gpu count %d\n", cpuCount, gpuCount);
 
   if (cpuCount < 0 || gpuCount < 0) {
-    SUCCEED("Skipped testcase hipPerfHostNumaAlloc as "
-            "there is no device to test.\n");
+    SUCCEED(
+        "Skipped testcase hipPerfHostNumaAlloc as "
+        "there is no device to test.\n");
     return;
   }
 
-  REQUIRE(true == runTest(cpuCount, gpuCount,
-                          hipHostMallocDefault | hipHostMallocNumaUser,
-               "Testing hipHostMallocDefault | hipHostMallocNumaUser......"));
+  REQUIRE(true ==
+          runTest(cpuCount, gpuCount, hipHostMallocDefault | hipHostMallocNumaUser,
+                  "Testing hipHostMallocDefault | hipHostMallocNumaUser......"));
 
-  REQUIRE(true == runTest(cpuCount, gpuCount,
-                          hipHostMallocMapped | hipHostMallocNumaUser,
-               "Testing hipHostMallocMapped | hipHostMallocNumaUser......."));
+  REQUIRE(true ==
+          runTest(cpuCount, gpuCount, hipHostMallocMapped | hipHostMallocNumaUser,
+                  "Testing hipHostMallocMapped | hipHostMallocNumaUser......."));
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfMemFill.cc b/catch/perftests/memory/hipPerfMemFill.cc
index 5892c10f8..32bfa622c 100644
--- a/catch/perftests/memory/hipPerfMemFill.cc
+++ b/catch/perftests/memory/hipPerfMemFill.cc
@@ -18,20 +18,19 @@
  */
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
 
 #define SIMPLY_ASSIGN 0
 #define USE_HIPTEST_SETNUMBLOCKS 0
 
-template<class T>
-__global__ void vec_fill(T *x, T coef, int N) {
+template <class T> __global__ void vec_fill(T* x, T coef, int N) {
   const int istart = threadIdx.x + blockIdx.x * blockDim.x;
   const int ishift = blockDim.x * gridDim.x;
   for (int i = istart; i < N; i += ishift) {
@@ -51,8 +50,7 @@ __device__ void print_log(int i, int value, int expected) {
   printf("failed at %d: val=%d, expected=%d\n", i, value, expected);
 }
 
-template<class T>
-__global__ void vec_verify(T *x, T coef, int N) {
+template <class T> __global__ void vec_verify(T* x, T coef, int N) {
   const int istart = threadIdx.x + blockIdx.x * blockDim.x;
   const int ishift = blockDim.x * gridDim.x;
   for (int i = istart; i < N; i += ishift) {
@@ -68,20 +66,17 @@ __global__ void vec_verify(T *x, T coef, int N) {
   }
 }
 
-template<class T>
-__global__ void daxpy(T *__restrict__ x, T *__restrict__ y,
-    const T coef, int Niter, int N) {
+template <class T>
+__global__ void daxpy(T* __restrict__ x, T* __restrict__ y, const T coef, int Niter, int N) {
   const int istart = threadIdx.x + blockIdx.x * blockDim.x;
   const int ishift = blockDim.x * gridDim.x;
   for (int iter = 0; iter < Niter; ++iter) {
     T iv = coef * iter;
-    for (int i = istart; i < N; i += ishift)
-      y[i] = iv * x[i] + y[i];
+    for (int i = istart; i < N; i += ishift) y[i] = iv * x[i] + y[i];
   }
 }
 
-template<class T>
-class hipPerfMemFill {
+template <class T> class hipPerfMemFill {
  private:
   static constexpr int NUM_START = 27;
   static constexpr int NUM_SIZE = 4;
@@ -96,26 +91,20 @@ class hipPerfMemFill {
  public:
   hipPerfMemFill() {
     for (int i = 0; i < NUM_SIZE; i++) {
-       // 128M, 256M, 512M, 1024M
+      // 128M, 256M, 512M, 1024M
       totalSizes_[i] = 1ull << (i + NUM_START);
     }
   }
 
-  ~hipPerfMemFill() { }
+  ~hipPerfMemFill() {}
 
-  bool supportLargeBar() {
-    return props_.isLargeBar != 0;
-  }
+  bool supportLargeBar() { return props_.isLargeBar != 0; }
 
-  bool supportManagedMemory() {
-    return props_.managedMemory != 0;
-  }
+  bool supportManagedMemory() { return props_.managedMemory != 0; }
 
-  const T getCoefficient(double val) {
-    return static_cast<T>(val);
-  }
+  const T getCoefficient(double val) { return static_cast<T>(val); }
 
-  void setHostBuffer(T *A, T val, size_t size) {
+  void setHostBuffer(T* A, T val, size_t size) {
     size_t len = size / sizeof(T);
     for (int i = 0; i < len; i++) {
       A[i] = val;
@@ -138,33 +127,29 @@ class hipPerfMemFill {
     HIP_CHECK(hipGetDeviceProperties(&props_, deviceId));
     blocksPerCU_ = props_.multiProcessorCount * 4;
 
-    std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
-        << props_.pciBusID << " " << props_.name << " with "
-        << props_.multiProcessorCount << " CUs, large bar: "
-        << supportLargeBar() << ", managed memory: " << supportManagedMemory()
-        << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained()
-        << std::endl;
+    std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x" << props_.pciBusID
+              << " " << props_.name << " with " << props_.multiProcessorCount
+              << " CUs, large bar: " << supportLargeBar()
+              << ", managed memory: " << supportManagedMemory()
+              << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained() << std::endl;
     return true;
   }
 
   void log_host(const char* title, double GBytes, double sec) {
-    std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost "
-              << std::setw(10) << sec << " s in bandwidth " << std::setw(10)
-              << GBytes / sec << " [GB/s]" << std::endl;
+    std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " << std::setw(10) << sec
+              << " s in bandwidth " << std::setw(10) << GBytes / sec << " [GB/s]" << std::endl;
   }
 
-  void log_kernel(const char* title, double GBytes, double sec,
-                                     double sec_hv, double sec_kv) {
-    std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost "
-              << std::setw(10) << sec << " s in bandwidth " << std::setw(10)
-              << GBytes / sec << " [GB/s]" << ", hostVerify cost "
-              << std::setw(10) << sec_hv << " s in bandwidth " << std::setw(10)
-              << GBytes / sec_hv << " [GB/s]" << ", kernelVerify cost "
-              << std::setw(10) << sec_kv << " s in bandwidth " << std::setw(10)
-              << GBytes / sec_kv << " [GB/s]" << std::endl;
+  void log_kernel(const char* title, double GBytes, double sec, double sec_hv, double sec_kv) {
+    std::cout << title << " [" << std::setw(7) << GBytes << " GB]: cost " << std::setw(10) << sec
+              << " s in bandwidth " << std::setw(10) << GBytes / sec << " [GB/s]"
+              << ", hostVerify cost " << std::setw(10) << sec_hv << " s in bandwidth "
+              << std::setw(10) << GBytes / sec_hv << " [GB/s]" << ", kernelVerify cost "
+              << std::setw(10) << sec_kv << " s in bandwidth " << std::setw(10) << GBytes / sec_kv
+              << " [GB/s]" << std::endl;
   }
 
-  void hostFill(size_t size, T *data, T coef, double *sec) {
+  void hostFill(size_t size, T* data, T coef, double* sec) {
     size_t num = size / sizeof(T);  // Size of elements
     auto start = std::chrono::steady_clock::now();
     for (int i = 0; i < num; ++i) {
@@ -179,29 +164,29 @@ class hipPerfMemFill {
     *sec = diff.count();
   }
 
-  void kernelFill(size_t size, T *data, T coef, double *sec) {
+  void kernelFill(size_t size, T* data, T coef, double* sec) {
     size_t num = size / sizeof(T);  // Size of elements
     unsigned blocks = setNumBlocks(num);
 
     // kernel will be loaded first time
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                           dim3(threadsPerBlock_), 0, 0, data, 0, num);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks), dim3(threadsPerBlock_), 0, 0,
+                       data, 0, num);
     HIP_CHECK(hipDeviceSynchronize());
 
     auto start = std::chrono::steady_clock::now();
 
     for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                             dim3(threadsPerBlock_), 0, 0, data, coef, num);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks), dim3(threadsPerBlock_), 0, 0,
+                         data, coef, num);
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     auto end = std::chrono::steady_clock::now();
     std::chrono::duration<double> diff = end - start;  // in second
-    *sec = diff.count() / NUM_ITER;  // in second
+    *sec = diff.count() / NUM_ITER;                    // in second
   }
 
-  void hostVerify(size_t size, T *data, T coef, double *sec) {
+  void hostVerify(size_t size, T* data, T coef, double* sec) {
     size_t num = size / sizeof(T);  // Size of elements
     auto start = std::chrono::steady_clock::now();
     for (int i = 0; i < num; ++i) {
@@ -224,27 +209,27 @@ class hipPerfMemFill {
     *sec = diff.count();
   }
 
-  void kernelVerify(size_t size, T *data, T coef, double *sec) {
+  void kernelVerify(size_t size, T* data, T coef, double* sec) {
     size_t num = size / sizeof(T);  // Size of elements
     unsigned blocks = setNumBlocks(num);
 
     // kernel will be loaded first time
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                       dim3(threadsPerBlock_), 0, 0, data, coef, num);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks), dim3(threadsPerBlock_), 0, 0,
+                       data, coef, num);
     HIP_CHECK(hipDeviceSynchronize());
 
     // Now all data verified. The following is to test bandwidth.
     auto start = std::chrono::steady_clock::now();
 
     for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                             dim3(threadsPerBlock_), 0, 0, data, coef, num);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks), dim3(threadsPerBlock_), 0, 0,
+                         data, coef, num);
     }
     HIP_CHECK(hipDeviceSynchronize());
 
     auto end = std::chrono::steady_clock::now();
     std::chrono::duration<double> diff = end - start;  // in second
-    *sec = diff.count() / NUM_ITER;  // in second
+    *sec = diff.count() / NUM_ITER;                    // in second
   }
 
   bool testLargeBarDeviceMemoryHostFill(size_t size) {
@@ -254,7 +239,7 @@ class hipPerfMemFill {
 
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A;
+    T* A;
     HIP_CHECK(hipMalloc(&A, size));
     double sec = 0;
     hostFill(size, A, coef_, &sec);  // Cpu can access device mem in LB
@@ -285,7 +270,7 @@ class hipPerfMemFill {
     }
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A;
+    T* A;
     HIP_CHECK(hipMallocManaged(&A, size));
     double sec = 0;
     hostFill(size, A, coef_, &sec);  // Cpu can access HMM mem
@@ -301,7 +286,7 @@ class hipPerfMemFill {
     }
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A;
+    T* A;
     HIP_CHECK(hipMallocManaged(&A, size));
 
     double sec = 0, sec_hv = 0, sec_kv = 0;
@@ -340,7 +325,7 @@ class hipPerfMemFill {
 
   bool testHostMemoryHostFill(size_t size, unsigned int flags) {
     double GBytes = static_cast<double>(size) / NUM_1GB;
-    T *A;
+    T* A;
     HIP_CHECK(hipHostMalloc(&A, size, flags));
     double sec = 0;
     hostFill(size, A, coef_, &sec);
@@ -353,8 +338,8 @@ class hipPerfMemFill {
   bool testHostMemoryKernelFill(size_t size, unsigned int flags) {
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A;
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&A), size, flags));
+    T* A;
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&A), size, flags));
     double sec = 0, sec_hv = 0, sec_kv = 0;
     kernelFill(size, A, coef_, &sec);
     hostVerify(size, A, coef_, &sec_hv);
@@ -400,10 +385,11 @@ class hipPerfMemFill {
   /* This function should be via device attribute query*/
   bool supportDeviceMallocFinegrained() {
 #ifdef __HIP_PLATFORM_AMD__
-    T *A = nullptr;
+    T* A = nullptr;
     hipError_t err;
-    err = hipExtMallocWithFlags(reinterpret_cast<void**>(&A), sizeof(T),
-                                hipDeviceMallocFinegrained);
+
+    err =
+        hipExtMallocWithFlags(reinterpret_cast<void**>(&A), sizeof(T), hipDeviceMallocFinegrained);
     if (err || !A) {
       return false;
     }
@@ -415,7 +401,7 @@ class hipPerfMemFill {
   }
 
   unsigned int setNumBlocks(size_t size) {
-    size_t num = size/sizeof(T);
+    size_t num = size / sizeof(T);
 
 #if USE_HIPTEST_SETNUMBLOCKS
     return HipTest::setNumBlocks(blocksPerCU_, threadsPerBlock_, num);
@@ -428,12 +414,11 @@ class hipPerfMemFill {
   bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A = nullptr;
-    HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast<void **>(&A),
-                                    size, flags));
+    T* A = nullptr;
+    HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast<void**>(&A), size, flags));
     if (!A) {
-      std::cout << "failed hipExtMallocWithFlags() with size =" <<
-                   size << " flags="<< std::hex << flags << std::endl;
+      std::cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" << std::hex
+                << flags << std::endl;
       return false;
     }
 
@@ -448,12 +433,11 @@ class hipPerfMemFill {
   bool testExtDeviceMemoryKernelFill(size_t size, unsigned int flags) {
     double GBytes = static_cast<double>(size) / NUM_1GB;
 
-    T *A = nullptr;
-    HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast<void **>(&A),
-                                    size, flags));
+    T* A = nullptr;
+    HIP_CHECK(hipExtMallocWithFlags(reinterpret_cast<void**>(&A), size, flags));
     if (!A) {
-      std::cout << "failed hipExtMallocWithFlags() with size =" <<
-                   size << " flags=" << std::hex << flags << std::endl;
+      std::cout << "failed hipExtMallocWithFlags() with size =" << size << " flags=" << std::hex
+                << flags << std::endl;
       return false;
     }
 
@@ -470,20 +454,16 @@ class hipPerfMemFill {
   }
 
   bool testExtDeviceMemory() {
-    std::cout << "Test fine grained device memory host filling"
-        << std::endl;
+    std::cout << "Test fine grained device memory host filling" << std::endl;
     for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryHostFill(totalSizes_[i],
-                                       hipDeviceMallocFinegrained)) {
+      if (!testExtDeviceMemoryHostFill(totalSizes_[i], hipDeviceMallocFinegrained)) {
         return false;
       }
     }
 
-    std::cout << "Test fine grained device memory kernel filling"
-        << std::endl;
+    std::cout << "Test fine grained device memory kernel filling" << std::endl;
     for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryKernelFill(totalSizes_[i],
-                                         hipDeviceMallocFinegrained)) {
+      if (!testExtDeviceMemoryKernelFill(totalSizes_[i], hipDeviceMallocFinegrained)) {
         return false;
       }
     }
@@ -521,16 +501,16 @@ class hipPerfMemFill {
 };
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfMemFill status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfMemFill.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfMemFill status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfMemFill.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfMemFill_test") {
   std::cout << "Test int" << std::endl;
@@ -545,6 +525,6 @@ TEST_CASE("Perf_hipPerfMemFill_test") {
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfMemMallocCpyFree.cc b/catch/perftests/memory/hipPerfMemMallocCpyFree.cc
index 3960a16bd..e48bc53e5 100644
--- a/catch/perftests/memory/hipPerfMemMallocCpyFree.cc
+++ b/catch/perftests/memory/hipPerfMemMallocCpyFree.cc
@@ -18,13 +18,13 @@ THE SOFTWARE.
 */
 
 /**
-* @addtogroup hipMemcpy hipMemcpy
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
-
+ * @addtogroup hipMemcpy hipMemcpy
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
+// #define ENABLE_DEBUG 1
 #include <time.h>
 #include <hip_test_common.hh>
 
@@ -38,7 +38,7 @@ void valSet(int* A, int val, size_t size) {
   }
 }
 
-void setup(size_t *size, int *num, int **pA, const size_t totalGlobalMem) {
+void setup(size_t* size, int* num, int** pA, const size_t totalGlobalMem) {
   for (int i = 0; i < *num; i++) {
     size[i] = 1 << (i + 6);
     if ((NUM_ITER + 1) * size[i] > totalGlobalMem) {
@@ -50,39 +50,39 @@ void setup(size_t *size, int *num, int **pA, const size_t totalGlobalMem) {
   valSet(*pA, 1, size[*num - 1]);
 }
 
-void testInit(size_t size, int *A) {
-  int *Ad;
+void testInit(size_t size, int* A) {
+  int* Ad;
 
   clock_t start = clock();
-  HIP_CHECK(hipMalloc(&Ad, size));   //  hip::init() will be called
+  HIP_CHECK(hipMalloc(&Ad, size));  //  hip::init() will be called
   clock_t end = clock();
   double uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-  INFO("Initial: hipMalloc(" << size << ") cost " << uS << "us" << "\n");
+  CONSOLE_PRINT("Initial: hipMalloc(%zu) cost %.2fus\n", size, uS);
 
   start = clock();
   HIP_CHECK(hipMemcpy(Ad, A, size, hipMemcpyHostToDevice));
   HIP_CHECK(hipDeviceSynchronize());
   end = clock();
   uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-  INFO("hipMemcpy(" << size << ") cost " << uS << "us" << "\n");
+  CONSOLE_PRINT("hipMemcpy(%zu) cost %.2fus\n", size, uS);
 
   start = clock();
   HIP_CHECK(hipFree(Ad));
   end = clock();
   uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-  INFO("hipFree(" << size << ") cost " << uS << "us" << "\n");
+  CONSOLE_PRINT("hipFree(%zu) cost %.2fus\n", size, uS);
 }
 
 static bool hipPerfMemMallocCpyFree_test() {
   double uS;
   clock_t start, end;
-  size_t size[NUM_SIZE] = { 0 };
-  int *Ad[NUM_ITER] = { nullptr };
-  int *A;
+  size_t size[NUM_SIZE] = {0};
+  int* Ad[NUM_ITER] = {nullptr};
+  int* A;
   hipDeviceProp_t props;
   memset(&props, 0, sizeof(props));
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
-  INFO("totalGlobalMem: " << props.totalGlobalMem << "\n");
+  CONSOLE_PRINT("totalGlobalMem: %zu\n", props.totalGlobalMem);
 
   int num = NUM_SIZE;
   setup(size, &num, &A, props.totalGlobalMem);
@@ -91,59 +91,60 @@ static bool hipPerfMemMallocCpyFree_test() {
   for (int i = 0; i < num; i++) {
     start = clock();
     for (int j = 0; j < NUM_ITER; j++) {
-        HIP_CHECK(hipMalloc(&Ad[j], size[i]));
+      HIP_CHECK(hipMalloc(&Ad[j], size[i]));
     }
     end = clock();
     uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-    INFO("hipMalloc(" << size[i] << ") cost " << uS << "us" << "\n");
+    CONSOLE_PRINT("hipMalloc(%zu) cost %.2fus\n", size[i], uS);
 
     start = clock();
     for (int j = 0; j < NUM_ITER; j++) {
-        HIP_CHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice));
+      HIP_CHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice));
     }
     HIP_CHECK(hipDeviceSynchronize());
     end = clock();
     uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-    INFO("hipMemcpy(" << size[i] << ") cost " << uS << "us" << "\n");
+    CONSOLE_PRINT("hipMemcpy(%zu) cost %.2fus\n", size[i], uS);
 
     start = clock();
     for (int j = 0; j < NUM_ITER; j++) {
-        HIP_CHECK(hipFree(Ad[j]));
-        Ad[j] = nullptr;
+      HIP_CHECK(hipFree(Ad[j]));
+      Ad[j] = nullptr;
     }
     end = clock();
     double uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-    INFO("hipFree(" << size[i] << ") cost " << uS << "us" << "\n");
+    CONSOLE_PRINT("hipFree(%zu) cost %.2fus\n", size[i], uS);
   }
   free(A);
   return true;
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfMemMallocCpyFree status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfMemMallocCpyFree.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfMemMallocCpyFree status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfMemMallocCpyFree.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfMemMallocCpyFree_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
 
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfDevMemReadSpeed as"
-            "there is no device to test.");
+    SUCCEED(
+        "Skipped testcase hipPerfDevMemReadSpeed as"
+        "there is no device to test.");
   } else {
     REQUIRE(true == hipPerfMemMallocCpyFree_test());
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfMemcpy.cc b/catch/perftests/memory/hipPerfMemcpy.cc
index badc3d43c..d5ad0786a 100644
--- a/catch/perftests/memory/hipPerfMemcpy.cc
+++ b/catch/perftests/memory/hipPerfMemcpy.cc
@@ -18,15 +18,15 @@
  */
 
 /**
-* @addtogroup hipMemcpy hipMemcpy
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpy hipMemcpy
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
-
+// #define ENABLE_DEBUG 1
 #define NUM_SIZE 14
 #define NUM_ITER 1000
 // max BW number for DevicetoDeviceNoCU
@@ -35,7 +35,8 @@
 class hipPerfMemcpy {
  private:
   size_t totalSizes_[NUM_SIZE];
-  void setHostBuffer(int *A, int val, size_t size);
+  void setHostBuffer(int* A, int val, size_t size);
+
  public:
   hipPerfMemcpy();
   ~hipPerfMemcpy() {}
@@ -53,7 +54,7 @@ hipPerfMemcpy::hipPerfMemcpy() {
   }
 }
 
-void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) {
+void hipPerfMemcpy::setHostBuffer(int* A, int val, size_t size) {
   size_t len = size / sizeof(int);
   for (int i = 0; i < len; i++) {
     A[i] = val;
@@ -61,36 +62,31 @@ void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) {
 }
 
 void hipPerfMemcpy::TestResult(unsigned int numTests,
-                              std::chrono::duration<double, std::micro> diff, hipMemcpyKind type)
-{
+                               std::chrono::duration<double, std::micro> diff, hipMemcpyKind type) {
   // BW in GB/s
-  double perf = (static_cast<double>(totalSizes_[numTests] * NUM_ITER) *
-                   static_cast<double>(1e-03)) / diff.count();
-
-  const char *typestr = NULL;
-
-  if(type == hipMemcpyHostToDevice){
-      typestr = "Host to Device";
-  }
-  else if(type == hipMemcpyDeviceToHost){
-      typestr = "Device to Host";
+  double perf =
+      (static_cast<double>(totalSizes_[numTests] * NUM_ITER) * static_cast<double>(1e-03)) /
+      diff.count();
+
+  const char* typestr = NULL;
+
+  if (type == hipMemcpyHostToDevice) {
+    typestr = "Host to Device";
+  } else if (type == hipMemcpyDeviceToHost) {
+    typestr = "Device to Host";
+  } else if (type == hipMemcpyDeviceToDevice) {
+    typestr = "Device to Device";
+    perf *= 2.0;
+  } else if (type == hipMemcpyDeviceToDeviceNoCU) {
+    typestr = "Device to Device No CU";
+    perf *= 2.0;
   }
-  else if(type == hipMemcpyDeviceToDevice){
-      typestr = "Device to Device";
-      perf *= 2.0;
-  }
-  else if(type == hipMemcpyDeviceToDeviceNoCU){
-      typestr = "Device to Device No CU";
-      perf *= 2.0;
-  }
-
-  UNSCOPED_INFO("hipPerfMemcpy[" << numTests << "] " << typestr << " copy BW "
-       << perf << "  GB/s for memory size of " <<
-       totalSizes_[numTests] << " Bytes.");
 
-  if(totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU)
-          REQUIRE(perf < NOCU_MAX_BW);
+  CONSOLE_PRINT("hipPerfMemcpy[%d] %s copy BW %.2f GB/s for memory size of %lu Bytes.\n", numTests,
+                typestr, perf, totalSizes_[numTests]);
 
+  if (totalSizes_[numTests] == 4194304 && type == hipMemcpyDeviceToDeviceNoCU)
+    REQUIRE(perf < NOCU_MAX_BW);
 }
 
 bool hipPerfMemcpy::run_h2d(unsigned int numTests) {
@@ -115,7 +111,7 @@ bool hipPerfMemcpy::run_h2d(unsigned int numTests) {
   TestResult(numTests, diff, hipMemcpyHostToDevice);
 
   HIP_CHECK(hipHostUnregister(A));
-  delete [] A;
+  delete[] A;
   HIP_CHECK(hipFree(Ad));
 
   return true;
@@ -143,7 +139,7 @@ bool hipPerfMemcpy::run_d2h(unsigned int numTests) {
   TestResult(numTests, diff, hipMemcpyDeviceToHost);
 
   HIP_CHECK(hipHostUnregister(A));
-  delete [] A;
+  delete[] A;
   HIP_CHECK(hipFree(Ad));
 
   return true;
@@ -186,8 +182,8 @@ bool hipPerfMemcpy::run_d2d_nocu(unsigned int numTests) {
   auto all_start = std::chrono::steady_clock::now();
 
   for (int j = 0; j < NUM_ITER; j++) {
-    HIP_CHECK(hipMemcpyAsync(Ad1, Ad2, totalSizes_[numTests], hipMemcpyDeviceToDeviceNoCU,
-                                                                                      nullptr));
+    HIP_CHECK(
+        hipMemcpyAsync(Ad1, Ad2, totalSizes_[numTests], hipMemcpyDeviceToDeviceNoCU, nullptr));
   }
 
   HIP_CHECK(hipDeviceSynchronize());
@@ -204,16 +200,16 @@ bool hipPerfMemcpy::run_d2d_nocu(unsigned int numTests) {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfMemcpy status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfMemcpy.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfMemcpy status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfMemcpy_test") {
   int numDevices = 0;
@@ -227,35 +223,34 @@ TEST_CASE("Perf_hipPerfMemcpy_test") {
     hipDeviceProp_t props;
     HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
 
-    UNSCOPED_INFO("info: running on bus " << "0x" << props.pciBusID << " " <<
-         props.name << " with " << props.multiProcessorCount << " CUs "
-         << " and device id: " << deviceId);
+    CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                  props.name, props.multiProcessorCount, deviceId);
 
     hipPerfMemcpy hipPerfMemcpy;
-    SECTION("Perf test Host Memory to Device Memory"){
+    SECTION("Perf test Host Memory to Device Memory") {
       for (auto testCase = 0; testCase < NUM_SIZE; testCase++) {
-         REQUIRE(true == hipPerfMemcpy.run_h2d(testCase));
+        REQUIRE(true == hipPerfMemcpy.run_h2d(testCase));
       }
     }
-    SECTION("Perf test Device Memory to Host Memory"){
+    SECTION("Perf test Device Memory to Host Memory") {
       for (auto testCase = 0; testCase < NUM_SIZE; testCase++) {
-         REQUIRE(true == hipPerfMemcpy.run_d2h(testCase));
+        REQUIRE(true == hipPerfMemcpy.run_d2h(testCase));
       }
     }
-    SECTION("Perf test Device Memory to Device Memory"){
+    SECTION("Perf test Device Memory to Device Memory") {
       for (auto testCase = 0; testCase < NUM_SIZE; testCase++) {
-         REQUIRE(true == hipPerfMemcpy.run_d2d(testCase));
+        REQUIRE(true == hipPerfMemcpy.run_d2d(testCase));
       }
     }
-    SECTION("Perf test Device Memory to Device Memory No CU"){
+    SECTION("Perf test Device Memory to Device Memory No CU") {
       for (auto testCase = 0; testCase < NUM_SIZE; testCase++) {
-         REQUIRE(true == hipPerfMemcpy.run_d2d_nocu(testCase));
+        REQUIRE(true == hipPerfMemcpy.run_d2d_nocu(testCase));
       }
     }
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc b/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc
index 94b5e6e79..c9e963de8 100644
--- a/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc
+++ b/catch/perftests/memory/hipPerfMemcpyAsyncSpeed.cc
@@ -17,12 +17,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 #include <hip_test_common.hh>
-// Quiet pesky warnings
-#ifdef WIN_OS
-#define SNPRINTF sprintf_s
-#else
-#define SNPRINTF snprintf
-#endif
 
 #define NUM_SIZES 6
 // 256 Bytes, 512 Bytes, 1024 Bytes, 2048 Bytes, 3072 Bytes, 4096 Bytes
@@ -38,9 +32,9 @@ void checkData(void* ptr, unsigned int size, char value) {
   char* ptr2 = (char*)ptr;
   for (unsigned int i = 0; i < size; i++) {
     if (ptr2[i] != value) {
-      printf("Data validation failed at %d!  Got 0x%08x\n", i, ptr2[i]);
-      printf("Expected 0x%08x\n", value);
-      printf("Data validation failed!");
+      CONSOLE_PRINT("Data validation failed at %d!  Got 0x%08x\n", i, ptr2[i]);
+      CONSOLE_PRINT("Expected 0x%08x\n", value);
+      CONSOLE_PRINT("Data validation failed!");
       break;
     }
   }
@@ -50,7 +44,7 @@ bool extraWarmup = true;
 TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
-  printf("Set device to %d : %s\n", 0, props.name);
+  CONSOLE_PRINT("Set device to %d : %s\n", 0, props.name);
   HIP_CHECK(hipSetDevice(0));
 
   unsigned int bufSize_;
@@ -66,9 +60,9 @@ TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") {
     int test = 0;
     uint32_t kMaxSize = (t == 0) ? 128 * 1024 * 1024 : 1024 * 1024 * 1024;
     if (t < 2) {
-      printf("----- Global buffer (MiB): %d\n", kMaxSize / (1024 * 1024));
+      CONSOLE_PRINT("\n----- Global buffer (MiB): %d", kMaxSize / (1024 * 1024));
     } else {
-      printf("----- Same buffer copy repeat\n");
+      CONSOLE_PRINT("\n----- Same buffer copy repeat");
     }
     for (; test <= numTests; test++) {
       bufSize_ = Sizes[test % NUM_SIZES];
@@ -131,12 +125,11 @@ TEST_CASE("Perf_hipPerfMemcpyAsyncSpeed_test") {
       // Double results when src and dst are both on device
       perf *= 2.0;
       char buf[256];
-      SNPRINTF(buf, sizeof(buf),
-               "hipMemcpyAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) "
-               "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f",
-               test, bufSize_, strSrc, strDst, numIter, (float)perf,
-               sec.count() / numIter * 1000 * 1000, sec_cpu.count() / numIter * 1000 * 1000);
-      printf("%s\n", buf);
+      CONSOLE_PRINT(
+          "hipMemcpyAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) "
+          "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f",
+          test, bufSize_, strSrc, strDst, numIter, (float)perf, sec.count() / numIter * 1000 * 1000,
+          sec_cpu.count() / numIter * 1000 * 1000);
 
       // Verification
       void* temp = malloc(bufSize_ + 4096);
diff --git a/catch/perftests/memory/hipPerfMemset.cc b/catch/perftests/memory/hipPerfMemset.cc
index 5b84e9194..aaf434245 100644
--- a/catch/perftests/memory/hipPerfMemset.cc
+++ b/catch/perftests/memory/hipPerfMemset.cc
@@ -18,30 +18,29 @@
  */
 
 /**
-* @addtogroup hipMemsetKernel hipMemsetKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemset(void* devPtr, int  value, size_t count)` -
-* Initializes or sets device memory to a value.
-*/
-
+ * @addtogroup hipMemsetKernel hipMemsetKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemset(void* devPtr, int  value, size_t count)` -
+ * Initializes or sets device memory to a value.
+ */
+// #define ENABLE_DEBUG 1
 #include <hip_test_common.hh>
 
 static unsigned int sizeList[] = {
-  256, 512, 1024, 2048, 4096, 8192,
+    256, 512, 1024, 2048, 4096, 8192,
 };
 
-static unsigned int eleNumList[] = {
-    0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000,
-    0x200000, 0x400000, 0x800000, 0x1000000
-};
+static unsigned int eleNumList[] = {0x100,    0x400,    0x1000,   0x4000,   0x10000,
+                                    0x20000,  0x40000,  0x80000,  0x100000, 0x200000,
+                                    0x400000, 0x800000, 0x1000000};
 
 typedef struct _dataType {
   char memsetval = 0x42;
   char memsetD8val = 0xDE;
   int16_t memsetD16val = 0xDEAD;
   int memsetD32val = 0xDEADBEEF;
-}dataType;
+} dataType;
 
 #define NUM_ITER 1000
 
@@ -56,7 +55,7 @@ enum MemsetType {
 
 class hipPerfMemset {
  private:
-  uint64_t     bufSize_;
+  uint64_t bufSize_;
   unsigned int num_elements_;
   unsigned int testNumEle_;
   unsigned int _numSubTests = 0;
@@ -78,25 +77,19 @@ class hipPerfMemset {
 
   bool open(int deviceID);
 
-  template<typename T>
+  template <typename T>
   void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async);
 
-  template<typename T>
+  template <typename T>
   void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async);
 
-  template<typename T>
+  template <typename T>
   void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async);
 
-  uint getNumTests() {
-    return _numSubTests;
-  }
+  uint getNumTests() { return _numSubTests; }
 
-  uint getNumTests2D() {
-    return _numSubTests2D;
-  }
-  uint getNumTests3D() {
-    return _numSubTests3D;
-  }
+  uint getNumTests2D() { return _numSubTests2D; }
+  uint getNumTests3D() { return _numSubTests3D; }
 };
 
 bool hipPerfMemset::open(int deviceId) {
@@ -109,15 +102,13 @@ bool hipPerfMemset::open(int deviceId) {
   HIP_CHECK(hipSetDevice(deviceId));
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  INFO("info: running on bus " << "0x" << props.pciBusID << " " << props.name
-        << " with " << props.multiProcessorCount << " CUs and device id: "
-        << deviceId << "\n");
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
   return true;
 }
 
-template<typename T>
-void hipPerfMemset::run1D(unsigned int test, T memsetval,
-                          enum MemsetType type, bool async) {
+template <typename T>
+void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
   T *A_h, *A_d;
 
   testNumEle_ = eleNumList[test % num_elements_];
@@ -126,17 +117,17 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval,
 
   HIP_CHECK(hipMalloc(&A_d, bufSize_));
 
-  A_h = reinterpret_cast<T*> (malloc(bufSize_));
+  A_h = reinterpret_cast<T*>(malloc(bufSize_));
 
   hipStream_t stream;
   HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
 
   // Warm-up
   if (async) {
-    HIP_CHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream));
+    HIP_CHECK(hipMemsetAsync((void*)A_d, memsetval, bufSize_, stream));
     HIP_CHECK(hipStreamSynchronize(stream));
   } else {
-    HIP_CHECK(hipMemset((void *)A_d, memsetval, bufSize_));
+    HIP_CHECK(hipMemset((void*)A_d, memsetval, bufSize_));
     HIP_CHECK(hipDeviceSynchronize());
   }
 
@@ -144,7 +135,7 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval,
 
   for (uint i = 0; i < NUM_ITER; i++) {
     if (type == hipMemsetTypeDefault && !async) {
-      HIP_CHECK(hipMemset(reinterpret_cast<void *>(A_d), memsetval, bufSize_));
+      HIP_CHECK(hipMemset(reinterpret_cast<void*>(A_d), memsetval, bufSize_));
     } else if (type == hipMemsetTypeDefault && async) {
       HIP_CHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
     } else if (type == hipMemsetTypeD8 && !async) {
@@ -152,13 +143,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval,
     } else if (type == hipMemsetTypeD8 && async) {
       HIP_CHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream));
     } else if (type == hipMemsetTypeD16 && !async) {
-      HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
+      HIP_CHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T)));
     } else if (type == hipMemsetTypeD16 && async) {
-      HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
+      HIP_CHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T), stream));
     } else if (type == hipMemsetTypeD32 && !async) {
-      HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
+      HIP_CHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T)));
     } else if (type == hipMemsetTypeD32 && async) {
-      HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
+      HIP_CHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_ / sizeof(T), stream));
     }
   }
   if (async) {
@@ -169,13 +160,12 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval,
 
   auto end = std::chrono::steady_clock::now();
 
-  HIP_CHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) );
+  HIP_CHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost));
 
   for (int i = 0; i < bufSize_ / sizeof(T); i++) {
     if (A_h[i] != memsetval) {
-      INFO("mismatch at index " << i << " computed: " <<
-            static_cast<int> (A_h[i]) << ", memsetval: " <<
-            static_cast<int> (memsetval) << "\n");
+      DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast<int>(A_h[i]),
+                  static_cast<int>(memsetval));
       REQUIRE(false);
     }
   }
@@ -188,30 +178,27 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval,
   auto sec = diff.count();
   auto perf = static_cast<double>((bufSize_ * NUM_ITER * (1e-09)) / sec);
 
-  std::cout << "[" << std::setw(2)
-        << test << "] " << std::setw(5) <<  bufSize_/1024
-        << " Kb " << std::setw(4) << " typeSize " << sizeof(T) << " : "
-        << std::setw(7) << perf <<  " GB/s \n";
+  std::cout << "[" << std::setw(2) << test << "] " << std::setw(5) << bufSize_ / 1024 << " Kb "
+            << std::setw(4) << " typeSize " << sizeof(T) << " : " << std::setw(7) << perf
+            << " GB/s \n";
 }
 
-template<typename T>
-void hipPerfMemset::run2D(unsigned int test, T memsetval,
-                          enum MemsetType type, bool async) {
+template <typename T>
+void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
   bufSize_ = sizeList[test % num_sizes_];
   size_t numH = bufSize_;
   size_t numW = bufSize_;
   size_t pitch_A;
   size_t width = numW * sizeof(char);
   size_t sizeElements = width * numH;
-  size_t elements = numW* numH;
+  size_t elements = numW * numH;
 
-  T * A_h, * A_d;
+  T *A_h, *A_d;
 
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                           &pitch_A, width, numH));
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width, numH));
   A_h = reinterpret_cast<char*>(malloc(sizeElements));
 
-  for (size_t i=0; i < elements; i++) {
+  for (size_t i = 0; i < elements; i++) {
     A_h[i] = 1;
   }
 
@@ -244,14 +231,12 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval,
 
   auto end = std::chrono::steady_clock::now();
 
-  HIP_CHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
-                       hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
 
-  for (int i=0; i < elements; i++) {
+  for (int i = 0; i < elements; i++) {
     if (A_h[i] != memsetval) {
-      INFO("mismatch at index " << i << " computed: " <<
-            static_cast<int> (A_h[i]) << ", memsetval: " <<
-            static_cast<int> (memsetval) << "\n");
+      DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast<int>(A_h[i]),
+                  static_cast<int>(memsetval));
       REQUIRE(false);
     }
   }
@@ -259,20 +244,19 @@ void hipPerfMemset::run2D(unsigned int test, T memsetval,
   std::chrono::duration<double> diff = end - start;
 
   auto sec = diff.count();
-  auto perf = static_cast<double>((sizeElements* NUM_ITER * (1e-09)) / sec);
+  auto perf = static_cast<double>((sizeElements * NUM_ITER * (1e-09)) / sec);
 
-  std::cout << "hipPerf2DMemset" << (async ? "Async" : "     ") << "[" << test << "] "
-       << "  " << "(GB/s) for " << std::setw(5) << bufSize_
-       << " x " << std::setw(5) << bufSize_ << " bytes : " << std::setw(7) << perf << "\n";
+  std::cout << "hipPerf2DMemset" << (async ? "Async" : "     ") << "[" << test << "] " << "  "
+            << "(GB/s) for " << std::setw(5) << bufSize_ << " x " << std::setw(5) << bufSize_
+            << " bytes : " << std::setw(7) << perf << "\n";
 
   HIP_CHECK(hipStreamDestroy(stream));
   HIP_CHECK(hipFree(A_d));
   free(A_h);
 }
 
-template<typename T>
-void hipPerfMemset::run3D(unsigned int test, T memsetval,
-                          enum MemsetType type, bool async) {
+template <typename T>
+void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
   bufSize_ = sizeList[test % num_sizes_];
 
   size_t numH = bufSize_;
@@ -280,12 +264,12 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval,
   size_t depth = 10;
   size_t width = numW * sizeof(char);
   size_t sizeElements = width * numH * depth;
-  size_t elements = numW* numH* depth;
+  size_t elements = numW * numH * depth;
 
   hipStream_t stream;
   HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
 
-  T *A_h;
+  T* A_h;
 
   hipExtent extent = make_hipExtent(width, numH, depth);
   hipPitchedPtr devPitchedPtr;
@@ -325,12 +309,12 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval,
 
   auto end = std::chrono::steady_clock::now();
 
-  hipMemcpy3DParms myparms ;
+  hipMemcpy3DParms myparms;
   myparms.srcArray = nullptr;
   myparms.dstArray = nullptr;
   myparms.srcPos = make_hipPos(0, 0, 0);
   myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
+  myparms.dstPtr = make_hipPitchedPtr(A_h, width, numW, numH);
   myparms.srcPtr = devPitchedPtr;
   myparms.extent = extent;
 
@@ -338,11 +322,10 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval,
 
   HIP_CHECK(hipMemcpy3D(&myparms));
 
-  for (int i=0; i < elements; i++) {
+  for (int i = 0; i < elements; i++) {
     if (A_h[i] != memsetval) {
-      INFO("mismatch at index " << i << " computed: " <<
-           static_cast<int> (A_h[i]) << ", memsetval: " <<
-           static_cast<int> (memsetval) << "\n");
+      DEBUG_PRINT("mismatch at index %d computed: %d, memsetval: %d\n", i, static_cast<int>(A_h[i]),
+                  static_cast<int>(memsetval));
       REQUIRE(false);
     }
   }
@@ -352,24 +335,23 @@ void hipPerfMemset::run3D(unsigned int test, T memsetval,
   auto sec = diff.count();
   auto perf = static_cast<double>((sizeElements * NUM_ITER * (1e-09)) / sec);
 
-  std::cout << "hipPerf3DMemset" << (async ? "Async" : "     ") << "[" << test << "] " << "  "
-      << "(GB/s) for " << std::setw(5) << bufSize_ << " x " << std::setw(5)
-      << bufSize_  << " x " << depth << " bytes : " << std::setw(7) << perf << "\n";
+  CONSOLE_PRINT("hipPerf3DMemset%s[%d] (GB/s) for %5lu x %5lu x %lu bytes : %7.2f\n",
+                (async ? "Async" : "     "), test, bufSize_, bufSize_, depth, perf);
   HIP_CHECK(hipFree(devPitchedPtr.ptr));
   free(A_h);
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfMemset status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfMemset.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfMemset status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfMemset.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfMemset_test") {
   hipPerfMemset hipPerfMemset;
@@ -385,44 +367,44 @@ TEST_CASE("Perf_hipPerfMemset_test") {
 
   bool async = false;
 
-  for (uint i = 0; i < 2 ; i++) {
-    std::cout << "--------------------- 1D buffer -------------------\n";
+  for (uint i = 0; i < 2; i++) {
+    CONSOLE_PRINT("--------------------- 1D buffer -------------------\n");
     for (auto testCase = 0; testCase < numTests; testCase++) {
       if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) {
-        std::cout << "hipMemsetD8" << (async ? "Async " : "      ");
+        CONSOLE_PRINT("hipMemsetD8%s", (async ? "Async " : "      "));
         hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async);
       } else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        std::cout << "hipMemsetD16" << (async ? "Async" : "     ");
+        CONSOLE_PRINT("hipMemsetD16%s", (async ? "Async" : "     "));
         hipPerfMemset.run1D(testCase, pattern.memsetD16val, hipMemsetTypeD16, async);
       } else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        std::cout << "hipMemsetD32" << (async ? "Async" : "     ");
+        CONSOLE_PRINT("hipMemsetD32%s", (async ? "Async" : "     "));
         hipPerfMemset.run1D(testCase, pattern.memsetD32val, hipMemsetTypeD32, async);
       } else {
-        std::cout << "hipMemset" << (async ? "Async   " : "        ");
+        CONSOLE_PRINT("hipMemset%s", (async ? "Async   " : "        "));
         hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeDefault, async);
       }
     }
     async = true;
   }
 
-  INFO("\n");
-  std::cout << "------------------ 2D buffer arrays ---------------\n";
+  CONSOLE_PRINT("\n");
+  CONSOLE_PRINT("\n------------------ 2D buffer arrays ---------------\n");
 
   async = false;
   for (uint i = 0; i < 2; i++) {
-    INFO("\n");
+    CONSOLE_PRINT("\n");
     for (uint test = 0; test < numTests2D; test++) {
       hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async);
     }
     async = true;
   }
 
-  INFO("\n");
-  std::cout << "------------------ 3D buffer arrays ---------------\n";
+  CONSOLE_PRINT("\n");
+  CONSOLE_PRINT("\n------------------ 3D buffer arrays ---------------\n");
 
   async = false;
   for (uint i = 0; i < 2; i++) {
-    INFO("\n");
+    CONSOLE_PRINT("\n");
     for (uint test = 0; test < numTests3D; test++) {
       hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async);
     }
@@ -431,6 +413,6 @@ TEST_CASE("Perf_hipPerfMemset_test") {
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc b/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc
index 7e138be4d..fe2fcdb33 100644
--- a/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc
+++ b/catch/perftests/memory/hipPerfMemsetAsyncSpeed.cc
@@ -18,13 +18,6 @@ THE SOFTWARE.
 */
 #include <hip_test_common.hh>
 
-// Quiet pesky warnings
-#ifdef WIN_OS
-#define SNPRINTF sprintf_s
-#else
-#define SNPRINTF snprintf
-#endif
-
 #define NUM_SIZES 6
 // 256 Bytes, 512 Bytes, 1024 Bytes, 2048 Bytes, 3072 Bytes, 4096 Bytes
 constexpr uint32_t Mi = 1024 * 1024;
@@ -39,9 +32,9 @@ void checkData_(void* ptr, unsigned int size, char value) {
   char* ptr2 = (char*)ptr;
   for (unsigned int i = 0; i < size; i++) {
     if (ptr2[i] != value) {
-      printf("Data validation failed at %d!  Got 0x%08x\n", i, ptr2[i]);
-      printf("Expected 0x%08x\n", value);
-      printf("Data validation failed!");
+      CONSOLE_PRINT("Data validation failed at %d!  Got 0x%08x\n", i, ptr2[i]);
+      CONSOLE_PRINT("Expected 0x%08x\n", value);
+      CONSOLE_PRINT("Data validation failed!");
       break;
     }
   }
@@ -51,7 +44,7 @@ bool extraWarmup_ = true;
 TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
-  printf("Set device to %d : %s\n", 0, props.name);
+  CONSOLE_PRINT("Set device to %d : %s", 0, props.name);
   HIP_CHECK(hipSetDevice(0));
 
   unsigned int bufSize_;
@@ -66,7 +59,7 @@ TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") {
     int numTests = (NUM_SIZES * NUM_SUBTESTS - 1);
     int test = 0;
     uint32_t kMaxSize = (t == 0) ? 128 * 1024 * 1024 : 1024 * 1024 * 1024;
-    printf("----- Global buffer (MiB): %d\n", kMaxSize / (1024 * 1024));
+    CONSOLE_PRINT("\n----- Global buffer (MiB): %d", kMaxSize / (1024 * 1024));
     for (; test <= numTests; test++) {
       bufSize_ = Sizes[test % NUM_SIZES];
       hostMalloc[0] = hostMalloc[1] = false;
@@ -123,13 +116,11 @@ TEST_CASE("Perf_hipPerfMemsetAsyncSpeed_test") {
       const char* strSrc = "dM";
       const char* strDst = "dM";
 
-      char buf[256];
-      SNPRINTF(buf, sizeof(buf),
-               "hipMemsetAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) "
-               "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f",
-               test, bufSize_, strSrc, strDst, numIter, (float)perf,
-               sec.count() / numIter * 1000 * 1000, sec_cpu.count() / numIter * 1000 * 1000);
-      printf("%s\n", buf);
+      CONSOLE_PRINT(
+          "hipMemsetAsync[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) "
+          "perf\t%.2f, time per iter(us):\t%.1f, time per iter CPU (us):\t%.1f",
+          test, bufSize_, strSrc, strDst, numIter, (float)perf, sec.count() / numIter * 1000 * 1000,
+          sec_cpu.count() / numIter * 1000 * 1000);
 
       // Verification
       void* temp = malloc(bufSize_ + 4096);
diff --git a/catch/perftests/memory/hipPerfSampleRate.cc b/catch/perftests/memory/hipPerfSampleRate.cc
index 9a083fa3e..a591f5a09 100644
--- a/catch/perftests/memory/hipPerfSampleRate.cc
+++ b/catch/perftests/memory/hipPerfSampleRate.cc
@@ -19,66 +19,69 @@
 
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
-
+// #define ENABLE_DEBUG 1
 #define NUM_TYPES 3
 std::vector<std::string> types = {"float", "float2", "float4"};
 std::vector<unsigned int> typeSizes = {4, 8, 16};
 
 #define NUM_SIZES 12
-std::vector<unsigned int> sizes = {1,  2,   4,   8,   16,   32,
-                                   64, 128, 256, 512, 1024, 2048};
+std::vector<unsigned int> sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
 
 #define NUM_BUFS 6
 #define MAX_BUFS (1 << (NUM_BUFS - 1))
 
 #ifdef __HIP_PLATFORM_NVIDIA__
-__host__ __device__ void operator+=(float2 &a, float2 b) {  //NOLINT
-  a.x += b.x; a.y += b.y;
+__host__ __device__ void operator+=(float2& a, float2 b) {  // NOLINT
+  a.x += b.x;
+  a.y += b.y;
 }
 
-__host__ __device__ void operator+=(float4 &a, float4 b) {  //NOLINT
-  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+__host__ __device__ void operator+=(float4& a, float4 b) {  // NOLINT
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+  a.w += b.w;
 }
 #endif
 
 template <typename T>
-__global__ void sampleRate(T * outBuffer, unsigned int inBufSize,
-                           unsigned int writeIt, T **inBuffer, int numBufs) {
+__global__ void sampleRate(T* outBuffer, unsigned int inBufSize, unsigned int writeIt, T** inBuffer,
+                           int numBufs) {
   uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
   uint inputIdx = gid % inBufSize;
 
   T tmp;
   memset(&tmp, 0, sizeof(T));
   for (int i = 0; i < numBufs; i++) {
-    tmp += *(*(inBuffer+i)+inputIdx);
+    tmp += *(*(inBuffer + i) + inputIdx);
   }
 
-  if (writeIt*(unsigned int)tmp.x) {
+  if (writeIt * (unsigned int)tmp.x) {
     outBuffer[gid] = tmp;
   }
 }
 
 template <typename T>
-__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize,
-                          unsigned int writeIt, T ** inBuffer, int numBufs) {
+__global__ void sampleRateFloat(T* outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                                T** inBuffer, int numBufs) {
   uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
   uint inputIdx = gid % inBufSize;
 
   T tmp = (T)0.0f;
 
   for (int i = 0; i < numBufs; i++) {
-    tmp += *((*inBuffer+i)+inputIdx);
+    tmp += *((*inBuffer + i) + inputIdx);
   }
 
-  if (writeIt*(unsigned int)tmp) {
+  if (writeIt * (unsigned int)tmp) {
     outBuffer[gid] = tmp;
   }
 }
@@ -93,26 +96,23 @@ class hipPerfSampleRate {
   void close(void);
 
   // array of funtion pointers
-  typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int
-                inBufSize, unsigned int writeIt, void **inBuffer, int numBufs,
-                int grids, int blocks);
+  typedef void (hipPerfSampleRate::*funPtr)(void* outBuffer, unsigned int inBufSize,
+                                            unsigned int writeIt, void** inBuffer, int numBufs,
+                                            int grids, int blocks);
 
   // Wrappers
-  void float_kernel(void * outBuffer, unsigned int inBufSize,
-                    unsigned int writeIt, void **inBuffer, int numBufs,
-                    int grids, int blocks);
+  void float_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer,
+                    int numBufs, int grids, int blocks);
 
-  void float2_kernel(void * outBuffer, unsigned int inBufSize,
-                     unsigned int writeIt, void **inBuffer, int numBufs,
-                     int grids, int blocks);
+  void float2_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer,
+                     int numBufs, int grids, int blocks);
 
-  void float4_kernel(void * outBuffer, unsigned int inBufSize,
-                     unsigned int writeIt, void **inBuffer, int numBufs,
-                     int grids, int blocks);
+  void float4_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt, void** inBuffer,
+                     int numBufs, int grids, int blocks);
 
  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);
 
   unsigned int width_;
   unsigned int bufSize_;
@@ -139,41 +139,36 @@ bool hipPerfSampleRate::open(void) {
   hipDeviceProp_t props;
   HIP_CHECK(hipSetDevice(deviceId));
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  INFO("info: running on bus " << "0x" << props.pciBusID << " " <<
-       props.name << " with " << props.multiProcessorCount <<
-       " CUs" << " and device id: " << deviceId << "\n");
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
   numCUs = props.multiProcessorCount;
   return true;
 }
 
 // Wrappers for the kernel launches
-void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize,
-                        unsigned int writeIt, void **inBuffer, int numBufs,
-                        int grids, int blocks) {
-  hipLaunchKernelGGL(sampleRateFloat<float>, dim3(grids, grids, grids),
-            dim3(blocks), 0, 0, reinterpret_cast<float*>(outBuffer),
-            inBufSize, writeIt, reinterpret_cast<float**>(inBuffer), numBufs);
+void hipPerfSampleRate::float_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                                     void** inBuffer, int numBufs, int grids, int blocks) {
+  hipLaunchKernelGGL(sampleRateFloat<float>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
+                     reinterpret_cast<float*>(outBuffer), inBufSize, writeIt,
+                     reinterpret_cast<float**>(inBuffer), numBufs);
 }
 
-void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize,
-                        unsigned int writeIt, void **inBuffer, int grids,
-                        int blocks, int numBufs) {
-  hipLaunchKernelGGL(sampleRate<float2>, dim3(grids, grids, grids),
-            dim3(blocks), 0, 0, reinterpret_cast<float2 *>(outBuffer),
-            inBufSize, writeIt, reinterpret_cast<float2 **>(inBuffer), numBufs);
+void hipPerfSampleRate::float2_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                                      void** inBuffer, int grids, int blocks, int numBufs) {
+  hipLaunchKernelGGL(sampleRate<float2>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
+                     reinterpret_cast<float2*>(outBuffer), inBufSize, writeIt,
+                     reinterpret_cast<float2**>(inBuffer), numBufs);
 }
 
-void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize,
-                        unsigned int writeIt, void **inBuffer, int grids,
-                        int blocks, int numBufs) {
-  hipLaunchKernelGGL(sampleRate<float4>, dim3(grids, grids, grids),
-            dim3(blocks), 0, 0, reinterpret_cast<float4 *>(outBuffer),
-            inBufSize, writeIt, reinterpret_cast<float4 **>(inBuffer), numBufs);
+void hipPerfSampleRate::float4_kernel(void* outBuffer, unsigned int inBufSize, unsigned int writeIt,
+                                      void** inBuffer, int grids, int blocks, int numBufs) {
+  hipLaunchKernelGGL(sampleRate<float4>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
+                     reinterpret_cast<float4*>(outBuffer), inBufSize, writeIt,
+                     reinterpret_cast<float4**>(inBuffer), numBufs);
 }
 
 void hipPerfSampleRate::run(unsigned int test) {
-  funPtr p[] = {&hipPerfSampleRate::float_kernel,
-                &hipPerfSampleRate::float2_kernel,
+  funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel,
                 &hipPerfSampleRate::float4_kernel};
 
   // We compute a square domain
@@ -182,35 +177,30 @@ void hipPerfSampleRate::run(unsigned int test) {
   bufSize_ = width_ * width_ * typeSizes[typeIdx_];
   numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES)));
 
-  void ** dPtr;
-  void *  hOutPtr;
-  void *  dOutPtr;
-  void ** hInPtr = new void *[numBufs_];
-  void ** dInPtr = new void *[numBufs_];
+  void** dPtr;
+  void* hOutPtr;
+  void* dOutPtr;
+  void** hInPtr = new void*[numBufs_];
+  void** dInPtr = new void*[numBufs_];
 
-  outBufSize_ =
-      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
+  outBufSize_ = sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
 
   // Allocate memory on the host and device
-  HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hOutPtr), outBufSize_,
-                          hipHostMallocDefault));
-  setData(reinterpret_cast<void *>(hOutPtr), 0xdeadbeef);
-  HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dOutPtr), outBufSize_));
+  HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hOutPtr), outBufSize_, hipHostMallocDefault));
+  setData(reinterpret_cast<void*>(hOutPtr), 0xdeadbeef);
+  HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dOutPtr), outBufSize_));
 
   // Allocate 2D array in Device
-  HIP_CHECK(hipMalloc(reinterpret_cast<void **>(&dPtr),
-                      numBufs_* sizeof(void *)));
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dPtr), numBufs_ * sizeof(void*)));
 
   for (uint i = 0; i < numBufs_; i++) {
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hInPtr[i]), bufSize_,
-                            hipHostMallocDefault));
-    HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dInPtr[i]), bufSize_));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hInPtr[i]), bufSize_, hipHostMallocDefault));
+    HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dInPtr[i]), bufSize_));
     setData(hInPtr[i], 0x3f800000);
   }
 
   // Populate array of pointers with array addresses
-  HIP_CHECK(hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *),
-                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(dPtr, dInPtr, numBufs_ * sizeof(void*), hipMemcpyHostToDevice));
 
   // Copy memory from host to device
   for (uint i = 0; i < numBufs_; i++) {
@@ -241,20 +231,19 @@ void hipPerfSampleRate::run(unsigned int test) {
   // Time the kernel execution
   auto all_start = std::chrono::steady_clock::now();
   for (uint i = 0; i < maxIter; i++) {
-        (this->*p[idx]) (reinterpret_cast<void *>(dOutPtr), sizeDW, writeIt,
-                         dPtr, numBufs_, grids, blocks);
+    (this->*p[idx])(reinterpret_cast<void*>(dOutPtr), sizeDW, writeIt, dPtr, numBufs_, grids,
+                    blocks);
   }
 
   HIP_CHECK(hipDeviceSynchronize());
   auto all_end = std::chrono::steady_clock::now();
   std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
-  double perf = (static_cast<double>(outBufSize_ * numBufs_ *
-                 maxIter * (1e-09))) / all_kernel_time.count();
+  double perf =
+      (static_cast<double>(outBufSize_ * numBufs_ * maxIter * (1e-09))) / all_kernel_time.count();
 
-  INFO("Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1]
-        << " bufs " << numBufs_ << " " << types[typeIdx_] << " " << width_
-        << "x" <<width_<< " (GB/s) " << perf << "\n");
+  CONSOLE_PRINT("Domain %u x %u bufs %u %s %u x %u (GB/s) %f\n", sizes[NUM_SIZES - 1],
+                sizes[NUM_SIZES - 1], numBufs_, types[typeIdx_].c_str(), width_, width_, perf);
 
   HIP_CHECK(hipFree(dOutPtr));
 
@@ -265,52 +254,51 @@ void hipPerfSampleRate::run(unsigned int test) {
   }
   HIP_CHECK(hipHostFree(hOutPtr));
   HIP_CHECK(hipFree(dPtr));
-  delete [] hInPtr;
-  delete [] dInPtr;
+  delete[] hInPtr;
+  delete[] dInPtr;
 }
 
 
-void hipPerfSampleRate::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
+void hipPerfSampleRate::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
   for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
     ptr2[i] = value;
   }
 }
 
 
-void hipPerfSampleRate::checkData(uint *ptr) {
+void hipPerfSampleRate::checkData(uint* ptr) {
   for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
     if (ptr[i] != static_cast<float>(numBufs_)) {
-      INFO("Data validation failed at "<< i << " Got "<< ptr[i]
-           << ", expected " << (float)numBufs_ << "\n");
+      DEBUG_PRINT("Data validation failed at %u Got %u, expected %f\n", i, ptr[i], (float)numBufs_);
       REQUIRE(false);
     }
   }
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfSampleRate status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfSampleRate.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfSampleRate status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfSampleRate.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfSampleRate_test") {
   hipPerfSampleRate sampleTypes;
 
   REQUIRE(true == sampleTypes.open());
 
-  for (unsigned int testCase = 0; testCase < 216 ; testCase+=36) {
+  for (unsigned int testCase = 0; testCase < 216; testCase += 36) {
     sampleTypes.run(testCase);
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc b/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc
index dbf10a04a..67f2e59b6 100644
--- a/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc
+++ b/catch/perftests/memory/hipPerfSharedMemReadSpeed.cc
@@ -18,19 +18,19 @@
  */
 
 /**
-* @addtogroup hipMemcpyKernel hipMemcpyKernel
-* @{
-* @ingroup perfMemoryTest
-* `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
-* Copies data between host and device.
-*/
+ * @addtogroup hipMemcpyKernel hipMemcpyKernel
+ * @{
+ * @ingroup perfMemoryTest
+ * `hipMemcpy(void* dst, const void* src, size_t count, hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
 
 #include <hip_test_common.hh>
-
+// #define ENABLE_DEBUG 1
 #define sharedMemSize1 2048
 #define sharedMemSize2 256
 
-__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) {
+__global__ void sharedMemReadSpeed1(float* outBuf, ulong N) {
   size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t lid = threadIdx.x;
   __shared__ float local[sharedMemSize1];
@@ -84,7 +84,7 @@ __global__ void sharedMemReadSpeed1(float *outBuf, ulong N) {
   }
 }
 
-__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) {
+__global__ void sharedMemReadSpeed2(float* outBuf, ulong N) {
   size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t lid = threadIdx.x;
   __shared__ float local[sharedMemSize2];
@@ -116,8 +116,8 @@ __global__ void sharedMemReadSpeed2(float *outBuf, ulong N) {
 }
 
 static bool hipPerfSharedMemReadSpeed_test() {
-  float *dDst;
-  float *hDst;
+  float* dDst;
+  float* hDst;
   hipStream_t stream;
   constexpr uint numSizes = 4;
   constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216};
@@ -132,8 +132,8 @@ static bool hipPerfSharedMemReadSpeed_test() {
   HIP_CHECK(hipSetDevice(device));
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, device));
-  INFO("info: running on bus " << "0x" << props.pciBusID << " " << props.name
-       << " with " << props.multiProcessorCount << " CUs \n");
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs\n", props.pciBusID, props.name,
+                props.multiProcessorCount);
 
   HIP_CHECK(hipStreamCreate(&stream));
 
@@ -149,8 +149,8 @@ static bool hipPerfSharedMemReadSpeed_test() {
     HIP_CHECK(hipMalloc(&dDst, nBytes));
     HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
 
-    hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
-                       dim3(threadsPerBlock), 0, stream, dDst, N);
+    hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst,
+                       N);
     HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -160,8 +160,7 @@ static bool hipPerfSharedMemReadSpeed_test() {
         tmp = 0;
       }
       if (hDst[i] != tmp) {
-        INFO("info: Data validation failed for warm up run! \n");
-        INFO("info: expected " << tmp << " got " << hDst[i] << " \n");
+        DEBUG_PRINT("Data validation failed for warm up run! expected %d got %f\n", tmp, hDst[i]);
         return false;
       }
       tmp += threadsPerBlock / 2;
@@ -169,8 +168,8 @@ static bool hipPerfSharedMemReadSpeed_test() {
 
     auto all_start = std::chrono::steady_clock::now();
     for (int i = 0; i < nIter; i++) {
-      hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
-                         dim3(threadsPerBlock), 0, stream, dDst, N);
+      hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst,
+                         N);
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -178,15 +177,14 @@ static bool hipPerfSharedMemReadSpeed_test() {
     std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
     // read speed in GB/s
-    double perf = (static_cast<double>(blocks * threadsPerBlock)
-                   * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64)
-                   * nIter * (1e-09)) / all_kernel_time.count();
+    double perf = (static_cast<double>(blocks * threadsPerBlock) *
+                   (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter * (1e-09)) /
+        all_kernel_time.count();
 
-    INFO("info: read speed = " << std::setw(8) << perf << " GB/s for " <<
-          sharedMemSizeBytes1 / 1024 << " KB shared memory with " <<
-          std::setw(8) << blocks * threadsPerBlock << " threads, "
-          << std::setw(4) << numReads1 <<
-          " reads in sharedMemReadSpeed1 kernel \n");
+    CONSOLE_PRINT(
+        "info: read speed = %.2f GB/s for %d KB shared memory with %d threads, %d reads in "
+        "sharedMemReadSpeed1 kernel\n",
+        perf, sharedMemSizeBytes1 / 1024, blocks * threadsPerBlock, numReads1);
 
     delete[] hDst;
     HIP_CHECK(hipFree(dDst));
@@ -204,15 +202,15 @@ static bool hipPerfSharedMemReadSpeed_test() {
     HIP_CHECK(hipMalloc(&dDst, nBytes));
     HIP_CHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
 
-    hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
-                       dim3(threadsPerBlock), 0, stream, dDst, N);
+    hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst,
+                       N);
     HIP_CHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
     HIP_CHECK(hipDeviceSynchronize());
 
     auto all_start = std::chrono::steady_clock::now();
     for (int i = 0; i < nIter; i++) {
-      hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
-                         dim3(threadsPerBlock), 0, stream, dDst, N);
+      hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst,
+                         N);
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -220,15 +218,14 @@ static bool hipPerfSharedMemReadSpeed_test() {
     std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
     // read speed in GB/s
-    double perf = (static_cast<double>(blocks * threadsPerBlock)
-                   * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64)
-                   * nIter * (1e-09)) / all_kernel_time.count();
+    double perf = (static_cast<double>(blocks * threadsPerBlock) *
+                   (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter * (1e-09)) /
+        all_kernel_time.count();
 
-    INFO("info: read speed = " << std::setw(8) << perf << " GB/s for "
-         << sharedMemSizeBytes2 / 1024 << " KB shared memory with "
-         << std::setw(8) << blocks * threadsPerBlock << " threads, "
-         << std::setw(4) << numReads2 <<
-         " reads in sharedMemReadSpeed2 kernel \n");
+    CONSOLE_PRINT(
+        "info: read speed = %.2f GB/s for %d KB shared memory with %d threads, %d reads in "
+        "sharedMemReadSpeed2 kernel\n",
+        perf, sharedMemSizeBytes2 / 1024, blocks * threadsPerBlock, numReads2);
 
     delete[] hDst;
     HIP_CHECK(hipFree(dDst));
@@ -238,30 +235,31 @@ static bool hipPerfSharedMemReadSpeed_test() {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify hipPerfSharedMemReadSpeed status.
-* Test source
-* ------------------------
-*  - perftests/memory/hipPerfSharedMemReadSpeed.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfSharedMemReadSpeed status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfSharedMemReadSpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfSharedMemReadSpeed_test") {
   int numDevices = 0;
   HIP_CHECK(hipGetDeviceCount(&numDevices));
 
   if (numDevices <= 0) {
-    SUCCEED("Skipped testcase hipPerfSharedMemReadSpeed as"
-            "there is no device to test.\n");
+    SUCCEED(
+        "Skipped testcase hipPerfSharedMemReadSpeed as"
+        "there is no device to test.\n");
   } else {
     REQUIRE(true == hipPerfSharedMemReadSpeed_test());
   }
 }
 
 /**
-* End doxygen group perfMemoryTest.
-* @}
-*/
+ * End doxygen group perfMemoryTest.
+ * @}
+ */
diff --git a/catch/perftests/stream/hipPerfDeviceConcurrency.cc b/catch/perftests/stream/hipPerfDeviceConcurrency.cc
index b07c9f49e..dfe1d83c0 100644
--- a/catch/perftests/stream/hipPerfDeviceConcurrency.cc
+++ b/catch/perftests/stream/hipPerfDeviceConcurrency.cc
@@ -18,12 +18,12 @@
  */
 
 /**
-* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
-* @{
-* @ingroup perfStreamTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
+ * @{
+ * @ingroup perfStreamTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */
 
 #include <hip_test_common.hh>
 
@@ -34,28 +34,28 @@ typedef struct {
 } coordRec;
 
 static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
+    {0.0, 0.0, 0.00001},  // All black
 };
 
 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
 
-__global__ void mandelbrot(uint *out, uint width, float xPos,
-                          float yPos, float xStep, float yStep, uint maxIter) {
+__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
+                           uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
   int i = tid % width;
   int j = tid / width;
-  float x0 = static_cast<float>(xPos + xStep*i);
-  float y0 = static_cast<float>(yPos + yStep*j);
+  float x0 = static_cast<float>(xPos + xStep * i);
+  float y0 = static_cast<float>(yPos + yStep * j);
 
   float x = x0;
   float y = y0;
 
   uint iter = 0;
   float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+  for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
     tmp = x;
     x = fma(-y, y, fma(x, x, x0));
-    y = fma(2.0f*tmp, y, y0);
+    y = fma(2.0f * tmp, y, y0);
   }
   out[tid] = iter;
 };
@@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency {
   hipPerfDeviceConcurrency();
   ~hipPerfDeviceConcurrency();
 
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
+  void setNumGpus(unsigned int num) { numDevices = num; }
+  unsigned int getNumGpus() { return numDevices; }
 
   void open(void);
   void close(void);
   bool run(unsigned int testCase, int numGpus);
 
  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);
 
   unsigned int numDevices;
   unsigned int width_;
@@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) {
   }
 }
 
-void hipPerfDeviceConcurrency::close() {
-}
+void hipPerfDeviceConcurrency::close() {}
 
 bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
   static int deviceId;
-  uint ** hPtr = new uint*[numGpus];
-  uint ** dPtr = new uint*[numGpus];
-  hipStream_t * streams = new hipStream_t[numGpus];
-  int *numCUs = new int[numGpus];
-  unsigned int *maxIter = new unsigned int[numGpus];
-  unsigned long long *expectedIters = new unsigned long long[numGpus];
+  uint** hPtr = new uint*[numGpus];
+  uint** dPtr = new uint*[numGpus];
+  hipStream_t* streams = new hipStream_t[numGpus];
+  int* numCUs = new int[numGpus];
+  unsigned int* maxIter = new unsigned int[numGpus];
+  unsigned long long* expectedIters = new unsigned long long[numGpus];
 
   int threads, threads_per_block, blocks;
   float xStep, yStep, xPos, yPos;
@@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
     hipDeviceProp_t props;
     HIP_CHECK(hipGetDeviceProperties(&props, i));
     if (testCase != 0) {
-    std::cout << "info: running on bus " << "0x" << props.pciBusID
-              << " " << props.name << " with " << props.multiProcessorCount
-              << " CUs" << " and device ID: " << i << std::endl;
+      CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
+                    props.name, props.multiProcessorCount, i);
     }
-
     numCUs[i] = props.multiProcessorCount;
     int clkFrequency = 0;
-    HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
-                                    hipDeviceAttributeClockRate, i));
+    HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
     if (clkFrequency == 0) {
-      std::cout << "clkFrequency = 0, set it to 1000000\n";
+      CONSOLE_PRINT("clkFrequency = 0, set it to 1000000");
       clkFrequency = 1000000;
     }
-    clkFrequency =(unsigned int)clkFrequency/1000;
+    clkFrequency = (unsigned int)clkFrequency / 1000;
 
     // Maximum iteration count
     // maxIter = 8388608 * (engine_clock / 1000).serial execution
-    maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000))
-                                              * numCUs[i]) / 128);
+    maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
     maxIter[i] = (maxIter[i] + 15) & ~15;
 
     // Width is divisible by 4 because the mandelbrot
@@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
     HIP_CHECK(hipStreamCreate(&streams[i]));
 
     // Allocate memory on the host and device
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
-                            bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
     setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
 
     // Prepare kernel launch parameters
-    threads = (bufSize/sizeof(uint));
-    threads_per_block  = 64;
-    blocks = (threads/threads_per_block) + (threads % threads_per_block);
+    threads = (bufSize / sizeof(uint));
+    threads_per_block = 64;
+    blocks = (threads / threads_per_block) + (threads % threads_per_block);
 
     coordIdx = testCase % numCoords;
     xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
@@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
       deviceId = i;
     }
 
-  HIP_CHECK(hipSetDevice(deviceId));
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
-                    streams[i], dPtr[i], width_, xPos, yPos, xStep,
-                    yStep, maxIter[i]);
+    HIP_CHECK(hipSetDevice(deviceId));
+    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i],
+                       width_, xPos, yPos, xStep, yStep, maxIter[i]);
   }
   for (int i = 0; i < numGpus; i++) {
     HIP_CHECK(hipStreamSynchronize(0));
@@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
   auto all_end = std::chrono::steady_clock::now();
   std::chrono::duration<double> all_kernel_time = all_end - all_start;
 
-  for(int i = 0; i < numGpus; i++) {
-    if(testCase != 0) {
+  for (int i = 0; i < numGpus; i++) {
+    if (testCase != 0) {
       deviceId = i;
     }
     HIP_CHECK(hipSetDevice(deviceId));
@@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
     // Copy data back from device to the host
     HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
     checkData(hPtr[i]);
-    expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+    expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i];
     if (testCase != 0) {
       checkData(hPtr[i]);
       if (totalIters != expectedIters[i]) {
-        std::cout << "Incorrect iteration count detected" << std::endl;
+        CONSOLE_PRINT("Incorrect iteration count detected");
       }
     }
 
@@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
   }
 
   if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus
-            << " device (s): " << all_kernel_time.count() << " (s) "
-            << '\n' << std::endl;
+    CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus,
+                  all_kernel_time.count());
   }
 
   if (testCase == 0) {
     deviceId++;
   }
-  delete [] hPtr;
-  delete [] dPtr;
-  delete [] streams;
-  delete [] numCUs;
-  delete [] maxIter;
-  delete [] expectedIters;
+  delete[] hPtr;
+  delete[] dPtr;
+  delete[] streams;
+  delete[] numCUs;
+  delete[] maxIter;
+  delete[] expectedIters;
   return true;
 }
 
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
+void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    ptr2[i] = value;
   }
 }
 
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+void hipPerfDeviceConcurrency::checkData(uint* ptr) {
   totalIters = 0;
   for (unsigned int i = 0; i < width_ * width_; i++) {
     totalIters += ptr[i];
@@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify the different levels of device concurrency.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfDeviceConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the different levels of device concurrency.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfDeviceConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfDeviceConcurrency") {
   hipPerfDeviceConcurrency deviceConcurrency;
@@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") {
 }
 
 /**
-* End doxygen group perfStreamTest.
-* @}
-*/
+ * End doxygen group perfStreamTest.
+ * @}
+ */
diff --git a/catch/perftests/stream/hipPerfStreamConcurrency.cc b/catch/perftests/stream/hipPerfStreamConcurrency.cc
index ba4a04aa9..aa069e2fd 100644
--- a/catch/perftests/stream/hipPerfStreamConcurrency.cc
+++ b/catch/perftests/stream/hipPerfStreamConcurrency.cc
@@ -18,12 +18,12 @@
  */
 
 /**
-* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
-* @{
-* @ingroup perfComputeTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
+ * @{
+ * @ingroup perfComputeTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */
 
 #include <hip_test_common.hh>
 #include <hip/hip_vector_types.h>
@@ -55,23 +55,23 @@ static coordRec coords[] = {
 
 static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
 
-__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
-         float xStep, float yStep, uint maxIter) {
+__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep,
+                                  float yStep, uint maxIter) {
   int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % (width/4);
-  int j = tid / (width/4);
-  int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
+  int i = tid % (width / 4);
+  int j = tid / (width / 4);
+  int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3);
   int4 vecj = make_int4(j, j, j, j);
   float4 x0;
-  x0.x = static_cast<float>(xPos + xStep*veci.x);
-  x0.y = static_cast<float>(xPos + xStep*veci.y);
-  x0.z = static_cast<float>(xPos + xStep*veci.z);
-  x0.w = static_cast<float>(xPos + xStep*veci.w);
+  x0.x = static_cast<float>(xPos + xStep * veci.x);
+  x0.y = static_cast<float>(xPos + xStep * veci.y);
+  x0.z = static_cast<float>(xPos + xStep * veci.z);
+  x0.w = static_cast<float>(xPos + xStep * veci.w);
   float4 y0;
-  y0.x = static_cast<float>(yPos + yStep*vecj.x);
-  y0.y = static_cast<float>(yPos + yStep*vecj.y);
-  y0.z = static_cast<float>(yPos + yStep*vecj.z);
-  y0.w = static_cast<float>(yPos + yStep*vecj.w);
+  y0.x = static_cast<float>(yPos + yStep * vecj.x);
+  y0.y = static_cast<float>(yPos + yStep * vecj.y);
+  y0.z = static_cast<float>(yPos + yStep * vecj.z);
+  y0.w = static_cast<float>(yPos + yStep * vecj.w);
   float4 x = x0;
   float4 y = y0;
   uint iter = 0;
@@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
   int4 ccount = make_int4(0, 0, 0, 0);
   float4 savx = x;
   float4 savy = y;
-  stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
-  stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
-  stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
-  stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
-  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
-  iter+=16) {
+  stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
+  stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
+  stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
+  stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
+  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) {
     x = savx;
     y = savy;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
     // Two iterations
-    tmp = x*x + x0 - y*y;
+    tmp = x * x + x0 - y * y;
     y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
+    x = tmp * tmp + x0 - y * y;
     y = 2.0f * tmp * y + y0;
-    stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
-    stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
-    stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
-    stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
+    stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
+    stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
+    stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
+    stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
     savx.x = static_cast<bool>(stay.x ? x.x : savx.x);
     savx.y = static_cast<bool>(stay.y ? x.y : savx.y);
     savx.z = static_cast<bool>(stay.z ? x.z : savx.z);
@@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
     savy.y = static_cast<bool>(stay.y ? y.y : savy.y);
     savy.z = static_cast<bool>(stay.z ? y.z : savy.z);
     savy.w = static_cast<bool>(stay.w ? y.w : savy.w);
-    ccount.x -= stay.x*16;
-    ccount.y -= stay.y*16;
-    ccount.z -= stay.z*16;
-    ccount.w -= stay.w*16;
+    ccount.x -= stay.x * 16;
+    ccount.y -= stay.y * 16;
+    ccount.z -= stay.z * 16;
+    ccount.w -= stay.w * 16;
   }
   // Handle remainder
   if (!(stay.x & stay.y & stay.z & stay.w)) {
@@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
     do {
       x = savx;
       y = savy;
-      stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x <  maxIter);
-      stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y <  maxIter);
-      stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z <  maxIter);
-      stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w <  maxIter);
+      stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter);
+      stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter);
+      stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter);
+      stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter);
       tmp = x;
-      x = x*x + x0 - y*y;
-      y = 2.0f*tmp*y + y0;
+      x = x * x + x0 - y * y;
+      y = 2.0f * tmp * y + y0;
       ccount.x += stay.x;
       ccount.y += stay.y;
       ccount.z += stay.z;
@@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
       savy.w = (stay.w ? y.w : savy.w);
     } while ((stay.x | stay.y | stay.z | stay.w) && iter);
   }
-  uint4 *vecOut = reinterpret_cast<uint4 *>(out);
+  uint4* vecOut = reinterpret_cast<uint4*>(out);
   vecOut[tid].x = (uint)(ccount.x);
   vecOut[tid].y = (uint)(ccount.y);
   vecOut[tid].z = (uint)(ccount.z);
@@ -180,27 +179,19 @@ class hipPerfStreamConcurrency {
   hipPerfStreamConcurrency();
   ~hipPerfStreamConcurrency();
 
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
+  void setNumKernels(unsigned int num) { numKernels = num; }
+  void setNumStreams(unsigned int num) { numStreams = num; }
+  unsigned int getNumStreams() { return numStreams; }
 
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
+  unsigned int getNumKernels() { return numKernels; }
 
   bool open(int deviceID);
   bool run(unsigned int testCase, unsigned int deviceId);
   void close(void);
 
  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
+  void setData(void* ptr, unsigned int value);
+  void checkData(uint* ptr);
 
   unsigned int numKernels;
   unsigned int numStreams;
@@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) {
   HIP_CHECK(hipSetDevice(deviceId));
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID
-    << " " << props.name << " with " << props.multiProcessorCount << " CUs"
-     << " and device id: " << deviceId  << std::endl;
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
+
   numCUs = props.multiProcessorCount;
   return true;
 }
 
-void hipPerfStreamConcurrency::close() {
-}
+void hipPerfStreamConcurrency::close() {}
 
-bool hipPerfStreamConcurrency::run(unsigned int testCase,
-                               unsigned int deviceId) {
+bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) {
   int clkFrequency = 0;
   unsigned int numStreams = getNumStreams();
   unsigned int numKernels = getNumKernels();
 
-  HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
-             hipDeviceAttributeClockRate, deviceId));
+  HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
   if (clkFrequency == 0) {
-    std::cout << "clkFrequency = 0, set it to 1000000\n";
+    CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n");
     clkFrequency = 1000000;
   }
-  clkFrequency =(unsigned int)clkFrequency/1000;
+  clkFrequency = (unsigned int)clkFrequency / 1000;
 
   // Maximum iteration count
   // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000))
-                                                        * numCUs) / 128);
+  maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000)) * numCUs) / 128);
   maxIter = (maxIter + 15) & ~15;
-  hipStream_t *streams = new hipStream_t[numStreams];
-  uint ** hPtr = new uint*[numKernels];
-  uint ** dPtr = new uint*[numKernels];
+  hipStream_t* streams = new hipStream_t[numStreams];
+  uint** hPtr = new uint*[numKernels];
+  uint** dPtr = new uint*[numKernels];
 
   // Width is divisible by 4 because the mandelbrot kernel
   // processes 4 pixels at once.
@@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
 
   // Allocate memory on the host and device
   for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
-                            bufSize, hipHostMallocDefault));
+    HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
     setData(hPtr[i], 0xdeadbeef);
-    HIP_CHECK(hipMalloc(reinterpret_cast<void **>(&dPtr[i]), bufSize))
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dPtr[i]), bufSize))
   }
 
   // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+  int threads = (bufSize / sizeof(uint));
+  int threads_per_block = 64;
+  int blocks = (threads / threads_per_block) + (threads % threads_per_block);
   coordIdx = testCase % numCoords;
   float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
   float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
@@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
 
   // Copy memory asynchronously and concurrently from host to device
   for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]),
-                                 hPtr[i], bufSize, streams[i % numStreams]));
+    HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize,
+                                 streams[i % numStreams]));
   }
 
   // Synchronize to make sure all the copies are completed
@@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
   auto all_start = std::chrono::steady_clock::now();
 
   for (uint i = 0; i < numKernels; i++) {
-    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block),
-    0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep,
-     yStep, maxIter);
+    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
+                       streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
   }
 
   // Synchronize all the concurrent streans to have completed execution
@@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
 
   // Copy data back from device to the host
   for (uint i = 0; i < numKernels; i++) {
-    HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i],
-     reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
-      streams[i % numStreams]));
+    HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
+                                 streams[i % numStreams]));
   }
 
   if (testCase != 0) {
-  std::cout <<"Measured time for " << numKernels <<" kernels (s) on "
-  << numStreams <<" stream (s): " << all_kernel_time.count() << std::endl;
+    CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams,
+                  all_kernel_time.count());
   }
 
-  for (uint i = 0 ; i < numStreams; i++) {
+  for (uint i = 0; i < numStreams; i++) {
     HIP_CHECK(hipStreamDestroy(streams[i]));
   }
 
@@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
     HIP_CHECK(hipFree(dPtr[i]));
   }
 
-  delete [] streams;
-  delete [] hPtr;
-  delete [] dPtr;
+  delete[] streams;
+  delete[] hPtr;
+  delete[] dPtr;
   return true;
 }
 
-void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ ; i++) {
-      ptr2[i] = value;
+void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) {
+  unsigned int* ptr2 = (unsigned int*)ptr;
+  for (unsigned int i = 0; i < width_; i++) {
+    ptr2[i] = value;
   }
 }
 
-void hipPerfStreamConcurrency::checkData(uint *ptr) {
+void hipPerfStreamConcurrency::checkData(uint* ptr) {
   totalIters = 0;
   for (unsigned int i = 0; i < width_; i++) {
     totalIters += ptr[i];
@@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) {
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify the different levels of stream concurrency.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfStreamConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the different levels of stream concurrency.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfStreamConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfStreamConcurrency") {
   hipPerfStreamConcurrency streamConcurrency;
@@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
         break;
 
       case 1:
-      // default stream executes serially
-      streamConcurrency.setNumStreams(1);
-      streamConcurrency.setNumKernels(1);
-      break;
+        // default stream executes serially
+        streamConcurrency.setNumStreams(1);
+        streamConcurrency.setNumKernels(1);
+        break;
 
       case 2:
         // 2-way concurrency
@@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
 }
 
 /**
-* End doxygen group perfComputeTest.
-* @}
-*/
+ * End doxygen group perfComputeTest.
+ * @}
+ */
diff --git a/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc b/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc
index edbe4c004..9b240ac13 100644
--- a/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc
+++ b/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc
@@ -18,19 +18,17 @@
  */
 
 /**
-* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
-* @{
-* @ingroup perfStreamTest
-* `hipError_t hipStreamCreate(hipStream_t* stream)` -
-* Create an asynchronous stream.
-*/
+ * @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
+ * @{
+ * @ingroup perfStreamTest
+ * `hipError_t hipStreamCreate(hipStream_t* stream)` -
+ * Create an asynchronous stream.
+ */
 
 #include <hip_test_kernels.hh>
 #include <hip_test_checkers.hh>
 #include <hip_test_common.hh>
 
-using namespace std;
-
 #define BufSize 0x1000
 #define Iterations 0x100
 #define TotalStreams 4
@@ -39,17 +37,20 @@ using namespace std;
 
 class hipPerfStreamCreateCopyDestroy {
  private:
-    unsigned int numBuffers_;
-    unsigned int numStreams_;
-    const size_t totalStreams_[TotalStreams];
-    const size_t totalBuffers_[TotalBufs];
+  unsigned int numBuffers_;
+  unsigned int numStreams_;
+  const size_t totalStreams_[TotalStreams];
+  const size_t totalBuffers_[TotalBufs];
+
  public:
-    hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
-                                       totalStreams_{1, 2, 4, 8},
-                                       totalBuffers_{1, 100, 1000, 5000} {};
-    ~hipPerfStreamCreateCopyDestroy() {};
-    bool open(int deviceID);
-    bool run(unsigned int testNumber);
+  hipPerfStreamCreateCopyDestroy()
+      : numBuffers_(0),
+        numStreams_(0),
+        totalStreams_{1, 2, 4, 8},
+        totalBuffers_{1, 100, 1000, 5000} {};
+  ~hipPerfStreamCreateCopyDestroy(){};
+  bool open(int deviceID);
+  bool run(unsigned int testNumber);
 };
 
 bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
@@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
   HIP_CHECK(hipSetDevice(deviceId));
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID
-  << " " << props.name << " with " << props.multiProcessorCount << " CUs"
-  << " and device id: " << deviceId  << std::endl;
+
+  CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
+                props.name, props.multiProcessorCount, deviceId);
   return true;
 }
 
 bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
   numStreams_ = totalStreams_[testNumber % TotalStreams];
-  size_t iter = Iterations / (numStreams_ * (static_cast<size_t>(1)
-                 << (testNumber / TotalBufs + 1)));
-  hipStream_t *streams = new hipStream_t[numStreams_];
+  size_t iter =
+      Iterations / (numStreams_ * (static_cast<size_t>(1) << (testNumber / TotalBufs + 1)));
+  hipStream_t* streams = new hipStream_t[numStreams_];
 
   numBuffers_ = totalBuffers_[testNumber / TotalBufs];
-  float ** dSrc = new float*[numBuffers_];
+  float** dSrc = new float*[numBuffers_];
   size_t nBytes = BufSize * sizeof(float);
 
   for (size_t b = 0; b < numBuffers_; ++b) {
@@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
 
     for (size_t s = 0; s < numStreams_; ++s) {
       for (size_t b = 0; b < numBuffers_; ++b) {
-        HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes,
-                  hipMemcpyHostToDevice, streams[s]));
+        HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
       }
     }
 
@@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
 
   auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
 
-  cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
-       << setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
-       << iter << " iterations " << time << " (ms) " << endl;
+  CONSOLE_PRINT(
+      "Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n",
+      numStreams_, numBuffers_, iter, time);
 
-  delete [] hSrc;
+  delete[] hSrc;
   for (size_t b = 0; b < numBuffers_; ++b) {
     HIP_CHECK(hipFree(dSrc[b]));
   }
 
-  delete [] streams;
-  delete [] dSrc;
+  delete[] streams;
+  delete[] dSrc;
   return true;
 }
 
 /**
-* Test Description
-* ------------------------
-*  - Verify the Create+Copy+Destroy time for different stream.
-* Test source
-* ------------------------
-*  - perftests/stream/hipPerfDeviceConcurrency.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+ * Test Description
+ * ------------------------
+ *  - Verify the Create+Copy+Destroy time for different stream.
+ * Test source
+ * ------------------------
+ *  - perftests/stream/hipPerfDeviceConcurrency.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
 
 TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
   hipPerfStreamCreateCopyDestroy streamCCD;
@@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
 }
 
 /**
-* End doxygen group perfStreamTest.
-* @}
-*/
+ * End doxygen group perfStreamTest.
+ * @}
+ */
diff --git a/catch/perftests/vmm/CMakeLists.txt b/catch/perftests/vmm/CMakeLists.txt
new file mode 100644
index 000000000..57f6d0a51
--- /dev/null
+++ b/catch/perftests/vmm/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+  hipPerfVMMAlloc.cc
+)
+
+hip_add_exe_to_target(NAME perfVMMTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME perf_test)
diff --git a/catch/perftests/vmm/hipPerfVMMAlloc.cc b/catch/perftests/vmm/hipPerfVMMAlloc.cc
new file mode 100644
index 000000000..702615b2a
--- /dev/null
+++ b/catch/perftests/vmm/hipPerfVMMAlloc.cc
@@ -0,0 +1,305 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @addtogroup hipMemCreate hipMemCreate
+ * @{
+ * @ingroup perfVMMTest
+ * `hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
+ *               const hipMemAllocationProp* prop, unsigned long long flags)` -
+ * Creates a memory allocation described by the properties and size.
+ */
+
+#include <hip_test_common.hh>
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Verify hipPerfBufferCopySpeed status.
+ * Test source
+ * ------------------------
+ *  - perftests/memory/hipPerfBufferCopySpeed.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.6
+ */
+
+// Control Variables
+constexpr bool single_map = false;
+constexpr bool debug_failure = false;
+constexpr size_t kMB = (1024 * 1024);
+constexpr size_t kGB = (1024 * 1024 * 1024);
+constexpr size_t chunk_size = (64 * kMB);
+
+bool CheckVMMSupportedOnDevice(int deviceId) {
+  int value = 0;
+  hipDeviceAttribute_t attr = hipDeviceAttributeVirtualMemoryManagementSupported;
+  HIP_CHECK(hipDeviceGetAttribute(&value, attr, deviceId));
+  return static_cast<bool>(value);
+}
+
+bool GetVMMGranularityOnDevice(int deviceId, size_t& granularity) {
+  hipMemAllocationProp prop{};
+  prop.type = hipMemAllocationTypePinned;
+  prop.location.type = hipMemLocationTypeDevice;
+  prop.location.id = deviceId;  // Current Devices
+  HIP_CHECK(
+      hipMemGetAllocationGranularity(&granularity, &prop, hipMemAllocationGranularityMinimum));
+  return true;
+}
+
+template <typename T> size_t GetSizeN(size_t total_size) {
+  if (total_size % sizeof(T) != 0) {
+    INFO("Size " << total_size << " is not a multiple of size of type T");
+    assert(false);
+  }
+  return (total_size / sizeof(T));
+}
+
+bool ValidateUsingCopy(int deviceId, void* dev_ptr, size_t data_size,
+                       std::chrono::microseconds& h2d_elapsed,
+                       std::chrono::microseconds& d2h_elapsed) {
+  // Get Host Data
+  std::vector<int> A_h(data_size), B_h(data_size);
+  size_t size_n = GetSizeN<int>(data_size);
+
+  for (size_t idx = 0; idx < size_n; ++idx) {
+    A_h[idx] = idx;
+    B_h[idx] = 0;
+  }
+
+  HIP_CHECK(hipSetDevice(deviceId));
+  auto start = std::chrono::high_resolution_clock::now();
+  HIP_CHECK(hipMemcpy(dev_ptr, A_h.data(), data_size, hipMemcpyHostToDevice));
+  auto end = std::chrono::high_resolution_clock::now();
+  h2d_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  
+  start = std::chrono::high_resolution_clock::now();
+  HIP_CHECK(hipMemcpy(B_h.data(), dev_ptr, data_size, hipMemcpyDeviceToHost));
+  end = std::chrono::high_resolution_clock::now();
+  d2h_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  
+  if (debug_failure) {
+    REQUIRE(true == std::equal(B_h.begin(), B_h.end(), A_h.data()));
+  } else {
+    assert(A_h.size() == B_h.size());
+    for (size_t idx = 0; idx < A_h.size(); ++idx) {
+      if (A_h[idx] != B_h[idx]) {
+        std::cout << "Failed at first index: " << idx
+                  << " Expected: " << A_h[idx]
+                  << " Value: " << B_h[idx] << std::endl;
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+bool TestOnDevice(int deviceId) {
+  HIP_CHECK(hipSetDevice(deviceId));
+  // Check if VMM is supported
+  if (!CheckVMMSupportedOnDevice(deviceId)) {
+    INFO("VMM is not suppored on device: " << deviceId);
+    return false;
+  }
+  // Get VMM granularity
+  size_t granularity = 0;
+  if (!GetVMMGranularityOnDevice(deviceId, granularity)) {
+    INFO("Cannot get granularity on device: " << deviceId);
+    return false;
+  }
+
+  // Measure CPU time of allocation taken
+  size_t start_size = 1 * kGB;
+  size_t max_size = 64 * kGB;
+  for (size_t size_idx = start_size; size_idx <= max_size; (size_idx <<= 1)) {
+    void* dev_ptr = nullptr;
+    // This seems to be a completely blocking call, measuring CPU time in this test for now.
+    // Create Memory Reservation
+    auto start = std::chrono::high_resolution_clock::now();
+    HIP_CHECK(hipMemAddressReserve(&dev_ptr, size_idx, granularity, nullptr, 0));
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds reserve_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::vector<hipMemGenericAllocationHandle_t> physmem_handles;
+    std::chrono::microseconds alloc_elapsed;
+    std::chrono::microseconds map_elapsed;
+    assert(size_idx % chunk_size == 0);
+    size_t chunk_max = size_idx / chunk_size;
+    size_t chunk_idx = 0;
+
+    size_t freeVRAM = 0, totalVRAM = 0;
+    HIP_CHECK(hipMemGetInfo(&freeVRAM, &totalVRAM));
+    INFO("Available total device memory : " << totalVRAM);
+
+    if (freeVRAM < size_idx) {
+      WARN("Further free device memory unavailable, hence exiting!");
+      break;
+    }
+
+    if (single_map) {
+      // Create Physical memory
+      hipMemAllocationProp prop{};
+      prop.type = hipMemAllocationTypePinned;
+      prop.location.type = hipMemLocationTypeDevice;
+      prop.location.id = deviceId;
+      hipMemGenericAllocationHandle_t handle;
+
+      start = std::chrono::high_resolution_clock::now();
+      HIP_CHECK(hipMemCreate(&handle, size_idx, &prop, 0));
+      end = std::chrono::high_resolution_clock::now();
+      alloc_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+      physmem_handles.push_back(handle);
+
+      // Map the memory
+      start = std::chrono::high_resolution_clock::now();
+      HIP_CHECK(hipMemMap(dev_ptr, size_idx, 0, handle, 0));
+      end = std::chrono::high_resolution_clock::now();
+      map_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    } else {
+      start = std::chrono::high_resolution_clock::now();
+      while (chunk_idx < chunk_max) {
+        hipMemAllocationProp prop{};
+        prop.type = hipMemAllocationTypePinned;
+        prop.location.type = hipMemLocationTypeDevice;
+        prop.location.id = deviceId;
+        hipMemGenericAllocationHandle_t handle;
+        HIP_CHECK(hipMemCreate(&handle, chunk_size, &prop, 0));
+        physmem_handles.push_back(handle);
+        ++chunk_idx;
+      }
+      end = std::chrono::high_resolution_clock::now();
+      alloc_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+      chunk_idx = 0;
+      start = std::chrono::high_resolution_clock::now();
+      while (chunk_idx < chunk_max) {
+        // Use chunk size to map multiple maps
+        uint64_t uiptr = reinterpret_cast<uint64_t>(dev_ptr);
+        uiptr = uiptr + chunk_idx * chunk_size;
+        HIP_CHECK(hipMemMap(reinterpret_cast<void*>(uiptr), chunk_size, 0,
+                            physmem_handles[chunk_idx], 0));
+        ++chunk_idx;
+      }
+      end = std::chrono::high_resolution_clock::now();
+      map_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+      chunk_idx = 0;
+      while (chunk_idx < chunk_max) {
+        // Set access
+        hipMemAccessDesc accessDesc = {};
+        accessDesc.location.type = hipMemLocationTypeDevice;
+        accessDesc.location.id = deviceId;
+        accessDesc.flags = hipMemAccessFlagsProtReadWrite;
+        uint64_t uiptr = reinterpret_cast<uint64_t>(dev_ptr);
+        uiptr = uiptr + chunk_idx * chunk_size;
+        // Make the address accessible to GPU 0
+        HIP_CHECK(hipMemSetAccess(reinterpret_cast<void*>(uiptr), chunk_size, &accessDesc, 1));
+        ++chunk_idx;
+      }
+    }
+
+    // Also measure the memcpy time elapsed
+    std::chrono::microseconds h2d_elapsed;
+    std::chrono::microseconds d2h_elapsed;
+
+    // Validate using copy
+    if (!ValidateUsingCopy(deviceId, dev_ptr, size_idx, h2d_elapsed, d2h_elapsed)) {
+      INFO("Validation failed for size: " << size_idx);
+    }
+
+    start = std::chrono::high_resolution_clock::now();
+    chunk_idx = 0;
+    while (chunk_idx < chunk_max) {
+      uint64_t uiptr = reinterpret_cast<uint64_t>(dev_ptr);
+      uiptr = uiptr + chunk_idx * chunk_size;
+      HIP_CHECK(hipMemUnmap(reinterpret_cast<void*>(uiptr), chunk_size));
+      ++chunk_idx;
+    }
+    end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds unmap_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    start = std::chrono::high_resolution_clock::now();
+    for (auto& physmem_handle : physmem_handles) {
+      HIP_CHECK(hipMemRelease(physmem_handle));
+    }
+    end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds release_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    start = std::chrono::high_resolution_clock::now();
+    HIP_CHECK(hipMemAddressFree(dev_ptr, size_idx));
+    end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds free_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    // Print the results
+    std::cout << "-------------Size: " << (size_idx / kGB) << " GB----------------" << std::endl;
+    std::cout << "Time taken to reserve : " << reserve_elapsed.count()
+              << " micro seconds and free: " << free_elapsed.count()
+              << " micro seconds" << std::endl;
+    std::cout <<"Time taken to alloc : " << alloc_elapsed.count()
+              << " micro seconds and release: "<< release_elapsed.count()
+              << " micro seconds" << std::endl;
+    std::cout << "Time taken to map : " << map_elapsed.count()
+              << " micro seconds and unmap: " << unmap_elapsed.count()
+              << " micro seconds" << std::endl;
+    std::cout << "Time taken to H2D : " << h2d_elapsed.count()
+              << " micro seconds and D2H: " << d2h_elapsed.count() << " micro seconds" << std::endl;
+    std::cout << "-------------------------/hipMallocPerf------------------------" << std::endl;
+
+    void* dev_ptr_legacy = nullptr;
+    start = std::chrono::high_resolution_clock::now();
+    HIP_CHECK(hipMalloc(&dev_ptr_legacy, size_idx));
+    end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds hm_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    start = std::chrono::high_resolution_clock::now();
+    HIP_CHECK(hipFree(dev_ptr_legacy));
+    end = std::chrono::high_resolution_clock::now();
+    std::chrono::microseconds hf_elapsed
+      = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << "Time taken for hipMalloc : " << hm_elapsed.count()
+              << " micro seconds and hipFree: " << hf_elapsed.count()
+              << " micro seconds" << std::endl;
+    std::cout << "---------------------------------------------------------------" << std::endl;
+    std::cout << std::endl;
+  }
+ 
+  return true;
+}
+
+TEST_CASE("Perf_hipPerfVMMAllocSpeed_test") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices <= 0) {
+    SUCCEED(
+        "Skipped testcase hipPerfBufferCopySpeed as"
+        "there is no device to test.");
+  } else {
+    // Test on Primary Device first
+    int deviceId = 0;
+    TestOnDevice(deviceId);
+  }
+}
+/**
+ * End doxygen group perfVMMTest.
+ * @}
+ */
diff --git a/catch/unit/atomics/arithmetic_common.hh b/catch/unit/atomics/arithmetic_common.hh
index 3ff5bb4d7..2b4db75dd 100644
--- a/catch/unit/atomics/arithmetic_common.hh
+++ b/catch/unit/atomics/arithmetic_common.hh
@@ -446,6 +446,7 @@ void TestCore(const TestParams& p) {
 
  // Launch Kernel
   for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
     for (auto j = 0u; j < p.kernel_count; ++j) {
       const auto& stream = streams[i * p.kernel_count + j].stream();
       const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
@@ -580,6 +581,8 @@ void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
     }
   }
 
+  CHECK_P2P_SUPPORT
+
   if (kernel_count > 1) {
     for (auto i = 0u; i < num_devices; ++i) {
       int canAccess  = 0;
diff --git a/catch/unit/atomics/atomicExch_common.hh b/catch/unit/atomics/atomicExch_common.hh
index e0fcf84c8..ba3105de6 100644
--- a/catch/unit/atomics/atomicExch_common.hh
+++ b/catch/unit/atomics/atomicExch_common.hh
@@ -395,8 +395,21 @@ void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_de
     }
   }
 
+  CHECK_P2P_SUPPORT
+
   if (kernel_count > 1) {
     for (auto i = 0u; i < num_devices; ++i) {
+      int canAccess  = 0;
+      for (auto j = 0u; j < num_devices; ++j) {
+        if (i != j) {
+          HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j));
+          if(canAccess == 0) {
+            std::string msg = "P2P access check failed between dev1:" + std::to_string(i) + ",dev2:" + std::to_string(j);
+            HipTest::HIP_SKIP_TEST(msg.c_str());
+            return;
+          }
+        }
+      }
       int concurrent_kernels = 0;
       HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
       if (!concurrent_kernels) {
diff --git a/catch/unit/atomics/bitwise_common.hh b/catch/unit/atomics/bitwise_common.hh
index b61167968..9c7bf0f5d 100644
--- a/catch/unit/atomics/bitwise_common.hh
+++ b/catch/unit/atomics/bitwise_common.hh
@@ -272,6 +272,7 @@ void TestCore(const TestParams& p) {
   }
   // Launch Kernel and get back old vals
   for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
     for (auto j = 0u; j < p.kernel_count; ++j) {
       const auto& stream = streams[i * p.kernel_count + j].stream();
       const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
diff --git a/catch/unit/atomics/min_max_common.hh b/catch/unit/atomics/min_max_common.hh
index 467cfdcae..1f6d180f8 100644
--- a/catch/unit/atomics/min_max_common.hh
+++ b/catch/unit/atomics/min_max_common.hh
@@ -302,6 +302,7 @@ void TestCore(const TestParams& p) {
 
   // Launch kernel
   for (auto i = 0u; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
     for (auto j = 0u; j < p.kernel_count; ++j) {
       const auto& stream = streams[i * p.kernel_count + j].stream();
       const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
@@ -422,8 +423,21 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
     }
   }
 
+  CHECK_P2P_SUPPORT
+
   if (kernel_count > 1) {
     for (auto i = 0u; i < num_devices; ++i) {
+      int canAccess  = 0;
+      for (auto j = 0u; j < num_devices; ++j) {
+        if (i != j) {
+          HIP_CHECK(hipDeviceCanAccessPeer(&canAccess, i, j));
+          if(canAccess == 0) {
+            std::string msg = "P2P access check failed between dev1:" + std::to_string(i) + ",dev2:" + std::to_string(j);
+            HipTest::HIP_SKIP_TEST(msg.c_str());
+            return;
+          }
+        }
+      }
       int concurrent_kernels = 0;
       HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
       if (!concurrent_kernels) {
diff --git a/catch/unit/atomics/unsafeAtomicAdd.cc b/catch/unit/atomics/unsafeAtomicAdd.cc
index 1fde5861c..0e31a38e6 100644
--- a/catch/unit/atomics/unsafeAtomicAdd.cc
+++ b/catch/unit/atomics/unsafeAtomicAdd.cc
@@ -172,6 +172,7 @@ TEMPLATE_TEST_CASE("Unit_unsafe_atomic_add_half_and_bfloat", "", __half2, __hip_
 
   REQUIRE(hout.x == 32.0f);
   REQUIRE(hout.y == 64.0f);
+  HIP_CHECK(hipFree(out));
 }
 
 /**
diff --git a/catch/unit/compiler/CMakeLists.txt b/catch/unit/compiler/CMakeLists.txt
index 2b2f50c52..6f1278d40 100644
--- a/catch/unit/compiler/CMakeLists.txt
+++ b/catch/unit/compiler/CMakeLists.txt
@@ -19,7 +19,7 @@ if(HIP_PLATFORM MATCHES "amd")
                       TEST_TARGET_NAME build_tests)
 
   set(OFFLOAD_ARCH_GENERIC_STR "--offload-arch=gfx9-generic --offload-arch=gfx9-4-generic:sramecc+:xnack- --offload-arch=gfx9-4-generic:sramecc-:xnack- --offload-arch=gfx9-4-generic:xnack+ --offload-arch=gfx10-1-generic --offload-arch=gfx10-3-generic --offload-arch=gfx11-generic --offload-arch=gfx12-generic")
-  
+
   set(DISABLE_GENERIC_TARGET_ONLY)
 
   # Build hipSquareGenericTargetOnly to cover generic targets only
@@ -62,7 +62,7 @@ if(HIP_PLATFORM MATCHES "amd")
     set_property(GLOBAL APPEND PROPERTY G_INSTALL_CUSTOM_TARGETS ${CMAKE_CURRENT_BINARY_DIR}/${GENERIC_TARGET_ONLY_EXE})
     set_property(GLOBAL APPEND PROPERTY G_INSTALL_CUSTOM_TARGETS ${CMAKE_CURRENT_BINARY_DIR}/${GENERIC_TARGET_ONLY_COMPRESSED_EXE})
   else()
-    set(DISABLE_GENERIC_TARGET_ONLY "-DNO_GENERIC_TARGET_ONLY_TEST") 
+    set(DISABLE_GENERIC_TARGET_ONLY "-DNO_GENERIC_TARGET_ONLY_TEST")
   endif()
 
   # Build hipSquareGenericTarget to cover generic targets and the specific target
@@ -84,4 +84,19 @@ if(HIP_PLATFORM MATCHES "amd")
     add_dependencies(hipSquareGenericTarget hipSquareGenericTargetOnly)
     add_dependencies(hipSquareGenericTarget hipSquareGenericTargetOnlyCompressed)
   endif()
+
+  # SWDEV-548807 skip building hipSpirvTest
+  if(false)
+    add_custom_target(hipSpirvTest ALL
+                    COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_SOURCE_DIR}/hipSpirvTest.cc
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../../hipTestMain/hip_test_context.cc
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../../hipTestMain/main.cc
+                    -I${CMAKE_CURRENT_SOURCE_DIR}/../../include
+                    -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/Catch2
+                    -I${CMAKE_CURRENT_SOURCE_DIR}/../../external/picojson
+                    -I${HIP_PATH}/include/ --rocm-path=${ROCM_PATH} --offload-arch=amdgcnspirv
+                    -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/compiler/hipSpirvTest)
+    add_dependencies(CompilerTest hipSpirvTest)
+  endif()
+
 endif()
diff --git a/catch/unit/compiler/hipSpirvTest.cc b/catch/unit/compiler/hipSpirvTest.cc
new file mode 100644
index 000000000..cd54350fd
--- /dev/null
+++ b/catch/unit/compiler/hipSpirvTest.cc
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+__global__ void kernel() { asm volatile("v_nop" ::: "memory"); }
+
+// This test case compiles with --offload-arch=amdgcnspirv to verify SPIRV mode
+TEST_CASE("Unit_test_spirv_mode") { kernel<<<1, 32>>>(); }
\ No newline at end of file
diff --git a/catch/unit/compiler/hipSquare.cc b/catch/unit/compiler/hipSquare.cc
index 693f2b2e5..94f4529cd 100644
--- a/catch/unit/compiler/hipSquare.cc
+++ b/catch/unit/compiler/hipSquare.cc
@@ -78,6 +78,10 @@ TEST_CASE("Unit_test_compressed_codeobject") {
           HIP_CHECK(hipErrorUnknown);
       }
   }
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(C_d));
+  free(A_h);
+  free(C_h);
   printf("PASSED!\n");
   REQUIRE(true);
 }
diff --git a/catch/unit/context/CMakeLists.txt b/catch/unit/context/CMakeLists.txt
index 0d20df1bf..8f5ac897a 100644
--- a/catch/unit/context/CMakeLists.txt
+++ b/catch/unit/context/CMakeLists.txt
@@ -22,6 +22,7 @@ set(TEST_SRC
   hipDrvGetPCIBusId.cc
   hipDrvMemcpy.cc
   hipMemsetD8.cc
+  hipCtxNotSupported.cc
 )
 hip_add_exe_to_target(NAME Context
                       TEST_SRC ${TEST_SRC}
diff --git a/catch/unit/context/hipCtxNotSupported.cc b/catch/unit/context/hipCtxNotSupported.cc
new file mode 100644
index 000000000..cf6392bc0
--- /dev/null
+++ b/catch/unit/context/hipCtxNotSupported.cc
@@ -0,0 +1,264 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates handling of invalid arguments:
+ *    -# When the device id passed is greater than available
+ *      - Expected output: return `hipErrorInvalidValue`
+ *    -# else
+ *      - Expected output: return `hipErrorContextAlreadyInUse`
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipDevicePrimaryCtxSetFlags_Negative") {
+  hipDevice_t dev;
+  unsigned int flags = 0;
+  SECTION("Negative device index") {
+    dev = static_cast<hipDevice_t>(-1);
+    auto res = hipDevicePrimaryCtxSetFlags(dev, flags);
+    REQUIRE(res == hipErrorInvalidDevice);
+  }
+  SECTION("Valid device index") {
+    dev = static_cast<hipDevice_t>(0);
+    auto res = hipDevicePrimaryCtxSetFlags(dev, flags);
+    REQUIRE(res == hipErrorContextAlreadyInUse);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates handling of invalid arguments:
+ *    -# When nullptr passed to hipCtxGetDevice
+ *      - Expected output: return `hipErrorInvalidValue`
+ *    -# When a non-nullptr is passed
+ *      - Expected output: returned device ID, within [0, numDevices]
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipDeviceAPIs_not_supported") {
+  hipDevice_t device;
+  int numDevices = -1;
+  HIP_CHECK(hipDeviceGet(&device, 0));
+  auto res = hipGetDeviceCount(&numDevices);
+  REQUIRE(res == hipSuccess);
+  REQUIRE(numDevices > 0);
+
+  SECTION("hipDevicePrimaryCtxReset_not_supported") { HIP_CHECK(hipDevicePrimaryCtxReset(device)); }
+
+  SECTION("hipCtxGetDevice_not_supported") {
+    SECTION("hipCtxGetDevice") {
+      auto res = hipCtxGetDevice(nullptr);
+      REQUIRE(res == hipErrorInvalidValue);
+    }
+    SECTION("hipCtxGetDevice_deviceCount") {
+      hipDevice_t dev = static_cast<hipDevice_t>(-1);
+      HIP_CHECK(hipCtxGetDevice(&dev));
+      // Ensure the returned device ID is within [0, numDevices]
+      REQUIRE(dev >= 0);
+      REQUIRE(dev < numDevices);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates handling of invalid arguments:
+ *    -# any value not equal to the four valid hipFuncCache_t constants
+ *      - Expected output: return `hipErrorInvalidValue`
+ *    -# When valid enum values are passed
+ *      - Expected output: return `hipErrorNotSupported`
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipCtxGetSetCacheConfig_not_supported") {
+  hipFuncCache_t cacheConfig;
+  SECTION("hipCtxSetCacheConfig_not_supported") {
+    SECTION("Invalid enum value") {
+      // any value not equal to the four valid hipFuncCache_t constants
+      cacheConfig = static_cast<hipFuncCache_t>(0x100);
+      auto res = hipCtxSetCacheConfig(cacheConfig);
+      REQUIRE(res == hipErrorInvalidValue);
+    }
+
+    SECTION("Valid enum values") {
+      std::array<hipFuncCache_t, 4> validCfgs = {hipFuncCachePreferNone, hipFuncCachePreferShared,
+                                                 hipFuncCachePreferL1, hipFuncCachePreferEqual};
+
+      for (auto cfg : validCfgs) {
+        auto res = hipCtxSetCacheConfig(cfg);
+        REQUIRE(res == hipErrorNotSupported);
+      }
+    }
+  }
+  SECTION("hipCtxGetCacheConfig_not_supported") {
+    auto res = hipCtxGetCacheConfig(&cacheConfig);
+    REQUIRE(res == hipErrorNotSupported);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - hipCtxGetSetSharedMemConfig APIs are verified to be unsupported
+ *    or return an empty hipSuccess
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - Textures supported on device
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipCtxGetSetSharedMemConfig_not_supported") {
+  hipSharedMemConfig config;
+  config = hipSharedMemBankSizeEightByte;
+  SECTION("hipCtxSetSharedMemConfig_not_supported") {
+    auto res = hipCtxGetSharedMemConfig(&config);
+    REQUIRE(res == hipSuccess);
+    REQUIRE(config == hipSharedMemBankSizeFourByte);
+  }
+  SECTION("hipCtxSetSharedMemConfig_not_supported") {
+    auto res = hipCtxSetSharedMemConfig(config);
+    REQUIRE(res == hipErrorNotSupported);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - hipCtxEnable/DisablePeerAccess APIs are verified to return hipSuccess:
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - Textures supported on device
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipCtxPeerAccess_not_supported") {
+  hipCtx_t peerCtx = nullptr;
+  unsigned int flags = 0;
+  SECTION("hipCtxEnablePeerAccess_not_supported") {
+    auto res = hipCtxEnablePeerAccess(peerCtx, flags);
+    REQUIRE(res == hipSuccess);
+  }
+  SECTION("hipCtxDisablePeerAccess_not_supported") {
+    auto res = hipCtxDisablePeerAccess(peerCtx);
+    REQUIRE(res == hipSuccess);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - hipCtx APIs are verified to be unsupported:
+ * Test source
+ * ------------------------
+ *  - unit/texture/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("Unit_hipCtxAPIs_not_supported") {
+  SECTION("hipCtxGetFlags_not_supported") {
+    unsigned int flags = 0x100;
+    auto res = hipCtxGetFlags(&flags);
+    REQUIRE(res == hipErrorNotSupported);
+    // In release builds (asserts disabled), flags should remain unchanged
+    REQUIRE(flags == 0x100);
+  }
+
+  SECTION("hipCtxSynchronize_not_supported") {
+    auto res = hipCtxSynchronize();
+    REQUIRE(res == hipErrorNotSupported);
+  }
+
+  SECTION("hipCtxGetApiVersion_not_supported") {
+    hipCtx_t ctx = nullptr;
+    unsigned int apiVersion;
+    auto res = hipCtxGetApiVersion(ctx, &apiVersion);
+    REQUIRE(res == hipErrorNotSupported);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Goes through the retain-reset-release cycle on a valid and invalid device:
+ *    Verifies
+ *    - a valid primary context is returned
+ *    - an active state is returned
+ *    - an invalidDevice is returned
+ * Test source
+ * ------------------------
+ *  - unit/context/hipCtxNotSupported.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.4
+ */
+TEST_CASE("hipDevicePrimaryCtxGetState_Negative") {
+  hipDevice_t device;
+  HIP_CHECK(hipDeviceGet(&device, 0));
+  hipCtx_t primaryCtx = nullptr;
+
+  SECTION("Valid device") {
+    HIP_CHECK(hipDevicePrimaryCtxRetain(&primaryCtx, device));
+    REQUIRE(primaryCtx != nullptr);
+    // Make it current
+    HIP_CHECK(hipCtxSetCurrent(primaryCtx));
+    unsigned int flags = 0;
+    int active = 0;
+    HIP_CHECK(hipDevicePrimaryCtxGetState(device, &flags, &active));
+    // Reset the primary context
+    HIP_CHECK(hipDevicePrimaryCtxReset(device));
+    // Release our retain-handle
+    HIP_CHECK(hipDevicePrimaryCtxRelease(device));
+  }
+  SECTION("Invalid device") {
+    device = -1;
+    // Retain the primary context
+    auto res = hipDevicePrimaryCtxRetain(&primaryCtx, device);
+    REQUIRE(res == hipErrorInvalidDevice);
+    unsigned int flags = 0;
+    int active = 0;
+    res = hipDevicePrimaryCtxGetState(device, &flags, &active);
+    REQUIRE(res == hipErrorInvalidDevice);
+    // Release our retain-handle
+    res = hipDevicePrimaryCtxRelease(device);
+    REQUIRE(res == hipErrorInvalidDevice);
+  }
+}
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc
index 675da24dd..f395c5978 100644
--- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc
+++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc
@@ -233,6 +233,7 @@ static void test_shfl_down() {
 
     HIPCHECK(hipHostFree(hPtr));
     HIPCHECK(hipFree(dPtr));
+    HIPCHECK(hipFree(dResults));
     free(cpuResultsArr);
   }
 }
diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc
index 54cd62c19..0118caf45 100644
--- a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc
+++ b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc
@@ -221,6 +221,7 @@ static void test_shfl_up() {
 
     HIPCHECK(hipHostFree(hPtr));
     HIPCHECK(hipFree(dPtr));
+    HIPCHECK(hipFree(dResults));
     free(cpuResultsArr);
   }
 }
diff --git a/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc b/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc
index 11933e99d..9f0d2d854 100644
--- a/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc
+++ b/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc
@@ -394,6 +394,8 @@ static void test_shfl_any_to_any() {
 
     HIPCHECK(hipHostFree(hPtr));
     HIPCHECK(hipFree(dPtr));
+    HIPCHECK(hipFree(dResults));
+    HIPCHECK(hipFree(dsrcArr));
     free(srcArr);
     free(srcArrCpu);
     free(cpuResultsArr);
@@ -461,6 +463,7 @@ static void test_shfl_broadcast() {
 
     HIPCHECK(hipHostFree(hPtr));
     HIPCHECK(hipFree(dPtr));
+    HIPCHECK(hipFree(dResults));
     free(cpuResultsArr);
   }
 }
@@ -554,4 +557,11 @@ TEST_CASE("Unit_coalesced_groups") {
 
   std::cout << "Now grouping active threads based on branch divergence" << '\n' << std::endl;
   test_active_threads_grouping();
+
+  HIPCHECK(hipFree(d_data_to_filter));
+  HIPCHECK(hipFree(d_filtered_data));
+  HIPCHECK(hipFree(d_nres));
+  free(data_to_filter);
+  free(filtered_data);
+  free(host_filtered_data);
 }
diff --git a/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc b/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc
index fd1dcc829..2c19a33d1 100644
--- a/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc
+++ b/catch/unit/executionControl/hipExtLaunchMultiKernelMultiDevice.cc
@@ -141,4 +141,4 @@ TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Negative_MultiKernelSameDevic
   for (const auto params : params_list) {
     HIP_CHECK(hipStreamDestroy(params.stream));
   }
-}
\ No newline at end of file
+}
diff --git a/catch/unit/graph/hipGetProcAddressGraphApis.cc b/catch/unit/graph/hipGetProcAddressGraphApis.cc
index 25fbc6cd2..63e50979a 100644
--- a/catch/unit/graph/hipGetProcAddressGraphApis.cc
+++ b/catch/unit/graph/hipGetProcAddressGraphApis.cc
@@ -375,9 +375,7 @@ TEST_CASE("Unit_hipGetProcAddress_GraphAPIs_AddMemsetMemcpyNodes") {
   hipGraphExec_t graphExec;
   HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
   HIP_CHECK(hipGraphLaunch(graphExec, 0));
-  #ifdef _WIN32
   HIP_CHECK(hipStreamSynchronize(0));
-  #endif
 
   REQUIRE(validateArrayT<char>(hostMemDst, N, value) == true);
 
diff --git a/catch/unit/memory/hipMallocAsync.cc b/catch/unit/memory/hipMallocAsync.cc
index 3a5baf4f1..6adb1088a 100644
--- a/catch/unit/memory/hipMallocAsync.cc
+++ b/catch/unit/memory/hipMallocAsync.cc
@@ -319,8 +319,8 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice") {
  *    - HIP_VERSION >= 6.2
  */
 #if HT_AMD
-static void threadQAsyncCommands(streamMemAllocTest* testObj,
-                                hipStream_t strm) {
+static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
+  HIP_CHECK(hipSetDevice(idx));
   // Create host buffer with test data.
   testObj->createHostBufferWithData();
   // Allocate device memory and transfer data to it asyncronously on stream.
@@ -350,7 +350,7 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_Concurrent") {
   // Queue commands in each device
   for (int idx = 0; idx < num_devices; idx++) {
     HIP_CHECK(hipSetDevice(idx));
-    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
+    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
     test.join();
   }
   // Wait for the streams
@@ -405,10 +405,10 @@ TEST_CASE("Unit_hipMallocAsync_Multidevice_MultiStream") {
   // Queue commands in each device
   for (int idx = 0; idx < num_devices; idx++) {
     HIP_CHECK(hipSetDevice(idx));
-    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
-                    stream_buf[streamPerAsic*idx]);
-    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
-                    stream_buf[streamPerAsic*idx + 1]);
+    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
+                      stream_buf[streamPerAsic * idx], idx);
+    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
+                      stream_buf[streamPerAsic * idx + 1], idx);
     test1.join();
     test2.join();
   }
diff --git a/catch/unit/memory/hipMallocFromPoolAsync.cc b/catch/unit/memory/hipMallocFromPoolAsync.cc
index 6993dfa87..ca9f41f07 100644
--- a/catch/unit/memory/hipMallocFromPoolAsync.cc
+++ b/catch/unit/memory/hipMallocFromPoolAsync.cc
@@ -371,8 +371,8 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_ReleaseThreshold_Mgpu") {
 /**
  * Local Thread Functions
  */
-static void threadQAsyncCommands(streamMemAllocTest* testObj,
-                                hipStream_t strm) {
+static void threadQAsyncCommands(streamMemAllocTest* testObj, hipStream_t strm, int idx) {
+  HIP_CHECK(hipSetDevice(idx));
   // Create host buffer with test data.
   testObj->createHostBufferWithData();
   // Allocate device memory and transfer data to it asyncronously on stream.
@@ -616,7 +616,7 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_Concurrent") {
   // Queue commands in each device
   for (int idx = 0; idx < num_devices; idx++) {
     HIP_CHECK(hipSetDevice(idx));
-    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx]);
+    std::thread test(threadQAsyncCommands, tesObjBuf[idx], stream_buf[idx], idx);
     test.join();
   }
   // Wait for the streams
@@ -675,10 +675,10 @@ TEST_CASE("Unit_hipMallocFromPoolAsync_Multidevice_MultiStream") {
   // Queue commands in each device
   for (int idx = 0; idx < num_devices; idx++) {
     HIP_CHECK(hipSetDevice(idx));
-    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx],
-                    stream_buf[streamPerAsic*idx]);
-    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic*idx + 1],
-                    stream_buf[streamPerAsic*idx + 1]);
+    std::thread test1(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx],
+                      stream_buf[streamPerAsic * idx], idx);
+    std::thread test2(threadQAsyncCommands, tesObjBuf[streamPerAsic * idx + 1],
+                      stream_buf[streamPerAsic * idx + 1], idx);
     test1.join();
     test2.join();
   }
diff --git a/catch/unit/memory/hipMemAdvise_old.cc b/catch/unit/memory/hipMemAdvise_old.cc
index 9b785aa61..6116aa34b 100644
--- a/catch/unit/memory/hipMemAdvise_old.cc
+++ b/catch/unit/memory/hipMemAdvise_old.cc
@@ -836,7 +836,6 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
     int *Hmm = NULL, NumElms = (1024 * 1024), InitVal = 123, blockSize = 64;
     int *Hmm1 = NULL, DataMismatch = 0;
     hipStream_t strm;
-    HIP_CHECK(hipStreamCreate(&strm));
     HIP_CHECK(hipMallocManaged(&Hmm, (NumElms * sizeof(int))));
     // Initializing memory
     for (int i = 0; i < NumElms; ++i) {
@@ -852,6 +851,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
       for (int i = 1; i < Ngpus; ++i) {
         DataMismatch = 0;
         HIP_CHECK(hipSetDevice(i));
+        HIP_CHECK(hipStreamCreate(&strm));
         HIP_CHECK(hipMallocManaged(&Hmm1, (NumElms * sizeof(int))));
         MemAdvise3<<<dimGrid, dimBlock, 0, strm>>>(Hmm, Hmm1, NumElms);
         HIP_CHECK(hipStreamSynchronize(strm));
@@ -865,6 +865,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
           WARN("DataMismatch is observed with the gpu: " << i);
           REQUIRE(false);
         }
+        HIP_CHECK(hipStreamDestroy(strm));
         HIP_CHECK(hipFree(Hmm1));
       }
     }
@@ -873,10 +874,12 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
       for (int i = 0; i < Ngpus; ++i) {
         DataMismatch = 0;
         HIP_CHECK(hipSetDevice(i));
+        HIP_CHECK(hipStreamCreate(&strm));
         HIP_CHECK(hipMemAdvise(Hmm, (NumElms * sizeof(int)),
                                hipMemAdviseSetReadMostly, i));
         MemAdvise2<<<dimGrid, dimBlock, 0, strm>>>(Hmm, NumElms);
         HIP_CHECK(hipStreamSynchronize(strm));
+        HIP_CHECK(hipStreamDestroy(strm));
       }
       // verifying the final result
       for (int i = 0; i < NumElms; ++i) {
@@ -892,7 +895,7 @@ TEST_CASE("Unit_hipMemAdvise_ReadMosltyMgpuTst") {
     }
 #endif
     HIP_CHECK(hipFree(Hmm));
-    HIP_CHECK(hipStreamDestroy(strm));
+
   } else {
     SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
            "attribute. Hence skipping the testing with Pass result.\n");
diff --git a/catch/unit/memory/hipMemcpyPeerAsync.cc b/catch/unit/memory/hipMemcpyPeerAsync.cc
index 5e1b384ff..c27fa9c0b 100644
--- a/catch/unit/memory/hipMemcpyPeerAsync.cc
+++ b/catch/unit/memory/hipMemcpyPeerAsync.cc
@@ -51,9 +51,6 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
     HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
     return;
   }
-  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
-  const StreamGuard stream_guard(stream_type);
-  const hipStream_t stream = stream_guard.stream();
 
   const auto allocation_size = GENERATE(kPageSize / 2, kPageSize, kPageSize * 2);
 
@@ -64,6 +61,11 @@ TEST_CASE("Unit_hipMemcpyPeerAsync_Positive_Default") {
   INFO("Src device: " << src_device << ", Dst device: " << dst_device);
 
   HIP_CHECK(hipSetDevice(src_device));
+
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
+  const StreamGuard stream_guard(stream_type);
+  const hipStream_t stream = stream_guard.stream();
+
   HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
   if (can_access_peer) {
     HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
diff --git a/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc b/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc
index 6e906d6ab..45d455e5c 100644
--- a/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc
+++ b/catch/unit/memory/hipMemcpyWithStreamMultiThread.cc
@@ -510,6 +510,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(bool& val_res)
   }
 
   for (int i = 0; i < numDevices; ++i) {
+    HIP_CHECK_THREAD(hipSetDevice(i));
     hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                        static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
     HIP_CHECK_THREAD(hipGetLastError());
diff --git a/catch/unit/memory/hipMemcpyWithStream_old.cc b/catch/unit/memory/hipMemcpyWithStream_old.cc
index 2b9bba966..b0f730994 100644
--- a/catch/unit/memory/hipMemcpyWithStream_old.cc
+++ b/catch/unit/memory/hipMemcpyWithStream_old.cc
@@ -475,6 +475,7 @@ void TestkindDefaultForDtoD(void) {
     }
 
     for (int i=0; i < NumDevices; ++i) {
+      HIP_CHECK(hipSetDevice(i));
       hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
                          dim3(threadsPerBlock),
                          0, stream[i], static_cast<const int*>(A_d[i]),
diff --git a/catch/unit/memory/hipSVMTestByteGranularity.cpp b/catch/unit/memory/hipSVMTestByteGranularity.cpp
index 9f3a89614..207bf7d1b 100644
--- a/catch/unit/memory/hipSVMTestByteGranularity.cpp
+++ b/catch/unit/memory/hipSVMTestByteGranularity.cpp
@@ -109,6 +109,7 @@ TEST_CASE("test_svm_byte_granularity") {
   // get all the devices going simultaneously
   for(unsigned int d = 0; d < num_devices; d++)  // device ids starting at 1.
   {
+    HIP_CHECK(hipSetDevice(d));
     write_owned_locations<<<num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host, d);
     HIP_CHECK(hipGetLastError());
   }
@@ -125,6 +126,7 @@ TEST_CASE("test_svm_byte_granularity") {
   size_t adjusted_num_elements = num_elements - num_devices;
   for(unsigned int d = 0; d < num_devices; d++)
   {
+    HIP_CHECK(hipSetDevice(d));
     sum_neighbor_locations<<<adjusted_num_elements, 1, 0, streams[d]>>>(pA, num_devices_plus_host,
                                                                      error_counts[d]);
     HIP_CHECK(hipGetLastError());
diff --git a/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp b/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
index cd5dd8fa6..655327ba5 100644
--- a/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
+++ b/catch/unit/memory/hipSVMTestFineGrainMemoryConsistency.cpp
@@ -129,6 +129,7 @@ void launch_kernels_and_verify(std::vector<hipStream_t> &streams, unsigned int n
   // all the pixels.
   for(unsigned int d=0; d < num_devices; d++)
   {
+    HIP_CHECK(hipSetDevice(d));
     build_hash_table_on_device<<<(num_pixels + 255) / 256, 256, 0, streams[d]>>>(
         pInputImage, num_pixels, pNodes, pNumNodes, numBins, d);
     HIP_CHECK(hipGetLastError());
diff --git a/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp b/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
index 83dc5b870..ee7944fc9 100644
--- a/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
+++ b/catch/unit/memory/hipSVMTestSharedAddressSpaceFineGrain.cpp
@@ -208,6 +208,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
       }
       else
       {
+        HIP_CHECK(hipSetDevice(ci));
         create_linked_lists_on_device(streams[ci], pNodes, pAllocator, numLists,
                                              ListLength);
       }
@@ -218,6 +219,7 @@ TEST_CASE("test_svm_shared_address_space_fine_grain_buffers") {
       }
       else
       {
+        HIP_CHECK(hipSetDevice(vi));
         verify_linked_lists_on_device(streams[vi], pNodes, pNumCorrect, numLists,
                                              ListLength);
       }
diff --git a/catch/unit/memory/hipStreamAttachMemAsync.cc b/catch/unit/memory/hipStreamAttachMemAsync.cc
index df9c5895b..f2b0a9a0e 100644
--- a/catch/unit/memory/hipStreamAttachMemAsync.cc
+++ b/catch/unit/memory/hipStreamAttachMemAsync.cc
@@ -87,6 +87,9 @@ TEST_CASE("Unit_hipStreamAttachMemAsync_Positive_AttachGlobal") {
   HIP_CHECK(hipStreamSynchronize(nullptr));
 
   for (int i = 0; i < stream_count; ++i) {
+    if (device_count > 1) {
+      HIP_CHECK(hipSetDevice(i));
+    }
     HipTest::launchKernel(Set, 1, 1, 0, streams.at(i)->stream(), managed_global.ptr() + i, i);
   }
 
diff --git a/catch/unit/memory/mempool_common.hh b/catch/unit/memory/mempool_common.hh
index 50d0a2f56..1a01f567b 100644
--- a/catch/unit/memory/mempool_common.hh
+++ b/catch/unit/memory/mempool_common.hh
@@ -407,6 +407,7 @@ class streamMemAllocTest {
                         dim3(THREADS_PER_BLOCK), 0, stream,
                         static_cast<const int*>(A_d),
                         static_cast<const int*>(B_d), C_d, size);
+    HIP_CHECK(hipGetLastError());
   }
   // Transfer data from device to host asynchronously.
   void transferFromMempool(hipStream_t stream) {
diff --git a/catch/unit/module/hipExtModuleLaunchKernel.cc b/catch/unit/module/hipExtModuleLaunchKernel.cc
index 3fe64b8bb..4faf712d9 100644
--- a/catch/unit/module/hipExtModuleLaunchKernel.cc
+++ b/catch/unit/module/hipExtModuleLaunchKernel.cc
@@ -389,7 +389,7 @@ void ModuleLaunchKernel::AllocateMemory() {
   args2.clockRate = clkRate;
   size1 = sizeof(args1);
   size2 = sizeof(args2);
-  size3 = sizeof(args3);
+  size3 = 0;
   HIP_CHECK(hipEventCreate(&start_event1));
   HIP_CHECK(hipEventCreate(&end_event1));
   HIP_CHECK(hipEventCreate(&start_event2));
diff --git a/catch/unit/stream/hipStreamLegacy_Ext.cc b/catch/unit/stream/hipStreamLegacy_Ext.cc
index 9b7b1d554..0f7c64327 100644
--- a/catch/unit/stream/hipStreamLegacy_Ext.cc
+++ b/catch/unit/stream/hipStreamLegacy_Ext.cc
@@ -731,9 +731,8 @@ TEST_CASE("Unit_hipStreamLegacy_TwoThreadsInTwoDevicesEachOneDiffOperation") {
   HIP_CHECK(hipSetDevice(0));
 
   std::thread dev0Thread(operationsInDev0, devArrDev0, devArrDev1);
-  std::thread dev1Thread(operationsInDev1, devArrDev1, hostArrDst);
-
   dev0Thread.join();
+  std::thread dev1Thread(operationsInDev1, devArrDev1, hostArrDst);
   dev1Thread.join();
 
   for ( int i = 0; i < N; i++ ) {
diff --git a/catch/unit/streamperthread/hipStreamPerThread_Basic.cc b/catch/unit/streamperthread/hipStreamPerThread_Basic.cc
index 51437b453..0d2ccc961 100644
--- a/catch/unit/streamperthread/hipStreamPerThread_Basic.cc
+++ b/catch/unit/streamperthread/hipStreamPerThread_Basic.cc
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -135,4 +135,8 @@ TEST_CASE("Unit_hipStreamPerThread_MemcpyAsync") {
   for (unsigned int i = 0; i < ele_size; ++i) {
     REQUIRE(A_h[i] == 123);
   }
-}
\ No newline at end of file
+
+  // Clean-up
+  HIP_CHECK(hipHostFree(A_h));
+  HIP_CHECK(hipFree(A_d));
+}
diff --git a/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc b/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc
index 9eea33da6..ca11e50d6 100644
--- a/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc
+++ b/catch/unit/streamperthread/hipStreamPerThread_DeviceReset.cc
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -88,6 +88,9 @@ TEST_CASE("Unit_hipStreamPerThread_DeviceReset_2") {
   if (status != hipSuccess) return;
   HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
 
+  // Host Memory is not destroyed with hipDeviceReset, need to free it
+  // explicitly to avoid memory leaks
+  HIP_CHECK(hipHostFree(A_h));
   HIP_CHECK(hipDeviceReset());
 
   // After reset all memory objects will be destroyed hence allocating them again
diff --git a/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc b/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc
index ec34c6de7..914cf6acd 100644
--- a/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc
+++ b/catch/unit/streamperthread/hipStreamPerThread_MultiThread.cc
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -34,6 +34,9 @@ static void Copy_to_device() {
   }
   HIP_CHECK(hipMemcpyAsync(A_d, A_h, ele_size * sizeof(int), hipMemcpyHostToDevice,
                  hipStreamPerThread));
+  // Clean up
+  HIP_CHECK(hipHostFree(A_h));
+  HIP_CHECK(hipFree(A_d));
 }
 
 /*
diff --git a/catch/unit/texture/CMakeLists.txt b/catch/unit/texture/CMakeLists.txt
index 69a578542..19ca57e57 100644
--- a/catch/unit/texture/CMakeLists.txt
+++ b/catch/unit/texture/CMakeLists.txt
@@ -64,6 +64,12 @@ set(TEST_SRC
     hipTexRefGetFlags.cc
     hipTexRefSetAddressMode.cc
     hipTexRefGetAddressMode.cc
+    hipTexRefSetGetFilterMode.cc
+    hipTexRefSetGetMipmapFilterMode.cc
+    hipTexRefSetGetMipmapLevelBias.cc
+    hipTexRefSetGetMipmapLevelClamp.cc
+    hipTexRefSetGetMipmappedArray.cc
+
 )
 
 # tests not for gfx90a+
diff --git a/catch/unit/texture/hipTexRefSetGetFilterMode.cc b/catch/unit/texture/hipTexRefSetGetFilterMode.cc
new file mode 100644
index 000000000..6203e2d85
--- /dev/null
+++ b/catch/unit/texture/hipTexRefSetGetFilterMode.cc
@@ -0,0 +1,65 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+texture<float, 2, hipReadModeElementType> tex;
+// Test for hipTexRefSetFilterMode and hipTexRefGetFilterMode, including error handling
+TEST_CASE("Unit_hipTexRefSetGetFilterMode") {
+  CHECK_IMAGE_SUPPORT;
+
+  // Retrieve the texture reference for our symbol
+  const textureReference* texRefConst = nullptr;
+  HIP_CHECK(hipGetTextureReference(&texRefConst, &tex));
+  REQUIRE(texRefConst != nullptr);
+  // Implementation expects non-const textureReference*
+  textureReference* texRef = const_cast<textureReference*>(texRefConst);
+
+  hipTextureFilterMode mode;
+
+  SECTION("Default filter mode is Point") {
+    HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef));
+    REQUIRE(mode == hipFilterModePoint);
+  }
+
+  SECTION("Set filter mode to Linear and verify") {
+    HIP_CHECK(hipTexRefSetFilterMode(texRef, hipFilterModeLinear));
+    HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef));
+    REQUIRE(mode == hipFilterModeLinear);
+  }
+
+  SECTION("Set filter mode back to Point and verify") {
+    HIP_CHECK(hipTexRefSetFilterMode(texRef, hipFilterModePoint));
+    HIP_CHECK(hipTexRefGetFilterMode(&mode, texRef));
+    REQUIRE(mode == hipFilterModePoint);
+  }
+
+  SECTION("Invalid arguments: null texture reference pointer") {
+    // Setting filter mode with null texRef should fail
+    hipError_t errSet = hipTexRefSetFilterMode(nullptr, hipFilterModeLinear);
+    REQUIRE(errSet == hipErrorInvalidValue);
+
+    // Getting filter mode with null texRef should fail
+    hipError_t errGetRef = hipTexRefGetFilterMode(&mode, nullptr);
+    REQUIRE(errGetRef == hipErrorInvalidValue);
+
+    // Getting filter mode with null mode pointer should fail
+    hipError_t errGetMode = hipTexRefGetFilterMode(nullptr, texRef);
+    REQUIRE(errGetMode == hipErrorInvalidValue);
+  }
+}
diff --git a/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc b/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc
new file mode 100644
index 000000000..73406b563
--- /dev/null
+++ b/catch/unit/texture/hipTexRefSetGetMipmapFilterMode.cc
@@ -0,0 +1,58 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+texture<float, 2, hipReadModeElementType> tex;
+// Test for hipTexRefSetMipmapFilterMode and hipTexRefGetMipmapFilterMode, including error handling
+TEST_CASE("Unit_hipTexRefSetGetMipmapFilterMode") {
+  CHECK_IMAGE_SUPPORT;
+
+  // Retrieve the texture reference for our symbol
+  const textureReference* texRefConst = nullptr;
+  HIP_CHECK(hipGetTextureReference(&texRefConst, &tex));
+  REQUIRE(texRefConst != nullptr);
+  // Implementation expects non-const textureReference*
+  textureReference* texRef = const_cast<textureReference*>(texRefConst);
+
+  hipTextureFilterMode mipMode;
+
+  SECTION("Set mipmap filter mode to Linear and verify") {
+    HIP_CHECK(hipTexRefSetMipmapFilterMode(texRef, hipFilterModeLinear));
+    auto res = hipTexRefGetMipmapFilterMode(&mipMode, texRef);
+    REQUIRE(res == hipErrorInvalidValue);
+    REQUIRE(mipMode == hipFilterModeLinear);
+  }
+
+  SECTION("Set mipmap filter mode back to Point and verify") {
+    HIP_CHECK(hipTexRefSetMipmapFilterMode(texRef, hipFilterModePoint));
+    auto res = hipTexRefGetMipmapFilterMode(&mipMode, texRef);
+    REQUIRE(res == hipErrorInvalidValue);
+    REQUIRE(mipMode == hipFilterModePoint);
+  }
+
+  SECTION("Invalid arguments: null pointers") {
+    hipError_t err;
+    err = hipTexRefSetMipmapFilterMode(nullptr, hipFilterModeLinear);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapFilterMode(&mipMode, nullptr);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapFilterMode(nullptr, texRef);
+    REQUIRE(err == hipErrorInvalidValue);
+  }
+}
diff --git a/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc b/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc
new file mode 100644
index 000000000..e2e4298d8
--- /dev/null
+++ b/catch/unit/texture/hipTexRefSetGetMipmapLevelBias.cc
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+texture<float, 2, hipReadModeElementType> tex;
+// Test for hipTexRefSetMipmapLevelBias and hipTexRefGetMipmapLevelBias, including error handling
+TEST_CASE("Unit_hipTexRefSetGetMipmapLevelBias") {
+  CHECK_IMAGE_SUPPORT;
+
+  // Retrieve the texture reference for our symbol
+  const textureReference* texRefConst = nullptr;
+  HIP_CHECK(hipGetTextureReference(&texRefConst, &tex));
+  REQUIRE(texRefConst != nullptr);
+  // Implementation expects non-const textureReference*
+  textureReference* texRef = const_cast<textureReference*>(texRefConst);
+
+  float bias = 0.0;
+
+  SECTION("Set mipmap level bias to custom value and verify") {
+    float newBias = 2.25;
+    HIP_CHECK(hipTexRefSetMipmapLevelBias(texRef, newBias));
+    auto res = hipTexRefGetMipmapLevelBias(&bias, texRef);
+    REQUIRE(res == hipErrorInvalidValue);
+    REQUIRE(bias == newBias);
+  }
+
+  SECTION("Invalid arguments: null pointers") {
+    hipError_t err;
+    err = hipTexRefSetMipmapLevelBias(nullptr, 1.0f);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapLevelBias(nullptr, texRef);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapLevelBias(&bias, nullptr);
+    REQUIRE(err == hipErrorInvalidValue);
+  }
+}
diff --git a/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc b/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc
new file mode 100644
index 000000000..764464a85
--- /dev/null
+++ b/catch/unit/texture/hipTexRefSetGetMipmapLevelClamp.cc
@@ -0,0 +1,57 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <float.h>
+texture<float, 2, hipReadModeElementType> tex;
+// Test for hipTexRefSetMipmapLevelClamp and hipTexRefGetMipmapLevelClamp, including error handling
+TEST_CASE("Unit_texRefSetGetMipmapLevelClamp") {
+  CHECK_IMAGE_SUPPORT;
+
+  // Retrieve the texture reference for our symbol
+  const textureReference* texRefConst = nullptr;
+  HIP_CHECK(hipGetTextureReference(&texRefConst, &tex));
+  REQUIRE(texRefConst != nullptr);
+  // Implementation expects non-const textureReference*
+  textureReference* texRef = const_cast<textureReference*>(texRefConst);
+
+
+  float minClamp = 0.0f, maxClamp = 0.0f;
+
+  SECTION("Set mipmap level clamp to custom values and verify") {
+    float newMin = 1.5f, newMax = 5.5f;
+    HIP_CHECK(hipTexRefSetMipmapLevelClamp(texRef, newMin, newMax));
+    auto res = hipTexRefGetMipmapLevelClamp(&minClamp, &maxClamp, texRefConst);
+    REQUIRE(res == hipErrorInvalidValue);
+    REQUIRE(minClamp == newMin);
+    REQUIRE(maxClamp == newMax);
+  }
+
+  SECTION("Invalid arguments: null pointers") {
+    hipError_t err;
+    err = hipTexRefSetMipmapLevelClamp(nullptr, 1.0f, 2.0f);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapLevelClamp(nullptr, &maxClamp, texRefConst);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapLevelClamp(&minClamp, nullptr, texRefConst);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipmapLevelClamp(&minClamp, &maxClamp, nullptr);
+    REQUIRE(err == hipErrorInvalidValue);
+  }
+}
diff --git a/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc b/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc
new file mode 100644
index 000000000..d12bd3159
--- /dev/null
+++ b/catch/unit/texture/hipTexRefSetGetMipmappedArray.cc
@@ -0,0 +1,76 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <hip_test_common.hh>
+texture<float, 2, hipReadModeElementType> tex;
+
+// Test for hipTexRefSetMipmappedArray and hipTexRefGetMipmappedArray, including error handling
+TEST_CASE("Unit_hipTexRefSetGetMipmappedArray") {
+  CHECK_IMAGE_SUPPORT;
+
+  // Retrieve the texture reference for our symbol
+  const textureReference* texRefConst = nullptr;
+  HIP_CHECK(hipGetTextureReference(&texRefConst, &tex));
+  REQUIRE(texRefConst != nullptr);
+  // Implementation expects non-const textureReference*
+  textureReference* texRef = const_cast<textureReference*>(texRefConst);
+  hipMipmappedArray_t mipArr = nullptr;
+  hipMipmappedArray_t outArr = nullptr;
+  unsigned int Flags = 0;
+
+
+  SECTION("Default mipmapped array GET returns invalid value when none bound") {
+    hipError_t err = hipTexRefGetMipMappedArray(&outArr, texRef);
+    REQUIRE(err == hipErrorInvalidValue);
+  }
+
+  SECTION("Set and get mipmapped array") {
+    hipMipmappedArray_t mipmapped_array;
+    HIP_RESOURCE_DESC res_desc{};
+    hipExtent extent;
+    hipChannelFormatDesc channel_desc;
+    unsigned int width = 256, height = 256, mipmap_level = 2;
+
+    res_desc.resType = HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+
+    channel_desc = hipCreateChannelDesc<float>();
+    extent = make_hipExtent(width, height, 0);
+    auto res = hipMallocMipmappedArray(&mipmapped_array, &channel_desc, extent, 2 * mipmap_level,
+                                       hipArrayDefault);
+    if (res == hipErrorNotSupported) {
+      SUCCEED("Mipmapped arrays not supported on this device");
+      return;
+    }
+    HIP_CHECK(res);
+
+    HIP_CHECK(hipTexRefSetMipmappedArray(texRef, mipmapped_array, Flags));
+    HIP_CHECK(hipTexRefGetMipMappedArray(&outArr, texRef));
+    REQUIRE(outArr == mipmapped_array);
+    HIP_CHECK(hipFreeMipmappedArray(mipmapped_array));
+  }
+
+  SECTION("Invalid arguments: null pointers") {
+    hipError_t err;
+    err = hipTexRefSetMipmappedArray(nullptr, mipArr, Flags);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipMappedArray(&outArr, nullptr);
+    REQUIRE(err == hipErrorInvalidValue);
+    err = hipTexRefGetMipMappedArray(nullptr, texRef);
+    REQUIRE(err == hipErrorInvalidValue);
+  }
+}
diff --git a/perftests/compute/hipPerfDotProduct.cpp b/perftests/compute/hipPerfDotProduct.cpp
deleted file mode 100644
index e30d5ab03..000000000
--- a/perftests/compute/hipPerfDotProduct.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <vector>
-
-#define DOT_DIM 256
-
-using namespace std;
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_not_equal(int n,
-                                 const double* __restrict__ x,
-                                 const double* __restrict__ y,
-                                 double* __restrict__ workspace) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  double sum = 0.0;
-  for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) {
-    sum = fma(y[idx], x[idx], sum);
-  }
-
-  __shared__ double sdata[BLOCKSIZE];
-  sdata[threadIdx.x] = sum;
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64){
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32){
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[blockIdx.x] = sdata[0];
-  }
-
-}
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void vectors_equal(int n, const double* __restrict__ x,
-                                  double* __restrict__ workspace) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  double sum = 0.0;
-  for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) {
-    sum = fma(x[idx], x[idx], sum);
-  }
-
-  __shared__ double sdata[BLOCKSIZE];
-  sdata[threadIdx.x] = sum;
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-          sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[blockIdx.x] = sdata[0];
-    }
-}
-
-template <unsigned int BLOCKSIZE>
-__launch_bounds__(BLOCKSIZE)
-__global__ void dot_reduction(double* __restrict__ workspace) {
-
-  __shared__ double sdata[BLOCKSIZE];
-
-  sdata[threadIdx.x] = workspace[threadIdx.x];
-
-  __syncthreads();
-
-  if(threadIdx.x < 128) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 128];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 64) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 64];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 32) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 32];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 16) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 16];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 8) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 8];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 4) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 4];
-  } __syncthreads();
-
-  if(threadIdx.x < 2) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 2];
-  }
-  __syncthreads();
-
-  if(threadIdx.x < 1) {
-    sdata[threadIdx.x] += sdata[threadIdx.x + 1];
-  }
-
-  if(threadIdx.x == 0) {
-    workspace[0] = sdata[0];
-  }
-
-}
-
-void computeDotProduct(int n, const double* x, const double* y, double& result,
-                      double* workspace)
-{
-  dim3 blocks(DOT_DIM);
-  dim3 threadsPerBlock(DOT_DIM);
-
-  if(x != y) {
-    hipLaunchKernelGGL(vectors_not_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, y,
-                       workspace);
-  }
-  else {
-    hipLaunchKernelGGL(vectors_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, workspace);
-  }
-
-  // Part 2 of dot product computation
-  hipLaunchKernelGGL(dot_reduction<DOT_DIM>, dim3(1), threadsPerBlock, 0, 0, workspace);
-
-  // Copy the final dot product result back from the device
-  HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost));
-
-  return;
-}
-
-int main(int argc, char* argv[]) {
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-  hipDeviceProp_t props = {0};
-  props = {0};
-  HIPCHECK(hipSetDevice(p_gpuDevice));
-  HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice
-    << std::endl;
-
-  int nx, ny, nz;
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-  vector<int> vectorSize = {200, 300, 50};
-  switch(testCase) {
-
-  case 0:
-  nx = vectorSize[0];
-  ny = vectorSize[0];
-  nz = vectorSize[0];
-  break;
-
-  case 1:
-  nx = vectorSize[1];
-  ny = vectorSize[1];
-  nz = vectorSize[1];
-  break;
-
-  case 2:
-  nx = vectorSize[0];
-  ny = vectorSize[1];
-  nz = vectorSize[2];
-  break;
-
-  default:
-    break;
-
-  }
-
-  int trials = 200;
-
-  int size = nx * ny * nz;
-
-  vector<double> hx(size);
-  vector<double> hy(size);
-  double hresult_xy = 0.0;
-  double hresult_xx = 0.0;
-
-  srand(time(NULL));
-
-  for(int i = 0; i < size; ++i) {
-    hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
-    hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
-
-    hresult_xy += hx[i] * hy[i];
-    hresult_xx += hx[i] * hx[i];
-  }
-
-  double* dx;
-  double* dy;
-  double* workspace;
-  double  dresult;
-
-  HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size));
-  HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size));
-  HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM));
-
-  HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-  HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
-
-  // Warm up
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-  computeDotProduct(size, dx, dy, dresult, workspace);
-
-  // Timed run for <x,y>
-  HIPCHECK(hipDeviceSynchronize());
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < trials; ++i) {
-    computeDotProduct(size, dx, dy, dresult, workspace);
-  }
-
-  float time = 0;
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
-
-  time /= trials;
-
-  double bw = sizeof(double) * size * 2.0 / 1e9;
-  double gf = 2.0 * size / 1e9;
-
-  cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
-
-  // Verify the device kernel results comparing it with the host results
-  if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) {
-    cerr << " Device results inconsistent with host results. "
-         << " Host result: " << hresult_xy
-         << " Device result: " << dresult;
-  }
-
-  // Warm up
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-  computeDotProduct(size, dx, dx, dresult, workspace);
-
-  // Timed run for <x,x>
-  HIPCHECK(hipDeviceSynchronize());
-  all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < trials; ++i) {
-    computeDotProduct(size, dx, dx, dresult, workspace);
-  }
-
-  all_end = std::chrono::steady_clock::now();
-  all_kernel_time = all_end - all_start;
-  time = all_kernel_time.count();
-
-  time /= trials;
-  bw = sizeof(double) * size / 1e9;
-
-  cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
-       << gf/(time / 1e3) << " GFlop/s" << endl;
-
-  // Verify the device kernel results comparing it with the host results
-  if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) {
-    cerr << " Device results inconsistent with host results"
-         << " Host result: " << hresult_xy
-         << " Device result: " << dresult;
-  }
-
-  HIPCHECK(hipFree(dx));
-  HIPCHECK(hipFree(dy));
-  HIPCHECK(hipFree(workspace));
-
-  }
-  passed();
-  return 0;
-}
diff --git a/perftests/compute/hipPerfMandelbrot.cpp b/perftests/compute/hipPerfMandelbrot.cpp
deleted file mode 100644
index 9f9d6b404..000000000
--- a/perftests/compute/hipPerfMandelbrot.cpp
+++ /dev/null
@@ -1,743 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <hip/math_functions.h>
-#include <vector>
-#include <string>
-#include <map>
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-coordRec coords[] = {
-    {0.0, 0.0, 4.0},                                     // Whole set
-    {0.0, 0.0, 0.00001},                                 // All black
-    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                  uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*(float)i);
-  float y0 = (float)(yPos + yStep*(float)j);
-
-  float x = x0;
-  float y = y0;
-
-#define FAST
-  uint iter = 0;
-  float tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  float savx = x;
-  float savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
-#endif
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-  }
-  // Handle remainder
-  if (!stay) {
-    iter = 16;
-    do {
-      x = savx;
-      y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-      tmp = x;
-      x =  fma(-y,y, fma(x,x,x0));
-      y =  fma(2.0f*tmp,y,y0);
-      ccount += stay;
-      iter--;
-      savx = (stay ? x : savx);
-      savy = (stay ? y : savy);
-    } while (stay && iter);
-  }
-
-
-  out[tid] = (uint)ccount;
-
-};
-
-
-template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*i);
-  double y0 = (double)(yPos + yStep*j);
-
-  double x = x0;
-  double y = y0;
-
-  uint iter = 0;
-  double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-  out[tid] = iter;
-};
-
-
-template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*(double)i);
-  double y0 = (double)(yPos + yStep*(double)j);
-
-  double x = x0;
-  double y = y0;
-
-#define FAST
-  uint iter = 0;
-  double tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  double savx = x;
-  double savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
-#endif
-  {
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-    }
-  // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x,x,x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-
-    }
-    out[tid] = (uint)ccount;
-};
-
-static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
-
-// Expected results for each kernel run at each coord
-unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
-
-class hipPerfMandelBrot {
-  public:
-  hipPerfMandelBrot();
-  ~hipPerfMandelBrot();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void printResults(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
-
-  // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
-
-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
-
-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
-
-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
-
-  hipStream_t streams[2];
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  std::map<std::string, std::vector<double>> results;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  volatile unsigned long long totalIters = 0;
-  int numCUs;
-  static const unsigned int numLoops = 10;
-};
-
-
-hipPerfMandelBrot::hipPerfMandelBrot() {}
-
-hipPerfMandelBrot::~hipPerfMandelBrot() {}
-
-void hipPerfMandelBrot::open(int deviceId) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfMandelBrot::printResults() {
-
-  int numkernels = getNumKernels();
-  int numStreams = getNumStreams();
-
-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
-
-  std::map<std::string, std::vector<double>>:: iterator itr;
-  for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for(auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
-  results.clear();
-
-  std::cout << std::endl;
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
-                      maxIter);
-
-
-}
-
-
-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
-
-  unsigned int numStreams = getNumStreams();
-  coordIdx = testCase % numCoords;
-
-  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
-
-  // Maximum iteration count
-  maxIter = 32768;
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_  * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-  }
-
-  // Synchronize to make sure all the copies are completed
-  HIPCHECK(hipStreamSynchronize(0));
-
-  int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
-    kernelIdx = 0;
-  }
-
-  else if(testCase == 1 || testCase == 6 || testCase == 11) {
-    kernelIdx = 1;
-  }
-  else if(testCase == 2 || testCase == 7 || testCase == 12) {
-    kernelIdx = 2;
-  }
-  else if(testCase == 3 || testCase == 8 || testCase == 13){
-    kernelIdx = 3;
-  }
-
-
-  double totalTime = 0.0;
-
-  for (unsigned int k = 0; k < numLoops; k++) {
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-
-  }
-
-
-  else {
-  double xStep = coords[coordIdx].width / (double)width_;
-  double yStep = -coords[coordIdx].width / (double)width_;
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
-
-
-  }
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-
-
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
-  }
-
-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
-
-  }
-
-
-  // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
-                (totalTime / (double)numLoops);
-
-
-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
-
-  // Print results except for Warm-up kernel
-  if(testCase!=100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
-
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfMandelBrot::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfMandelBrot mandelbrotCompute;
-  int deviceId = 0;
-
-  mandelbrotCompute.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0: {
-    // Warmup-kernel - default stream executes serially
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(100/*Random number*/, deviceId);
-    break;
-    }
-
-
-  case 1: {
-    // run all - sync
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-  }
-
-
-  case 2: {
-    // run all - async
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(2);
-    mandelbrotCompute.setNumKernels(2);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-
-  }
-
-
-  default: {
-    break;
-  }
-
-
-  }
-
-
-
-  }
-
-
-  passed();
-}
diff --git a/perftests/dispatch/hipPerfDispatchSpeed.cpp b/perftests/dispatch/hipPerfDispatchSpeed.cpp
deleted file mode 100644
index 56a757a54..000000000
--- a/perftests/dispatch/hipPerfDispatchSpeed.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <stdio.h>
-#include <assert.h>
-#include <string.h>
-#include <complex>
-
-#include "timer.h"
-#include "test_common.h"
-
-// Quiet pesky warnings
-#ifdef WIN_OS
-#define SNPRINTF sprintf_s
-#else
-#define SNPRINTF snprintf
-#endif
-
-#define CHAR_BUF_SIZE 512
-
-#define CHECK_RESULT(test, msg)         \
-    if ((test))                         \
-    {                                   \
-        printf("\n%s\n", msg);          \
-        abort();                        \
-    }
-
-typedef struct {
-    unsigned int iterations;
-    int flushEvery;
-} testStruct;
-
-testStruct testList[] =
-{
-    { 1, -1},
-    { 1, -1},
-    { 10, 1},
-    { 10, -1},
-    { 100, 1},
-    { 100, 10},
-    { 100, -1},
-    { 1000, 1},
-    { 1000, 10},
-    { 1000, 100},
-    { 1000, -1},
-    { 10000, 1},
-    { 10000, 10},
-    { 10000, 100},
-    { 10000, 1000},
-    { 10000, -1},
-    { 100000, 1},
-    { 100000, 10},
-    { 100000, 100},
-    { 100000, 1000},
-    { 100000, 10000},
-    { 100000, -1},
-};
-
-unsigned int mapTestList[] = {1, 1, 10, 100, 1000, 10000, 100000};
-
-__global__ void _dispatchSpeed(float *outBuf)
-{
-   int i = (blockIdx.x * blockDim.x + threadIdx.x);
-   if (i < 0)
-       outBuf[i] = 0.0f;
-};
-
-
-int main(int argc, char* argv[]) {
-    HipTest::parseStandardArguments(argc, argv, true);
-
-    hipError_t err = hipSuccess;
-    hipDeviceProp_t props = {0};
-    hipGetDeviceProperties(&props, p_gpuDevice);
-    CHECK_RESULT(err != hipSuccess, "hipGetDeviceProperties failed" );
-    printf("Set device to %d : %s\n", p_gpuDevice, props.name);
-
-    unsigned int testListSize = sizeof(testList) / sizeof(testStruct);
-    int numTests = (p_tests == -1) ? (2*2*testListSize - 1) : p_tests;
-    int test = (p_tests == -1) ? 0 : p_tests;
-
-    float* srcBuffer = NULL;
-    unsigned int bufSize_ = 64*sizeof(float);
-    err = hipMalloc(&srcBuffer, bufSize_);
-    CHECK_RESULT(err != hipSuccess, "hipMalloc failed");
-
-    for(;test <= numTests; test++)
-    {
-        int openTest = test % testListSize;
-        bool sleep = false;
-
-        if (test >= (testListSize * 2))
-        {
-            sleep = true;
-        }
-
-        int threads = (bufSize_ / sizeof(float));
-        int threads_per_block  = 64;
-        int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-        // warmup
-        hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block),
-                           0, hipStream_t(0), srcBuffer);
-        err = hipDeviceSynchronize();
-        CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed");
-
-        CPerfCounter timer;
-
-        timer.Reset();
-        timer.Start();
-        for (unsigned int i = 0; i < testList[openTest].iterations; i++)
-        {
-            hipLaunchKernelGGL(_dispatchSpeed, dim3(blocks), dim3(threads_per_block),
-                                0, hipStream_t(0), srcBuffer);
-
-            if ((testList[openTest].flushEvery > 0) &&
-                (((i + 1) % testList[openTest].flushEvery) == 0))
-            {
-                if (sleep)
-                {
-                    err = hipDeviceSynchronize();
-                    CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed");
-                }
-                else
-                {
-                    do {
-                        err = hipStreamQuery(NULL);
-                    } while (err == hipErrorNotReady);
-                }
-            }
-        }
-        if (sleep)
-        {
-            err = hipDeviceSynchronize();
-            CHECK_RESULT(err != hipSuccess, "hipDeviceSynchronize failed");
-        }
-        else
-        {
-            do {
-                err = hipStreamQuery(NULL);
-            } while (err == hipErrorNotReady);
-        }
-        timer.Stop();
-
-        double sec = timer.GetElapsedTime();
-
-        // microseconds per launch
-        double perf = (1000000.f*sec/testList[openTest].iterations);
-        const char *waitType;
-        const char *extraChar;
-        const char *n;
-        if (sleep)
-        {
-            waitType = "sleep";
-            extraChar = "";
-            n = "";
-        }
-        else
-        {
-            waitType = "spin";
-            n = "n";
-            extraChar = " ";
-        }
-
-
-        char buf[256];
-        if (testList[openTest].flushEvery > 0)
-        {
-            SNPRINTF(buf, sizeof(buf),
-                     "HIPPerfDispatchSpeed[%3d] %7d dispatches %s%sing every %5d (us/disp) %3f",
-                     test, testList[openTest].iterations,
-                     waitType, n, testList[openTest].flushEvery, (float)perf);
-        }
-        else
-        {
-            SNPRINTF(buf, sizeof(buf),
-                     "HIPPerfDispatchSpeed[%3d] %7d dispatches (%s%s)              (us/disp) %3f",
-                     test, testList[openTest].iterations, waitType, extraChar, (float)perf);
-        }
-        printf("%s\n", buf);
-    }
-
-    hipFree(srcBuffer);
-    passed();
-}
diff --git a/perftests/memory/hipPerfBufferCopyRectSpeed.cpp b/perftests/memory/hipPerfBufferCopyRectSpeed.cpp
deleted file mode 100644
index 78096844f..000000000
--- a/perftests/memory/hipPerfBufferCopyRectSpeed.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <stdio.h>
-#include <assert.h>
-#include <string.h>
-#include <complex>
-
-#include "timer.h"
-#include "test_common.h"
-
-// Quiet pesky warnings
-#ifdef WIN_OS
-#define SNPRINTF sprintf_s
-#else
-#define SNPRINTF snprintf
-#endif
-
-#define NUM_SIZES 8
-//4KB, 8KB, 64KB, 256KB, 1 MB, 4MB, 16 MB, 16MB+10
-static const unsigned int Sizes[NUM_SIZES] = {4096, 8192, 65536, 262144, 1048576, 4194304, 16777216, 16777216+10};
-
-static const unsigned int Iterations[2] = {1, 1000};
-
-#define BUF_TYPES 4
-//  16 ways to combine 4 different buffer types
-#define NUM_SUBTESTS (BUF_TYPES*BUF_TYPES)
-
-#define CHECK_RESULT(test, msg)         \
-    if ((test))                         \
-    {                                   \
-        printf("\n%s\n", msg);          \
-        abort();                        \
-    }
-
-void setData(void *ptr, unsigned int size, char value)
-{
-    char *ptr2 = (char *)ptr;
-    for (unsigned int i = 0; i < size ; i++)
-    {
-        ptr2[i] = value;
-    }
-}
-
-void checkData(void *ptr, unsigned int size, char value)
-{
-    char *ptr2 = (char *)ptr;
-    for (unsigned int i = 0; i < size; i++)
-    {
-        if (ptr2[i] != value)
-        {
-            printf("Data validation failed at %d!  Got 0x%08x\n", i, ptr2[i]);
-            printf("Expected 0x%08x\n", value);
-            CHECK_RESULT(true, "Data validation failed!");
-            break;
-        }
-    }
-}
-
-
-int main(int argc, char* argv[]) {
-    HipTest::parseStandardArguments(argc, argv, true);
-
-    hipError_t err = hipSuccess;
-    hipDeviceProp_t props = {0};
-    hipGetDeviceProperties(&props, p_gpuDevice);
-    CHECK_RESULT(err != hipSuccess, "hipGetDeviceProperties failed" );
-    printf("Set device to %d : %s\n", p_gpuDevice, props.name);
-    printf("Legend: unp - unpinned(malloc), hM - hipMalloc(device)\n");
-    printf("        hHR - hipHostRegister(pinned), hHM - hipHostMalloc(prePinned)\n");
-    err = hipSetDevice(p_gpuDevice);
-    CHECK_RESULT(err != hipSuccess, "hipSetDevice failed" );
-
-    unsigned int bufSize_;
-    bool hostMalloc[2] = {false};
-    bool hostRegister[2] = {false};
-    bool unpinnedMalloc[2] = {false};
-    unsigned int numIter;
-    void *memptr[2] = {NULL};
-    void *alignedmemptr[2] = {NULL};
-    void* srcBuffer = NULL;
-    void* dstBuffer = NULL;
-
-    int numTests = (p_tests == -1) ? (NUM_SIZES*NUM_SUBTESTS*2 - 1) : p_tests;
-    int test = (p_tests == -1) ? 0 : p_tests;
-
-    for(;test <= numTests; test++)
-    {
-        unsigned int srcTest = (test / NUM_SIZES) % BUF_TYPES;
-        unsigned int dstTest = (test / (NUM_SIZES*BUF_TYPES)) % BUF_TYPES;
-        bufSize_ = Sizes[test % NUM_SIZES];
-        hostMalloc[0] = hostMalloc[1] = false;
-        hostRegister[0] = hostRegister[1] = false;
-        unpinnedMalloc[0] = unpinnedMalloc[1] = false;
-        srcBuffer = dstBuffer = 0;
-        memptr[0] = memptr[1] = NULL;
-        alignedmemptr[0] = alignedmemptr[1] = NULL;
-
-        size_t width = static_cast<size_t>(sqrt(static_cast<float>(bufSize_)));
-
-        if (srcTest == 3)
-        {
-            hostRegister[0] = true;
-        }
-        else if (srcTest == 2)
-        {
-            hostMalloc[0] = true;
-        }
-        else if (srcTest == 1)
-        {
-            unpinnedMalloc[0] = true;
-        }
-
-        if (dstTest == 1)
-        {
-            unpinnedMalloc[1] = true;
-        }
-        else if (dstTest == 2)
-        {
-            hostMalloc[1] = true;
-        }
-        else if (dstTest == 3)
-        {
-            hostRegister[1] = true;
-        }
-
-        numIter = Iterations[test / (NUM_SIZES * NUM_SUBTESTS)];
-
-        if (hostMalloc[0])
-        {
-            err = hipHostMalloc((void**)&srcBuffer, bufSize_, 0);
-            setData(srcBuffer, bufSize_, 0xd0);
-            CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed");
-        }
-        else if (hostRegister[0])
-        {
-            memptr[0] = malloc(bufSize_ + 4096);
-            alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095);
-            srcBuffer = alignedmemptr[0];
-            setData(srcBuffer, bufSize_, 0xd0);
-            err = hipHostRegister(srcBuffer, bufSize_, 0);
-            CHECK_RESULT(err != hipSuccess, "hipHostRegister failed");
-        }
-        else if (unpinnedMalloc[0])
-        {
-            memptr[0] = malloc(bufSize_ + 4096);
-            alignedmemptr[0] = (void*)(((size_t)memptr[0] + 4095) & ~4095);
-            srcBuffer = alignedmemptr[0];
-            setData(srcBuffer, bufSize_, 0xd0);
-        }
-        else
-        {
-            err = hipMalloc(&srcBuffer, bufSize_);
-            CHECK_RESULT(err != hipSuccess, "hipMalloc failed");
-            err = hipMemset(srcBuffer, 0xd0, bufSize_);
-            CHECK_RESULT(err != hipSuccess, "hipMemset failed");
-        }
-
-        if (hostMalloc[1])
-        {
-            err = hipHostMalloc((void**)&dstBuffer, bufSize_, 0);
-            CHECK_RESULT(err != hipSuccess, "hipHostMalloc failed");
-        }
-        else if (hostRegister[1])
-        {
-            memptr[1] = malloc(bufSize_ + 4096);
-            alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095);
-            dstBuffer = alignedmemptr[1];
-            err = hipHostRegister(dstBuffer, bufSize_, 0);
-            CHECK_RESULT(err != hipSuccess, "hipHostRegister failed");
-        }
-        else if (unpinnedMalloc[1])
-        {
-            memptr[1] = malloc(bufSize_ + 4096);
-            alignedmemptr[1] = (void*)(((size_t)memptr[1] + 4095) & ~4095);
-            dstBuffer = alignedmemptr[1];
-        }
-        else
-        {
-            err = hipMalloc(&dstBuffer, bufSize_);
-            CHECK_RESULT(err != hipSuccess, "hipMalloc failed");
-        }
-
-        CPerfCounter timer;
-
-        //warm up
-        err = hipMemcpy2D(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault);
-        CHECK_RESULT(err, "hipMemcpy2D failed");
-
-        timer.Reset();
-        timer.Start();
-        for (unsigned int i = 0; i < numIter; i++)
-        {
-            err = hipMemcpy2DAsync(dstBuffer, width, srcBuffer, width, width, width, hipMemcpyDefault, NULL);
-            CHECK_RESULT(err, "hipMemcpyAsync2D failed");
-        }
-        err = hipDeviceSynchronize();
-        CHECK_RESULT(err, "hipDeviceSynchronize failed");
-        timer.Stop();
-        double sec = timer.GetElapsedTime();
-
-        // Buffer copy bandwidth in GB/s
-        double perf = ((double)bufSize_*numIter*(double)(1e-09)) / sec;
-
-        const char *strSrc = NULL;
-        const char *strDst = NULL;
-         if (hostMalloc[0])
-            strSrc = "hHM";
-        else if (hostRegister[0])
-            strSrc = "hHR";
-        else if (unpinnedMalloc[0])
-            strSrc = "unp";
-        else
-            strSrc = "hM";
-
-        if (hostMalloc[1])
-            strDst = "hHM";
-        else if (hostRegister[1])
-            strDst = "hHR";
-        else if (unpinnedMalloc[1])
-            strDst = "unp";
-        else
-            strDst = "hM";
-        // Double results when src and dst are both on device
-        if ((!hostMalloc[0] && !hostRegister[0] && !unpinnedMalloc[0]) &&
-            (!hostMalloc[1] && !hostRegister[1] && !unpinnedMalloc[1]))
-            perf *= 2.0;
-        // Double results when src and dst are both in sysmem
-        if ((hostMalloc[0] || hostRegister[0] || unpinnedMalloc[0]) &&
-            (hostMalloc[1] || hostRegister[1] || unpinnedMalloc[1]))
-            perf *= 2.0;
-
-        char buf[256];
-        SNPRINTF(buf, sizeof(buf), "HIPPerfBufferCopyRectSpeed[%d]\t(%8d bytes)\ts:%s d:%s\ti:%4d\t(GB/s) perf\t%f",
-                test, bufSize_, strSrc, strDst, numIter, (float)perf);
-        printf("%s\n", buf);
-
-        //Free src
-        if (hostMalloc[0])
-        {
-            hipHostFree(srcBuffer);
-        }
-        else if (hostRegister[0])
-        {
-            hipHostUnregister(srcBuffer);
-            free(memptr[0]);
-        }
-        else if (unpinnedMalloc[0])
-        {
-            free(memptr[0]);
-        }
-        else
-        {
-            hipFree(srcBuffer);
-        }
-
-        //Free dst
-        if (hostMalloc[1])
-        {
-            hipHostFree(dstBuffer);
-        }
-        else if (hostRegister[1])
-        {
-            hipHostUnregister(dstBuffer);
-            free(memptr[1]);
-        }
-        else if (unpinnedMalloc[1])
-        {
-            free(memptr[1]);
-        }
-        else
-        {
-            hipFree(dstBuffer);
-        }
-    }
-
-    passed();
-}
diff --git a/perftests/memory/hipPerfDevMemReadSpeed.cpp b/perftests/memory/hipPerfDevMemReadSpeed.cpp
deleted file mode 100644
index 6548da94c..000000000
--- a/perftests/memory/hipPerfDevMemReadSpeed.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-using namespace std;
-
-#define arraySize 16
-
-typedef struct d_uint16 {
-  uint data[arraySize];
-} d_uint16;
-
-__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) {
-
-  size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x ;
-
-  uint tmp = 0;
-  for (size_t i = idx; i < N; i += stride) {
-    for (size_t j = 0; j < arraySize; j++) {
-      tmp += src[i].data[j];
-    }
-  }
-
-  atomicAdd(dst, tmp);
-}
-
-int main(int argc, char* argv[]) {
-  d_uint16 *dSrc;
-  d_uint16 *hSrc;
-  uint *dDst;
-  uint *hDst;
-  hipStream_t stream;
-  ulong N = 4 * 1024 * 1024;
-  uint nBytes = N * sizeof(d_uint16);
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-      cout << "info: didn't find any GPU! skipping the test!\n";
-      passed();
-      return 0;
-  }
-
-  static int device = 0;
-  HIPCHECK(hipSetDevice(device));
-  hipDeviceProp_t props;
-  HIPCHECK(hipGetDeviceProperties(&props, device));
-  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
-      " with " << props.multiProcessorCount << " CUs" << endl;
-
-  const unsigned threadsPerBlock = 64;
-  const unsigned blocks = props.multiProcessorCount * 4;
-
-  uint inputData = 0x1;
-  int nIter = 1000;
-
-  hSrc =  new d_uint16[nBytes];
-  HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
-  hDst =  new uint;
-  hDst[0] = 0;
-  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
-  for (size_t i = 0; i < N; i++) {
-    for (int j = 0; j < arraySize; j++) {
-      hSrc[i].data[j] = inputData;
-    }
-  }
-
-  HIPCHECK(hipMalloc(&dSrc, nBytes));
-  HIPCHECK(hipMalloc(&dDst, sizeof(uint)));
-
-  HIPCHECK(hipStreamCreate(&stream));
-
-  HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice));
-  HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice));
-
-  hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
-  HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost));
-  hipDeviceSynchronize();
-
-  if (hDst[0] != (nBytes / sizeof(uint))) {
-    cout << "info: Data validation failed for warm up run!" << endl;
-    cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl;
-    HIPCHECK(hipErrorUnknown);
-  }
-
-  // measure performance based on host time
-  auto all_start = chrono::steady_clock::now();
-
-  for(int i = 0; i < nIter; i++) {
-    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
-  }
-  hipDeviceSynchronize();
-
-  auto all_end = chrono::steady_clock::now();
-  chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  // read speed in GB/s
-  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
-
-  cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " <<
-      nBytes / (1024 * 1024) << " MB" << endl;
-
-  delete [] hSrc;
-  delete hDst;
-  hipFree(dSrc);
-  hipFree(dDst);
-  HIPCHECK(hipStreamDestroy(stream));
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfDevMemWriteSpeed.cpp b/perftests/memory/hipPerfDevMemWriteSpeed.cpp
deleted file mode 100644
index cc4883660..000000000
--- a/perftests/memory/hipPerfDevMemWriteSpeed.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-using namespace std;
-
-#define arraySize 16
-
-typedef struct d_uint16 {
-  uint data[arraySize];
-} d_uint16;
-
-__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) {
-    size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x;
-    for (size_t i = idx; i < N; i += stride) {
-      dst[i] = pval;
-    }
-};
-
-int main(int argc, char* argv[]) {
-  d_uint16 *dDst;
-  d_uint16 *hDst;
-  hipStream_t stream;
-  ulong N = 4 * 1024 * 1024;
-  uint nBytes = N * sizeof(d_uint16);
-  d_uint16 pval;
-
-  for (int i = 0; i < arraySize; i++) {
-    pval.data[i] = 0xabababab;
-  }
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-      cout << "info: didn't find any GPU! skipping the test!\n";
-      passed();
-      return 0;
-  }
-
-  static int device = 0;
-  HIPCHECK(hipSetDevice(device));
-  hipDeviceProp_t props;
-  HIPCHECK(hipGetDeviceProperties(&props, device));
-  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
-      " with " << props.multiProcessorCount << " CUs" << endl;
-
-  size_t threadsPerBlock = 64;
-  size_t blocks = props.multiProcessorCount * 4;
-
-  uint inputData = 0xabababab;
-  int nIter = 1000;
-
-  hDst =  new d_uint16[nBytes];
-  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
-  for (size_t i = 0; i < N; i++) {
-    for (size_t j = 0; j < arraySize; j++) {
-      hDst[i].data[j] = 0;
-    }
-  }
-
-  HIPCHECK(hipMalloc(&dDst, nBytes));
-
-  HIPCHECK(hipStreamCreate(&stream));
-
-  hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
-  HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost));
-  hipDeviceSynchronize();
-
-  for (uint i = 0; i < N; i++) {
-    for (uint j = 0; j < arraySize; j++) {
-      if (hDst[i].data[j] != inputData) {
-        cout << "info: Data validation failed for warm up run! " << endl;
-        cout << "at index i: " << i << " element j: " << j << endl;
-        cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl;
-        HIPCHECK(hipErrorUnknown);
-      }
-    }
-  }
-
-  auto all_start = chrono::steady_clock::now();
-  for(int i = 0; i < nIter; i++) {
-    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
-  }
-  hipDeviceSynchronize();
-  auto all_end = chrono::steady_clock::now();
-  chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  // read speed in GB/s
-  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
-
-  cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " <<
-      nBytes / (1024 * 1024) << " MB" << endl;
-
-
-  delete [] hDst;
-  hipFree(dDst);
-  HIPCHECK(hipStreamDestroy(stream));
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfHostNumaAlloc.cpp b/perftests/memory/hipPerfHostNumaAlloc.cpp
deleted file mode 100644
index 93cd71ace..000000000
--- a/perftests/memory/hipPerfHostNumaAlloc.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD_CMD: hipPerfHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvidia
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <iostream>
-#include <time.h>
-#include <cstdio>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <numaif.h>
-#include <iostream>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <array>
-#include "hip/hip_runtime.h"
-
-// To run it correctly, we must not export HIP_VISIBLE_DEVICES.
-// And we must explicitly link libnuma because of numa api move_pages().
-#define NUM_PAGES 4
-char *h = nullptr;
-char *d_h = nullptr;
-char *m = nullptr;
-char *d_m = nullptr;
-int page_size = 0;
-const int mode[] = { MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE };
-const char* modeStr[] = { "MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE" };
-
-std::string exeCommand(const char* cmd) {
-  std::array<char, 128> buff;
-  std::string result;
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
-  if (!pipe) {
-    return result;
-  }
-  while (fgets(buff.data(), buff.size(), pipe.get()) != nullptr) {
-    result += buff.data();
-  }
-  return result;
-}
-
-int getCpuAgentCount() {
-  const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l";
-  int cpuAgentCount = std::atoi(exeCommand(cmd).c_str());
-  return cpuAgentCount;
-}
-
-bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
-  void *pages[NUM_PAGES];
-  int status[NUM_PAGES];
-  int nodes[NUM_PAGES];
-  int ret_code;
-
-  printf("set cpu %d, gpu %d, numaMode %d, hostMallocflags 0x%x\n", cpuId,
-         gpuId, numaMode, hostMallocflags);
-
-  if (cpuId >= 0) {
-    unsigned long nodeMask = 1 << cpuId;
-    unsigned long maxNode = sizeof(nodeMask) * 8;
-    if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask,
-                      numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) {
-      printf("set_mempolicy() failed with err %d\n", errno);
-      return false;
-    }
-  }
-
-  if (gpuId >= 0) {
-    HIPCHECK(hipSetDevice(gpuId));
-  }
-
-  posix_memalign((void**) &m, page_size, page_size * NUM_PAGES);
-  hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped);
-  hipHostGetDevicePointer((void**) &d_m, m, 0);
-
-  status[0] = -1;
-  pages[0] = m;
-  for (int i = 1; i < NUM_PAGES; i++) {
-    pages[i] = (char*) pages[0] + page_size;
-  }
-  ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0);
-  printf("Memory (malloc) ret %d at %p (dev %p) is at node: ", ret_code, m, d_m);
-  for (int i = 0; i < NUM_PAGES; i++) {
-    printf("%d ", status[i]); // Don't verify as it's out of our control
-  }
-  printf("\n");
-
-  HIPCHECK(hipHostMalloc((void**) &h, page_size*NUM_PAGES, hostMallocflags));
-  pages[0] = h;
-  for (int i = 1; i < NUM_PAGES; i++) {
-    pages[i] = (char*) pages[0] + page_size;
-  }
-  ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0);
-  d_h = nullptr;
-  if (hostMallocflags & hipHostMallocMapped) {
-    hipHostGetDevicePointer((void**) &d_h, h, 0);
-    printf("Memory (hipHostMalloc) ret %d at %p (dev %p) is at node: ",
-           ret_code, h, d_h);
-  } else {
-    printf("Memory (hipHostMalloc) ret %d at %p is at node: ", ret_code, h);
-  }
-  for (int i = 0; i < NUM_PAGES; i++) {
-    printf("%d ", status[i]);  // Always print it even if it's wrong. Verify later
-  }
-  printf("\n");
-
-  HIPCHECK(hipHostFree((void* )h));
-  hipHostUnregister(m);
-  free(m);
-
-  if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) {
-    for (int i = 0; i < NUM_PAGES; i++) {
-      if (status[i] != cpuId) {  // Now verify
-        printf("Failed at %d", i);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool runTest(const int &cpuCount, const int &gpuCount,
-             const unsigned int &hostMallocflags, const std::string &str) {
-  printf("%s\n", str.c_str());
-
-  for (int m = 0; m < sizeof(mode) / sizeof(mode[0]); m++) {
-    printf("Testing %s\n", modeStr[m]);
-
-    for (int i = 0; i < cpuCount; i++) {
-      for (int j = 0; j < gpuCount; j++) {
-        if (!test(i, j, mode[m], hostMallocflags)) {
-          return false;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-int main(int argc, char *argv[]) {
-  int gpuCount = 0;
-  HIPCHECK(hipGetDeviceCount(&gpuCount));
-  int cpuCount = getCpuAgentCount();
-  page_size = getpagesize();
-  printf("Cpu count %d, Gpu count %d, Page size %d\n", cpuCount, gpuCount,
-         page_size);
-
-  if (cpuCount < 0 || gpuCount < 0) {
-    failed("Bad device count\n");
-    return -1;
-  }
-
-  if (!runTest(cpuCount, gpuCount, hipHostMallocDefault | hipHostMallocNumaUser,
-               "Testing hipHostMallocDefault | hipHostMallocNumaUser........................")) {
-    failed("Failed testing hipHostMallocDefault | hipHostMallocNumaUser\n");
-    return -1;
-  }
-
-  if (!runTest(cpuCount, gpuCount, hipHostMallocMapped | hipHostMallocNumaUser,
-               "Testing hipHostMallocMapped | hipHostMallocNumaUser.........................")) {
-    failed("Failed testing hipHostMallocMapped | hipHostMallocNumaUser\n");
-    return -1;
-  }
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfMemFill.cpp b/perftests/memory/hipPerfMemFill.cpp
deleted file mode 100644
index dd54ec685..000000000
--- a/perftests/memory/hipPerfMemFill.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <printf/printf_common.h>
-#include <iostream>
-#include <chrono>
-#include <sys/time.h>
-
-#define SIMPLY_ASSIGN 0
-#define USE_HIPTEST_SETNUMBLOCKS 0
-
-using namespace std;
-
-template<class T>
-__global__ void vec_fill(T *x, T coef, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int i = istart; i < N; i += ishift) {
-#if SIMPLY_ASSIGN
-    x[i] = coef;
-#else
-    x[i] = coef * i;
-#endif
-  }
-}
-
-__device__ void print_log(int i, double value, double expected) {
-  printf("failed at %d: val=%g, expected=%g\n", i, value, expected);
-}
-
-__device__ void print_log(int i, int value, int expected) {
-  printf("failed at %d: val=%d, expected=%d\n", i, value, expected);
-}
-
-template<class T>
-__global__ void vec_verify(T *x, T coef, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int i = istart; i < N; i += ishift) {
-#if SIMPLY_ASSIGN
-    if(x[i] != coef) {
-      print_log(i, x[i], coef);
-    }
-#else
-    if(x[i] != coef * i) {
-      print_log(i, x[i], coef * i);
-    }
-#endif
-  }
-}
-
-template<class T>
-__global__ void daxpy(T *__restrict__ x, T *__restrict__ y,
-    const T coef, int Niter, int N) {
-  const int istart = threadIdx.x + blockIdx.x * blockDim.x;
-  const int ishift = blockDim.x * gridDim.x;
-  for (int iter = 0; iter < Niter; ++iter) {
-    T iv = coef * iter;
-    for (int i = istart; i < N; i += ishift)
-    y[i] = iv * x[i] + y[i];
-  }
-}
-
-template<class T>
-class hipPerfMemFill {
- private:
-  static constexpr int NUM_START = 27;
-  static constexpr int NUM_SIZE = 5;
-  static constexpr int NUM_ITER = 10;
-  size_t totalSizes_[NUM_SIZE];
-  hipDeviceProp_t props_;
-  const T coef_ = getCoefficient(3.14159);
-  const unsigned int blocksPerCU_;
-  const unsigned int threadsPerBlock_;
-
- public:
-  hipPerfMemFill(unsigned int blocksPerCU, unsigned int threadsPerBlock) :
-    blocksPerCU_(blocksPerCU), threadsPerBlock_(threadsPerBlock) {
-    for (int i = 0; i < NUM_SIZE; i++) {
-      totalSizes_[i] = 1ull << (i + NUM_START); // 128M, 256M, 512M, 1024M, 2048M
-    }
-  }
-
-  ~hipPerfMemFill() {
-  }
-
-  bool supportLargeBar() {
-    return props_.isLargeBar != 0;
-  }
-
-  bool supportManagedMemory() {
-    return props_.managedMemory != 0;
-  }
-
-  const T getCoefficient(double val) {
-    return static_cast<T>(val);
-  }
-
-  void setHostBuffer(T *A, T val, size_t size) {
-    size_t len = size / sizeof(T);
-    for (int i = 0; i < len; i++) {
-      A[i] = val;
-    }
-  }
-
-  void open(int deviceId) {
-    int nGpu = 0;
-    HIPCHECK(hipGetDeviceCount(&nGpu));
-    if (nGpu < 1) {
-      failed("No GPU!");
-    } else if (deviceId >= nGpu) {
-      failed("Info: wrong GPU Id %d\n", deviceId);
-    }
-
-    HIPCHECK(hipSetDevice(deviceId));
-    memset(&props_, 0, sizeof(props_));
-    HIPCHECK(hipGetDeviceProperties(&props_, deviceId));
-    std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
-        << props_.pciBusID << " " << props_.name << " with "
-        << props_.multiProcessorCount << " CUs, large bar: "
-        << supportLargeBar() << ", managed memory: " << supportManagedMemory()
-        << ", DeviceMallocFinegrained: " << supportDeviceMallocFinegrained()
-        << std::endl;
-  }
-
-  void log_host(const char* title, double GBytes, double sec) {
-    cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec
-        << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << endl;
-  }
-
-  void log_kernel(const char* title, double GBytes, double sec, double sec_hv, double sec_kv) {
-    cout << title << " [" << setw(7) << GBytes << " GB]: cost " << setw(10) << sec
-        << " s in bandwidth " << setw(10) << GBytes / sec << " [GB/s]" << ", hostVerify cost "
-        << setw(10) << sec_hv << " s in bandwidth " << setw(10) << GBytes / sec_hv << " [GB/s]"
-        << ", kernelVerify cost "<< setw(10) << sec_kv << " s in bandwidth " << setw(10)
-        << GBytes / sec_kv << " [GB/s]" << endl;
-  }
-
-  void hostFill(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    auto start = chrono::steady_clock::now();
-    for (int i = 0; i < num; ++i) {
-#if SIMPLY_ASSIGN
-      data[i] = coef;
-#else
-      data[i] = coef * i;
-#endif
-    }
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count();
-  }
-
-  void kernelFill(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    unsigned blocks = setNumBlocks(num);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                           dim3(threadsPerBlock), 0, 0, data, 0, num);  // kernel will be loaded first time
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto start = chrono::steady_clock::now();
-
-    for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_fill<T>), dim3(blocks),
-                             dim3(threadsPerBlock), 0, 0, data, coef, num);
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count() / NUM_ITER;  // in second
-  }
-
-  void hostVerify(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    auto start = chrono::steady_clock::now();
-    for (int i = 0; i < num; ++i) {
-#if SIMPLY_ASSIGN
-      if(data[i] != coef) {
-        cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef << endl;
-        failed("failed\n");
-      }
-#else
-      if(data[i] != coef * i) {
-        cout << "hostVerify failed: i=" << i << ", data[i]=" << data[i] << ", expected=" << coef * i << endl;
-        failed("failed\n");
-      }
-#endif
-    }
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count();
-  }
-
-  void kernelVerify(size_t size, T *data, T coef, double &sec) {
-    size_t num = size / sizeof(T);  // Size of elements
-    unsigned blocks = setNumBlocks(num);
-
-    CaptureStream *capture = new CaptureStream(stdout);
-    capture->Begin();
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                       dim3(threadsPerBlock), 0, 0, data, coef, num);  // kernel will be loaded first time
-    HIPCHECK(hipDeviceSynchronize());
-
-    capture->End();
-    capture->Truncate(1000); // Don't want too long log if existing
-    std::string device_output = capture->getData();
-    delete capture;
-    if (device_output.length() > 0) {
-      failed("kernelVerify failed:\n%s\n", device_output.c_str());
-    }
-
-    // Now all data verified. The following is to test bandwidth.
-    auto start = chrono::steady_clock::now();
-
-    for (int iter = 0; iter < NUM_ITER; ++iter) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(vec_verify<T>), dim3(blocks),
-                             dim3(threadsPerBlock), 0, 0, data, coef, num);
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    auto end = chrono::steady_clock::now();
-    chrono::duration<double> diff = end - start;  // in second
-    sec = diff.count() / NUM_ITER;  // in second
-  }
-
-  bool testLargeBarDeviceMemoryHostFill(size_t size) {
-    if (!supportLargeBar()) {
-      return false;
-    }
-
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMalloc(&A, size));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access device mem in LB
-    HIPCHECK(hipFree(A));
-
-    log_host("Largebar: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testLargeBar() {
-    if (!supportLargeBar()) {
-      return false;
-    }
-
-    cout << "Test large bar device memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testLargeBarDeviceMemoryHostFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  bool testManagedMemoryHostFill(size_t size) {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMallocManaged(&A, size));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access HMM mem
-    HIPCHECK(hipFree(A));
-
-    log_host("Managed: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testManagedMemoryKernelFill(size_t size) {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipMallocManaged(&A, size));
-
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);  // Managed memory can be verified by host
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipFree(A));
-
-    log_kernel("Managed: kernel fill", GBytes, sec, sec_hv, sec_kv);
-
-    return true;
-  }
-
-  bool testManagedMemory() {
-    if (!supportManagedMemory()) {
-      return false;
-    }
-
-    cout << "Test managed memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testManagedMemoryHostFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    cout << "Test managed memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testManagedMemoryKernelFill(totalSizes_[i])) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  bool testHostMemoryHostFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-    T *A;
-    HIPCHECK(hipHostMalloc(&A, size, flags));
-    double sec = 0;
-    hostFill(size, A, coef_, sec);
-    HIPCHECK(hipHostFree(A));
-
-    log_host("Host: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testHostMemoryKernelFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A;
-    HIPCHECK(hipHostMalloc((void** ) &A, size, flags));
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipHostFree(A));
-
-    log_kernel("Host: kernel fill", GBytes, sec, sec_hv, sec_kv);
-    return true;
-  }
-
-  bool testHostMemory() {
-    cout << "Test coherent host memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test non-coherent host memory host filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryHostFill(totalSizes_[i], hipHostMallocNonCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test coherent host memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocCoherent)) {
-        return false;
-      }
-    }
-
-    cout << "Test non-coherent host memory kernel filling" << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testHostMemoryKernelFill(totalSizes_[i], hipHostMallocNonCoherent)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  /* This function should be via device attribute query*/
-  bool supportDeviceMallocFinegrained() {
-#ifdef __HIP_PLATFORM_AMD__
-    T *A = nullptr;
-    hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained);
-    if (!A) {
-      return false;
-    }
-    HIPCHECK(hipFree(A));
-    return true;
-#else
-    return false;
-#endif
-  }
-
-  unsigned int setNumBlocks(size_t size) {
-    size_t num = size/sizeof(T);
-
-#if USE_HIPTEST_SETNUMBLOCKS
-    return HipTest::setNumBlocks(blocksPerCU_, threadsPerBlock_,
-                                 num);
-#else
-    return (num + threadsPerBlock_ - 1) / threadsPerBlock_;
-#endif
-  }
-
-#ifdef __HIP_PLATFORM_AMD__
-  bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A = nullptr;
-    HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags));
-    if (!A) {
-      cout << "failed hipExtMallocWithFlags() with size =" << size << " flags="
-           << std::hex << flags << endl;
-      return false;
-    }
-
-    double sec = 0;
-    hostFill(size, A, coef_, sec);  // Cpu can access this mem
-    HIPCHECK(hipFree(A));
-
-    log_host("ExtDevice: host   fill", GBytes, sec);
-    return true;
-  }
-
-  bool testExtDeviceMemoryKernelFill(size_t size, unsigned int flags) {
-    double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
-
-    T *A = nullptr;
-    HIPCHECK(hipExtMallocWithFlags((void **)&A, size, flags));
-    if (!A) {
-      cout << "failed hipExtMallocWithFlags() with size =" << size << " flags="
-           << std::hex << flags << endl;
-      return false;
-    }
-
-    double sec = 0, sec_hv = 0, sec_kv = 0;
-    kernelFill(size, A, coef_, sec);
-    hostVerify(size, A, coef_, sec_hv);  // Fine grained device memory can be verified by host
-    kernelVerify(size, A, coef_, sec_kv);
-    HIPCHECK(hipFree(A));
-
-    log_kernel("ExtDevice: kernel fill", GBytes, sec, sec_hv, sec_kv);
-
-    return true;
-  }
-
-  bool testExtDeviceMemory() {
-    cout << "Test fine grained device memory host filling"
-        << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryHostFill(totalSizes_[i],
-                                       hipDeviceMallocFinegrained)) {
-        return false;
-      }
-    }
-
-    cout << "Test fine grained device memory kernel filling"
-        << endl;
-    for (int i = 0; i < NUM_SIZE; i++) {
-      if (!testExtDeviceMemoryKernelFill(totalSizes_[i],
-                                         hipDeviceMallocFinegrained)) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-#endif
-
-  bool run() {
-    if (supportLargeBar()) {
-      if (!testLargeBar()) {
-        return false;
-      }
-    }
-
-    if (supportManagedMemory()) {
-      if (!testManagedMemory()) {
-        return false;
-      }
-    }
-
-    if (!testHostMemory()) {
-      return false;
-    }
-
-#ifdef __HIP_PLATFORM_AMD__
-    if (supportDeviceMallocFinegrained()) {
-      if (!testExtDeviceMemory()) {
-        return false;
-      }
-    }
-#endif
-    return true;
-  }
-
-};
-
-int main(int argc, char *argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true); // For ::p_gpuDevice, ::blocksPerCU, ::threadsPerBlock
-  cout << "Test int" << endl;
-  hipPerfMemFill<int> hipPerfMemFillInt(::blocksPerCU, ::threadsPerBlock);
-  hipPerfMemFillInt.open(::p_gpuDevice);
-  HIPASSERT(hipPerfMemFillInt.run());
-
-  cout << "Test double" << endl;
-  hipPerfMemFill<double> hipPerfMemFillDouble(::blocksPerCU, ::threadsPerBlock);
-  hipPerfMemFillDouble.open(::p_gpuDevice);
-  HIPASSERT(hipPerfMemFillDouble.run());
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfMemMallocCpyFree.cpp b/perftests/memory/hipPerfMemMallocCpyFree.cpp
deleted file mode 100644
index 94ceb68cb..000000000
--- a/perftests/memory/hipPerfMemMallocCpyFree.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "test_common.h"
-#include <iostream>
-#include <time.h>
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#define NUM_SIZE 19  //size up to 16M
-#define NUM_ITER 500 //Total GPU memory up to 16M*500=8G
-
-void valSet(int* A, int val, size_t size) {
-    size_t len = size / sizeof(int);
-    for (int i = 0; i < len; i++) {
-        A[i] = val;
-    }
-}
-
-void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) {
-
-    std::cout << "size: ";
-    for (int i = 0; i < num; i++) {
-        size[i] = 1 << (i + 6);
-        if((NUM_ITER + 1) * size[i] > totalGlobalMem) {
-          num = i;
-          break;
-        }
-        std::cout << size[i] << " ";
-    }
-    std::cout << std::endl;
-    *pA = (int*)malloc(size[num - 1]);
-    valSet(*pA, 1, size[num - 1]);
-}
-
-void testInit(size_t size, int *A) {
-    int *Ad;
-    clock_t start = clock();
-    hipMalloc(&Ad, size); //hip::init() will be called
-    clock_t end = clock();
-    double uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-    std::cout << "Initial" << std::endl;
-    std::cout << "hipMalloc(" << size << ") cost " << uS << "us" << std::endl;
-
-    start = clock();
-    hipMemcpy(Ad, A, size, hipMemcpyHostToDevice);
-    hipDeviceSynchronize();
-    end = clock();
-    uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-    std::cout << "hipMemcpy(" << size << ") cost " << uS << "us" << std::endl;
-
-    start = clock();
-    hipFree(Ad);
-    end = clock();
-    uS = (end - start) * 1000000. / CLOCKS_PER_SEC;
-    std::cout << "hipFree(" << size << ") cost " << uS << "us" << std::endl;
-}
-
-int main() {
-    double uS;
-    clock_t start, end;
-    size_t size[NUM_SIZE] = { 0 };
-    int *Ad[NUM_ITER] = { nullptr };
-    int *A;
-    hipDeviceProp_t props;
-    memset(&props, 0, sizeof(props));
-    HIPCHECK(hipGetDeviceProperties(&props, 0));
-    std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl;
-
-    int num = NUM_SIZE;
-    setup(size, num, &A, props.totalGlobalMem);
-    testInit(size[0], A);
-
-    for (int i = 0; i < num; i++) {
-        std::cout << size[i] << std::endl;
-        start = clock();
-        for (int j = 0; j < NUM_ITER; j++) {
-            HIPCHECK(hipMalloc(&Ad[j], size[i]));
-        }
-        end = clock();
-        uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-        std::cout << "hipMalloc(" << size[i] << ") cost " << uS << "us" << std::endl;
-
-        start = clock();
-        for (int j = 0; j < NUM_ITER; j++) {
-            HIPCHECK(hipMemcpy(Ad[j], A, size[i], hipMemcpyHostToDevice));
-        }
-        hipDeviceSynchronize();
-        end = clock();
-        uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-        std::cout << "hipMemcpy(" << size[i] << ") cost " << uS << "us" << std::endl;
-
-        start = clock();
-        for (int j = 0; j < NUM_ITER; j++) {
-            HIPCHECK(hipFree(Ad[j]));
-            Ad[j] = nullptr;
-        }
-        end = clock();
-        double uS = (end - start) * 1000000. / (NUM_ITER * CLOCKS_PER_SEC);
-        std::cout << "hipFree(" << size[i] << ") cost " << uS << "us" << std::endl;
-    }
-    free(A);
-    passed();
-}
diff --git a/perftests/memory/hipPerfMemcpy.cpp b/perftests/memory/hipPerfMemcpy.cpp
deleted file mode 100644
index 9751117ec..000000000
--- a/perftests/memory/hipPerfMemcpy.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <iostream>
-#include <chrono>
-
-#define NUM_SIZE 8
-#define NUM_ITER 0x40000
-
-
-using namespace std;
-
-class hipPerfMemcpy {
-  private:
-    unsigned int numBuffers_;
-    size_t totalSizes_[NUM_SIZE];
-    void setHostBuffer(int *A, int val, size_t size);
-  public:
-    hipPerfMemcpy();
-    ~hipPerfMemcpy() {};
-    void open(int deviceID);
-    void run(unsigned int testNumber);
-};
-
-hipPerfMemcpy::hipPerfMemcpy() : numBuffers_(0) {
-  for (int i = 0; i < NUM_SIZE; i++) {
-    totalSizes_[i] = 1 << (i + 6);
-  }
-};
-
-void hipPerfMemcpy::setHostBuffer(int *A, int val, size_t size) {
-  size_t len = size / sizeof(int);
-  for (int i = 0; i < len; i++) {
-    A[i] = val;
-  }
-}
-
-void hipPerfMemcpy::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId  << std::endl;
-}
-
-void hipPerfMemcpy::run(unsigned int testNumber) {
-  int *A, *Ad;
-  A = new int[totalSizes_[testNumber]];
-  setHostBuffer(A, 1, totalSizes_[testNumber]);
-  hipMalloc(&Ad, totalSizes_[testNumber]);
-
-  auto start = chrono::steady_clock::now();
-
-  for (int j = 0; j < NUM_ITER; j++) {
-    hipMemcpy(Ad, A, totalSizes_[testNumber], hipMemcpyHostToDevice);
-  }
-
-  hipDeviceSynchronize();
-
-  auto end = chrono::steady_clock::now();
-  chrono::duration<double, micro> diff = end - start;
-
-  cout << "hipPerfMemcpy[" << testNumber << "] " << "Host to Device copy took "
-      << diff.count() / NUM_ITER << " us for memory size of " << totalSizes_[testNumber]
-      << " Bytes" << endl;
-
-  delete [] A;
-  HIPCHECK(hipFree(Ad));
-
-}
-
-
-int main() {
-  hipPerfMemcpy hipPerfMemcpy;
-
-  int deviceId = 0;
-  hipPerfMemcpy.open(deviceId);
-
-  for (auto testCase = 0; testCase < NUM_SIZE; testCase++) {
-    hipPerfMemcpy.run(testCase);
-  }
-
-  passed();
-
-}
diff --git a/perftests/memory/hipPerfMemset.cpp b/perftests/memory/hipPerfMemset.cpp
deleted file mode 100644
index 2df0c9727..000000000
--- a/perftests/memory/hipPerfMemset.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <iostream>
-#include <chrono>
-
-static unsigned int sizeList[] = {
-  256, 512, 1024, 2048, 4096, 8192,
-};
-
-static unsigned int eleNumList[] = {
-    0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000,
-    0x200000, 0x400000, 0x800000, 0x1000000
-};
-
-typedef struct _dataType {
-char memsetval = 0x42;
-char memsetD8val = 0xDE;
-int16_t memsetD16val = 0xDEAD;
-int memsetD32val = 0xDEADBEEF;
-}dataType;
-
-#define NUM_ITER 1000
-
-enum MemsetType {
-  hipMemsetTypeDefault,
-  hipMemsetTypeD8,
-  hipMemsetTypeD16,
-  hipMemsetTypeD32,
-  hipMemsetTypeMax
-
-};
-
-using namespace std;
-
-class hipPerfMemset {
-  private:
-    uint64_t     bufSize_;
-    unsigned int num_elements_;
-    unsigned int testNumEle_;
-    unsigned int _numSubTests = 0;
-    unsigned int _numSubTests2D = 0;
-    unsigned int _numSubTests3D = 0;
-    unsigned int num_sizes_ =0;
-
-  public:
-    hipPerfMemset() {
-    num_elements_ = sizeof(eleNumList) / sizeof(unsigned int);
-    _numSubTests = num_elements_ * hipMemsetTypeMax;
-
-    num_sizes_ = sizeof(sizeList) / sizeof(unsigned int);
-    _numSubTests2D = num_sizes_;
-    _numSubTests3D = _numSubTests2D;
-    };
-
-    ~hipPerfMemset() {};
-
-    void open(int deviceID);
-
-    template<typename T>
-    void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    template<typename T>
-    void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    template<typename T>
-    void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    uint getNumTests() {
-      return _numSubTests;
-    }
-
-    uint getNumTests2D() {
-      return _numSubTests2D;
-    }
-    uint getNumTests3D() {
-      return _numSubTests3D;
-    }
-};
-
-
-void hipPerfMemset::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-            << std::endl;
-}
-
-template<typename T>
-void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-  T * A_h;
-  T * A_d;
-
-  testNumEle_ = eleNumList[test % num_elements_];
-
-  bufSize_ = testNumEle_ * sizeof(uint32_t);
-
-  HIPCHECK(hipMalloc(&A_d, bufSize_));
-
-  A_h = reinterpret_cast<T*> (malloc(bufSize_));
-
-  hipStream_t stream;
-  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto start = chrono::high_resolution_clock::now();
-  for (uint i = 0; i < NUM_ITER; i++) {
-    if (type == hipMemsetTypeDefault && !async) {
-      HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
-    }
-    else if (type == hipMemsetTypeDefault && async) {
-      HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
-    }
-    else if (type == hipMemsetTypeD8 && !async){
-      HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_));
-    }
-    else if (type == hipMemsetTypeD8 && async) {
-      HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream));
-    }
-    else if (type == hipMemsetTypeD16 && !async) {
-      HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
-    }
-    else if (type == hipMemsetTypeD16 && async) {
-      HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
-    }
-    else if (type == hipMemsetTypeD32 && !async) {
-      HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
-    }
-    else if (type == hipMemsetTypeD32 && async) {
-      HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
-    }
-  }
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::high_resolution_clock::now();
-
-  HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) );
-
-  for (int i = 0; i < bufSize_ / sizeof(T); i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-    }
-  }
-
-  HIPCHECK(hipFree(A_d));
-  free(A_h);
-
-  auto diff = std::chrono::duration<double>(end - start);
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((bufSize_ * NUM_ITER * (double)(1e-09)) / sec);
-
-  cout <<  "[" << setw(2) << test << "] " << setw(5) << bufSize_/1024 << " Kb " << setw(4)
-       << " typeSize " << (int)sizeof(T) << " : " << setw(7) << perf << " GB/s " << endl;
-}
-
-template<typename T>
-void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-  bufSize_ = sizeList[test % num_sizes_];
-
-  size_t numH = bufSize_;
-  size_t numW = bufSize_;
-  size_t pitch_A;
-  size_t width = numW * sizeof(char);
-  size_t sizeElements = width * numH;
-  size_t elements = numW* numH;
-
-  T * A_h;
-  T * A_d;
-
-  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width ,
-                          numH));
-  A_h = reinterpret_cast<char*>(malloc(sizeElements));
-
-  for (size_t i=0; i < elements; i++) {
-    A_h[i] = 1;
-  }
-
-  hipStream_t stream;
-  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto start = chrono::steady_clock::now();
-
-  for (uint i = 0; i < NUM_ITER; i++) {
-    if (type == hipMemsetTypeDefault && !async) {
-    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
-    }
-    else if (type == hipMemsetTypeDefault && async) {
-      HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
-    }
-  }
-
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::steady_clock::now();
-
-  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
-                       hipMemcpyDeviceToHost));
-
-  for (int i=0; i < elements; i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-    }
-  }
-
-  chrono::duration<double> diff = end - start;
-
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((sizeElements* NUM_ITER * (double)(1e-09)) / sec);
-
-  cout << " hipPerf2DMemset" << (async ? "Async" : "     ") << "[" << test << "] "
-       << "  " << "(GB/s) for " << setw(5) << bufSize_
-       << " x " << setw(5) << bufSize_ << " bytes : " << setw(7) << perf <<  endl;
-
-  HIPCHECK(hipStreamDestroy(stream));
-  HIPCHECK(hipFree(A_d));
-  free(A_h);
-}
-
-template<typename T>
-void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-    bufSize_ = sizeList[test % num_sizes_];
-
-    size_t numH = bufSize_;
-    size_t numW = bufSize_;
-    size_t depth = 10;
-    size_t width = numW * sizeof(char);
-    size_t sizeElements = width * numH * depth;
-    size_t elements = numW* numH* depth;
-
-    hipStream_t stream;
-    HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-    T *A_h;
-
-    hipExtent extent = make_hipExtent(width, numH, depth);
-    hipPitchedPtr devPitchedPtr;
-
-    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
-    A_h = (char*)malloc(sizeElements);
-    HIPASSERT(A_h != NULL);
-
-    for (size_t i=0; i<elements; i++) {
-        A_h[i] = 1;
-    }
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemset3DAsync( devPitchedPtr, memsetval, extent, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-   auto start = chrono::steady_clock::now();
-
-   for (uint i = 0; i < NUM_ITER; i++) {
-     if (type == hipMemsetTypeDefault && !async) {
-       HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
-     }
-     else if (type == hipMemsetTypeDefault && async) {
-       HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
-     }
-   }
-
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::steady_clock::now();
-
-  hipMemcpy3DParms myparms = {0};
-  myparms.srcPos = make_hipPos(0,0,0);
-  myparms.dstPos = make_hipPos(0,0,0);
-  myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
-  myparms.srcPtr = devPitchedPtr;
-  myparms.extent = extent;
-
-  myparms.kind = hipMemcpyDeviceToHost;
-
-  HIPCHECK(hipMemcpy3D(&myparms));
-
-  for (int i=0; i<elements; i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-      }
-  }
-
-  chrono::duration<double> diff = end - start;
-
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((sizeElements * NUM_ITER * (double)(1e-09)) / sec);
-
-  cout << " hipPerf3DMemset" << (async ? "Async" : "     ") << "[" << test << "] " << "  "
-       <<  "(GB/s) for " << setw(5) << bufSize_ << " x " << setw(5)
-       << bufSize_  << " x " << depth << " bytes : " << setw(7) << perf <<  endl;
-  HIPCHECK(hipFree(devPitchedPtr.ptr));
-  free(A_h);
-}
-
-int main() {
-  hipPerfMemset hipPerfMemset;
-
-  dataType pattern;
-  int deviceId = 0;
-  hipPerfMemset.open(deviceId);
-  MemsetType type;
-
-  int numTests = hipPerfMemset.getNumTests();
-  int numTests2D = hipPerfMemset.getNumTests2D();
-  int numTests3D = hipPerfMemset.getNumTests3D();
-
-
-  cout << "--------------------- 1D buffer -------------------" << endl;
-  bool async= false;
-  for (uint i = 0; i < 2 ; i++) {
-    cout << endl;
-
-    for (auto testCase = 0; testCase < numTests; testCase++) {
-      if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD8" << (async ? "Async " : "      ");
-        hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async);
-      }
-
-      else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD16" << (async ? "Async" : "     ");
-        hipPerfMemset.run1D(testCase,pattern.memsetD16val, hipMemsetTypeD16, async);
-      }
-
-      else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD32" << (async ? "Async" : "     ");
-        hipPerfMemset.run1D(testCase,pattern.memsetD32val, hipMemsetTypeD32, async);
-      }
-
-      else {
-        cout << "API: hipMemset" << (async ? "Async   " : "        ");
-        hipPerfMemset.run1D(testCase,pattern.memsetval, hipMemsetTypeDefault, async);
-      }
-    }
-    async = true;
-  }
-
-  cout << endl;
-  cout << "------------------ 2D buffer arrays ---------------" << endl;
-
-  async = false;
-  for (uint i = 0; i < 2; i++) {
-    cout << endl;
-    for (uint test = 0; test < numTests2D; test++) {
-      hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async);
-    }
-    async = true;
-  }
-
-  cout << endl;
-  cout << "------------------ 3D buffer arrays ---------------" << endl;
-
-  async = false;
-  for (uint i = 0; i < 2; i++) {
-    cout << endl;
-    for (uint test =0; test < numTests3D; test++) {
-      hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async);
-    }
-    async = true;
-  }
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfSampleRate.cpp b/perftests/memory/hipPerfSampleRate.cpp
deleted file mode 100644
index 1ecadfe74..000000000
--- a/perftests/memory/hipPerfSampleRate.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <vector>
-
-using namespace std;
-
-#define NUM_TYPES 3
-vector<string> types= {"float", "float2", "float4"};
-vector<unsigned int> typeSizes = {4, 8, 16};
-
-#define NUM_SIZES 12
-vector<unsigned int> sizes = {1,  2,   4,   8,   16,   32,
-                              64, 128, 256, 512, 1024, 2048};
-
-#define NUM_BUFS 6
-#define MAX_BUFS (1 << (NUM_BUFS - 1))
-
-#ifdef __HIP_PLATFORM_NVIDIA__
-inline __host__ __device__ void operator+=(float2 &a, float2 b)
-{
-  a.x += b.x; a.y += b.y;
-}
-
-inline __host__ __device__ void operator+=(float4 &a, float4 b)
-{
-  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-#endif
-
-template <typename T>
-__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
-                           T **inBuffer, int numBufs) {
-
-  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  uint inputIdx = gid % inBufSize;
-
-  T tmp;
-  memset(&tmp, 0, sizeof(T));
-  for(int i = 0; i < numBufs; i++) {
-    tmp += *(*(inBuffer+i)+inputIdx);
-  }
-
-  if (writeIt*(unsigned int)tmp.x) {
-    outBuffer[gid] = tmp;
-   }
-};
-
-template <typename T>
-__global__ void sampleRateFloat(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
-                                T ** inBuffer, int numBufs) {
-
-  uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  uint inputIdx = gid % inBufSize;
-
-  T tmp = (T)0.0f;
-
-  for(int i = 0; i < numBufs; i++) {
-    tmp += *((*inBuffer+i)+inputIdx);
-  }
-
-  if (writeIt*(unsigned int)tmp) {
-    outBuffer[gid] = tmp;
-  }
-};
-
-class hipPerfSampleRate {
-  public:
-  hipPerfSampleRate();
-  ~hipPerfSampleRate();
-
-  void open(void);
-  void run(unsigned int testCase);
-  void close(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfSampleRate::*funPtr)(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  // Wrappers
-  void float_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  void float2_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  void float4_kernel(void * outBuffer, unsigned int
-    inBufSize, unsigned int writeIt, void **inBuffer, int numBufs, int grids, int blocks,
-    int threads_per_block);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int width_;
-  unsigned int bufSize_;
-  unsigned long long totalIters = 0;
-  int numCUs;
-
-  unsigned int outBufSize_;
-  static const unsigned int MAX_ITERATIONS = 25;
-  unsigned int numBufs_;
-  unsigned int typeIdx_;
-};
-
-
-hipPerfSampleRate::hipPerfSampleRate() {}
-
-hipPerfSampleRate::~hipPerfSampleRate() {}
-
-void hipPerfSampleRate::open(void) {
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  int deviceId = 0;
-  hipDeviceProp_t props = {0};
-  props = {0};
-  HIPCHECK(hipSetDevice(deviceId));
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-  numCUs = props.multiProcessorCount;
-  }
-
-
-void hipPerfSampleRate::close() {
-
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfSampleRate::float_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int numBufs, int grids, int blocks, int threads_per_block) {
-
-  hipLaunchKernelGGL(sampleRateFloat<float>, dim3(grids, grids, grids), dim3 (blocks), 0, 0,
-                      (float*)outBuffer, inBufSize, writeIt, (float**)inBuffer, numBufs);
-
-}
-
-void hipPerfSampleRate::float2_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int grids, int blocks, int threads_per_block, int numBufs) {
-
-  hipLaunchKernelGGL(sampleRate<float2>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
-                      (float2 *)outBuffer, inBufSize, writeIt, (float2**)inBuffer, numBufs);
-}
-
-void hipPerfSampleRate::float4_kernel(void * outBuffer, unsigned int inBufSize,
-                                       unsigned int writeIt, void **inBuffer,
-                                       int grids, int blocks, int threads_per_block, int numBufs) {
-
-  hipLaunchKernelGGL(sampleRate<float4>, dim3(grids, grids, grids), dim3(blocks), 0, 0,
-                      (float4 *) outBuffer, inBufSize, writeIt, (float4**)inBuffer, numBufs);
-}
-
-void hipPerfSampleRate::run(unsigned int test) {
-
-  funPtr p[] = {&hipPerfSampleRate::float_kernel, &hipPerfSampleRate::float2_kernel,
-               &hipPerfSampleRate::float4_kernel};
-
-  // We compute a square domain
-  width_ = sizes[test % NUM_SIZES];
-  typeIdx_ = (test / NUM_SIZES) % NUM_TYPES;
-  bufSize_ = width_ * width_ * typeSizes[typeIdx_];
-  numBufs_ = (1 << (test / (NUM_SIZES * NUM_TYPES)));
-
-  void *  hOutPtr;
-  void *  dOutPtr;
-  void *  hInPtr[numBufs_];
-  void ** dPtr;
-  void *  dInPtr[numBufs_];
-
- outBufSize_ =
-      sizes[NUM_SIZES - 1] * sizes[NUM_SIZES - 1] * typeSizes[NUM_TYPES - 1];
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hOutPtr, outBufSize_, hipHostMallocDefault));
-  setData((void *)hOutPtr, 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dOutPtr, outBufSize_));
-
-  // Allocate 2D array in Device
-   hipMalloc((void **)&dPtr, numBufs_* sizeof(void *));
-
-  for (uint i = 0; i < numBufs_; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hInPtr[i], bufSize_, hipHostMallocDefault));
-    HIPCHECK(hipMalloc((uint **)&dInPtr[i], bufSize_));
-    setData(hInPtr[i], 0x3f800000);
-  }
-
-  // Populate array of pointers with array addresses
-  hipMemcpy(dPtr, dInPtr, numBufs_* sizeof(void *), hipMemcpyHostToDevice);
-
-  // Copy memory from host to device
-  for (uint i = 0; i < numBufs_; i++) {
-  HIPCHECK(hipMemcpy(dInPtr[i], hInPtr[i], bufSize_, hipMemcpyHostToDevice));
-  }
-
-  HIPCHECK(hipMemcpy(dOutPtr, hOutPtr, outBufSize_, hipMemcpyHostToDevice));
-
-  // Prepare kernel launch parameters
-  // outBufSize_/sizeof(uint) - Grid size in 3D
-  int grids = 64;
-  int blocks = 64;
-  int threads_per_block  = 1;
-
-  unsigned int maxIter = MAX_ITERATIONS * (MAX_BUFS / numBufs_);
-  unsigned int sizeDW = width_ * width_;
-  unsigned int writeIt = 0;
-
-  int idx = 0;
-
-  if (!types[typeIdx_].compare("float")) {
-    idx = 0;
-  }
-  else if(!types[typeIdx_].compare("float2")) {
-          idx = 1;
-  }
-  else if(!types[typeIdx_].compare("float4")) {
-          idx = 2;
-  }
-
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-  for (uint i = 0; i < maxIter; i++) {
-        (this->*p[idx]) ((void *)dOutPtr, sizeDW, writeIt, dPtr, numBufs_, grids, blocks,
-                          threads_per_block);
-  }
-
-  hipDeviceSynchronize();
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  double perf = ((double)outBufSize_ * numBufs_ * (double)maxIter * (double)(1e-09)) /
-                          all_kernel_time.count();
-
-  cout << "Domain " << sizes[NUM_SIZES - 1] << "x"<< sizes[NUM_SIZES - 1] << " bufs "
-       << numBufs_ << " " << types[typeIdx_] << " " << width_<<"x"<<width_<< " (GB/s) "
-       << perf << endl;
-
-   HIPCHECK(hipFree(dOutPtr));
-
-   // Free host and device memory
-   for (uint i = 0; i < numBufs_; i++) {
-    HIPCHECK(hipHostFree(hInPtr[i]));
-    HIPCHECK(hipFree(dInPtr[i]));
-    }
-
-   HIPCHECK(hipHostFree(hOutPtr));
-   HIPCHECK(hipFree(dPtr));
-}
-
-
-void hipPerfSampleRate::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < bufSize_ / sizeof(unsigned int); i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfSampleRate::checkData(uint *ptr) {
-  for (unsigned int i = 0; i < outBufSize_ / sizeof(float); i++) {
-    if (ptr[i] != (float)numBufs_) {
-      cout << "Data validation failed at "<< i << " Got "<< ptr[i] << ", expected "
-           << (float)numBufs_;
-      break;
-          }
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfSampleRate sampleTypes;
-
-  sampleTypes.open();
-
-  for (unsigned int testCase = 0; testCase < 216 ; testCase+=36) {
-    sampleTypes.run(testCase);
-  }
-
-
-  passed();
-}
diff --git a/perftests/memory/hipPerfSharedMemReadSpeed.cpp b/perftests/memory/hipPerfSharedMemReadSpeed.cpp
deleted file mode 100644
index 539cf4105..000000000
--- a/perftests/memory/hipPerfSharedMemReadSpeed.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-using namespace std;
-
-#define sharedMemSize1 2048
-#define sharedMemSize2 256
-
-__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) {
-
-  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t lid = threadIdx.x;
-  __shared__ float local[sharedMemSize1];
-
-  float val1 = 0;
-  float val2 = 0;
-  float val3 = 0;
-  float val4 = 0;
-
-  for (int i = 0; i < (sharedMemSize1 / 64); i++) {
-    local[lid + i * 64] = lid;
-  }
-
-  __syncthreads();
-
-  val1 += local[lid];
-  val2 += local[lid + 64];
-  val3 += local[lid + 128];
-  val4 += local[lid + 192];
-  val1 += local[lid + 256];
-  val2 += local[lid + 320];
-  val3 += local[lid + 384];
-  val4 += local[lid + 448];
-  val1 += local[lid + 512];
-  val2 += local[lid + 576];
-  val3 += local[lid + 640];
-  val4 += local[lid + 704];
-  val1 += local[lid + 768];
-  val2 += local[lid + 832];
-  val3 += local[lid + 896];
-  val4 += local[lid + 960];
-  val1 += local[lid + 1024];
-  val2 += local[lid + 1088];
-  val3 += local[lid + 1152];
-  val4 += local[lid + 1216];
-  val1 += local[lid + 1280];
-  val2 += local[lid + 1344];
-  val3 += local[lid + 1408];
-  val4 += local[lid + 1472];
-  val1 += local[lid + 1536];
-  val2 += local[lid + 1600];
-  val3 += local[lid + 1664];
-  val4 += local[lid + 1728];
-  val1 += local[lid + 1792];
-  val2 += local[lid + 1856];
-  val3 += local[lid + 1920];
-  val4 += local[lid + 1984];
-
-  if (gid < N) {
-    outBuf[gid] = val1 + val2 + val3 + val4;
-  }
-};
-
-__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) {
-  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t lid = threadIdx.x;
-  __shared__ float local[sharedMemSize2];
-
-  float val0 = 0.0f;
-  float val1 = 0.0f;
-
-  for (int i = 0; i < (sharedMemSize2 / 64); i++) {
-    local[lid + i * 64] = lid;
-  }
-
-  __syncthreads();
-
-#pragma nounroll
-  for (uint i = 0; i < 32; i++) {
-    val0 += local[8 * i + 0];
-    val1 += local[8 * i + 1];
-    val0 += local[8 * i + 2];
-    val1 += local[8 * i + 3];
-    val0 += local[8 * i + 4];
-    val1 += local[8 * i + 5];
-    val0 += local[8 * i + 6];
-    val1 += local[8 * i + 7];
-  }
-
-  if (gid < N) {
-    outBuf[gid] = val0 + val1;
-  }
-};
-
-int main(int argc, char *argv[]) {
-  float *dDst;
-  float *hDst;
-  hipStream_t stream;
-  constexpr uint numSizes = 4;
-  constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216};
-  uint numReads1 = 32;
-  uint numReads2 = 256;
-  uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float);
-  uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float);
-  int nIter = 1000;
-  const unsigned threadsPerBlock = 64;
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    cout << "info: didn't find any GPU! skipping the test!\n";
-    passed();
-    return 0;
-  }
-
-  static int device = 0;
-  HIPCHECK(hipSetDevice(device));
-  hipDeviceProp_t props;
-  HIPCHECK(hipGetDeviceProperties(&props, device));
-  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-      << " with " << props.multiProcessorCount << " CUs" << endl;
-
-  HIPCHECK(hipStreamCreate(&stream));
-
-  for (int nTest = 0; nTest < numSizes; nTest++) {
-    uint nBytes = Sizes[nTest % numSizes];
-    ulong N = nBytes / sizeof(float);
-    const unsigned blocks = N / threadsPerBlock;
-
-    hDst = new float[nBytes];
-    HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
-    memset(hDst, 0, nBytes);
-
-    HIPCHECK(hipMalloc(&dDst, nBytes));
-    HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
-
-    hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock),
-        0, stream, dDst, N);
-    HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
-    hipDeviceSynchronize();
-
-    int tmp = 0;
-    for (int i = 0; i < N; i++) {
-      if (i % threadsPerBlock == 0) {
-        tmp = 0;
-      }
-      if (hDst[i] != tmp) {
-        cout << "info: Data validation failed for warm up run!" << endl;
-        cout << "info: expected " << tmp << " got " << hDst[i] << endl;
-        HIPCHECK (hipErrorUnknown);
-      }
-      tmp += threadsPerBlock / 2;
-    }
-
-    auto all_start = chrono::steady_clock::now();
-    for (int i = 0; i < nIter; i++) {
-      hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
-          dim3(threadsPerBlock), 0, stream, dDst, N);
-    }
-    hipDeviceSynchronize();
-
-    auto all_end = chrono::steady_clock::now();
-    chrono::duration<double> all_kernel_time = all_end - all_start;
-
-    // read speed in GB/s
-    double perf = ((double) blocks * threadsPerBlock
-        * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter
-        * (double) (1e-09)) / all_kernel_time.count();
-
-    cout << "info: read speed = " << setw(8) << perf << " GB/s for "
-        << sharedMemSizeBytes1 / 1024 << " KB shared memory"
-            " with " << setw(8) << blocks * threadsPerBlock << " threads, "
-        << setw(4) << numReads1 << " reads in sharedMemReadSpeed1 kernel" << endl;
-
-    delete[] hDst;
-    hipFree(dDst);
-  }
-
-
-  for (int nTest = 0; nTest < numSizes; nTest++) {
-    uint nBytes = Sizes[nTest % numSizes];
-    ulong N = nBytes / sizeof(float);
-    const unsigned blocks = N / threadsPerBlock;
-
-    hDst = new float[nBytes];
-    HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
-    memset(hDst, 0, nBytes);
-
-    HIPCHECK(hipMalloc(&dDst, nBytes));
-    HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
-
-    hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock),
-        0, stream, dDst, N);
-    HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
-    hipDeviceSynchronize();
-
-    auto all_start = chrono::steady_clock::now();
-    for (int i = 0; i < nIter; i++) {
-      hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
-          dim3(threadsPerBlock), 0, stream, dDst, N);
-    }
-    hipDeviceSynchronize();
-
-    auto all_end = chrono::steady_clock::now();
-    chrono::duration<double> all_kernel_time = all_end - all_start;
-
-    // read speed in GB/s
-    double perf = ((double) blocks * threadsPerBlock
-        * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter
-        * (double) (1e-09)) / all_kernel_time.count();
-
-    cout << "info: read speed = " << setw(8) << perf << " GB/s for "
-        << sharedMemSizeBytes2 / 1024 << " KB shared memory"
-            " with " << setw(8) << blocks * threadsPerBlock << " threads, "
-        << setw(4) << numReads2 << " reads in sharedMemReadSpeed2 kernel" << endl;
-
-    delete[] hDst;
-    hipFree(dDst);
-  }
-
-  HIPCHECK(hipStreamDestroy(stream));
-
-  passed();
-}
diff --git a/perftests/stream/hipPerfDeviceConcurrency.cpp b/perftests/stream/hipPerfDeviceConcurrency.cpp
deleted file mode 100644
index 664bdb47e..000000000
--- a/perftests/stream/hipPerfDeviceConcurrency.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
-                            float yStep, uint maxIter) {
-
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-class hipPerfDeviceConcurrency {
-  public:
-  hipPerfDeviceConcurrency();
-  ~hipPerfDeviceConcurrency();
-
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
-
-  void open(void);
-  void close(void);
-  void run(unsigned int testCase, int numGpus);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numDevices;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int coordIdx;
-  unsigned long long totalIters = 0;
-};
-
-
-hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
-
-hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
-
-void hipPerfDeviceConcurrency::open(void) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  setNumGpus(nGpu);
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-}
-
-
-void hipPerfDeviceConcurrency::close() {
-}
-
-void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
-
-
-  static int deviceId;
-  uint * hPtr[numGpus];
-  uint * dPtr[numGpus];
-  hipStream_t streams[numGpus];
-  int numCUs[numGpus];
-  unsigned int maxIter[numGpus];
-  unsigned long long expectedIters[numGpus];
-
-  int threads, threads_per_block, blocks;
-  float xStep, yStep, xPos, yPos;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, i));
-
-  if (testCase != 0) {
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
-            << i << std::endl;
-  }
-
-  numCUs[i] = props.multiProcessorCount;
-  int clkFrequency = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
-  maxIter[i] = (maxIter[i] + 15) & ~15;
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  HIPCHECK(hipStreamCreate(&streams[i]));
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-  setData(hPtr[i], 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-
-  // Prepare kernel launch parameters
-  threads = (bufSize/sizeof(uint));
-  threads_per_block  = 64;
-  blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  xStep = (float)(coords[coordIdx].width / (double)width_);
-  yStep = (float)(-coords[coordIdx].width / (double)width_);
-  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory from host to device
-  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
-
-  }
-
-  for(int i = 0; i < numGpus; i++) {
-    HIPCHECK(hipStreamSynchronize(0));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-  HIPCHECK(hipSetDevice(deviceId));
-
-  // Copy data back from device to the host
-  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
-
-  checkData(hPtr[i]);
-  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
-
-  if (testCase != 0) {
-    checkData(hPtr[i]);
-    if(totalIters != expectedIters[i]) {
-      std::cout << "Incorrect iteration count detected" << std::endl;
-    }
-  }
-
-
-  HIPCHECK(hipStreamDestroy(streams[i]));
-
-  // Free host and device memory
-  HIPCHECK(hipHostFree(hPtr[i]));
-  HIPCHECK(hipFree(dPtr[i]));
-  }
-
-  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
-            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
-  }
-
-  if(testCase == 0) {
-    deviceId++;
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfDeviceConcurrency deviceConcurrency;
-
-  deviceConcurrency.open();
-
-  int nGpu = deviceConcurrency.getNumGpus();
-
-  // testCase = 0 refers to warmup kernel run
-  int testCase = 0;
-
-  for (int i = 0; i < nGpu; i++) {
-    // Warm-up kernel on all devices
-    deviceConcurrency.run(testCase, 1);
-  }
-
-  // Time for kernel on 1 device
-  deviceConcurrency.run(++testCase, 1);
-
-  // Time for kernel on all available devices
-  deviceConcurrency.run(++testCase, nGpu);
-
-  passed();
-}
diff --git a/perftests/stream/hipPerfStreamConcurrency.cpp b/perftests/stream/hipPerfStreamConcurrency.cpp
deleted file mode 100644
index 16e29bc06..000000000
--- a/perftests/stream/hipPerfStreamConcurrency.cpp
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-
-#ifdef __HIP_PLATFORM_NVIDIA__
-inline __device__ float4 operator*(float s, float4 a)
-{
-  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-}
-inline __device__ float4 operator*(float4 a, float4 b)
-{
-  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-}
-inline __device__ float4 operator+(float4 a, float4 b)
-{
-  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-}
-inline __device__ float4 operator-(float4 a, float4 b)
-{
-  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-}
-#endif
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},  // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
-         float xStep, float yStep, uint maxIter) {
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % (width/4);
-  int j = tid / (width/4);
-  int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
-  int4 vecj = make_int4(j, j, j, j);
-  float4 x0;
-  x0.x = (float)(xPos + xStep*veci.x);
-  x0.y = (float)(xPos + xStep*veci.y);
-  x0.z = (float)(xPos + xStep*veci.z);
-  x0.w = (float)(xPos + xStep*veci.w);
-  float4 y0;
-  y0.x = (float)(yPos + yStep*vecj.x);
-  y0.y = (float)(yPos + yStep*vecj.y);
-  y0.z = (float)(yPos + yStep*vecj.z);
-  y0.w = (float)(yPos + yStep*vecj.w);
-  float4 x = x0;
-  float4 y = y0;
-  uint iter = 0;
-  float4 tmp;
-  int4 stay;
-  int4 ccount = make_int4(0, 0, 0, 0);
-  float4 savx = x;
-  float4 savy = y;
-  stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
-  stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
-  stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
-  stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
-  for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
-  iter+=16) {
-    x = savx;
-    y = savy;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    // Two iterations
-    tmp = x*x + x0 - y*y;
-    y = 2.0f * x * y + y0;
-    x = tmp*tmp + x0 - y*y;
-    y = 2.0f * tmp * y + y0;
-    stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
-    stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
-    stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
-    stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
-    savx.x = (bool)(stay.x ? x.x : savx.x);
-    savx.y = (bool)(stay.y ? x.y : savx.y);
-    savx.z = (bool)(stay.z ? x.z : savx.z);
-    savx.w = (bool)(stay.w ? x.w : savx.w);
-    savy.x = (bool)(stay.x ? y.x : savy.x);
-    savy.y = (bool)(stay.y ? y.y : savy.y);
-    savy.z = (bool)(stay.z ? y.z : savy.z);
-    savy.w = (bool)(stay.w ? y.w : savy.w);
-    ccount.x -= stay.x*16;
-    ccount.y -= stay.y*16;
-    ccount.z -= stay.z*16;
-    ccount.w -= stay.w*16;
-  }
-  // Handle remainder
-  if (!(stay.x & stay.y & stay.z & stay.w))
-  {
-    iter = 16;
-    do
-    {
-      x = savx;
-      y = savy;
-      stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x <  maxIter);
-      stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y <  maxIter);
-      stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z <  maxIter);
-      stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w <  maxIter);
-      tmp = x;
-      x = x*x + x0 - y*y;
-      y = 2.0f*tmp*y + y0;
-      ccount.x += stay.x;
-      ccount.y += stay.y;
-      ccount.z += stay.z;
-      ccount.w += stay.w;
-      iter--;
-      savx.x = (stay.x ? x.x : savx.x);
-      savx.y = (stay.y ? x.y : savx.y);
-      savx.z = (stay.z ? x.z : savx.z);
-      savx.w = (stay.w ? x.w : savx.w);
-      savy.x = (stay.x ? y.x : savy.x);
-      savy.y = (stay.y ? y.y : savy.y);
-      savy.z = (stay.z ? y.z : savy.z);
-      savy.w = (stay.w ? y.w : savy.w);
-    } while ((stay.x | stay.y | stay.z | stay.w) && iter);
-  }
-  uint4 *vecOut = (uint4 *)out;
-  vecOut[tid].x = (uint)(ccount.x);
-  vecOut[tid].y = (uint)(ccount.y);
-  vecOut[tid].z = (uint)(ccount.z);
-  vecOut[tid].w = (uint)(ccount.w);
-}
-
-class hipPerfStreamConcurrency {
-  public:
-  hipPerfStreamConcurrency();
-  ~hipPerfStreamConcurrency();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void close(void);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  unsigned long long totalIters;
-  int numCUs;
-
-};
-
-
-hipPerfStreamConcurrency::hipPerfStreamConcurrency() {}
-
-hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {}
-
-void hipPerfStreamConcurrency::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId  << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfStreamConcurrency::close() {
-}
-
-
-void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) {
-
-  int clkFrequency = 0;
-  unsigned int numStreams = getNumStreams();
-  unsigned int numKernels = getNumKernels();
-
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
-  maxIter = (maxIter + 15) & ~15;
-
-  hipStream_t streams[numStreams];
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
-  }
-
-
-  // Synchronize to make sure all the copies are completed
-  for(uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamSynchronize(streams[i]));
-  }
-
-  // Warm-up kernel with lower iteration
-  if (testCase == 0) {
-    maxIter = 256;
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
-  }
-
-
-  // Synchronize all the concurrent streans to have completed execution
-  for(uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamSynchronize(streams[i]));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
-  }
-
-
-  if (testCase != 0) {
-  std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): "
-    << all_kernel_time.count() << std::endl;
-  }
-
-
-  unsigned long long expected =
-    (unsigned long long)width_ * (unsigned long long)maxIter;
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfStreamConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfStreamConcurrency streamConcurrency;
-  int deviceId = 0;
-
-  streamConcurrency.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 5; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0:
-    // Warm-up kernel
-    streamConcurrency.setNumStreams(1);
-    streamConcurrency.setNumKernels(1);
-    break;
-
-  case 1:
-  // default stream executes serially
-  streamConcurrency.setNumStreams(1);
-  streamConcurrency.setNumKernels(1);
-  break;
-
-  case 2:
-    // 2-way concurrency
-    streamConcurrency.setNumStreams(2);
-    streamConcurrency.setNumKernels(2);
-    break;
-
-  case 3:
-    // 4-way concurrency
-    streamConcurrency.setNumStreams(4);
-    streamConcurrency.setNumKernels(4);
-    break;
-
-  case 4:
-    streamConcurrency.setNumStreams(2);
-    streamConcurrency.setNumKernels(4);
-    break;
-
-  case 5:
-    break;
-
-  default:
-    break;
-  }
-  streamConcurrency.run(testCase, deviceId);
-
-  }
-
-
-  passed();
-}
diff --git a/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp b/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp
deleted file mode 100644
index 103f40c7b..000000000
--- a/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-using namespace std;
-
-#define BufSize 0x1000
-#define Iterations 0x100
-#define TotalStreams 4
-#define TotalBufs 4
-
-
-class hipPerfStreamCreateCopyDestroy {
-  private:
-    unsigned int numBuffers_;
-    unsigned int numStreams_;
-    const size_t totalStreams_[TotalStreams];
-    const size_t totalBuffers_[TotalBufs];
-  public:
-    hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
-                                       totalStreams_{1, 2, 4, 8},
-                                       totalBuffers_{1, 100, 1000, 5000} {};
-    ~hipPerfStreamCreateCopyDestroy() {};
-    void open(int deviceID);
-    void run(unsigned int testNumber);
-};
-
-void hipPerfStreamCreateCopyDestroy::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId  << std::endl;
-}
-
-void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
-  numStreams_ = totalStreams_[testNumber % TotalStreams];
-  size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1)));
-  hipStream_t streams[numStreams_];
-
-  numBuffers_ = totalBuffers_[testNumber / TotalBufs];
-  float* dSrc[numBuffers_];
-  size_t nBytes = BufSize * sizeof(float);
-
-  for (size_t b = 0; b < numBuffers_; ++b) {
-    HIPCHECK(hipMalloc(&dSrc[b], nBytes));
-  }
-
-  float* hSrc;
-  hSrc = new float[nBytes];
-  HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
-  for (size_t i = 0; i < BufSize; i++) {
-    hSrc[i] = 1.618f + i;
-  }
-
-  auto start = std::chrono::steady_clock::now();
-
-  for (size_t i = 0; i < iter; ++i) {
-    for (size_t s = 0; s < numStreams_; ++s) {
-      HIPCHECK(hipStreamCreate(&streams[s]));
-    }
-
-    for (size_t s = 0; s < numStreams_; ++s) {
-      for (size_t b = 0; b < numBuffers_; ++b) {
-        HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
-      }
-    }
-
-    for (size_t s = 0; s < numStreams_; ++s) {
-      HIPCHECK(hipStreamDestroy(streams[s]));
-    }
-  }
-
-  auto end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> diff = end - start;
-
-  auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
-
-  cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
-       << setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
-       << iter << " iterations " << time << " (ms) " << endl;
-
-  delete [] hSrc;
-  for (size_t b = 0; b < numBuffers_; ++b) {
-    HIPCHECK(hipFree(dSrc[b]));
-  }
-}
-
-int main(int argc, char* argv[]) {
-  hipPerfStreamCreateCopyDestroy streamCCD;
-
-  int deviceId = 0;
-  streamCCD.open(deviceId);
-
-  for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) {
-    streamCCD.run(testCase);
-  }
-
-  passed();
-}