diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp index dbcb5e06eb..9fdec5ccc9 100644 --- a/include/RAJA/policy/desul/atomic.hpp +++ b/include/RAJA/policy/desul/atomic.hpp @@ -12,10 +12,9 @@ #if defined(RAJA_ENABLE_DESUL_ATOMICS) -#include "RAJA/util/macros.hpp" - #include "RAJA/policy/atomic_builtin.hpp" - +#include "RAJA/policy/desul/policy.hpp" +#include "RAJA/util/macros.hpp" #include "desul/atomics.hpp" // Default desul options for RAJA @@ -26,153 +25,208 @@ using raja_default_desul_scope = desul::MemoryScopeDevice; namespace RAJA { +namespace detail +{ +template +struct DesulAtomicPolicy { + using memory_order = raja_default_desul_order; + using memory_scope = raja_default_desul_scope; +}; + +template +struct DesulAtomicPolicy> { + using memory_order = OrderingPolicy; + using memory_scope = ScopePolicy; +}; + +} // namespace detail + RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T -atomicAdd(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_add(const_cast(acc), +RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T volatile *acc, T value) +{ + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_add(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T -atomicSub(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_sub(const_cast(acc), +RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T volatile *acc, T value) +{ + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_sub(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_min(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_min(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_max(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_max(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) +RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) { - return desul::atomic_fetch_inc(const_cast(acc), - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_inc(const_cast(acc), + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val) +RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val) { + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc - return desul::atomic_fetch_inc_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_inc_mod(const_cast(acc), + val, + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) +RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) { - return desul::atomic_fetch_dec(const_cast(acc), - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_dec(const_cast(acc), + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val) +RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val) { + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec - return desul::atomic_fetch_dec_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_dec_mod(const_cast(acc), + val, + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_and(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_and(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_or(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_or(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_xor(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_fetch_xor(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value) +RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, + T volatile *acc, + T value) { - return desul::atomic_exchange(const_cast(acc), + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_exchange(const_cast(acc), value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + desul_order{}, + desul_scope{}); } RAJA_SUPPRESS_HD_WARN template -RAJA_HOST_DEVICE -RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value) +RAJA_HOST_DEVICE RAJA_INLINE T +atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value) { - return desul::atomic_compare_exchange(const_cast(acc), - compare, - value, - raja_default_desul_order{}, - raja_default_desul_scope{}); + using desul_order = + typename detail::DesulAtomicPolicy::memory_order; + using desul_scope = + typename detail::DesulAtomicPolicy::memory_scope; + return desul::atomic_compare_exchange( + const_cast(acc), compare, value, desul_order{}, desul_scope{}); } } // namespace RAJA #endif // RAJA_ENABLE_DESUL_ATOMICS -#endif // guard +#endif // guard diff --git a/include/RAJA/policy/desul/policy.hpp b/include/RAJA/policy/desul/policy.hpp new file mode 100644 index 0000000000..b743760592 --- /dev/null +++ b/include/RAJA/policy/desul/policy.hpp @@ -0,0 +1,62 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_policy_desul_HPP +#define RAJA_policy_desul_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_DESUL_ATOMICS) + +#include "desul/atomics.hpp" + +namespace RAJA +{ + +// Policy to perform an atomic operation with a given memory ordering. +template +struct detail_atomic_t { +}; + +using atomic_seq_cst = + detail_atomic_t; +using atomic_acq_rel = + detail_atomic_t; +using atomic_acquire = + detail_atomic_t; +using atomic_release = + detail_atomic_t; +using atomic_relaxed = + detail_atomic_t; + +using atomic_seq_cst_block = + detail_atomic_t; +using atomic_acq_rel_block = + detail_atomic_t; +using atomic_acquire_block = + detail_atomic_t; +using atomic_release_block = + detail_atomic_t; +using atomic_relaxed_block = + detail_atomic_t; + +using atomic_seq_cst_sys = + detail_atomic_t; +using atomic_acq_rel_sys = + detail_atomic_t; +using atomic_acquire_sys = + detail_atomic_t; +using atomic_release_sys = + detail_atomic_t; +using atomic_relaxed_sys = + detail_atomic_t; + +} // namespace RAJA + +#endif // RAJA_ENABLE_DESUL_ATOMICS + +#endif diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 9c2c12d76f..707eabdc78 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -28,3 +28,38 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} ) target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() + + +set(ENABLE_LITMUS_TESTS OFF) + +if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_CUDA) + set(LITMUS_BACKEND "Cuda") + set(ENABLE_LITMUS_TESTS ON) +endif() + +if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP) + set(LITMUS_BACKEND "Hip") + set(ENABLE_LITMUS_TESTS ON) +endif() + +set(FORALL_LITMUS_TESTS + mp # Message Passing + sb # Store Buffer + lb # Load Buffer + store # Store + read # Read + write2x2 # 2+2 write +) + +if (ENABLE_LITMUS_TESTS) + foreach ( LITMUS_TEST ${FORALL_LITMUS_TESTS} ) + raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST} + SOURCES test-forall-atomic-litmus-${LITMUS_TEST}.cpp) + target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() +endif() + +unset(FORALL_LITMUS_TESTS) +unset(ENABLE_LITMUS_TESTS) +unset(LITMUS_BACKEND) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp new file mode 100644 index 0000000000..67a49eb91d --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp @@ -0,0 +1,235 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Load buffer" litmus test for DESUL ordered atomic +// -------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// a = load(y) b = load(x) +// store(x, 1) store(y, 1) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 0, b = 1 +// - a = 1, b = 0 +// - a = 0, b = 0 +// Weak behavior: +// - a = 1, b = 1 + +// Send policy: Relaxed (Weak), Release, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +template +struct LoadBufferLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + constexpr static int PERMUTE_THREAD_FLAG = 97; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + T *b; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + load_buffer_1(other_thread, iter); + load_buffer_2(this_thread, iter); + } else { + load_buffer_2(this_thread, iter); + load_buffer_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void load_buffer_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_thread * m_stride + iter; + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + RAJA::atomicAdd(&(x[permute_idx]), T{1}); + } + + RAJA_HOST_DEVICE void load_buffer_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_thread * m_stride + iter; + b[thread_idx] = RAJA::atomicAdd(&(x[permute_idx]), T{0}); + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && b_local[i] == 0) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && b_local[i] == 0) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { + // Weak behavior: stores reordered after receives + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using LBLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using LBLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, LoadBufferTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using LBTest = LoadBufferLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, LoadBufferTest); + +using LoadBufferTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + LoadBufferTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp new file mode 100644 index 0000000000..362686d5c0 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp @@ -0,0 +1,242 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Message Passing" litmus test for DESUL ordered atomic +// ------------------------------------------------------ +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) a = load(flag) +// store(flag, 1) b = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, b = 1 +// - a = 0, b = 0 +// - a = 0, b = 1 +// Weak behavior: +// - a = 1, b = 0 +// +// On weak architectures (POWER/ARM/GPUs), the store to "x" can be reordered +// after the store to "flag". Store-release and load-acquire on the "flag" +// variable should prevent observing the weak behavior. + +// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst +template +struct MessagePassingLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + constexpr static int PERMUTE_THREAD_FLAG = 97; + size_t m_size; + int m_stride; + T *x; + T *flag; + T *a; + T *b; + + size_t strong_behavior_0{0}; + size_t strong_behavior_1{0}; + size_t interleaved_behavior{0}; + size_t weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + flag = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(flag); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(flag, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool send_first = (this_thread % 2 == 0); + // Send action + if (send_first) { + this->run_send(other_thread, iter); + this->run_recv(this_thread, iter); + } else { + this->run_recv(this_thread, iter); + this->run_send(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void run_send(int other_thread, int iter) + { + int other_thread_idx = other_thread * m_stride + iter; + int permute_other_thread = (other_thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_other_thread * m_stride + iter; + RAJA::atomicAdd(&(x[other_thread_idx]), T{1}); + RAJA::atomicAdd(&(flag[permute_idx]), T{1}); + } + + RAJA_HOST_DEVICE void run_recv(int this_thread, int iter) + { + int this_thread_idx = this_thread * m_stride + iter; + int permute_this_thread = (this_thread * PERMUTE_THREAD_FLAG) % m_size; + int permute_idx = permute_this_thread * m_stride + iter; + a[this_thread_idx] = + RAJA::atomicAdd(&(flag[permute_idx]), T{0}); + b[this_thread_idx] = + RAJA::atomicAdd(&(x[this_thread_idx]), T{0}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 0 && b_local[i] == 0) { + // Strong behavior: neither store from test_send is observable + strong_cnt_0 += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { + // Strong behavior: both stores from test_send are observable + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 1 && b_local[i] == 0) { + // Weak behavior: second store observed before first store + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using MPLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using MPLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using MPTest = MessagePassingLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest); + +using MessagePassingTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + MessagePassingTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp new file mode 100644 index 0000000000..16c870e4b8 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp @@ -0,0 +1,219 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Read" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 2) +// store(y, 1) a = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 2 +// - a = 1, x = 1 +// - a = 0, x = 1 +// Weak behavior: +// - a = 0, x = 2 + +// Atomic policy: Relaxed (Weak), SeqCst +template +struct ReadLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + read_1(other_thread, iter); + read_2(this_thread, iter); + } else { + read_2(this_thread, iter); + read_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void read_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + RAJA::atomicExchange(&(y[thread_idx]), T{1}); + } + + RAJA_HOST_DEVICE void read_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(y[thread_idx]), T{2}); + a[thread_idx] = RAJA::atomicAdd(&(x[thread_idx]), T{0}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *y_local = y; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && y_local[i] == 2) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && y_local[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (a_local[i] == 1 && y_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 0 && y_local[i] == 2) { + // Weak behavior: receives reordered after stores + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using ReadLitmusTestOrderPols = + camp::list; + +using ReadLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, ReadTest) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using ReadTest = ReadLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, ReadTest); + +using ReadTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + ReadTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp new file mode 100644 index 0000000000..666c1470ea --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp @@ -0,0 +1,228 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Store buffer" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 1) +// a = load(y) b = load(x) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, b = 1 +// - a = 0, b = 1 +// - a = 1, b = 0 +// Weak behavior: +// - a = 0, b = 0 +// +// Acquire-release semantics are not enough to disallow the stores to be +// reordered after the load -- full sequential consistency is required in order +// to impose a "single total order" of the stores. + +// Send policy: Relaxed, SeqCst (Strong) +// Recv policy: Relaxed, SeqCst (Strong) +template +struct StoreBufferLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + T *b; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + b = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + work_res.deallocate(b); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + work_res.memset(b, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + store_buffer_1(other_thread, iter); + store_buffer_2(this_thread, iter); + } else { + store_buffer_2(this_thread, iter); + store_buffer_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void store_buffer_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicAdd(&(x[thread_idx]), T{1}); + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + } + + RAJA_HOST_DEVICE void store_buffer_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + b[thread_idx] = RAJA::atomicAdd(&(x[thread_idx]), T{0}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *b_local = b; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && b_local[i] == 0) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && b_local[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (a_local[i] == 1 && b_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 0 && b_local[i] == 0) { + // Weak behavior: stores reordered after receives + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using SBLitmusTestOrderPols = + camp::list; + +using SBLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, StoreBufferTest) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using SBTest = StoreBufferLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreBufferTest); + +using StoreBufferTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + StoreBufferTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp new file mode 100644 index 0000000000..10fd9edeb8 --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp @@ -0,0 +1,226 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "Store" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 2) a = load(y) +// store(y, 1) store(x, 1) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 1 +// - a = 0, x = 2 +// - a = 0, x = 1 +// Weak behavior: +// - a = 1, x = 2 + +// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst +// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst +template +struct StoreLitmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + store_1(other_thread, iter); + store_2(this_thread, iter); + } else { + store_2(this_thread, iter); + store_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void store_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{2}); + RAJA::atomicAdd(&(y[thread_idx]), T{1}); + } + + RAJA_HOST_DEVICE void store_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + a[thread_idx] = RAJA::atomicAdd(&(y[thread_idx]), T{0}); + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *a_local = a; + T *x_local = x; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (a_local[i] == 1 && x_local[i] == 1) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (a_local[i] == 0 && x_local[i] == 2) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (a_local[i] == 0 && x_local[i] == 1) { + // Strong behavior: stores interleaved with receives + interleaved_cnt += 1; + } else if (a_local[i] == 1 && x_local[i] == 2) { + // Weak behavior: receives reordered after stores + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value && + std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using StoreLitmusTestOrderPols = + camp::list, + camp::list, + camp::list, + camp::list >; + +using StoreLitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, StoreTest) +{ + using Type = typename camp::at>::type; + using SendRecvPol = typename camp::at>::type; + using SendPol = typename camp::at>::type; + using RecvPol = typename camp::at>::type; + + using StoreTest = StoreLitmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreTest); + +using StoreTestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + StoreTestTypes); diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp new file mode 100644 index 0000000000..b5dc28ac5f --- /dev/null +++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp @@ -0,0 +1,222 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +#include "test-forall-atomic-litmus-driver.hpp" + +// "2+2 Write" litmus test for DESUL ordered atomic +// --------------------------------------------------- +// Initial state: x = 0 && y = 0 +// +// Thread 1: Thread 2: +// ----------- ----------- +// store(x, 1) store(y, 1) +// store(y, 2) store(x, 2) +// +// Allowed results: +// ---------------- +// Strong behaviors: +// - a = 1, x = 2 +// - a = 2, x = 1 +// - a = 2, x = 2 +// Weak behavior: +// - a = 1, x = 1 + +// Send policy: Relaxed (Weak), AcqRel, SeqCst +// Recv policy: Relaxed (Weak), AcqRel, SeqCst +template +struct Write2x2Litmus { + using DataType = T; + using RelaxedPolicy = RAJA::atomic_relaxed; + size_t m_size; + int m_stride; + T *x; + T *y; + T *a; + + int strong_behavior_0{0}; + int strong_behavior_1{0}; + int interleaved_behavior{0}; + int weak_behavior{0}; + + void allocate(camp::resources::Resource work_res, size_t size, int stride) + { + m_size = size; + m_stride = stride; + x = work_res.allocate(size * stride); + y = work_res.allocate(size * stride); + a = work_res.allocate(size * stride); + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(x); + work_res.deallocate(y); + work_res.deallocate(a); + } + + void pre_run(camp::resources::Resource work_res) + { + work_res.memset(x, 0, sizeof(T) * m_size * m_stride); + work_res.memset(y, 0, sizeof(T) * m_size * m_stride); + work_res.memset(a, 0, sizeof(T) * m_size * m_stride); + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter) + { + bool swap = this_thread % 2 == 0; + if (swap) { + store_1(other_thread, iter); + store_2(this_thread, iter); + } else { + store_2(this_thread, iter); + store_1(other_thread, iter); + } + } + + RAJA_HOST_DEVICE void store_1(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(x[thread_idx]), T{1}); + RAJA::atomicExchange(&(y[thread_idx]), T{2}); + } + + RAJA_HOST_DEVICE void store_2(int thread, int iter) + { + int thread_idx = thread * m_stride + iter; + RAJA::atomicExchange(&(y[thread_idx]), T{1}); + RAJA::atomicExchange(&(x[thread_idx]), T{2}); + } + + void count_results(camp::resources::Resource work_res) + { + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec<256>; + using ReducePolicy = RAJA::hip_reduce; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec<256>; + using ReducePolicy = RAJA::cuda_reduce; +#endif + RAJA::ReduceSum strong_cnt_0(0); + RAJA::ReduceSum strong_cnt_1(0); + RAJA::ReduceSum interleaved_cnt(0); + RAJA::ReduceSum weak_cnt(0); + RAJA::ReduceSum unexpected_cnt(0); + + T *x_local = x; + T *y_local = y; + + auto forall_len = RAJA::TypedRangeSegment(0, m_size * m_stride); + + RAJA::forall(forall_len, [=] RAJA_HOST_DEVICE(int i) { + if (x_local[i] == 1 && y_local[i] == 2) { + // Strong behavior: thread 1 happened before thread 2 + strong_cnt_0 += 1; + } else if (x_local[i] == 2 && y_local[i] == 1) { + // Strong behavior: thread 2 happened before thread 1 + strong_cnt_1 += 1; + } else if (x_local[i] == 2 && y_local[i] == 2) { + // Strong behavior: interleaved stores in-order + interleaved_cnt += 1; + } else if (x_local[i] == 1 && y_local[i] == 1) { + // Weak behavior: stores on each thread were reordered + weak_cnt += 1; + } else { + unexpected_cnt += 1; + } + }); + + EXPECT_EQ(unexpected_cnt.get(), 0); + + strong_behavior_0 += strong_cnt_0.get(); + strong_behavior_1 += strong_cnt_1.get(); + interleaved_behavior += interleaved_cnt.get(); + weak_behavior += weak_cnt.get(); + } + + void verify() + { + std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0 + << "\n"; + std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1 + << "\n"; + std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior + << "\n"; + std::cerr << " - Weak behaviors = " << weak_behavior << "\n"; + + if (std::is_same::value) { + // In the relaxed case, we should observe some weak behaviors. + // Don't fail the test, but do print out a message. + if (weak_behavior == 0) { + std::cerr << "Warning - no weak behaviors detected in the control case." + << "\nThis litmus test may be insufficient to exercise " + "ordered memory atomics.\n"; + } else { + double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 + + interleaved_behavior + weak_behavior; + std::cerr << "\n Weak behaviors detected in " + << 100 * (weak_behavior / overall_behavior_counts) + << "% of cases.\n"; + } + } else { + // We should not expect any weak behaviors if using a strong ordering. + EXPECT_EQ(weak_behavior, 0); + } + } +}; + +using Write2x2LitmusTestOrderPols = + camp::list; + +using Write2x2LitmusTestPols = + camp::cartesian_product; + +TYPED_TEST_SUITE_P(ForallAtomicLitmusTest); + +template +class ForallAtomicLitmusTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallAtomicLitmusTest, Write2x2Test) +{ + using Type = typename camp::at>::type; + using AtomicPol = typename camp::at>::type; + + using Write2x2Test = Write2x2Litmus; + LitmusTestDriver::run(); +} + +REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, Write2x2Test); + +using Write2x2TestTypes = Test::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(Hip, + ForallAtomicLitmusTest, + Write2x2TestTypes); diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp new file mode 100644 index 0000000000..3e6b9520e8 --- /dev/null +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp @@ -0,0 +1,325 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-atomic-types.hpp" +#include "RAJA_test-atomicpol.hpp" +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-index-types.hpp" + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include +#include + +using IdxType = size_t; +constexpr int NUM_ITERS = 100; +#ifdef RAJA_ENABLE_CUDA +constexpr int STRIDE = 32; +constexpr bool STRESS_BEFORE_TEST = true; +constexpr bool NONTESTING_BLOCKS = true; +constexpr int WARP_SIZE = 32; +constexpr int BLOCK_SIZE = 128; +#elif defined(RAJA_ENABLE_HIP) +constexpr int STRIDE = 32; +constexpr bool STRESS_BEFORE_TEST = true; +constexpr bool NONTESTING_BLOCKS = true; +constexpr int WARP_SIZE = 64; +constexpr int BLOCK_SIZE = 128; +#endif + +constexpr int STRESS_STRIDE = 64; +constexpr int NUM_STRESS_LINES = 2048; +constexpr int DATA_STRESS_SIZE = NUM_STRESS_LINES * STRESS_STRIDE; + +constexpr int PERMUTE_PRIME_BLOCK = 17; +constexpr int PERMUTE_PRIME_GRID = 47; +constexpr int PERMUTE_STRESS = 977; + +template +__global__ void dummy_kernel(IdxType index, Func func) +{ + func(index); +} + +// Generic test driver for memory litmus tests on the GPU. +template +struct LitmusTestDriver { +public: + using T = typename LitmusPolicy::DataType; + struct TestData { + int block_size; + int grid_size; + + int stride = STRIDE; + + int pre_stress_iterations = 4; + int stress_iterations = 12; + + // Number of blocks to run the message-passing litmus test on. + int testing_blocks; + + // Array to shuffle block indices in a test kernel. + IdxType* shuffle_block; + + // Barrier integers to synchronize testing threads. + IdxType* barriers; + + int* data_stress; + + int num_stress_index; + IdxType* stress_index; + + void allocate(camp::resources::Resource work_res, + int grid_size, + int block_size, + int num_testing_blocks) + { + this->grid_size = grid_size; + this->block_size = block_size; + + testing_blocks = num_testing_blocks; + + shuffle_block = work_res.allocate(grid_size); + barriers = work_res.allocate(STRIDE); + data_stress = work_res.allocate(DATA_STRESS_SIZE); + + num_stress_index = 64; + stress_index = work_res.allocate(num_stress_index); + } + + void pre_run(camp::resources::Resource work_res) + { + std::random_device rand_device; + // Create a random permutation for the range [0, grid_size) + std::vector shuffle_block_host(grid_size); + { + std::iota(shuffle_block_host.begin(), shuffle_block_host.end(), 0); + std::shuffle(shuffle_block_host.begin(), + shuffle_block_host.end(), + std::mt19937{rand_device()}); + } + work_res.memcpy(shuffle_block, + shuffle_block_host.data(), + sizeof(IdxType) * grid_size); + + std::vector stress_index_host(num_stress_index); + { + std::mt19937 gen{rand_device()}; + std::uniform_int_distribution rnd_stress_offset(0, + STRESS_STRIDE); + std::uniform_int_distribution rnd_stress_dist( + 0, NUM_STRESS_LINES - 1); + std::generate(stress_index_host.begin(), + stress_index_host.end(), + [&]() -> IdxType { + return rnd_stress_dist(gen) * STRESS_STRIDE; + }); + } + work_res.memcpy(stress_index, + stress_index_host.data(), + sizeof(IdxType) * num_stress_index); + + work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE); + work_res.memset(data_stress, 0, sizeof(int) * DATA_STRESS_SIZE); + + +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaDeviceSynchronize()); +#endif + +#if defined(RAJA_ENABLE_HIP) + hipErrchk(hipDeviceSynchronize()); +#endif + } + + void deallocate(camp::resources::Resource work_res) + { + work_res.deallocate(shuffle_block); + work_res.deallocate(barriers); + work_res.deallocate(data_stress); + } + }; + + RAJA_HOST_DEVICE LitmusTestDriver() {} + + // Run + static void run() + { + int num_blocks = 0; + { + LitmusPolicy dummy_policy{}; + TestData dummy_test_data{}; + auto lambda = [=] RAJA_HOST_DEVICE(IdxType index) { + LitmusTestDriver test_inst{}; + test_inst.test_main(index, dummy_test_data, dummy_policy); + }; + RAJA_UNUSED_VAR(lambda); +#ifdef RAJA_ENABLE_CUDA + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, dummy_kernel, BLOCK_SIZE, 0)); + num_blocks *= RAJA::cuda::device_prop().multiProcessorCount; +#endif +#ifdef RAJA_ENABLE_HIP + hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, dummy_kernel, BLOCK_SIZE, 0)); + num_blocks *= RAJA::hip::device_prop().multiProcessorCount; +#endif + } + std::cout << "Got num_blocks = " << num_blocks + << ", block_size = " << BLOCK_SIZE << "\n" + << std::flush; + if (num_blocks == 0) { + FAIL() << "Grid size wasn't set to a valid value.\n"; + } + +#ifdef RAJA_ENABLE_CUDA + using ResourcePolicy = camp::resources::Cuda; +#endif +#ifdef RAJA_ENABLE_HIP + using ResourcePolicy = camp::resources::Hip; +#endif + camp::resources::Resource work_res{ResourcePolicy()}; + + int num_testing_blocks = num_blocks; + if (NONTESTING_BLOCKS) { + num_testing_blocks = num_blocks / 4; + } + + TestData test_data; + test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks); + + LitmusPolicy litmus_test; + litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE, STRIDE); + +#ifdef RAJA_ENABLE_HIP + using GPUExec = RAJA::hip_exec; +#endif + +#ifdef RAJA_ENABLE_CUDA + using GPUExec = RAJA::cuda_exec; +#endif + + for (int iter = 0; iter < NUM_ITERS; iter++) { + test_data.pre_run(work_res); + litmus_test.pre_run(work_res); + + RAJA::forall( + RAJA::TypedRangeSegment(0, num_blocks * BLOCK_SIZE), + [=] RAJA_HOST_DEVICE(IdxType index) { + LitmusTestDriver test_inst{}; + test_inst.test_main(index, test_data, litmus_test); + }); + + litmus_test.count_results(work_res); + } + + litmus_test.verify(); + + litmus_test.deallocate(work_res); + test_data.deallocate(work_res); + } + +private: + using NormalAtomic = RAJA::atomic_relaxed; + + RAJA_HOST_DEVICE void test_main(IdxType index, + TestData param, + LitmusPolicy test) + { + IdxType block_idx = index / param.block_size; + IdxType thread_idx = index % param.block_size; + + // Permute the thread index, to promote scattering of memory accesses + // within a block. + IdxType permute_thread_idx = + (thread_idx * PERMUTE_PRIME_BLOCK) % param.block_size; + + // Shuffle the block ID randomly according to a permutation array. + block_idx = param.shuffle_block[block_idx]; + + IdxType data_idx = block_idx * param.block_size + thread_idx; + + for (int i = 0; i < param.stride; i++) { + + // Synchronize all blocks before testing, to increase the chance of + // interleaved requests. + this->sync(param.grid_size, thread_idx, param.barriers[i]); + + if (block_idx < (IdxType)param.testing_blocks) { + + // Block is a testing block. + // + // Each block acts as a "sender" to a unique "partner" block. This is + // done by permuting the block IDs with a function p(i) = i * k mod n, + // where n is the number of blocks being tested, and k and n are + // coprime. + int partner_idx = + (block_idx * PERMUTE_PRIME_GRID + i) % param.testing_blocks; + + // Run specified test, matching threads between the two paired blocks. + int other_data_idx = + partner_idx * param.block_size + permute_thread_idx; + + // Pre-stress pattern - stressing memory accesses before the test may + // increase the rate of weak memory behaviors + // Helps on AMD, doesn't seem to help on NVIDIA + if (STRESS_BEFORE_TEST) { + this->stress(block_idx, thread_idx, param, true); + } + + test.run(data_idx, other_data_idx, i); + } else { + this->stress(block_idx, thread_idx, param); + } + } + }; + + RAJA_HOST_DEVICE void sync(int num_blocks, int thread_idx, IdxType& barrier) + { + if (thread_idx == 0) { + IdxType result = RAJA::atomicAdd(&barrier, IdxType{1}); + // Busy-wait until all blocks perform the above add. + while (result != num_blocks) { + result = RAJA::atomicAdd(&barrier, IdxType{0}); + } + } + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__) + __syncthreads(); +#endif + } + + RAJA_HOST_DEVICE void stress(IdxType block_idx, + IdxType thread_idx, + const TestData& param, + bool pre_stress = false) + { + int num_iters = + (pre_stress ? param.pre_stress_iterations : param.stress_iterations); + int volatile* stress_data = param.data_stress; + if (thread_idx % WARP_SIZE == 0) { + int warp_idx = thread_idx / WARP_SIZE; + // int select_idx = block_idx * NUM_WARPS + warp_idx; + int select_idx = block_idx; + select_idx = (select_idx * PERMUTE_STRESS) % param.num_stress_index; + int stress_line = param.stress_index[select_idx]; + for (int i = 0; i < num_iters; i++) { + int data = stress_data[stress_line]; + stress_data[stress_line] = i + 1; + } + } + } +};