diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index dbcb5e06eb..9fdec5ccc9 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -12,10 +12,9 @@
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
 
-#include "RAJA/util/macros.hpp"
-
 #include "RAJA/policy/atomic_builtin.hpp"
-
+#include "RAJA/policy/desul/policy.hpp"
+#include "RAJA/util/macros.hpp"
 #include "desul/atomics.hpp"
 
 // Default desul options for RAJA
@@ -26,153 +25,208 @@ using raja_default_desul_scope = desul::MemoryScopeDevice;
 namespace RAJA
 {
 
+namespace detail
+{
+template <typename T>
+struct DesulAtomicPolicy {
+  using memory_order = raja_default_desul_order;
+  using memory_scope = raja_default_desul_scope;
+};
+
+template <typename OrderingPolicy, typename ScopePolicy>
+struct DesulAtomicPolicy<detail_atomic_t<OrderingPolicy, ScopePolicy>> {
+  using memory_order = OrderingPolicy;
+  using memory_scope = ScopePolicy;
+};
+
+}  // namespace detail
+
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_add(const_cast<T*>(acc),
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T volatile *acc, T value)
+{
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_add(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_sub(const_cast<T*>(acc),
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T volatile *acc, T value)
+{
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_sub(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_min(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_min(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_max(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_max(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
 {
-  return desul::atomic_fetch_inc(const_cast<T*>(acc),
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_inc(const_cast<T *>(acc),
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
 {
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(const_cast<T *>(acc),
+                                     val,
+                                     desul_order{},
+                                     desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
 {
-  return desul::atomic_fetch_dec(const_cast<T*>(acc),
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_dec(const_cast<T *>(acc),
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
 {
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(const_cast<T *>(acc),
+                                     val,
+                                     desul_order{},
+                                     desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_and(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_and(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_or(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_or(const_cast<T *>(acc),
                                 value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+                                desul_order{},
+                                desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
 {
-  return desul::atomic_fetch_xor(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_fetch_xor(const_cast<T *>(acc),
                                  value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+                                 desul_order{},
+                                 desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy,
+                                              T volatile *acc,
+                                              T value)
 {
-  return desul::atomic_exchange(const_cast<T*>(acc),
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_exchange(const_cast<T *>(acc),
                                 value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+                                desul_order{},
+                                desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value)
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(const_cast<T*>(acc),
-                                        compare,
-                                        value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+  using desul_order =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_order;
+  using desul_scope =
+      typename detail::DesulAtomicPolicy<AtomicPolicy>::memory_scope;
+  return desul::atomic_compare_exchange(
+      const_cast<T *>(acc), compare, value, desul_order{}, desul_scope{});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/desul/policy.hpp b/include/RAJA/policy/desul/policy.hpp
new file mode 100644
index 0000000000..b743760592
--- /dev/null
+++ b/include/RAJA/policy/desul/policy.hpp
@@ -0,0 +1,62 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_desul_HPP
+#define RAJA_policy_desul_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_DESUL_ATOMICS)
+
+#include "desul/atomics.hpp"
+
+namespace RAJA
+{
+
+// Policy to perform an atomic operation with a given memory ordering.
+template <typename OrderingPolicy, typename ScopePolicy>
+struct detail_atomic_t {
+};
+
+using atomic_seq_cst =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeDevice>;
+using atomic_acq_rel =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeDevice>;
+using atomic_acquire =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeDevice>;
+using atomic_release =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeDevice>;
+using atomic_relaxed =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeDevice>;
+
+using atomic_seq_cst_block =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeCore>;
+using atomic_acq_rel_block =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeCore>;
+using atomic_acquire_block =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeCore>;
+using atomic_release_block =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeCore>;
+using atomic_relaxed_block =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeCore>;
+
+using atomic_seq_cst_sys =
+    detail_atomic_t<desul::MemoryOrderSeqCst, desul::MemoryScopeSystem>;
+using atomic_acq_rel_sys =
+    detail_atomic_t<desul::MemoryOrderAcqRel, desul::MemoryScopeSystem>;
+using atomic_acquire_sys =
+    detail_atomic_t<desul::MemoryOrderAcquire, desul::MemoryScopeSystem>;
+using atomic_release_sys =
+    detail_atomic_t<desul::MemoryOrderRelease, desul::MemoryScopeSystem>;
+using atomic_relaxed_sys =
+    detail_atomic_t<desul::MemoryOrderRelaxed, desul::MemoryScopeSystem>;
+
+}  // namespace RAJA
+
+#endif  // RAJA_ENABLE_DESUL_ATOMICS
+
+#endif
diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 9c2c12d76f..707eabdc78 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -28,3 +28,38 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
   target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe
                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
+
+
+set(ENABLE_LITMUS_TESTS OFF)
+
+if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_CUDA)
+    set(LITMUS_BACKEND "Cuda")
+    set(ENABLE_LITMUS_TESTS ON)
+endif()
+
+if(RAJA_ENABLE_DESUL_ATOMICS AND RAJA_ENABLE_HIP)
+    set(LITMUS_BACKEND "Hip")
+    set(ENABLE_LITMUS_TESTS ON)
+endif()
+
+set(FORALL_LITMUS_TESTS
+    mp # Message Passing
+    sb # Store Buffer
+    lb # Load Buffer
+    store # Store
+    read # Read
+    write2x2 # 2+2 write
+)
+
+if (ENABLE_LITMUS_TESTS)
+ foreach ( LITMUS_TEST ${FORALL_LITMUS_TESTS} )
+   raja_add_test( NAME test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}
+                  SOURCES test-forall-atomic-litmus-${LITMUS_TEST}.cpp)
+   target_include_directories(test-forall-atomic-litmus-${LITMUS_BACKEND}-${LITMUS_TEST}.exe
+                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+ endforeach()
+endif()
+
+unset(FORALL_LITMUS_TESTS)
+unset(ENABLE_LITMUS_TESTS)
+unset(LITMUS_BACKEND)
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp
new file mode 100644
index 0000000000..67a49eb91d
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-lb.cpp
@@ -0,0 +1,235 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Load buffer" litmus test for DESUL ordered atomic
+// --------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// a = load(y)      b = load(x)
+// store(x, 1)      store(y, 1)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 0, b = 1
+//  - a = 1, b = 0
+//  - a = 0, b = 0
+// Weak behavior:
+//  - a = 1, b = 1
+
+// Send policy: Relaxed (Weak), Release, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct LoadBufferLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  constexpr static int PERMUTE_THREAD_FLAG = 97;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+  T *b;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      load_buffer_1(other_thread, iter);
+      load_buffer_2(this_thread, iter);
+    } else {
+      load_buffer_2(this_thread, iter);
+      load_buffer_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void load_buffer_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_thread * m_stride + iter;
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{0});
+    RAJA::atomicAdd<SendPolicy>(&(x[permute_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void load_buffer_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    int permute_thread = (thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_thread * m_stride + iter;
+    b[thread_idx] = RAJA::atomicAdd<RecvPolicy>(&(x[permute_idx]), T{0});
+    RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && b_local[i] == 0) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 0) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
+        // Weak behavior: stores reordered after receives
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using LBLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using LBLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, LBLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, LoadBufferTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using LBTest = LoadBufferLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<LBTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, LoadBufferTest);
+
+using LoadBufferTestTypes = Test<LBLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               LoadBufferTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
new file mode 100644
index 0000000000..362686d5c0
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-mp.cpp
@@ -0,0 +1,242 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Message Passing" litmus test for DESUL ordered atomic
+// ------------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      a = load(flag)
+// store(flag, 1)   b = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, b = 1
+//  - a = 0, b = 0
+//  - a = 0, b = 1
+// Weak behavior:
+//  - a = 1, b = 0
+//
+// On weak architectures (POWER/ARM/GPUs), the store to "x" can be reordered
+// after the store to "flag". Store-release and load-acquire on the "flag"
+// variable should prevent observing the weak behavior.
+
+// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct MessagePassingLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  constexpr static int PERMUTE_THREAD_FLAG = 97;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *flag;
+  T *a;
+  T *b;
+
+  size_t strong_behavior_0{0};
+  size_t strong_behavior_1{0};
+  size_t interleaved_behavior{0};
+  size_t weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    flag = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(flag);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(flag, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool send_first = (this_thread % 2 == 0);
+    // Send action
+    if (send_first) {
+      this->run_send(other_thread, iter);
+      this->run_recv(this_thread, iter);
+    } else {
+      this->run_recv(this_thread, iter);
+      this->run_send(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void run_send(int other_thread, int iter)
+  {
+    int other_thread_idx = other_thread * m_stride + iter;
+    int permute_other_thread = (other_thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_other_thread * m_stride + iter;
+    RAJA::atomicAdd<RelaxedPolicy>(&(x[other_thread_idx]), T{1});
+    RAJA::atomicAdd<SendPolicy>(&(flag[permute_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void run_recv(int this_thread, int iter)
+  {
+    int this_thread_idx = this_thread * m_stride + iter;
+    int permute_this_thread = (this_thread * PERMUTE_THREAD_FLAG) % m_size;
+    int permute_idx = permute_this_thread * m_stride + iter;
+    a[this_thread_idx] =
+        RAJA::atomicAdd<RecvPolicy>(&(flag[permute_idx]), T{0});
+    b[this_thread_idx] =
+        RAJA::atomicAdd<RelaxedPolicy>(&(x[this_thread_idx]), T{0});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 0 && b_local[i] == 0) {
+        // Strong behavior: neither store from test_send is observable
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
+        // Strong behavior: both stores from test_send are observable
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 0) {
+        // Weak behavior: second store observed before first store
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 0, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using MPLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using MPLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, MPLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, MessagePassingTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using MPTest = MessagePassingLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<MPTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, MessagePassingTest);
+
+using MessagePassingTestTypes = Test<MPLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               MessagePassingTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp
new file mode 100644
index 0000000000..16c870e4b8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-read.cpp
@@ -0,0 +1,219 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Read" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 2)
+// store(y, 1)      a = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 2
+//  - a = 1, x = 1
+//  - a = 0, x = 1
+// Weak behavior:
+//  - a = 0, x = 2
+
+// Atomic policy: Relaxed (Weak), SeqCst
+template <typename T, typename AtomicPolicy>
+struct ReadLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      read_1(other_thread, iter);
+      read_2(this_thread, iter);
+    } else {
+      read_2(this_thread, iter);
+      read_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void read_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<AtomicPolicy>(&(x[thread_idx]), T{1});
+    RAJA::atomicExchange<RelaxedPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void read_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<AtomicPolicy>(&(y[thread_idx]), T{2});
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(x[thread_idx]), T{0});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *y_local = y;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && y_local[i] == 2) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && y_local[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 1 && y_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 0 && y_local[i] == 2) {
+        // Weak behavior: receives reordered after stores
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using ReadLitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed, RAJA::atomic_seq_cst>;
+
+using ReadLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, ReadLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, ReadTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using ReadTest = ReadLitmus<Type, AtomicPol>;
+  LitmusTestDriver<ReadTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, ReadTest);
+
+using ReadTestTypes = Test<ReadLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               ReadTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
new file mode 100644
index 0000000000..666c1470ea
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-sb.cpp
@@ -0,0 +1,228 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Store buffer" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 1)
+// a = load(y)      b = load(x)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, b = 1
+//  - a = 0, b = 1
+//  - a = 1, b = 0
+// Weak behavior:
+//  - a = 0, b = 0
+//
+// Acquire-release semantics are not enough to disallow the stores to be
+// reordered after the load -- full sequential consistency is required in order
+// to impose a "single total order" of the stores.
+
+// Send policy: Relaxed, SeqCst (Strong)
+// Recv policy: Relaxed, SeqCst (Strong)
+template <typename T, typename AtomicPolicy>
+struct StoreBufferLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+  T *b;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+    b = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+    work_res.deallocate(b);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(b, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_buffer_1(other_thread, iter);
+      store_buffer_2(this_thread, iter);
+    } else {
+      store_buffer_2(this_thread, iter);
+      store_buffer_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void store_buffer_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicAdd<AtomicPolicy>(&(x[thread_idx]), T{1});
+    a[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(y[thread_idx]), T{0});
+  }
+
+  RAJA_HOST_DEVICE void store_buffer_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicAdd<AtomicPolicy>(&(y[thread_idx]), T{1});
+    b[thread_idx] = RAJA::atomicAdd<RelaxedPolicy>(&(x[thread_idx]), T{0});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *b_local = b;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && b_local[i] == 0) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 1 && b_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 0 && b_local[i] == 0) {
+        // Weak behavior: stores reordered after receives
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using SBLitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed, RAJA::atomic_seq_cst>;
+
+using SBLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, SBLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, StoreBufferTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using SBTest = StoreBufferLitmus<Type, AtomicPol>;
+  LitmusTestDriver<SBTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreBufferTest);
+
+using StoreBufferTestTypes = Test<SBLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               StoreBufferTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp
new file mode 100644
index 0000000000..10fd9edeb8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-store.cpp
@@ -0,0 +1,226 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "Store" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 2)      a = load(y)
+// store(y, 1)      store(x, 1)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 1
+//  - a = 0, x = 2
+//  - a = 0, x = 1
+// Weak behavior:
+//  - a = 1, x = 2
+
+// Send policy: Relaxed (Weak), Acquire, AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), Release, AcqRel, SeqCst
+template <typename T, typename SendPolicy, typename RecvPolicy>
+struct StoreLitmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_1(other_thread, iter);
+      store_2(this_thread, iter);
+    } else {
+      store_2(this_thread, iter);
+      store_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void store_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{2});
+    RAJA::atomicAdd<SendPolicy>(&(y[thread_idx]), T{1});
+  }
+
+  RAJA_HOST_DEVICE void store_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    a[thread_idx] = RAJA::atomicAdd<RecvPolicy>(&(y[thread_idx]), T{0});
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{1});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *a_local = a;
+    T *x_local = x;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (a_local[i] == 1 && x_local[i] == 1) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (a_local[i] == 0 && x_local[i] == 2) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (a_local[i] == 0 && x_local[i] == 1) {
+        // Strong behavior: stores interleaved with receives
+        interleaved_cnt += 1;
+      } else if (a_local[i] == 1 && x_local[i] == 2) {
+        // Weak behavior: receives reordered after stores
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<SendPolicy, RAJA::atomic_relaxed>::value &&
+        std::is_same<RecvPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using StoreLitmusTestOrderPols =
+    camp::list<camp::list<RAJA::atomic_relaxed, RAJA::atomic_relaxed>,
+               camp::list<RAJA::atomic_release, RAJA::atomic_acquire>,
+               camp::list<RAJA::atomic_acq_rel, RAJA::atomic_acq_rel>,
+               camp::list<RAJA::atomic_seq_cst, RAJA::atomic_seq_cst> >;
+
+using StoreLitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, StoreLitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, StoreTest)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using SendRecvPol = typename camp::at<TypeParam, camp::num<1>>::type;
+  using SendPol = typename camp::at<SendRecvPol, camp::num<0>>::type;
+  using RecvPol = typename camp::at<SendRecvPol, camp::num<1>>::type;
+
+  using StoreTest = StoreLitmus<Type, SendPol, RecvPol>;
+  LitmusTestDriver<StoreTest>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, StoreTest);
+
+using StoreTestTypes = Test<StoreLitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               StoreTestTypes);
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp
new file mode 100644
index 0000000000..b5dc28ac5f
--- /dev/null
+++ b/test/functional/forall/atomic-basic/test-forall-atomic-litmus-write2x2.cpp
@@ -0,0 +1,222 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "test-forall-atomic-litmus-driver.hpp"
+
+// "2+2 Write" litmus test for DESUL ordered atomic
+// ---------------------------------------------------
+// Initial state: x = 0 && y = 0
+//
+//  Thread 1:        Thread 2:
+// -----------      -----------
+// store(x, 1)      store(y, 1)
+// store(y, 2)      store(x, 2)
+//
+// Allowed results:
+// ----------------
+// Strong behaviors:
+//  - a = 1, x = 2
+//  - a = 2, x = 1
+//  - a = 2, x = 2
+// Weak behavior:
+//  - a = 1, x = 1
+
+// Send policy: Relaxed (Weak), AcqRel, SeqCst
+// Recv policy: Relaxed (Weak), AcqRel, SeqCst
+template <typename T, typename AtomicPolicy>
+struct Write2x2Litmus {
+  using DataType = T;
+  using RelaxedPolicy = RAJA::atomic_relaxed;
+  size_t m_size;
+  int m_stride;
+  T *x;
+  T *y;
+  T *a;
+
+  int strong_behavior_0{0};
+  int strong_behavior_1{0};
+  int interleaved_behavior{0};
+  int weak_behavior{0};
+
+  void allocate(camp::resources::Resource work_res, size_t size, int stride)
+  {
+    m_size = size;
+    m_stride = stride;
+    x = work_res.allocate<T>(size * stride);
+    y = work_res.allocate<T>(size * stride);
+    a = work_res.allocate<T>(size * stride);
+  }
+
+  void deallocate(camp::resources::Resource work_res)
+  {
+    work_res.deallocate(x);
+    work_res.deallocate(y);
+    work_res.deallocate(a);
+  }
+
+  void pre_run(camp::resources::Resource work_res)
+  {
+    work_res.memset(x, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(y, 0, sizeof(T) * m_size * m_stride);
+    work_res.memset(a, 0, sizeof(T) * m_size * m_stride);
+
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipDeviceSynchronize());
+#endif
+  }
+
+  RAJA_HOST_DEVICE void run(int this_thread, int other_thread, int iter)
+  {
+    bool swap = this_thread % 2 == 0;
+    if (swap) {
+      store_1(other_thread, iter);
+      store_2(this_thread, iter);
+    } else {
+      store_2(this_thread, iter);
+      store_1(other_thread, iter);
+    }
+  }
+
+  RAJA_HOST_DEVICE void store_1(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(x[thread_idx]), T{1});
+    RAJA::atomicExchange<AtomicPolicy>(&(y[thread_idx]), T{2});
+  }
+
+  RAJA_HOST_DEVICE void store_2(int thread, int iter)
+  {
+    int thread_idx = thread * m_stride + iter;
+    RAJA::atomicExchange<RelaxedPolicy>(&(y[thread_idx]), T{1});
+    RAJA::atomicExchange<AtomicPolicy>(&(x[thread_idx]), T{2});
+  }
+
+  void count_results(camp::resources::Resource work_res)
+  {
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<256>;
+    using ReducePolicy = RAJA::hip_reduce;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<256>;
+    using ReducePolicy = RAJA::cuda_reduce;
+#endif
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_0(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> strong_cnt_1(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> interleaved_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> weak_cnt(0);
+    RAJA::ReduceSum<ReducePolicy, size_t> unexpected_cnt(0);
+
+    T *x_local = x;
+    T *y_local = y;
+
+    auto forall_len = RAJA::TypedRangeSegment<int>(0, m_size * m_stride);
+
+    RAJA::forall<GPUExec>(forall_len, [=] RAJA_HOST_DEVICE(int i) {
+      if (x_local[i] == 1 && y_local[i] == 2) {
+        // Strong behavior: thread 1 happened before thread 2
+        strong_cnt_0 += 1;
+      } else if (x_local[i] == 2 && y_local[i] == 1) {
+        // Strong behavior: thread 2 happened before thread 1
+        strong_cnt_1 += 1;
+      } else if (x_local[i] == 2 && y_local[i] == 2) {
+        // Strong behavior: interleaved stores in-order
+        interleaved_cnt += 1;
+      } else if (x_local[i] == 1 && y_local[i] == 1) {
+        // Weak behavior: stores on each thread were reordered
+        weak_cnt += 1;
+      } else {
+        unexpected_cnt += 1;
+      }
+    });
+
+    EXPECT_EQ(unexpected_cnt.get(), 0);
+
+    strong_behavior_0 += strong_cnt_0.get();
+    strong_behavior_1 += strong_cnt_1.get();
+    interleaved_behavior += interleaved_cnt.get();
+    weak_behavior += weak_cnt.get();
+  }
+
+  void verify()
+  {
+    std::cerr << " - Strong behavior (a = 1, b = 0) = " << strong_behavior_0
+              << "\n";
+    std::cerr << " - Strong behavior (a = 0, b = 1) = " << strong_behavior_1
+              << "\n";
+    std::cerr << " - Strong behavior (a = 1, b = 1) = " << interleaved_behavior
+              << "\n";
+    std::cerr << " - Weak behaviors = " << weak_behavior << "\n";
+
+    if (std::is_same<AtomicPolicy, RAJA::atomic_relaxed>::value) {
+      // In the relaxed case, we should observe some weak behaviors.
+      // Don't fail the test, but do print out a message.
+      if (weak_behavior == 0) {
+        std::cerr << "Warning - no weak behaviors detected in the control case."
+                  << "\nThis litmus test may be insufficient to exercise "
+                     "ordered memory atomics.\n";
+      } else {
+        double overall_behavior_counts = strong_behavior_0 + strong_behavior_1 +
+                                         interleaved_behavior + weak_behavior;
+        std::cerr << "\n Weak behaviors detected in "
+                  << 100 * (weak_behavior / overall_behavior_counts)
+                  << "% of cases.\n";
+      }
+    } else {
+      // We should not expect any weak behaviors if using a strong ordering.
+      EXPECT_EQ(weak_behavior, 0);
+    }
+  }
+};
+
+using Write2x2LitmusTestOrderPols =
+    camp::list<RAJA::atomic_relaxed,
+               RAJA::atomic_acq_rel,
+               RAJA::atomic_seq_cst >;
+
+using Write2x2LitmusTestPols =
+    camp::cartesian_product<AtomicDataTypeList, Write2x2LitmusTestOrderPols>;
+
+TYPED_TEST_SUITE_P(ForallAtomicLitmusTest);
+
+template <typename T>
+class ForallAtomicLitmusTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallAtomicLitmusTest, Write2x2Test)
+{
+  using Type = typename camp::at<TypeParam, camp::num<0>>::type;
+  using AtomicPol = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  using Write2x2Test = Write2x2Litmus<Type, AtomicPol>;
+  LitmusTestDriver<Write2x2Test>::run();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallAtomicLitmusTest, Write2x2Test);
+
+using Write2x2TestTypes = Test<Write2x2LitmusTestPols>::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Hip,
+                               ForallAtomicLitmusTest,
+                               Write2x2TestTypes);
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
new file mode 100644
index 0000000000..3e6b9520e8
--- /dev/null
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-litmus-driver.hpp
@@ -0,0 +1,325 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-atomic-types.hpp"
+#include "RAJA_test-atomicpol.hpp"
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-index-types.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include <numeric>
+#include <random>
+
+using IdxType = size_t;
+constexpr int NUM_ITERS = 100;
+#ifdef RAJA_ENABLE_CUDA
+constexpr int STRIDE = 32;
+constexpr bool STRESS_BEFORE_TEST = true;
+constexpr bool NONTESTING_BLOCKS = true;
+constexpr int WARP_SIZE = 32;
+constexpr int BLOCK_SIZE = 128;
+#elif defined(RAJA_ENABLE_HIP)
+constexpr int STRIDE = 32;
+constexpr bool STRESS_BEFORE_TEST = true;
+constexpr bool NONTESTING_BLOCKS = true;
+constexpr int WARP_SIZE = 64;
+constexpr int BLOCK_SIZE = 128;
+#endif
+
+constexpr int STRESS_STRIDE = 64;
+constexpr int NUM_STRESS_LINES = 2048;
+constexpr int DATA_STRESS_SIZE = NUM_STRESS_LINES * STRESS_STRIDE;
+
+constexpr int PERMUTE_PRIME_BLOCK = 17;
+constexpr int PERMUTE_PRIME_GRID = 47;
+constexpr int PERMUTE_STRESS = 977;
+
+template <typename Func>
+__global__ void dummy_kernel(IdxType index, Func func)
+{
+  func(index);
+}
+
+// Generic test driver for memory litmus tests on the GPU.
+template <typename LitmusPolicy>
+struct LitmusTestDriver {
+public:
+  using T = typename LitmusPolicy::DataType;
+  struct TestData {
+    int block_size;
+    int grid_size;
+
+    int stride = STRIDE;
+
+    int pre_stress_iterations = 4;
+    int stress_iterations = 12;
+
+    // Number of blocks to run the message-passing litmus test on.
+    int testing_blocks;
+
+    // Array to shuffle block indices in a test kernel.
+    IdxType* shuffle_block;
+
+    // Barrier integers to synchronize testing threads.
+    IdxType* barriers;
+
+    int* data_stress;
+
+    int num_stress_index;
+    IdxType* stress_index;
+
+    void allocate(camp::resources::Resource work_res,
+                  int grid_size,
+                  int block_size,
+                  int num_testing_blocks)
+    {
+      this->grid_size = grid_size;
+      this->block_size = block_size;
+
+      testing_blocks = num_testing_blocks;
+
+      shuffle_block = work_res.allocate<IdxType>(grid_size);
+      barriers = work_res.allocate<IdxType>(STRIDE);
+      data_stress = work_res.allocate<int>(DATA_STRESS_SIZE);
+
+      num_stress_index = 64;
+      stress_index = work_res.allocate<IdxType>(num_stress_index);
+    }
+
+    void pre_run(camp::resources::Resource work_res)
+    {
+      std::random_device rand_device;
+      // Create a random permutation for the range [0, grid_size)
+      std::vector<IdxType> shuffle_block_host(grid_size);
+      {
+        std::iota(shuffle_block_host.begin(), shuffle_block_host.end(), 0);
+        std::shuffle(shuffle_block_host.begin(),
+                     shuffle_block_host.end(),
+                     std::mt19937{rand_device()});
+      }
+      work_res.memcpy(shuffle_block,
+                      shuffle_block_host.data(),
+                      sizeof(IdxType) * grid_size);
+
+      std::vector<IdxType> stress_index_host(num_stress_index);
+      {
+        std::mt19937 gen{rand_device()};
+        std::uniform_int_distribution<IdxType> rnd_stress_offset(0,
+                                                                 STRESS_STRIDE);
+        std::uniform_int_distribution<IdxType> rnd_stress_dist(
+            0, NUM_STRESS_LINES - 1);
+        std::generate(stress_index_host.begin(),
+                      stress_index_host.end(),
+                      [&]() -> IdxType {
+                        return rnd_stress_dist(gen) * STRESS_STRIDE;
+                      });
+      }
+      work_res.memcpy(stress_index,
+                      stress_index_host.data(),
+                      sizeof(IdxType) * num_stress_index);
+
+      work_res.memset(barriers, 0, sizeof(IdxType) * STRIDE);
+      work_res.memset(data_stress, 0, sizeof(int) * DATA_STRESS_SIZE);
+
+
+#if defined(RAJA_ENABLE_CUDA)
+      cudaErrchk(cudaDeviceSynchronize());
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+      hipErrchk(hipDeviceSynchronize());
+#endif
+    }
+
+    void deallocate(camp::resources::Resource work_res)
+    {
+      work_res.deallocate(shuffle_block);
+      work_res.deallocate(barriers);
+      work_res.deallocate(data_stress);
+    }
+  };
+
+  RAJA_HOST_DEVICE LitmusTestDriver() {}
+
+  // Run
+  static void run()
+  {
+    int num_blocks = 0;
+    {
+      LitmusPolicy dummy_policy{};
+      TestData dummy_test_data{};
+      auto lambda = [=] RAJA_HOST_DEVICE(IdxType index) {
+        LitmusTestDriver<LitmusPolicy> test_inst{};
+        test_inst.test_main(index, dummy_test_data, dummy_policy);
+      };
+      RAJA_UNUSED_VAR(lambda);
+#ifdef RAJA_ENABLE_CUDA
+      cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks, dummy_kernel<decltype(lambda)>, BLOCK_SIZE, 0));
+      num_blocks *= RAJA::cuda::device_prop().multiProcessorCount;
+#endif
+#ifdef RAJA_ENABLE_HIP
+      hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+          &num_blocks, dummy_kernel<decltype(lambda)>, BLOCK_SIZE, 0));
+      num_blocks *= RAJA::hip::device_prop().multiProcessorCount;
+#endif
+    }
+    std::cout << "Got num_blocks = " << num_blocks
+              << ", block_size = " << BLOCK_SIZE << "\n"
+              << std::flush;
+    if (num_blocks == 0) {
+      FAIL() << "Grid size wasn't set to a valid value.\n";
+    }
+
+#ifdef RAJA_ENABLE_CUDA
+    using ResourcePolicy = camp::resources::Cuda;
+#endif
+#ifdef RAJA_ENABLE_HIP
+    using ResourcePolicy = camp::resources::Hip;
+#endif
+    camp::resources::Resource work_res{ResourcePolicy()};
+
+    int num_testing_blocks = num_blocks;
+    if (NONTESTING_BLOCKS) {
+      num_testing_blocks = num_blocks / 4;
+    }
+
+    TestData test_data;
+    test_data.allocate(work_res, num_blocks, BLOCK_SIZE, num_testing_blocks);
+
+    LitmusPolicy litmus_test;
+    litmus_test.allocate(work_res, num_testing_blocks * BLOCK_SIZE, STRIDE);
+
+#ifdef RAJA_ENABLE_HIP
+    using GPUExec = RAJA::hip_exec<BLOCK_SIZE>;
+#endif
+
+#ifdef RAJA_ENABLE_CUDA
+    using GPUExec = RAJA::cuda_exec<BLOCK_SIZE>;
+#endif
+
+    for (int iter = 0; iter < NUM_ITERS; iter++) {
+      test_data.pre_run(work_res);
+      litmus_test.pre_run(work_res);
+
+      RAJA::forall<GPUExec>(
+          RAJA::TypedRangeSegment<IdxType>(0, num_blocks * BLOCK_SIZE),
+          [=] RAJA_HOST_DEVICE(IdxType index) {
+            LitmusTestDriver<LitmusPolicy> test_inst{};
+            test_inst.test_main(index, test_data, litmus_test);
+          });
+
+      litmus_test.count_results(work_res);
+    }
+
+    litmus_test.verify();
+
+    litmus_test.deallocate(work_res);
+    test_data.deallocate(work_res);
+  }
+
+private:
+  using NormalAtomic = RAJA::atomic_relaxed;
+
+  RAJA_HOST_DEVICE void test_main(IdxType index,
+                                  TestData param,
+                                  LitmusPolicy test)
+  {
+    IdxType block_idx = index / param.block_size;
+    IdxType thread_idx = index % param.block_size;
+
+    // Permute the thread index, to promote scattering of memory accesses
+    // within a block.
+    IdxType permute_thread_idx =
+        (thread_idx * PERMUTE_PRIME_BLOCK) % param.block_size;
+
+    // Shuffle the block ID randomly according to a permutation array.
+    block_idx = param.shuffle_block[block_idx];
+
+    IdxType data_idx = block_idx * param.block_size + thread_idx;
+
+    for (int i = 0; i < param.stride; i++) {
+
+      // Synchronize all blocks before testing, to increase the chance of
+      // interleaved requests.
+      this->sync(param.grid_size, thread_idx, param.barriers[i]);
+
+      if (block_idx < (IdxType)param.testing_blocks) {
+
+        // Block is a testing block.
+        //
+        // Each block acts as a "sender" to a unique "partner" block. This is
+        // done by permuting the block IDs with a function p(i) = i * k mod n,
+        // where n is the number of blocks being tested, and k and n are
+        // coprime.
+        int partner_idx =
+            (block_idx * PERMUTE_PRIME_GRID + i) % param.testing_blocks;
+
+        // Run specified test, matching threads between the two paired blocks.
+        int other_data_idx =
+            partner_idx * param.block_size + permute_thread_idx;
+
+        // Pre-stress pattern - stressing memory accesses before the test may
+        // increase the rate of weak memory behaviors
+        // Helps on AMD, doesn't seem to help on NVIDIA
+        if (STRESS_BEFORE_TEST) {
+          this->stress(block_idx, thread_idx, param, true);
+        }
+
+        test.run(data_idx, other_data_idx, i);
+      } else {
+        this->stress(block_idx, thread_idx, param);
+      }
+    }
+  };
+
+  RAJA_HOST_DEVICE void sync(int num_blocks, int thread_idx, IdxType& barrier)
+  {
+    if (thread_idx == 0) {
+      IdxType result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{1});
+      // Busy-wait until all blocks perform the above add.
+      while (result != num_blocks) {
+        result = RAJA::atomicAdd<NormalAtomic>(&barrier, IdxType{0});
+      }
+    }
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_CODE__)
+    __syncthreads();
+#endif
+  }
+
+  RAJA_HOST_DEVICE void stress(IdxType block_idx,
+                               IdxType thread_idx,
+                               const TestData& param,
+                               bool pre_stress = false)
+  {
+    int num_iters =
+        (pre_stress ? param.pre_stress_iterations : param.stress_iterations);
+    int volatile* stress_data = param.data_stress;
+    if (thread_idx % WARP_SIZE == 0) {
+      int warp_idx = thread_idx / WARP_SIZE;
+      // int select_idx = block_idx * NUM_WARPS + warp_idx;
+      int select_idx = block_idx;
+      select_idx = (select_idx * PERMUTE_STRESS) % param.num_stress_index;
+      int stress_line = param.stress_index[select_idx];
+      for (int i = 0; i < num_iters; i++) {
+        int data = stress_data[stress_line];
+        stress_data[stress_line] = i + 1;
+      }
+    }
+  }
+};