diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4 index ab9cd8be19bcb..16ae2ce15855b 100644 --- a/make/autoconf/flags-cflags.m4 +++ b/make/autoconf/flags-cflags.m4 @@ -784,6 +784,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP], elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then if test "x$FLAGS_CPU" = xx86; then $1_CFLAGS_CPU_JVM="-arch:IA32" + elif test "x$FLAGS_CPU" = xaarch64; then + # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes + # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code + # was written assuming volatile provides acquire/release semantics (as on x86 + # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and + # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other + # lock-free algorithms that use plain volatile fields. + $1_CFLAGS_CPU_JVM="-volatile:ms" elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then if test "x$DEBUG_LEVEL" != xrelease; then # NOTE: This is probably redundant; -homeparams is default on diff --git a/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp index f8119654c50b5..70fa932129f23 100644 --- a/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp +++ b/src/hotspot/os_cpu/windows_aarch64/atomicAccess_windows_aarch64.hpp @@ -114,4 +114,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64) #undef DEFINE_INTRINSIC_CMPXCHG +// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64. +// +// The generic PlatformLoad and PlatformStore use plain volatile dereferences. +// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles +// those to LDAR/STLR, so these overrides produce identical codegen. They are +// retained as defense-in-depth: they guarantee acquire/release semantics for +// AtomicAccess::load()/AtomicAccess::store() regardless of the compiler flag setting, +// ensuring correct cross-core visibility for HotSpot's lock-free algorithms +// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if +// /volatile:ms were ever removed or overridden. + +template<> +struct AtomicAccess::PlatformLoad<1> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 1); + return PrimitiveConversions::cast( + __ldar8(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct AtomicAccess::PlatformLoad<2> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 2); + return PrimitiveConversions::cast( + __ldar16(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct AtomicAccess::PlatformLoad<4> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 4); + return PrimitiveConversions::cast( + __ldar32(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct AtomicAccess::PlatformLoad<8> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 8); + return PrimitiveConversions::cast( + __ldar64(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct AtomicAccess::PlatformStore<1> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 1); + __stlr8(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct AtomicAccess::PlatformStore<2> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 2); + __stlr16(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct AtomicAccess::PlatformStore<4> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 4); + __stlr32(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct AtomicAccess::PlatformStore<8> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 8); + __stlr64(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's +// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which +// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE. +// These emit single LDAR/STLR instructions that have acquire/release semantics +// baked in, rather than the generic fallback of separate dmb + plain load/store. +// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str +// for cross-core visibility (Dekker patterns, etc.). + +template<> +struct AtomicAccess::PlatformOrderedLoad<1, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 1); + return PrimitiveConversions::cast( + __ldar8(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedLoad<2, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 2); + return PrimitiveConversions::cast( + __ldar16(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedLoad<4, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 4); + return PrimitiveConversions::cast( + __ldar32(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedLoad<8, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 8); + return PrimitiveConversions::cast( + __ldar64(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 1); + __stlr8(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 2); + __stlr16(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 4); + __stlr32(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 8); + __stlr64(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +// release_store + fence combination, matching Linux AArch64 +template +struct AtomicAccess::PlatformOrderedStore { + template + void operator()(volatile T* p, T v) const { + AtomicAccess::release_store(p, v); + OrderAccess::fence(); + } +}; + #endif // OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp index 5385f3e6a1028..01711aff0e49f 100644 --- a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp +++ b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp @@ -26,22 +26,29 @@ #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP // Included in orderAccess.hpp header file. -#include -using std::atomic_thread_fence; #include #include "vm_version_aarch64.hpp" #include "runtime/vm_version.hpp" // Implementation of class OrderAccess. +// +// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence(). +// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction +// reordering" in addition to emitting the hardware DMB instruction. This is critical +// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and +// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic +// operations — it may not act as a compiler barrier for volatile/non-atomic accesses +// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker +// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot. inline void OrderAccess::loadload() { acquire(); } inline void OrderAccess::storestore() { release(); } inline void OrderAccess::loadstore() { acquire(); } inline void OrderAccess::storeload() { fence(); } -#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire); -#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release); -#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst); +#define READ_MEM_BARRIER __dmb(_ARM64_BARRIER_ISHLD) +#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) +#define FULL_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) inline void OrderAccess::acquire() { READ_MEM_BARRIER;