-
Notifications
You must be signed in to change notification settings - Fork 20
Windows AArch64 MSVC /volatile:iso memory ordering — HotSpot C++ runtime #48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: macarte/baselinePR-TestTrampolineFix
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -114,4 +114,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64) | |
|
|
||
| #undef DEFINE_INTRINSIC_CMPXCHG | ||
|
|
||
| // Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64. | ||
| // | ||
| // The generic PlatformLoad and PlatformStore use plain volatile dereferences. | ||
| // With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles | ||
| // those to LDAR/STLR, so these overrides produce identical codegen. They are | ||
| // retained as defense-in-depth: they guarantee acquire/release semantics for | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mo-beck, was the use of |
||
| // AtomicAccess::load()/AtomicAccess::store() regardless of the compiler flag setting, | ||
| // ensuring correct cross-core visibility for HotSpot's lock-free algorithms | ||
| // (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if | ||
| // /volatile:ms were ever removed or overridden. | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformLoad<1> { | ||
| template<typename T> | ||
| T operator()(T const volatile* dest) const { | ||
| STATIC_ASSERT(sizeof(T) == 1); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar8(reinterpret_cast<unsigned __int8 volatile*>( | ||
| const_cast<T volatile*>(dest)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformLoad<2> { | ||
| template<typename T> | ||
| T operator()(T const volatile* dest) const { | ||
| STATIC_ASSERT(sizeof(T) == 2); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar16(reinterpret_cast<unsigned __int16 volatile*>( | ||
| const_cast<T volatile*>(dest)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformLoad<4> { | ||
| template<typename T> | ||
| T operator()(T const volatile* dest) const { | ||
| STATIC_ASSERT(sizeof(T) == 4); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar32(reinterpret_cast<unsigned __int32 volatile*>( | ||
| const_cast<T volatile*>(dest)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformLoad<8> { | ||
| template<typename T> | ||
| T operator()(T const volatile* dest) const { | ||
| STATIC_ASSERT(sizeof(T) == 8); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar64(reinterpret_cast<unsigned __int64 volatile*>( | ||
| const_cast<T volatile*>(dest)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformStore<1> { | ||
| template<typename T> | ||
| void operator()(T volatile* dest, T new_value) const { | ||
| STATIC_ASSERT(sizeof(T) == 1); | ||
| __stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest), | ||
| PrimitiveConversions::cast<unsigned __int8>(new_value)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformStore<2> { | ||
| template<typename T> | ||
| void operator()(T volatile* dest, T new_value) const { | ||
| STATIC_ASSERT(sizeof(T) == 2); | ||
| __stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest), | ||
| PrimitiveConversions::cast<unsigned __int16>(new_value)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformStore<4> { | ||
| template<typename T> | ||
| void operator()(T volatile* dest, T new_value) const { | ||
| STATIC_ASSERT(sizeof(T) == 4); | ||
| __stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest), | ||
| PrimitiveConversions::cast<unsigned __int32>(new_value)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformStore<8> { | ||
| template<typename T> | ||
| void operator()(T volatile* dest, T new_value) const { | ||
| STATIC_ASSERT(sizeof(T) == 8); | ||
| __stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest), | ||
| PrimitiveConversions::cast<unsigned __int64>(new_value)); | ||
| } | ||
| }; | ||
|
|
||
| // Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's | ||
| // __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which | ||
| // uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE. | ||
| // These emit single LDAR/STLR instructions that have acquire/release semantics | ||
| // baked in, rather than the generic fallback of separate dmb + plain load/store. | ||
| // On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str | ||
| // for cross-core visibility (Dekker patterns, etc.). | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedLoad<1, X_ACQUIRE> { | ||
| template <typename T> | ||
| T operator()(const volatile T* p) const { | ||
| STATIC_ASSERT(sizeof(T) == 1); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar8(reinterpret_cast<unsigned __int8 volatile*>( | ||
| const_cast<T volatile*>(p)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedLoad<2, X_ACQUIRE> { | ||
| template <typename T> | ||
| T operator()(const volatile T* p) const { | ||
| STATIC_ASSERT(sizeof(T) == 2); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar16(reinterpret_cast<unsigned __int16 volatile*>( | ||
| const_cast<T volatile*>(p)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedLoad<4, X_ACQUIRE> { | ||
| template <typename T> | ||
| T operator()(const volatile T* p) const { | ||
| STATIC_ASSERT(sizeof(T) == 4); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar32(reinterpret_cast<unsigned __int32 volatile*>( | ||
| const_cast<T volatile*>(p)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedLoad<8, X_ACQUIRE> { | ||
| template <typename T> | ||
| T operator()(const volatile T* p) const { | ||
| STATIC_ASSERT(sizeof(T) == 8); | ||
| return PrimitiveConversions::cast<T>( | ||
| __ldar64(reinterpret_cast<unsigned __int64 volatile*>( | ||
| const_cast<T volatile*>(p)))); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedStore<1, RELEASE_X> { | ||
| template <typename T> | ||
| void operator()(volatile T* p, T v) const { | ||
| STATIC_ASSERT(sizeof(T) == 1); | ||
| __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p), | ||
| PrimitiveConversions::cast<unsigned __int8>(v)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedStore<2, RELEASE_X> { | ||
| template <typename T> | ||
| void operator()(volatile T* p, T v) const { | ||
| STATIC_ASSERT(sizeof(T) == 2); | ||
| __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p), | ||
| PrimitiveConversions::cast<unsigned __int16>(v)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedStore<4, RELEASE_X> { | ||
| template <typename T> | ||
| void operator()(volatile T* p, T v) const { | ||
| STATIC_ASSERT(sizeof(T) == 4); | ||
| __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p), | ||
| PrimitiveConversions::cast<unsigned __int32>(v)); | ||
| } | ||
| }; | ||
|
|
||
| template<> | ||
| struct AtomicAccess::PlatformOrderedStore<8, RELEASE_X> { | ||
| template <typename T> | ||
| void operator()(volatile T* p, T v) const { | ||
| STATIC_ASSERT(sizeof(T) == 8); | ||
| __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p), | ||
| PrimitiveConversions::cast<unsigned __int64>(v)); | ||
| } | ||
| }; | ||
|
|
||
| // release_store + fence combination, matching Linux AArch64 | ||
| template<size_t byte_size> | ||
| struct AtomicAccess::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> { | ||
| template <typename T> | ||
| void operator()(volatile T* p, T v) const { | ||
| AtomicAccess::release_store(p, v); | ||
| OrderAccess::fence(); | ||
| } | ||
| }; | ||
|
|
||
| #endif // OS_CPU_WINDOWS_AARCH64_ATOMICACCESS_WINDOWS_AARCH64_HPP | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,22 +26,29 @@ | |
| #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP | ||
|
|
||
| // Included in orderAccess.hpp header file. | ||
| #include <atomic> | ||
| using std::atomic_thread_fence; | ||
| #include <arm64intr.h> | ||
| #include "vm_version_aarch64.hpp" | ||
| #include "runtime/vm_version.hpp" | ||
|
|
||
| // Implementation of class OrderAccess. | ||
| // | ||
| // Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence(). | ||
| // Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction | ||
| // reordering" in addition to emitting the hardware DMB instruction. This is critical | ||
| // because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and | ||
| // std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic | ||
| // operations — it may not act as a compiler barrier for volatile/non-atomic accesses | ||
| // on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker | ||
| // protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot. | ||
|
|
||
| inline void OrderAccess::loadload() { acquire(); } | ||
| inline void OrderAccess::storestore() { release(); } | ||
| inline void OrderAccess::loadstore() { acquire(); } | ||
| inline void OrderAccess::storeload() { fence(); } | ||
|
|
||
| #define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire); | ||
| #define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release); | ||
| #define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst); | ||
| #define READ_MEM_BARRIER __dmb(_ARM64_BARRIER_ISHLD) | ||
| #define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) | ||
| #define FULL_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be the _ARM64_BARRIER_SY barrier type?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm specifically referring to the FULL_MEM_BARRIER based on explanation at https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-170#BarrierRestrictions
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll check on this more (thanks for bringing up) ... an initial analysis by CoPilot:
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Saint, following yours and Monica's input I ran test on jdk25: microsoft/openjdk-jdk25u#27 Where I reverted this change and also the AtomicAccess::PlatformLoad, AtomicAccess::PlatformStore pairs (kept the AtomicAccess::PlatformOrderedLoad, AtomicAccess::PlatformOrderedStore pairs) winarm64 GHA tests are passing; will make the same change here on tip
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. revalidate that dmb is needed (its a pause vs sequencing) - this was targetting the same issue that volatile:ms ended up solving |
||
|
|
||
| inline void OrderAccess::acquire() { | ||
| READ_MEM_BARRIER; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this change necessary given the atomics changes in atomicAccess_windows_aarch64.hpp?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ask because https://learn.microsoft.com/en-us/cpp/cpp/volatile-cpp?view=msvc-170 makes it sound like the compiler expects /volatile:iso to be used along with explicit synchronization primitives where the code needs it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So while the atomics changes address many issues, there's still too many weak memory issues in the JVM cpp source itself that don't use atomics. So setting the flag addresses all compiled JVM code issues
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be clearer, the issues in the JVM are only a problem when those code paths are in virtual thread execution or support virtual threads