Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions make/autoconf/flags-cflags.m4
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
if test "x$FLAGS_CPU" = xx86; then
$1_CFLAGS_CPU_JVM="-arch:IA32"
elif test "x$FLAGS_CPU" = xaarch64; then
# MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
# plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
# was written assuming volatile provides acquire/release semantics (as on x86
# and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
# prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
# lock-free algorithms that use plain volatile fields.
$1_CFLAGS_CPU_JVM="/volatile:ms"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using the same style for these flags as the rest of the make files -volatile:ms

elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
if test "x$DEBUG_LEVEL" != xrelease; then
# NOTE: This is probably redundant; -homeparams is default on
Expand Down
203 changes: 203 additions & 0 deletions src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,207 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)

#undef DEFINE_INTRINSIC_CMPXCHG

// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
//
// Under MSVC /volatile:iso (the default for ARM64), the generic PlatformLoad
// and PlatformStore use plain volatile dereferences which generate plain LDR/STR
// instructions with NO acquire/release barriers. This is insufficient for ARM64's
// weak memory model in HotSpot's concurrent runtime code (ObjectMonitor Dekker
// protocols, ParkEvent signaling, etc.) where Atomic::load()/Atomic::store() are
// used in cross-thread communication patterns that depend on load/store ordering.
//
// On x86, this works by accident because x86-TSO provides store-release and
// load-acquire semantics for all memory accesses. On ARM64, we must explicitly
// use LDAR (acquire load) and STLR (release store) to get equivalent ordering.
//
// This matches the effective behavior of /volatile:ms but scoped only to
// Atomic:: operations rather than ALL volatile accesses, and ensures correct
// cross-core visibility for HotSpot's lock-free algorithms.

template<>
struct Atomic::PlatformLoad<1> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 1);
return PrimitiveConversions::cast<T>(
__ldar8(reinterpret_cast<unsigned __int8 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct Atomic::PlatformLoad<2> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 2);
return PrimitiveConversions::cast<T>(
__ldar16(reinterpret_cast<unsigned __int16 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct Atomic::PlatformLoad<4> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 4);
return PrimitiveConversions::cast<T>(
__ldar32(reinterpret_cast<unsigned __int32 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct Atomic::PlatformLoad<8> {
template<typename T>
T operator()(T const volatile* dest) const {
STATIC_ASSERT(sizeof(T) == 8);
return PrimitiveConversions::cast<T>(
__ldar64(reinterpret_cast<unsigned __int64 volatile*>(
const_cast<T volatile*>(dest))));
}
};

template<>
struct Atomic::PlatformStore<1> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 1);
__stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int8>(new_value));
}
};

template<>
struct Atomic::PlatformStore<2> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 2);
__stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int16>(new_value));
}
};

template<>
struct Atomic::PlatformStore<4> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 4);
__stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int32>(new_value));
}
};

template<>
struct Atomic::PlatformStore<8> {
template<typename T>
void operator()(T volatile* dest, T new_value) const {
STATIC_ASSERT(sizeof(T) == 8);
__stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
PrimitiveConversions::cast<unsigned __int64>(new_value));
}
};

// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
// These emit single LDAR/STLR instructions that have acquire/release semantics
// baked in, rather than the generic fallback of separate dmb + plain load/store.
// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
// for cross-core visibility (Dekker patterns, etc.).

template<>
struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 1);
return PrimitiveConversions::cast<T>(
__ldar8(reinterpret_cast<unsigned __int8 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 2);
return PrimitiveConversions::cast<T>(
__ldar16(reinterpret_cast<unsigned __int16 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 4);
return PrimitiveConversions::cast<T>(
__ldar32(reinterpret_cast<unsigned __int32 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 8);
return PrimitiveConversions::cast<T>(
__ldar64(reinterpret_cast<unsigned __int64 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 1);
__stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
PrimitiveConversions::cast<unsigned __int8>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 2);
__stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
PrimitiveConversions::cast<unsigned __int16>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 4);
__stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
PrimitiveConversions::cast<unsigned __int32>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 8);
__stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
PrimitiveConversions::cast<unsigned __int64>(v));
}
};

// release_store + fence combination, matching Linux AArch64
template<size_t byte_size>
struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
template <typename T>
void operator()(volatile T* p, T v) const {
Atomic::release_store(p, v);
OrderAccess::fence();
}
};

#endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
17 changes: 12 additions & 5 deletions src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,29 @@
#define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP

// Included in orderAccess.hpp header file.
#include <atomic>
using std::atomic_thread_fence;
#include <arm64intr.h>
#include "vm_version_aarch64.hpp"
#include "runtime/vm_version.hpp"

// Implementation of class OrderAccess.
//
// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
// reordering" in addition to emitting the hardware DMB instruction. This is critical
// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.

inline void OrderAccess::loadload() { acquire(); }
inline void OrderAccess::storestore() { release(); }
inline void OrderAccess::loadstore() { acquire(); }
inline void OrderAccess::storeload() { fence(); }

#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
#define READ_MEM_BARRIER __dmb(_ARM64_BARRIER_ISHLD)
#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
#define FULL_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)

inline void OrderAccess::acquire() {
READ_MEM_BARRIER;
Expand Down
8 changes: 8 additions & 0 deletions src/hotspot/share/c1/c1_GraphBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
if ( callee->dont_inline()) return "don't inline by annotation";

// Don't inline a method that changes Thread.currentThread() except
// into another method that is annotated @ChangesCurrentThread.
if (callee->changes_current_thread()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that we're fixing atomics handling on Windows AArch64, won't we need additional justification for why we're changing code that other platforms are using given that they haven't run into any of these concurrency issues?

Copy link
Contributor Author

@macarte macarte Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CoPilot] This should not be guarded with an architecture check. Here's why:

The problem this solves is a compiler correctness issue, not a hardware barrier issue. If C1 inlines a @ChangesCurrentThread method (like Continuation.yield() or carrier-thread switching code) into a caller that isn't annotated @ChangesCurrentThread, the compiler can cache the result of Thread.currentCarrierThread() across the inlined call — observing the old carrier thread after the virtual thread has migrated to a new one.

This is broken on all architectures, not just ARM64:

  • On ARM64, it manifests more easily because the stale OopHandle dereference can also be reordered (which the MO_ACQUIRE fix in c1_LIRGenerator/library_call addresses separately).

  • On x86, it could still manifest if C1 keeps the currentCarrierThread result in a register across the inlined yield — TSO doesn't help because there's no memory access to reorder, the value is simply never re-read.

The check is also very low cost — it only runs during C1 inlining decisions, not at runtime. And @ChangesCurrentThread is used on a tiny number of methods, so this almost never triggers.

Bottom line: This is a correct-by-construction inlining guard that prevents a real compiler bug on all platforms. No architecture guard needed. The existing code is right as-is.

&& !compilation()->method()->changes_current_thread()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this specific change, for example, wouldn't it be a bug for an inlined method that changes threads to not read the correct state given that all its registers are correct and the load/acquire semantics have been fixed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CoPilot] Good question. The concern is specifically about compiler value caching, not memory ordering.

When C1 inlines a @ChangesCurrentThread method into a non-@ChangesCurrentThread caller, the compiler can CSE (common subexpression elimination) or hoist the currentCarrierThread intrinsic across the carrier-change boundary. The result: the caller reuses a register-held value from before the yield/mount/unmount and never re-reads it.

The MO_ACQUIRE fix addresses a different problem — it ensures that when you do load currentCarrierThread, subsequent dependent loads (e.g., Thread.cont) see consistent data. But it can't force a re-load that the compiler optimized away entirely.

Consider this concrete scenario after inlining:

// Caller (not @ChangesCurrentThread):
Thread carrier = Thread.currentCarrierThread();  // C1 intrinsic → LDAR, result in R0
// ... inlined @ChangesCurrentThread code ...
//   yield() freezes frame (saves R0=old_carrier)
//   VM mounts VT on new carrier
//   thaw restores frame (R0 still = old_carrier)
// ... back in caller ...
carrier.cont  // uses stale R0 — wrong carrier!

The MO_ACQUIRE on the LDAR is irrelevant here because there is no second LDAR — C1 reused R0. And the "correct registers" after thaw are correct in the sense that they're faithfully restored from freeze — but they contain the pre-yield values, which is exactly the stale data.

The inlining guard ensures @ChangesCurrentThread methods stay as real calls, which forces C1 to treat currentCarrierThread as potentially invalidated across the call boundary and re-read it.

So: MO_ACQUIRE = correct ordering on loads that happen. Inlining guard = ensuring loads actually happen. They're complementary.

return "method changes current thread";
}

return nullptr;
}

Expand Down
9 changes: 8 additions & 1 deletion src/hotspot/share/c1/c1_LIRGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
LIR_Opr temp = new_register(T_ADDRESS);
LIR_Opr reg = rlock_result(x);
__ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
access_load(IN_NATIVE, T_OBJECT,
// OopHandle stores uncompressed oops in native memory.
// Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
// Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
// Continuation.yield) cannot float above this load on weakly-ordered
// architectures such as AArch64. Without acquire semantics the hardware
// may reorder a later field load before the OopHandle dereference,
// observing a stale value after a virtual thread migrates between carriers.
access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
}

Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
LIRGenerator *gen = access.gen();
DecoratorSet decorators = access.decorators();
bool is_volatile = (decorators & MO_SEQ_CST) != 0;
bool is_acquire = (decorators & MO_ACQUIRE) != 0;
bool is_atomic = is_volatile || AlwaysAtomicAccesses;
bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
Expand All @@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
__ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
}

if (is_volatile) {
if (is_volatile || is_acquire) {
__ membar_acquire();
}

Expand Down
9 changes: 8 additions & 1 deletion src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
: make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
thread_obj_handle = _gvn.transform(thread_obj_handle);

DecoratorSet decorators = IN_NATIVE;
// OopHandle stores uncompressed oops in native memory.
// Use IN_NATIVE to ensure proper handling without compressed oop decoding.
// Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
// Continuation.yield) cannot float above this load on weakly-ordered
// architectures such as AArch64. Without acquire semantics the hardware
// may reorder a later field load before the OopHandle dereference,
// observing a stale value after a virtual thread migrates between carriers.
DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
if (is_immutable) {
decorators |= C2_IMMUTABLE_MEMORY;
}
Expand Down
Loading
Loading