Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions make/autoconf/flags-cflags.m4
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
if test "x$FLAGS_CPU" = xx86; then
$1_CFLAGS_CPU_JVM="-arch:IA32"
elif test "x$FLAGS_CPU" = xaarch64; then
# MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
# plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
# was written assuming volatile provides acquire/release semantics (as on x86
# and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
# prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
# lock-free algorithms that use plain volatile fields.
$1_CFLAGS_CPU_JVM="/volatile:ms"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using the same style for these flags as the rest of the make files -volatile:ms

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed

elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
if test "x$DEBUG_LEVEL" != xrelease; then
# NOTE: This is probably redundant; -homeparams is default on
Expand Down
102 changes: 102 additions & 0 deletions src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,106 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)

#undef DEFINE_INTRINSIC_CMPXCHG

// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
// These emit single LDAR/STLR instructions that have acquire/release semantics
// baked in, rather than the generic fallback of separate dmb + plain load/store.
// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
// for cross-core visibility (Dekker patterns, etc.).

template<>
struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 1);
return PrimitiveConversions::cast<T>(
__ldar8(reinterpret_cast<unsigned __int8 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 2);
return PrimitiveConversions::cast<T>(
__ldar16(reinterpret_cast<unsigned __int16 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 4);
return PrimitiveConversions::cast<T>(
__ldar32(reinterpret_cast<unsigned __int32 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
template <typename T>
T operator()(const volatile T* p) const {
STATIC_ASSERT(sizeof(T) == 8);
return PrimitiveConversions::cast<T>(
__ldar64(reinterpret_cast<unsigned __int64 volatile*>(
const_cast<T volatile*>(p))));
}
};

template<>
struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 1);
__stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
PrimitiveConversions::cast<unsigned __int8>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 2);
__stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
PrimitiveConversions::cast<unsigned __int16>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 4);
__stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
PrimitiveConversions::cast<unsigned __int32>(v));
}
};

template<>
struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
template <typename T>
void operator()(volatile T* p, T v) const {
STATIC_ASSERT(sizeof(T) == 8);
__stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
PrimitiveConversions::cast<unsigned __int64>(v));
}
};

// release_store + fence combination, matching Linux AArch64
template<size_t byte_size>
struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
template <typename T>
void operator()(volatile T* p, T v) const {
Atomic::release_store(p, v);
OrderAccess::fence();
}
};

#endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
8 changes: 8 additions & 0 deletions src/hotspot/share/c1/c1_GraphBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
if ( callee->dont_inline()) return "don't inline by annotation";

// Don't inline a method that changes Thread.currentThread() except
// into another method that is annotated @ChangesCurrentThread.
if (callee->changes_current_thread()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that we're fixing atomics handling on Windows AArch64, won't we need additional justification for why we're changing code that other platforms are using given that they haven't run into any of these concurrency issues?

Copy link
Contributor Author

@macarte macarte Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem this solves is a compiler correctness issue, not a hardware barrier issue. If C1 inlines a @ChangesCurrentThread method (like Continuation.yield() or carrier-thread switching code) into a caller that isn't annotated @ChangesCurrentThread, the compiler can cache the result of Thread.currentCarrierThread() across the inlined call — observing the old carrier thread after the virtual thread has migrated to a new one.

This is broken on all architectures, not just ARM64:

  • On ARM64, it manifests more easily because the stale OopHandle dereference can also be reordered (which the MO_ACQUIRE fix in c1_LIRGenerator/library_call addresses separately).

  • On x86, it could still manifest if C1 keeps the currentCarrierThread result in a register across the inlined yield — TSO doesn't help because there's no memory access to reorder, the value is simply never re-read.

The check is also very low cost — it only runs during C1 inlining decisions, not at runtime. And @ChangesCurrentThread is used on a tiny number of methods, so this almost never triggers.

Bottom line: This is a correct-by-construction inlining guard that prevents a real compiler bug on all platforms. No architecture guard needed.

&& !compilation()->method()->changes_current_thread()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this specific change, for example, wouldn't it be a bug for an inlined method that changes threads to not read the correct state given that all its registers are correct and the load/acquire semantics have been fixed?

Copy link
Contributor Author

@macarte macarte Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When C1 inlines a @ChangesCurrentThread method into a non-@ChangesCurrentThread caller, the compiler can CSE (common subexpression elimination) or hoist the currentCarrierThread intrinsic across the carrier-change boundary. The result: the caller reuses a register-held value from before the yield/mount/unmount and never re-reads it.

The MO_ACQUIRE fix addresses a different problem — it ensures that when you do load currentCarrierThread, subsequent dependent loads (e.g., Thread.cont) see consistent data. But it can't force a re-load that the compiler optimized away entirely.

Consider this concrete scenario after inlining:

// Caller (not @ChangesCurrentThread):
Thread carrier = Thread.currentCarrierThread();  // C1 intrinsic → LDAR, result in R0
// ... inlined @ChangesCurrentThread code ...
//   yield() freezes frame (saves R0=old_carrier)
//   VM mounts VT on new carrier
//   thaw restores frame (R0 still = old_carrier)
// ... back in caller ...
carrier.cont  // uses stale R0 — wrong carrier!

The MO_ACQUIRE on the LDAR is irrelevant here because there is no second LDAR — C1 reused R0. And the "correct registers" after thaw are correct in the sense that they're faithfully restored from freeze — but they contain the pre-yield values, which is exactly the stale data.

The inlining guard ensures @ChangesCurrentThread methods stay as real calls, which forces C1 to treat currentCarrierThread as potentially invalidated across the call boundary and re-read it.

So: MO_ACQUIRE = correct ordering on loads that happen. Inlining guard = ensuring loads actually happen. They're complementary.

return "method changes current thread";
}

return nullptr;
}

Expand Down
9 changes: 8 additions & 1 deletion src/hotspot/share/c1/c1_LIRGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
LIR_Opr temp = new_register(T_ADDRESS);
LIR_Opr reg = rlock_result(x);
__ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
access_load(IN_NATIVE, T_OBJECT,
// OopHandle stores uncompressed oops in native memory.
// Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
// Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
// Continuation.yield) cannot float above this load on weakly-ordered
// architectures such as AArch64. Without acquire semantics the hardware
// may reorder a later field load before the OopHandle dereference,
// observing a stale value after a virtual thread migrates between carriers.
access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
}

Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
LIRGenerator *gen = access.gen();
DecoratorSet decorators = access.decorators();
bool is_volatile = (decorators & MO_SEQ_CST) != 0;
bool is_acquire = (decorators & MO_ACQUIRE) != 0;
bool is_atomic = is_volatile || AlwaysAtomicAccesses;
bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
Expand All @@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
__ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
}

if (is_volatile) {
if (is_volatile || is_acquire) {
__ membar_acquire();
}

Expand Down
9 changes: 8 additions & 1 deletion src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
: make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
thread_obj_handle = _gvn.transform(thread_obj_handle);

DecoratorSet decorators = IN_NATIVE;
// OopHandle stores uncompressed oops in native memory.
// Use IN_NATIVE to ensure proper handling without compressed oop decoding.
// Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
// Continuation.yield) cannot float above this load on weakly-ordered
// architectures such as AArch64. Without acquire semantics the hardware
// may reorder a later field load before the OopHandle dereference,
// observing a stale value after a virtual thread migrates between carriers.
DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
if (is_immutable) {
decorators |= C2_IMMUTABLE_MEMORY;
}
Expand Down
112 changes: 112 additions & 0 deletions src/hotspot/share/runtime/objectMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
*/

#include "classfile/symbolTable.hpp"
#include "classfile/vmSymbols.hpp"
#include "gc/shared/oopStorage.hpp"
#include "gc/shared/oopStorageSet.hpp"
Expand All @@ -42,6 +43,7 @@
#include "runtime/globals.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/interfaceSupport.inline.hpp"
#include "runtime/javaCalls.hpp"
#include "runtime/javaThread.inline.hpp"
#include "runtime/lightweightSynchronizer.hpp"
#include "runtime/mutexLocker.hpp"
Expand Down Expand Up @@ -523,6 +525,90 @@ void ObjectMonitor::notify_contended_enter(JavaThread* current) {
}
}

// Compensate the ForkJoinPool scheduler when a pinned virtual thread's
// carrier is about to block on a contended monitor. Without compensation
// the pool has no visibility that its worker is blocked, which can lead to
// carrier starvation / deadlock (JDK-8345294).
//
// This calls CarrierThread.beginMonitorBlock() which uses
// ForkJoinPool.tryCompensateForMonitor() to (a) activate an idle worker or
// create a replacement spare carrier, and (b) decrement RC so that signalWork
// can find a carrier for any queued continuation (e.g. the VT holding the
// contested lock) that would otherwise be starved because blocked carriers
// still count as active in the pool's RC field. The RC decrement is restored
// by endCompensatedBlock when this carrier unblocks.
//
// IMPORTANT: At this point notify_contended_enter() has already set
// _current_pending_monitor. The Java call below may itself trigger
// nested monitor contention (e.g., ThreadGroup.addUnstarted acquires a
// monitor when creating a spare worker thread). To avoid:
// (a) assertion failures (pending_monitor != nullptr on nested entry)
// (b) corruption of _current_pending_monitor for the outer monitor
// we save and clear the pending monitor before the Java call and restore
// it afterwards.
static bool compensate_pinned_carrier(JavaThread* current) {
oop carrier = current->threadObj();
if (carrier == nullptr) return false;

// Save and clear pending monitor so nested monitor enters work cleanly.
ObjectMonitor* saved_pending = current->current_pending_monitor();
current->set_current_pending_monitor(nullptr);

HandleMark hm(current);
Handle carrier_h(current, carrier);
Klass* klass = carrier_h->klass();

// CarrierThread.beginMonitorBlock() — compensates the ForkJoinPool via
// tryCompensateForMonitor which always activates/creates a real spare
// carrier (branch 2 of tryCompensate is omitted for this path).
// Not every carrier thread is a CarrierThread (custom schedulers may
// use plain threads), so we look up the method and silently return
// false if it is not found.
TempNewSymbol begin_name = SymbolTable::new_symbol("beginMonitorBlock");
TempNewSymbol begin_sig = SymbolTable::new_symbol("()Z");

JavaValue result(T_BOOLEAN);
JavaCalls::call_virtual(&result, carrier_h, klass,
begin_name, begin_sig, current);

bool compensated = false;
if (current->has_pending_exception()) {
current->clear_pending_exception();
} else {
compensated = result.get_jboolean();
}

// Restore pending monitor for the outer monitor enter.
current->set_current_pending_monitor(saved_pending);
return compensated;
}

static void end_compensate_pinned_carrier(JavaThread* current) {
oop carrier = current->threadObj();
if (carrier == nullptr) return;

// Save and clear pending monitor (should be nullptr here, but be safe).
ObjectMonitor* saved_pending = current->current_pending_monitor();
current->set_current_pending_monitor(nullptr);

HandleMark hm(current);
Handle carrier_h(current, carrier);
Klass* klass = carrier_h->klass();

TempNewSymbol end_name = SymbolTable::new_symbol("endBlocking");
TempNewSymbol end_sig = vmSymbols::void_method_signature();

JavaValue result(T_VOID);
JavaCalls::call_virtual(&result, carrier_h, klass,
end_name, end_sig, current);

if (current->has_pending_exception()) {
current->clear_pending_exception();
}

current->set_current_pending_monitor(saved_pending);
}

void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonitorContentionMark &cm) {
assert(current == JavaThread::current(), "must be");
assert(!has_owner(current), "must be");
Expand Down Expand Up @@ -570,6 +656,15 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
}
}

// Compensate the ForkJoinPool before the carrier blocks so that the
// scheduler can spin up a replacement worker thread. This prevents
// deadlocks where every carrier is pinned and waiting for a monitor
// held by an unmounted virtual thread that can never get a carrier.
bool compensated = false;
if (is_virtual) {
compensated = compensate_pinned_carrier(current);
}

{
// Change java thread status to indicate blocked on monitor enter.
JavaThreadBlockedOnMonitorEnterState jtbmes(current, this);
Expand Down Expand Up @@ -606,6 +701,11 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
// the monitor free and clear.
}

// End compensation now that the carrier is no longer blocked.
if (compensated) {
end_compensate_pinned_carrier(current);
}

assert(contentions() >= 0, "must not be negative: contentions=%d", contentions());

// Must either set _recursions = 0 or ASSERT _recursions == 0.
Expand Down Expand Up @@ -2346,6 +2446,16 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
if (!has_successor()) {
set_successor(current);
}
// Dekker/Lamport duality with exit(): ST _succ; MEMBAR; LD _owner.
// The exiter's side (release_clear_owner + storeload) is in exit().
// Here on the spinner's side, we need a StoreLoad barrier between
// setting _succ and reading _owner to prevent the CPU from reordering
// the _owner load before the _succ store. On ARM64 with MSVC
// /volatile:iso, Atomic::store/load are plain STR/LDR with no
// barrier, so without this fence the Dekker protocol is broken and
// the exiter may not see our successor designation while we may not
// see its lock release — leading to missed wakeups and starvation.
OrderAccess::storeload();
int64_t prv = NO_OWNER;

// There are three ways to exit the following loop:
Expand Down Expand Up @@ -2421,6 +2531,8 @@ bool ObjectMonitor::try_spin(JavaThread* current) {

if (!has_successor()) {
set_successor(current);
// See Dekker/storeload comment before the loop.
OrderAccess::storeload();
}
}

Expand Down
Loading
Loading