diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4 index 3a61840e8c9..9b8f9cdb505 100644 --- a/make/autoconf/flags-cflags.m4 +++ b/make/autoconf/flags-cflags.m4 @@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP], elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then if test "x$FLAGS_CPU" = xx86; then $1_CFLAGS_CPU_JVM="-arch:IA32" + elif test "x$FLAGS_CPU" = xaarch64; then + # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes + # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code + # was written assuming volatile provides acquire/release semantics (as on x86 + # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and + # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other + # lock-free algorithms that use plain volatile fields. + $1_CFLAGS_CPU_JVM="/volatile:ms" elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then if test "x$DEBUG_LEVEL" != xrelease; then # NOTE: This is probably redundant; -homeparams is default on diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp index 90fc8ecfba4..4f88debe78a 100644 --- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp +++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp @@ -108,4 +108,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64) #undef DEFINE_INTRINSIC_CMPXCHG +// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64. +// +// The generic PlatformLoad and PlatformStore use plain volatile dereferences. +// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles +// those to LDAR/STLR, so these overrides produce identical codegen. They are +// retained as defense-in-depth: they guarantee acquire/release semantics for +// Atomic::load()/Atomic::store() regardless of the compiler flag setting, +// ensuring correct cross-core visibility for HotSpot's lock-free algorithms +// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if +// /volatile:ms were ever removed or overridden. + +template<> +struct Atomic::PlatformLoad<1> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 1); + return PrimitiveConversions::cast( + __ldar8(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct Atomic::PlatformLoad<2> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 2); + return PrimitiveConversions::cast( + __ldar16(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct Atomic::PlatformLoad<4> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 4); + return PrimitiveConversions::cast( + __ldar32(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct Atomic::PlatformLoad<8> { + template + T operator()(T const volatile* dest) const { + STATIC_ASSERT(sizeof(T) == 8); + return PrimitiveConversions::cast( + __ldar64(reinterpret_cast( + const_cast(dest)))); + } +}; + +template<> +struct Atomic::PlatformStore<1> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 1); + __stlr8(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct Atomic::PlatformStore<2> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 2); + __stlr16(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct Atomic::PlatformStore<4> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 4); + __stlr32(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +template<> +struct Atomic::PlatformStore<8> { + template + void operator()(T volatile* dest, T new_value) const { + STATIC_ASSERT(sizeof(T) == 8); + __stlr64(reinterpret_cast(dest), + PrimitiveConversions::cast(new_value)); + } +}; + +// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's +// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which +// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE. +// These emit single LDAR/STLR instructions that have acquire/release semantics +// baked in, rather than the generic fallback of separate dmb + plain load/store. +// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str +// for cross-core visibility (Dekker patterns, etc.). + +template<> +struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 1); + return PrimitiveConversions::cast( + __ldar8(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 2); + return PrimitiveConversions::cast( + __ldar16(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 4); + return PrimitiveConversions::cast( + __ldar32(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> { + template + T operator()(const volatile T* p) const { + STATIC_ASSERT(sizeof(T) == 8); + return PrimitiveConversions::cast( + __ldar64(reinterpret_cast( + const_cast(p)))); + } +}; + +template<> +struct Atomic::PlatformOrderedStore<1, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 1); + __stlr8(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct Atomic::PlatformOrderedStore<2, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 2); + __stlr16(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct Atomic::PlatformOrderedStore<4, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 4); + __stlr32(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +template<> +struct Atomic::PlatformOrderedStore<8, RELEASE_X> { + template + void operator()(volatile T* p, T v) const { + STATIC_ASSERT(sizeof(T) == 8); + __stlr64(reinterpret_cast(p), + PrimitiveConversions::cast(v)); + } +}; + +// release_store + fence combination, matching Linux AArch64 +template +struct Atomic::PlatformOrderedStore { + template + void operator()(volatile T* p, T v) const { + Atomic::release_store(p, v); + OrderAccess::fence(); + } +}; + #endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp index 5385f3e6a10..01711aff0e4 100644 --- a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp +++ b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp @@ -26,22 +26,29 @@ #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP // Included in orderAccess.hpp header file. -#include -using std::atomic_thread_fence; #include #include "vm_version_aarch64.hpp" #include "runtime/vm_version.hpp" // Implementation of class OrderAccess. +// +// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence(). +// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction +// reordering" in addition to emitting the hardware DMB instruction. This is critical +// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and +// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic +// operations — it may not act as a compiler barrier for volatile/non-atomic accesses +// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker +// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot. inline void OrderAccess::loadload() { acquire(); } inline void OrderAccess::storestore() { release(); } inline void OrderAccess::loadstore() { acquire(); } inline void OrderAccess::storeload() { fence(); } -#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire); -#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release); -#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst); +#define READ_MEM_BARRIER __dmb(_ARM64_BARRIER_ISHLD) +#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) +#define FULL_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH) inline void OrderAccess::acquire() { READ_MEM_BARRIER; diff --git a/src/hotspot/share/c1/c1_GraphBuilder.cpp b/src/hotspot/share/c1/c1_GraphBuilder.cpp index 8658bebdaee..0dc3034c4af 100644 --- a/src/hotspot/share/c1/c1_GraphBuilder.cpp +++ b/src/hotspot/share/c1/c1_GraphBuilder.cpp @@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const { const char* GraphBuilder::should_not_inline(ciMethod* callee) const { if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand"; if ( callee->dont_inline()) return "don't inline by annotation"; + + // Don't inline a method that changes Thread.currentThread() except + // into another method that is annotated @ChangesCurrentThread. + if (callee->changes_current_thread() + && !compilation()->method()->changes_current_thread()) { + return "method changes current thread"; + } + return nullptr; } diff --git a/src/hotspot/share/c1/c1_LIRGenerator.cpp b/src/hotspot/share/c1/c1_LIRGenerator.cpp index 341de0ac0c2..0fd8644eaee 100644 --- a/src/hotspot/share/c1/c1_LIRGenerator.cpp +++ b/src/hotspot/share/c1/c1_LIRGenerator.cpp @@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) { LIR_Opr temp = new_register(T_ADDRESS); LIR_Opr reg = rlock_result(x); __ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp); - access_load(IN_NATIVE, T_OBJECT, + // OopHandle stores uncompressed oops in native memory. + // Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling. + // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by + // Continuation.yield) cannot float above this load on weakly-ordered + // architectures such as AArch64. Without acquire semantics the hardware + // may reorder a later field load before the OopHandle dereference, + // observing a stale value after a virtual thread migrates between carriers. + access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT, LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg); } diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp index ac640fb88d2..1291f6e1b33 100644 --- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp +++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp @@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) { LIRGenerator *gen = access.gen(); DecoratorSet decorators = access.decorators(); bool is_volatile = (decorators & MO_SEQ_CST) != 0; + bool is_acquire = (decorators & MO_ACQUIRE) != 0; bool is_atomic = is_volatile || AlwaysAtomicAccesses; bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0; bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0; @@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) { __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code); } - if (is_volatile) { + if (is_volatile || is_acquire) { __ membar_acquire(); } diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index f74af38387c..23d30e065c5 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o : make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered)); thread_obj_handle = _gvn.transform(thread_obj_handle); - DecoratorSet decorators = IN_NATIVE; + // OopHandle stores uncompressed oops in native memory. + // Use IN_NATIVE to ensure proper handling without compressed oop decoding. + // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by + // Continuation.yield) cannot float above this load on weakly-ordered + // architectures such as AArch64. Without acquire semantics the hardware + // may reorder a later field load before the OopHandle dereference, + // observing a stale value after a virtual thread migrates between carriers. + DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE; if (is_immutable) { decorators |= C2_IMMUTABLE_MEMORY; } diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp index ceac0b42acf..16e9cda0a50 100644 --- a/src/hotspot/share/runtime/objectMonitor.cpp +++ b/src/hotspot/share/runtime/objectMonitor.cpp @@ -22,6 +22,7 @@ * */ +#include "classfile/symbolTable.hpp" #include "classfile/vmSymbols.hpp" #include "gc/shared/oopStorage.hpp" #include "gc/shared/oopStorageSet.hpp" @@ -42,6 +43,7 @@ #include "runtime/globals.hpp" #include "runtime/handles.inline.hpp" #include "runtime/interfaceSupport.inline.hpp" +#include "runtime/javaCalls.hpp" #include "runtime/javaThread.inline.hpp" #include "runtime/lightweightSynchronizer.hpp" #include "runtime/mutexLocker.hpp" @@ -523,6 +525,90 @@ void ObjectMonitor::notify_contended_enter(JavaThread* current) { } } +// Compensate the ForkJoinPool scheduler when a pinned virtual thread's +// carrier is about to block on a contended monitor. Without compensation +// the pool has no visibility that its worker is blocked, which can lead to +// carrier starvation / deadlock (JDK-8345294). +// +// This calls CarrierThread.beginMonitorBlock() which uses +// ForkJoinPool.tryCompensateForMonitor() to (a) activate an idle worker or +// create a replacement spare carrier, and (b) decrement RC so that signalWork +// can find a carrier for any queued continuation (e.g. the VT holding the +// contested lock) that would otherwise be starved because blocked carriers +// still count as active in the pool's RC field. The RC decrement is restored +// by endCompensatedBlock when this carrier unblocks. +// +// IMPORTANT: At this point notify_contended_enter() has already set +// _current_pending_monitor. The Java call below may itself trigger +// nested monitor contention (e.g., ThreadGroup.addUnstarted acquires a +// monitor when creating a spare worker thread). To avoid: +// (a) assertion failures (pending_monitor != nullptr on nested entry) +// (b) corruption of _current_pending_monitor for the outer monitor +// we save and clear the pending monitor before the Java call and restore +// it afterwards. +static bool compensate_pinned_carrier(JavaThread* current) { + oop carrier = current->threadObj(); + if (carrier == nullptr) return false; + + // Save and clear pending monitor so nested monitor enters work cleanly. + ObjectMonitor* saved_pending = current->current_pending_monitor(); + current->set_current_pending_monitor(nullptr); + + HandleMark hm(current); + Handle carrier_h(current, carrier); + Klass* klass = carrier_h->klass(); + + // CarrierThread.beginMonitorBlock() — compensates the ForkJoinPool via + // tryCompensateForMonitor which always activates/creates a real spare + // carrier (branch 2 of tryCompensate is omitted for this path). + // Not every carrier thread is a CarrierThread (custom schedulers may + // use plain threads), so we look up the method and silently return + // false if it is not found. + TempNewSymbol begin_name = SymbolTable::new_symbol("beginMonitorBlock"); + TempNewSymbol begin_sig = SymbolTable::new_symbol("()Z"); + + JavaValue result(T_BOOLEAN); + JavaCalls::call_virtual(&result, carrier_h, klass, + begin_name, begin_sig, current); + + bool compensated = false; + if (current->has_pending_exception()) { + current->clear_pending_exception(); + } else { + compensated = result.get_jboolean(); + } + + // Restore pending monitor for the outer monitor enter. + current->set_current_pending_monitor(saved_pending); + return compensated; +} + +static void end_compensate_pinned_carrier(JavaThread* current) { + oop carrier = current->threadObj(); + if (carrier == nullptr) return; + + // Save and clear pending monitor (should be nullptr here, but be safe). + ObjectMonitor* saved_pending = current->current_pending_monitor(); + current->set_current_pending_monitor(nullptr); + + HandleMark hm(current); + Handle carrier_h(current, carrier); + Klass* klass = carrier_h->klass(); + + TempNewSymbol end_name = SymbolTable::new_symbol("endBlocking"); + TempNewSymbol end_sig = vmSymbols::void_method_signature(); + + JavaValue result(T_VOID); + JavaCalls::call_virtual(&result, carrier_h, klass, + end_name, end_sig, current); + + if (current->has_pending_exception()) { + current->clear_pending_exception(); + } + + current->set_current_pending_monitor(saved_pending); +} + void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonitorContentionMark &cm) { assert(current == JavaThread::current(), "must be"); assert(!has_owner(current), "must be"); @@ -570,6 +656,15 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito } } + // Compensate the ForkJoinPool before the carrier blocks so that the + // scheduler can spin up a replacement worker thread. This prevents + // deadlocks where every carrier is pinned and waiting for a monitor + // held by an unmounted virtual thread that can never get a carrier. + bool compensated = false; + if (is_virtual) { + compensated = compensate_pinned_carrier(current); + } + { // Change java thread status to indicate blocked on monitor enter. JavaThreadBlockedOnMonitorEnterState jtbmes(current, this); @@ -606,6 +701,11 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito // the monitor free and clear. } + // End compensation now that the carrier is no longer blocked. + if (compensated) { + end_compensate_pinned_carrier(current); + } + assert(contentions() >= 0, "must not be negative: contentions=%d", contentions()); // Must either set _recursions = 0 or ASSERT _recursions == 0. @@ -2346,6 +2446,16 @@ bool ObjectMonitor::try_spin(JavaThread* current) { if (!has_successor()) { set_successor(current); } + // Dekker/Lamport duality with exit(): ST _succ; MEMBAR; LD _owner. + // The exiter's side (release_clear_owner + storeload) is in exit(). + // Here on the spinner's side, we need a StoreLoad barrier between + // setting _succ and reading _owner to prevent the CPU from reordering + // the _owner load before the _succ store. On ARM64 with MSVC + // /volatile:iso, Atomic::store/load are plain STR/LDR with no + // barrier, so without this fence the Dekker protocol is broken and + // the exiter may not see our successor designation while we may not + // see its lock release — leading to missed wakeups and starvation. + OrderAccess::storeload(); int64_t prv = NO_OWNER; // There are three ways to exit the following loop: @@ -2421,6 +2531,8 @@ bool ObjectMonitor::try_spin(JavaThread* current) { if (!has_successor()) { set_successor(current); + // See Dekker/storeload comment before the loop. + OrderAccess::storeload(); } } diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java index b0b134e69a3..17ce3e552ac 100644 --- a/src/java.base/share/classes/java/lang/VirtualThread.java +++ b/src/java.base/share/classes/java/lang/VirtualThread.java @@ -567,6 +567,13 @@ private void afterYield() { setState(newState = TIMED_PARKED); } + // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state + // is visible before reading parkPermit (Dekker pattern with + // unpark which writes parkPermit then reads state). + // Note: storeFence is insufficient — on ARM64 it only emits + // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish). + U.fullFence(); + // may have been unparked while parking if (parkPermit && compareAndSetState(newState, UNPARKED)) { // lazy submit if local queue is empty @@ -592,6 +599,10 @@ private void afterYield() { if (s == BLOCKING) { setState(BLOCKED); + // Full fence (StoreLoad) for Dekker pattern with unblock + // which writes blockPermit then reads state. + U.fullFence(); + // may have been unblocked while blocking if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) { // lazy submit if local queue is empty @@ -622,6 +633,10 @@ private void afterYield() { } } + // Full fence (StoreLoad) for Dekker pattern with notify + // which writes notified then reads state. + U.fullFence(); + // may have been notified while in transition to wait state if (notified && compareAndSetState(newState, BLOCKED)) { // may have even been unblocked already @@ -659,6 +674,14 @@ private void afterDone(boolean notifyContainer) { assert carrierThread == null; setState(TERMINATED); + // Full fence (StoreLoad) to ensure the TERMINATED state is + // visible to any thread that has stored a termination latch + // (joinNanos). Without this, on ARM64 the volatile write of + // state and the subsequent volatile read of termination can be + // reordered, causing a missed-wakeup where both sides miss + // each other's store (Dekker pattern). + U.fullFence(); + // notify anyone waiting for this virtual thread to terminate CountDownLatch termination = this.termination; if (termination != null) { @@ -847,6 +870,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) { */ private void unpark(boolean lazySubmit) { if (!getAndSetParkPermit(true) && currentThread() != this) { + // Full fence (StoreLoad) to ensure parkPermit=true is visible + // before reading state (Dekker pattern with afterYield PARKING + // path which writes state then reads parkPermit). + U.fullFence(); int s = state(); // unparked while parked @@ -889,6 +916,7 @@ void unpark() { private void unblock() { assert !Thread.currentThread().isVirtual(); blockPermit = true; + U.fullFence(); // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) { submitRunContinuation(); } @@ -1003,6 +1031,10 @@ boolean joinNanos(long nanos) throws InterruptedException { // ensure termination object exists, then re-check state CountDownLatch termination = getTermination(); + // Full fence (StoreLoad) for Dekker with afterDone: ensures that + // the CAS in getTermination() (storing the latch) is visible to + // afterDone before we read state here. + U.fullFence(); if (state() == TERMINATED) return true; diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java index cdc5e0b9ea6..cd2594a6262 100644 --- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java +++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java @@ -2064,7 +2064,8 @@ private int deactivate(WorkQueue w, int phase) { if ((q = qs[k & (n - 1)]) == null) Thread.onSpinWait(); else if ((a = q.array) != null && (cap = a.length) > 0 && - a[q.base & (cap - 1)] != null && --prechecks < 0 && + a[q.base & (cap - 1)] != null && + --prechecks < 0 && (int)(c = ctl) == activePhase && compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK))) return w.phase = activePhase; // reactivate @@ -4347,14 +4348,114 @@ final long beginCompensatedBlock() { return (c == 0) ? 0L : RC_UNIT; } + /** + * Variant of tryCompensate for use when a carrier thread is about to block + * on a native OS-level monitor (objectMonitor::enter_internal) while pinned + * by a virtual thread. + * + *

Unlike tryCompensate, this method omits the passive RC-only path + * (branch 2 of tryCompensate), so the blocking carrier always results in + * either an idle worker being activated (branch 1) or a new spare being + * created (branch 3). + * + *

Both branches decrement RC so that {@code signalWork} (triggered by + * subsequent VT task submissions) sees the pool as under-active and can + * dispatch the queued VT continuations. Without the RC decrement in + * branch 1, monitor-blocked carriers are counted as "active" by + * {@code signalWork}'s {@code (short)(c >>> RC_SHIFT) >= pc} check, + * preventing it from waking idle workers when tasks are queued. + * + *

The RC decrement in both branches requires {@code endCompensatedBlock} + * to restore two RC_UNITs: one for the decrement here and one for + * the compensator's eventual deactivation (RC--). Both branches return + * {@code 2} so that {@code beginMonitorCompensatedBlock} passes + * {@code 2 * RC_UNIT} to {@code endCompensatedBlock}: + *

+     *   branch CAS: RC--                (-1)
+     *   compensator deactivates: RC--   (-1)
+     *   endCompensatedBlock:            (+2)
+     *   net:                             0
+     * 
+ */ + private int tryCompensateForMonitor(long c) { + Predicate sat; + long b = config; + int pc = parallelism, + maxTotal = (short)(b >>> TC_SHIFT) + pc, + total = (short)(c >>> TC_SHIFT), + sp = (int)c, + stat = -1; + if (sp != 0) { // activate idle worker (branch 1) + // RC-- so that signalWork sees the pool as under-active while + // the carrier is blocked on the monitor. Without this, RC + // counts monitor-blocked carriers as "active", preventing + // signalWork from waking idle workers. Return stat=2 so + // that beginMonitorCompensatedBlock passes 2*RC_UNIT to + // endCompensatedBlock, balancing both the RC-- here and the + // compensator's eventual deactivation (RC--). + WorkQueue[] qs; WorkQueue v; int i; + if ((qs = queues) != null && qs.length > (i = sp & SMASK) && + (v = qs[i]) != null && + compareAndSetCtl(c, ((c - RC_UNIT) & UMASK) | + (v.stackPred & LMASK))) { + v.phase = sp; + if (v.parking != 0) + U.unpark(v.owner); + stat = 2; // need 2 * RC_UNIT at unblock + } + } + // Branch 2 (passive RC-only decrement) is intentionally absent. + else if (total < maxTotal && total < MAX_CAP) { // create spare (branch 3) + // TC++ and RC-- so that signalWork sees the pool as under-active. + // Return stat=2 so that beginMonitorCompensatedBlock passes + // 2*RC_UNIT to endCompensatedBlock, balancing both the RC-- + // here and the spare's eventual deactivation (RC--). + long nc = ((c + TC_UNIT) & TC_MASK) | + ((c - RC_UNIT) & RC_MASK) | + (c & LMASK); + if ((runState & STOP) != 0L) + stat = 0; + else if (compareAndSetCtl(c, nc)) { + createWorker(); + stat = 2; // need 2 * RC_UNIT at unblock + } + } + else if (!compareAndSetCtl(c, c)) // validate + ; + else if ((sat = saturate) != null && sat.test(this)) + stat = 0; + else + throw new RejectedExecutionException( + "Thread limit exceeded replacing blocked worker"); + return stat; + } + + /** + * Like beginCompensatedBlock but for carrier threads that are about to + * block on a native OS-level monitor (objectMonitor::enter_internal) while + * pinned by a virtual thread. Uses tryCompensateForMonitor which always + * activates or creates a real replacement carrier (never passive RC-only). + *

tryCompensateForMonitor returns: + *

    + *
  • {@code 2} — branch 1 (activated idle worker with RC--), need 2 × RC_UNIT
  • + *
  • {@code 2} — branch 3 (created spare with RC--), need 2 × RC_UNIT
  • + *
  • {@code 0} — saturated/stopping, no restoration needed
  • + *
+ * @return value to pass to endCompensatedBlock + */ + final long beginMonitorCompensatedBlock() { + int c; + do {} while ((c = tryCompensateForMonitor(ctl)) < 0); + return (c == 0) ? 0L : (long)c * RC_UNIT; + } + /** * Re-adjusts parallelism after a blocking operation completes. - * @param post value from beginCompensatedBlock + * @param post value from beginCompensatedBlock or beginMonitorCompensatedBlock */ void endCompensatedBlock(long post) { - if (post > 0L) { + if (post > 0L) getAndAddCtl(post); - } } /** ManagedBlock for external threads */ @@ -4411,9 +4512,14 @@ protected RunnableFuture newTaskFor(Callable callable) { public long beginCompensatedBlock(ForkJoinPool pool) { return pool.beginCompensatedBlock(); } + @Override public void endCompensatedBlock(ForkJoinPool pool, long post) { pool.endCompensatedBlock(post); } + @Override + public long beginMonitorCompensatedBlock(ForkJoinPool pool) { + return pool.beginMonitorCompensatedBlock(); + } }); defaultForkJoinWorkerThreadFactory = new DefaultForkJoinWorkerThreadFactory(); diff --git a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java index 118d648c7a2..cff4da50d30 100644 --- a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java @@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) { q = p.next; if (p.isData != haveData && haveData != (m != null) && p.cmpExItem(m, e) == m) { + // Full fence (StoreLoad) for Dekker with await() which + // writes waiter then reads item. On ARM64, CAS + // (ldaxr/stlxr) + plain load to a different field does + // NOT provide StoreLoad ordering. + VarHandle.fullFence(); Thread w = p.waiter; // matched complementary node if (p != h && h == cmpExHead(h, (q == null) ? p : q)) h.next = h; // advance head; self-link old diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java index 49efe5d5c2c..c74a2483aa4 100644 --- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java @@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) { else if (p.cmpExItem(m, e) != m) p = head; // missed; restart else { // matched complementary node + // Full fence (StoreLoad) for Dekker with await() which + // writes waiter then reads item. On ARM64, CAS + // (ldaxr/stlxr) + plain load to a different field does + // NOT provide StoreLoad ordering. + VarHandle.fullFence(); Thread w = p.waiter; cmpExHead(p, p.next); LockSupport.unpark(w); diff --git a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java index 0ff216c80a0..1c794032aae 100644 --- a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java +++ b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java @@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared, Thread.onSpinWait(); } else if (node.status == 0) { node.status = WAITING; // enable signal and recheck + // Full fence (StoreLoad) to ensure WAITING status is visible + // before re-reading state in tryAcquire/tryAcquireShared + // (Dekker pattern with releaseShared/release which writes + // state then reads node.status in signalNext). + // On ARM64, volatile write (stlr) + volatile read (ldar) to + // different addresses does NOT provide StoreLoad ordering. + U.fullFence(); } else { spins = postSpins = (byte)((postSpins << 1) | 1); try { @@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout) */ public final boolean release(int arg) { if (tryRelease(arg)) { + // Full fence (StoreLoad) to ensure the state update from + // tryRelease is visible before reading node.status in signalNext + // (Dekker pattern: release writes state then reads status, + // acquire writes status then reads state). + // On ARM64, CAS (stlxr/release) + ldar to different addresses + // does NOT provide StoreLoad ordering. + U.fullFence(); signalNext(head); return true; } @@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout) */ public final boolean releaseShared(int arg) { if (tryReleaseShared(arg)) { + // Full fence (StoreLoad) — see comment in release() + U.fullFence(); signalNext(head); return true; } diff --git a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java index 28c1b7fddee..5f824dfabbd 100644 --- a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java +++ b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java @@ -29,4 +29,20 @@ public interface JavaUtilConcurrentFJPAccess { long beginCompensatedBlock(ForkJoinPool pool); void endCompensatedBlock(ForkJoinPool pool, long post); + /** + * Like beginCompensatedBlock but for carrier threads that are about to + * block on a native OS-level monitor (objectMonitor::enter_internal) while + * pinned by a virtual thread. This variant: + *
    + *
  • Never takes the passive RC-only path (branch 2 of tryCompensate); + * it always activates an idle worker or creates a new spare.
  • + *
  • Decrements RC immediately so that signalWork can create a carrier + * for any continuation (e.g. the contested lock's holder) that is + * already in the queue, which would otherwise be starved because + * blocked carriers still count as active in RC.
  • + *
+ * The RC decrement is restored by endCompensatedBlock when the carrier + * unblocks. + */ + long beginMonitorCompensatedBlock(ForkJoinPool pool); } diff --git a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java index b314b1823a5..811477c2475 100644 --- a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java +++ b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java @@ -89,6 +89,47 @@ public boolean beginBlocking() { } } + /** + * Mark the start of a blocking operation where the carrier thread is about + * to block on a native OS-level monitor (objectMonitor::enter_internal) + * while pinned by a virtual thread. Unlike beginBlocking, this always + * activates an idle worker or creates a new spare carrier (never takes the + * passive RC-only path that creates no worker). endBlocking restores + * the compensateValue (1 or 2 RC_UNITs) when this carrier unblocks. + */ + public boolean beginMonitorBlock() { + assert Thread.currentThread().isVirtual() && JLA.currentCarrierThread() == this; + + // Guard against re-entrant calls. tryCompensateForMonitor may call + // createWorker() which can hit a nested monitor (e.g. ThreadGroup), + // re-entering objectMonitor::enter_with_contention_mark and thus this + // method. Without this check each nested call overwrites + // compensateValue, leaking an RC_UNIT per re-entrant compensation. + if (compensating != NOT_COMPENSATING) { + return false; + } + + // don't preempt when attempting to compensate + Continuation.pin(); + try { + compensating = COMPENSATE_IN_PROGRESS; + + // Uses FJP.tryCompensateForMonitor (branch 2 removed) – always + // activates an idle worker (1*RC_UNIT) or creates a new spare + // with RC-- (2*RC_UNIT to balance both the RC-- and the spare's + // eventual deactivation). + compensateValue = ForkJoinPools.beginMonitorCompensatedBlock(getPool()); + compensating = COMPENSATING; + return true; + } catch (Throwable e) { + // exception starting spare thread + compensating = NOT_COMPENSATING; + throw e; + } finally { + Continuation.unpin(); + } + } + /** * Mark the end of a blocking operation. */ @@ -133,6 +174,9 @@ static long beginCompensatedBlock(ForkJoinPool pool) { static void endCompensatedBlock(ForkJoinPool pool, long post) { FJP_ACCESS.endCompensatedBlock(pool, post); } + static long beginMonitorCompensatedBlock(ForkJoinPool pool) { + return FJP_ACCESS.beginMonitorCompensatedBlock(pool); + } } static { diff --git a/src/java.base/share/classes/jdk/internal/vm/Continuation.java b/src/java.base/share/classes/jdk/internal/vm/Continuation.java index a97f9ac9ea4..6ed31e08bd1 100644 --- a/src/java.base/share/classes/jdk/internal/vm/Continuation.java +++ b/src/java.base/share/classes/jdk/internal/vm/Continuation.java @@ -345,6 +345,7 @@ private boolean isEmpty() { * @throws IllegalStateException if not currently in the given {@code scope}, */ @Hidden + @DontInline @JvmtiHideEvents public static boolean yield(ContinuationScope scope) { Continuation cont = JLA.getContinuation(currentCarrierThread()); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java index 114f7f9bfab..d44440849dd 100644 --- a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java @@ -23,7 +23,7 @@ */ package compiler.c2.aarch64; - +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import jdk.test.lib.process.OutputAnalyzer; @@ -96,6 +96,26 @@ private static void test(String s, int i) { public static void main(String[] args) { String s = "Returns the char value at the specified index."; + + boolean isWindows = System.getProperty("os.name").startsWith("Windows"); + if (isWindows) { + /* + Depending on the Windows code page, the UTF-16 path in the String.charAt method + may be executed during JVM startup. However, it may not execute often enough to + cause it to be inlined by C2. In this case, C2 may not fully inline the + String.charAt method, resulting in the unexpected generation of trampolines. + Therefore, we execute the UTF-16 path in the String.charAt method in a loop + enough times to ensure that it will be inlined by C2. + */ + String jnuEncoding = System.getProperty("sun.jnu.encoding"); + if (jnuEncoding != null && Charset.isSupported(jnuEncoding)) { + byte[] bytes = s.getBytes(Charset.forName(jnuEncoding)); + + for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) { + String encoded = new String(bytes, Charset.forName(jnuEncoding)); + } + } + } for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) { test(s, i % s.length()); }