From d54a0cf0c4d6381baadf5c6bcc961b296ae147da Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Mon, 23 Feb 2026 11:28:04 -0800
Subject: [PATCH 01/12] Fix ARM64 Virtual Thread currentCarrierThread Intrinsic
 Bug

---
 src/hotspot/cpu/aarch64/aarch64.ad            | 38 +++++++++++++++++++
 src/hotspot/share/c1/c1_GraphBuilder.cpp      |  8 ++++
 src/hotspot/share/c1/c1_LIRGenerator.cpp      |  9 ++++-
 .../share/gc/shared/c1/barrierSetC1.cpp       |  3 +-
 src/hotspot/share/opto/library_call.cpp       |  9 ++++-
 .../classes/jdk/internal/vm/Continuation.java |  1 +
 6 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 6899526a282..4d81bef329d 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -6676,6 +6676,25 @@ instruct loadP(iRegPNoSp dst, memory8 mem)
   ins_pipe(iload_reg_mem);
 %}
 
+// Load Pointer with acquire semantics (load + barrier for complex addressing)
+instruct loadP_acq(iRegPNoSp dst, memory8 mem)
+%{
+  match(Set dst (LoadP mem));
+  predicate(needs_acquiring_load(n) && (n->as_Load()->barrier_data() == 0));
+
+  ins_cost(4 * INSN_COST + VOLATILE_REF_COST);
+  format %{ "ldr  $dst, $mem\t# ptr (acquire)\n\t"
+            "dmb ishld" %}
+
+  ins_encode %{
+    loadStore(masm, &MacroAssembler::ldr, $dst$$Register, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp, 8);
+    __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+  %}
+
+  ins_pipe(pipe_serial);
+%}
+
 // Load Compressed Pointer
 instruct loadN(iRegNNoSp dst, memory4 mem)
 %{
@@ -6690,6 +6709,25 @@ instruct loadN(iRegNNoSp dst, memory4 mem)
   ins_pipe(iload_reg_mem);
 %}
 
+// Load Compressed Pointer with acquire semantics (load + barrier for complex addressing)
+instruct loadN_acq(iRegNNoSp dst, memory4 mem)
+%{
+  match(Set dst (LoadN mem));
+  predicate(needs_acquiring_load(n) && n->as_Load()->barrier_data() == 0);
+
+  ins_cost(4 * INSN_COST + VOLATILE_REF_COST);
+  format %{ "ldrw  $dst, $mem\t# compressed ptr (acquire)\n\t"
+            "dmb ishld" %}
+
+  ins_encode %{
+    loadStore(masm, &MacroAssembler::ldrw, $dst$$Register, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp, 4);
+    __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+  %}
+
+  ins_pipe(pipe_serial);
+%}
+
 // Load Klass Pointer
 instruct loadKlass(iRegPNoSp dst, memory8 mem)
 %{
diff --git a/src/hotspot/share/c1/c1_GraphBuilder.cpp b/src/hotspot/share/c1/c1_GraphBuilder.cpp
index 8658bebdaee..0dc3034c4af 100644
--- a/src/hotspot/share/c1/c1_GraphBuilder.cpp
+++ b/src/hotspot/share/c1/c1_GraphBuilder.cpp
@@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
 const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
   if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
   if ( callee->dont_inline())          return "don't inline by annotation";
+  
+  // Don't inline a method that changes Thread.currentThread() except
+  // into another method that is annotated @ChangesCurrentThread.
+  if (callee->changes_current_thread()
+      && !compilation()->method()->changes_current_thread()) {
+    return "method changes current thread";
+  }
+
   return nullptr;
 }
 
diff --git a/src/hotspot/share/c1/c1_LIRGenerator.cpp b/src/hotspot/share/c1/c1_LIRGenerator.cpp
index 341de0ac0c2..0fd8644eaee 100644
--- a/src/hotspot/share/c1/c1_LIRGenerator.cpp
+++ b/src/hotspot/share/c1/c1_LIRGenerator.cpp
@@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
   LIR_Opr temp = new_register(T_ADDRESS);
   LIR_Opr reg = rlock_result(x);
   __ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
-  access_load(IN_NATIVE, T_OBJECT,
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
               LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
 }
 
diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
index ac640fb88d2..1291f6e1b33 100644
--- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
+++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
   LIRGenerator *gen = access.gen();
   DecoratorSet decorators = access.decorators();
   bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_acquire  = (decorators & MO_ACQUIRE) != 0;
   bool is_atomic = is_volatile || AlwaysAtomicAccesses;
   bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
   bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
@@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
-  if (is_volatile) {
+  if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
 
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index f74af38387c..23d30e065c5 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
       : make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
   thread_obj_handle = _gvn.transform(thread_obj_handle);
 
-  DecoratorSet decorators = IN_NATIVE;
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure proper handling without compressed oop decoding.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
   if (is_immutable) {
     decorators |= C2_IMMUTABLE_MEMORY;
   }
diff --git a/src/java.base/share/classes/jdk/internal/vm/Continuation.java b/src/java.base/share/classes/jdk/internal/vm/Continuation.java
index a97f9ac9ea4..6ed31e08bd1 100644
--- a/src/java.base/share/classes/jdk/internal/vm/Continuation.java
+++ b/src/java.base/share/classes/jdk/internal/vm/Continuation.java
@@ -345,6 +345,7 @@ private boolean isEmpty() {
      * @throws IllegalStateException if not currently in the given {@code scope},
      */
     @Hidden
+    @DontInline
     @JvmtiHideEvents
     public static boolean yield(ContinuationScope scope) {
         Continuation cont = JLA.getContinuation(currentCarrierThread());

From b8d5fcae31761d919ddfe14c72c0e5338f6f346f Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Mon, 23 Feb 2026 13:40:04 -0800
Subject: [PATCH 02/12] Force MemNode::acquire on all ARM64 pointer loads

---
 src/hotspot/share/opto/memnode.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index 24b81b894cb..5ad5630a61c 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -970,17 +970,30 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
   case T_LONG:    load = new LoadLNode (ctl, mem, adr, adr_type, rt->is_long(), mo, control_dependency, require_atomic_access); break;
   case T_FLOAT:   load = new LoadFNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency); break;
   case T_DOUBLE:  load = new LoadDNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency, require_atomic_access); break;
+#ifdef AARCH64
+  // On ARM64, use acquire semantics for all pointer loads to ensure proper ordering
+  case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  MemNode::acquire, control_dependency); break;
+#else
   case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  mo, control_dependency); break;
+#endif
   case T_OBJECT:
   case T_NARROWOOP:
 #ifdef _LP64
     if (adr->bottom_type()->is_ptr_to_narrowoop()) {
+#ifdef AARCH64
+      load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), MemNode::acquire, control_dependency);
+#else
       load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), mo, control_dependency);
+#endif
     } else
 #endif
     {
       assert(!adr->bottom_type()->is_ptr_to_narrowoop() && !adr->bottom_type()->is_ptr_to_narrowklass(), "should have got back a narrow oop");
+#ifdef AARCH64
+      load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), MemNode::acquire, control_dependency);
+#else
       load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), mo, control_dependency);
+#endif
     }
     break;
   default:

From 9ac65c9d9ff14844a53f7f2c3b6685a64bcee5dd Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Mon, 23 Feb 2026 13:42:29 -0800
Subject: [PATCH 03/12] ARM64: Force acquire semantics on pointer loads to fix
 weak memory ordering On ARM64's weakly-ordered memory model, pointer loads
 (LoadP/LoadN) can be reordered with subsequent dependent loads by the
 hardware. This causes failures in concurrent data structures used by virtual
 thread scheduling (SynchronousQueue, LinkedTransferQueue) where a loaded
 pointer is immediately dereferenced but the dependent field load sees stale
 data.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Force MemNode::acquire at IR creation time in LoadNode::make() for all
LoadP/LoadN nodes on AARCH64 when barrier_data == 0. This must happen
at IR creation rather than instruction selection because C2's optimizer
can eliminate or reorder loads between those phases — a load removed by
LoadNode::Identity() or hoisted by Ideal() will never reach the matcher
to receive a barrier.

Loads with barrier_data != 0 (GC reference barriers) are excluded since
GC barrier patterns emit their own ordering sequences and have no
acquire variants in the .ad file.

Also reverts needs_acquiring_load() to the upstream form (is_acquire()
only), since the opcode-based check is now redundant — all pointer
loads already have acquire set on the IR node.
---
 src/hotspot/share/opto/memnode.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index 5ad5630a61c..99dd38ed48f 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -971,8 +971,9 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
   case T_FLOAT:   load = new LoadFNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency); break;
   case T_DOUBLE:  load = new LoadDNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency, require_atomic_access); break;
 #ifdef AARCH64
-  // On ARM64, use acquire semantics for all pointer loads to ensure proper ordering
-  case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  MemNode::acquire, control_dependency); break;
+  // On ARM64, use acquire semantics for pointer loads to ensure proper ordering.
+  // Only when barrier_data == 0; GC barrier patterns handle their own ordering.
+  case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  barrier_data == 0 ? MemNode::acquire : mo, control_dependency); break;
 #else
   case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  mo, control_dependency); break;
 #endif
@@ -981,7 +982,7 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
 #ifdef _LP64
     if (adr->bottom_type()->is_ptr_to_narrowoop()) {
 #ifdef AARCH64
-      load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), MemNode::acquire, control_dependency);
+      load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), barrier_data == 0 ? MemNode::acquire : mo, control_dependency);
 #else
       load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), mo, control_dependency);
 #endif
@@ -990,7 +991,7 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
     {
       assert(!adr->bottom_type()->is_ptr_to_narrowoop() && !adr->bottom_type()->is_ptr_to_narrowklass(), "should have got back a narrow oop");
 #ifdef AARCH64
-      load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), MemNode::acquire, control_dependency);
+      load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), barrier_data == 0 ? MemNode::acquire : mo, control_dependency);
 #else
       load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), mo, control_dependency);
 #endif

From 73d39aa4e1b9b1ace5fdc7224e845deb0dcae346 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Mon, 23 Feb 2026 13:43:20 -0800
Subject: [PATCH 04/12] ARM64: Fix loadP_acq/loadN_acq instruction cost for
 correct pattern matching Lower the cost of loadP_acq and loadN_acq from
 (4*INSN_COST + VOLATILE_REF_COST) to VOLATILE_REF_COST so the instruction
 matcher prefers these patterns over the loadP_volatile/loadN_volatile +
 addP_reg_imm combination when the address has a foldable constant offset.

The acq patterns use ldr+dmb with memory8/memory4 operands that support offset
addressing, while the volatile patterns use ldar with an indirect operand (no
offset). At the previous higher cost, the matcher would choose the cheaper
ldar variant plus a separate addP_reg_imm instruction rather than folding the
offset into a single ldr+dmb. This left unexpected addP_reg_imm nodes in the
compiled output, causing TestCombineAddPWithConstantOffsets to fail on AArch64
fastdebug builds.
---
 src/hotspot/cpu/aarch64/aarch64.ad | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 4d81bef329d..28a5c62bb05 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -6677,12 +6677,15 @@ instruct loadP(iRegPNoSp dst, memory8 mem)
 %}
 
 // Load Pointer with acquire semantics (load + barrier for complex addressing)
+// Cost must be <= VOLATILE_REF_COST so the matcher prefers this pattern
+// (which supports memory8 offset addressing) over loadP_volatile + addP_reg_imm
+// when the address has a foldable offset.
 instruct loadP_acq(iRegPNoSp dst, memory8 mem)
 %{
   match(Set dst (LoadP mem));
   predicate(needs_acquiring_load(n) && (n->as_Load()->barrier_data() == 0));
 
-  ins_cost(4 * INSN_COST + VOLATILE_REF_COST);
+  ins_cost(VOLATILE_REF_COST);
   format %{ "ldr  $dst, $mem\t# ptr (acquire)\n\t"
             "dmb ishld" %}
 
@@ -6710,12 +6713,15 @@ instruct loadN(iRegNNoSp dst, memory4 mem)
 %}
 
 // Load Compressed Pointer with acquire semantics (load + barrier for complex addressing)
+// Cost must be <= VOLATILE_REF_COST so the matcher prefers this pattern
+// (which supports memory4 offset addressing) over loadN_volatile + addP_reg_imm
+// when the address has a foldable offset.
 instruct loadN_acq(iRegNNoSp dst, memory4 mem)
 %{
   match(Set dst (LoadN mem));
   predicate(needs_acquiring_load(n) && n->as_Load()->barrier_data() == 0);
 
-  ins_cost(4 * INSN_COST + VOLATILE_REF_COST);
+  ins_cost(VOLATILE_REF_COST);
   format %{ "ldrw  $dst, $mem\t# compressed ptr (acquire)\n\t"
             "dmb ishld" %}
 

From e898b3eb3d9530aae3631d168234c23d54098189 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Mon, 23 Feb 2026 13:44:27 -0800
Subject: [PATCH 05/12] Fix weak memory ordering for Windows AArch64 with MSVC
 /volatile:iso MSVC's /volatile:iso means volatile reads/writes have no
 implicit acquire/release on ARM64, unlike GCC/Clang. This causes memory
 ordering violations in the interpreter, C1, and C2 JIT tiers.

Changes:

1. orderAccess_windows_aarch64.hpp: Use __dmb() intrinsics instead of
   std::atomic_thread_fence() for READ_MEM_BARRIER (dmb ishld),
   WRITE_MEM_BARRIER and FULL_MEM_BARRIER (dmb ish).

2. atomic_windows_aarch64.hpp: Implement PlatformOrderedLoad (__ldar)
   and PlatformOrderedStore (__stlr) for 1/2/4/8-byte widths, ensuring
   acquire/release semantics for Atomic::load_acquire/store_release.

3. barrierSetAssembler_aarch64.cpp: Add dmb ishld after interpreter
   oop reference loads to enforce load ordering.

4. barrierSetC1.cpp: Add membar_acquire() after C1 oop reference loads.

5. memnode.cpp: Add MemNode::acquire to C2 LoadP/LoadN for T_OBJECT
   and T_NARROWOOP when barrier_data == 0. T_ADDRESS is intentionally
   excluded to avoid breaking C2 TLAB inline allocation ordering.
---
 .../gc/shared/barrierSetAssembler_aarch64.cpp |  15 +++
 .../atomic_windows_aarch64.hpp                | 102 ++++++++++++++++++
 .../orderAccess_windows_aarch64.hpp           |  17 ++-
 .../share/gc/shared/c1/barrierSetC1.cpp       |  10 ++
 src/hotspot/share/opto/memnode.cpp            |  12 +--
 5 files changed, 145 insertions(+), 11 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
index 302701e1cad..a86eaf23a14 100644
--- a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
@@ -67,6 +67,21 @@ void BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators,
       assert(in_native, "why else?");
       __ ldr(dst, src);
     }
+    // On AArch64's weakly-ordered memory model, a plain LDR of a reference can
+    // be reordered with subsequent dependent loads (e.g. dereferencing the loaded
+    // pointer to read a field). This causes failures in concurrent data structures
+    // such as ForkJoinPool's work queues where a reference load followed by a
+    // field access may see stale data.
+    //
+    // Add a LoadLoad|LoadStore barrier (dmb ishld) after every reference load to
+    // enforce acquire semantics, matching the C2 fix in memnode.cpp that forces
+    // MemNode::acquire on all LoadP/LoadN nodes on AArch64.
+    //
+    // This is only needed when the caller hasn't already requested acquire or
+    // stronger ordering through the decorators, to avoid redundant barriers.
+    if ((decorators & (MO_ACQUIRE | MO_SEQ_CST)) == 0) {
+      __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+    }
     break;
   }
   case T_BOOLEAN: __ load_unsigned_byte (dst, src); break;
diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
index 90fc8ecfba4..117de8b52a4 100644
--- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -108,4 +108,106 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
+// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
+// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
+// These emit single LDAR/STLR instructions that have acquire/release semantics
+// baked in, rather than the generic fallback of separate dmb + plain load/store.
+// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
+// for cross-core visibility (Dekker patterns, etc.).
+
+template<>
+struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
+            PrimitiveConversions::cast<unsigned __int8>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int16>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int32>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int64>(v));
+  }
+};
+
+// release_store + fence combination, matching Linux AArch64
+template<size_t byte_size>
+struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    Atomic::release_store(p, v);
+    OrderAccess::fence();
+  }
+};
+
 #endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
index 5385f3e6a10..01711aff0e4 100644
--- a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
@@ -26,22 +26,29 @@
 #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP
 
 // Included in orderAccess.hpp header file.
-#include <atomic>
-using std::atomic_thread_fence;
 #include <arm64intr.h>
 #include "vm_version_aarch64.hpp"
 #include "runtime/vm_version.hpp"
 
 // Implementation of class OrderAccess.
+//
+// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
+// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
+// reordering" in addition to emitting the hardware DMB instruction. This is critical
+// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
+// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
+// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
+// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
+// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.
 
 inline void OrderAccess::loadload()   { acquire(); }
 inline void OrderAccess::storestore() { release(); }
 inline void OrderAccess::loadstore()  { acquire(); }
 inline void OrderAccess::storeload()  { fence(); }
 
-#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
-#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
-#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
+#define READ_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISHLD)
+#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
+#define FULL_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISH)
 
 inline void OrderAccess::acquire() {
   READ_MEM_BARRIER;
diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
index 1291f6e1b33..04bff8d6936 100644
--- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
+++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -188,6 +188,16 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
+#ifdef AARCH64
+  // On AArch64's weakly-ordered memory model, a plain load of a reference can be
+  // reordered with subsequent dependent loads (e.g. dereferencing the loaded pointer
+  // to read a field). Add acquire barrier after all reference loads to match the C2
+  // fix in memnode.cpp that forces MemNode::acquire on all LoadP/LoadN on AArch64.
+  if (!is_volatile && !is_acquire && is_reference_type(access.type())) {
+    __ membar_acquire();
+  }
+#endif
+
   if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index 99dd38ed48f..9cab7b7adf7 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -970,18 +970,18 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
   case T_LONG:    load = new LoadLNode (ctl, mem, adr, adr_type, rt->is_long(), mo, control_dependency, require_atomic_access); break;
   case T_FLOAT:   load = new LoadFNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency); break;
   case T_DOUBLE:  load = new LoadDNode (ctl, mem, adr, adr_type, rt,            mo, control_dependency, require_atomic_access); break;
-#ifdef AARCH64
-  // On ARM64, use acquire semantics for pointer loads to ensure proper ordering.
-  // Only when barrier_data == 0; GC barrier patterns handle their own ordering.
-  case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  barrier_data == 0 ? MemNode::acquire : mo, control_dependency); break;
-#else
   case T_ADDRESS: load = new LoadPNode (ctl, mem, adr, adr_type, rt->is_ptr(),  mo, control_dependency); break;
-#endif
   case T_OBJECT:
   case T_NARROWOOP:
 #ifdef _LP64
     if (adr->bottom_type()->is_ptr_to_narrowoop()) {
 #ifdef AARCH64
+      // On ARM64 with MSVC /volatile:iso, plain volatile loads have no acquire
+      // semantics. Use acquire for oop reference loads (T_OBJECT/T_NARROWOOP) to
+      // ensure proper ordering after GC reference loads that bypass GC barriers
+      // (barrier_data == 0). T_ADDRESS (raw pointer) loads are excluded — they are
+      // internal JVM loads (TLAB, TLS, metadata) that don't need Java memory model
+      // ordering and adding acquire causes C2 TLAB allocation corruption.
       load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), barrier_data == 0 ? MemNode::acquire : mo, control_dependency);
 #else
       load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), mo, control_dependency);

From 378cdf15c4c368936483f7f3d3defe10e811a727 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Tue, 24 Feb 2026 09:22:47 -0800
Subject: [PATCH 06/12] Fix Windows AArch64 memory ordering for MSVC
 /volatile:iso Add /volatile:ms to JVM_CFLAGS for Windows AArch64 in build
 config Override PlatformLoad/PlatformStore with LDAR/STLR intrinsics as
 defense-in-depth Add StoreLoad barrier in ObjectMonitor::try_spin() Dekker
 protocol Use acquire loads and opaque base update in ForkJoinPool
 work-stealing

---
 make/autoconf/flags-cflags.m4                 |   8 ++
 .../atomic_windows_aarch64.hpp                | 101 ++++++++++++++++++
 src/hotspot/share/runtime/objectMonitor.cpp   |  12 +++
 .../java/util/concurrent/ForkJoinPool.java    |  15 +--
 4 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
index 3a61840e8c9..9b8f9cdb505 100644
--- a/make/autoconf/flags-cflags.m4
+++ b/make/autoconf/flags-cflags.m4
@@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
   elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
     if test "x$FLAGS_CPU" = xx86; then
       $1_CFLAGS_CPU_JVM="-arch:IA32"
+    elif test "x$FLAGS_CPU" = xaarch64; then
+      # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
+      # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
+      # was written assuming volatile provides acquire/release semantics (as on x86
+      # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
+      # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
+      # lock-free algorithms that use plain volatile fields.
+      $1_CFLAGS_CPU_JVM="/volatile:ms"
     elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
       if test "x$DEBUG_LEVEL" != xrelease; then
         # NOTE: This is probably redundant; -homeparams is default on
diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
index 117de8b52a4..bd727723977 100644
--- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -108,6 +108,107 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
+//
+// Under MSVC /volatile:iso (the default for ARM64), the generic PlatformLoad
+// and PlatformStore use plain volatile dereferences which generate plain LDR/STR
+// instructions with NO acquire/release barriers. This is insufficient for ARM64's
+// weak memory model in HotSpot's concurrent runtime code (ObjectMonitor Dekker
+// protocols, ParkEvent signaling, etc.) where Atomic::load()/Atomic::store() are
+// used in cross-thread communication patterns that depend on load/store ordering.
+//
+// On x86, this works by accident because x86-TSO provides store-release and
+// load-acquire semantics for all memory accesses. On ARM64, we must explicitly
+// use LDAR (acquire load) and STLR (release store) to get equivalent ordering.
+//
+// This matches the effective behavior of /volatile:ms but scoped only to
+// Atomic:: operations rather than ALL volatile accesses, and ensures correct
+// cross-core visibility for HotSpot's lock-free algorithms.
+
+template<>
+struct Atomic::PlatformLoad<1> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<2> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<4> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<8> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<1> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
+            PrimitiveConversions::cast<unsigned __int8>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<2> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int16>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<4> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int32>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<8> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int64>(new_value));
+  }
+};
+
 // Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
 // __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
 // uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
index ceac0b42acf..b3fb30232f5 100644
--- a/src/hotspot/share/runtime/objectMonitor.cpp
+++ b/src/hotspot/share/runtime/objectMonitor.cpp
@@ -2346,6 +2346,16 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
   if (!has_successor()) {
     set_successor(current);
   }
+  // Dekker/Lamport duality with exit(): ST _succ; MEMBAR; LD _owner.
+  // The exiter's side (release_clear_owner + storeload) is in exit().
+  // Here on the spinner's side, we need a StoreLoad barrier between
+  // setting _succ and reading _owner to prevent the CPU from reordering
+  // the _owner load before the _succ store. On ARM64 with MSVC
+  // /volatile:iso, Atomic::store/load are plain STR/LDR with no
+  // barrier, so without this fence the Dekker protocol is broken and
+  // the exiter may not see our successor designation while we may not
+  // see its lock release — leading to missed wakeups and starvation.
+  OrderAccess::storeload();
   int64_t prv = NO_OWNER;
 
   // There are three ways to exit the following loop:
@@ -2421,6 +2431,8 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
 
     if (!has_successor()) {
       set_successor(current);
+      // See Dekker/storeload comment before the loop.
+      OrderAccess::storeload();
     }
   }
 
diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
index cdc5e0b9ea6..8f466e37aa5 100644
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@@ -1993,11 +1993,11 @@ final void runWorker(WorkQueue w) {
                                 a, k = slotOffset(m & b));
                             if (b != (b = q.base) || t == null ||
                                 !U.compareAndSetReference(a, k, t, null)) {
-                                if (a[b & m] == null) {
+                                if (U.getReferenceAcquire(a, slotOffset(m & b)) == null) {
                                     if (rescan)           // end of run
                                         break scan;
-                                    if (a[(b + 1) & m] == null &&
-                                        a[(b + 2) & m] == null) {
+                                    if (U.getReferenceAcquire(a, slotOffset(m & (b + 1))) == null &&
+                                        U.getReferenceAcquire(a, slotOffset(m & (b + 2))) == null) {
                                         break;            // probably empty
                                     }
                                     if (pb == (pb = b)) { // track progress
@@ -2008,13 +2008,15 @@ final void runWorker(WorkQueue w) {
                             }
                             else {
                                 boolean propagate;
-                                int nb = q.base = b + 1, prevSrc = src;
+                                q.updateBase(b + 1);
+                                int nb = b + 1, prevSrc = src;
                                 w.nsteals = ++nsteals;
                                 w.source = src = j;       // volatile
                                 rescan = true;
                                 int nh = t.noUserHelp();
                                 if (propagate =
-                                    (prevSrc != src || nh != 0) && a[nb & m] != null)
+                                    (prevSrc != src || nh != 0) &&
+                                    U.getReferenceAcquire(a, slotOffset(m & nb)) != null)
                                     signalWork();
                                 w.topLevelExec(t, fifo);
                                 if ((b = q.base) != nb && !propagate)
@@ -2064,7 +2066,8 @@ private int deactivate(WorkQueue w, int phase) {
             if ((q = qs[k & (n - 1)]) == null)
                 Thread.onSpinWait();
             else if ((a = q.array) != null && (cap = a.length) > 0 &&
-                     a[q.base & (cap - 1)] != null && --prechecks < 0 &&
+                     U.getReferenceAcquire(a, slotOffset(q.base & (cap - 1))) != null &&
+                     --prechecks < 0 &&
                      (int)(c = ctl) == activePhase &&
                      compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK)))
                 return w.phase = activePhase; // reactivate

From 01c7ef55144610755a48e2ee6b26b515ffe81d53 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Wed, 4 Mar 2026 12:10:32 -0800
Subject: [PATCH 07/12] snapshot of potential fixes

---
 src/hotspot/share/runtime/objectMonitor.cpp   | 100 ++++++++++++++++
 .../classes/java/lang/VirtualThread.java      |  32 +++++
 .../java/util/concurrent/ForkJoinPool.java    | 112 +++++++++++++++++-
 .../locks/AbstractQueuedSynchronizer.java     |  16 +++
 .../access/JavaUtilConcurrentFJPAccess.java   |  16 +++
 .../jdk/internal/misc/CarrierThread.java      |  44 +++++++
 6 files changed, 317 insertions(+), 3 deletions(-)

diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
index b3fb30232f5..16e9cda0a50 100644
--- a/src/hotspot/share/runtime/objectMonitor.cpp
+++ b/src/hotspot/share/runtime/objectMonitor.cpp
@@ -22,6 +22,7 @@
  *
  */
 
+#include "classfile/symbolTable.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "gc/shared/oopStorage.hpp"
 #include "gc/shared/oopStorageSet.hpp"
@@ -42,6 +43,7 @@
 #include "runtime/globals.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/javaCalls.hpp"
 #include "runtime/javaThread.inline.hpp"
 #include "runtime/lightweightSynchronizer.hpp"
 #include "runtime/mutexLocker.hpp"
@@ -523,6 +525,90 @@ void ObjectMonitor::notify_contended_enter(JavaThread* current) {
   }
 }
 
+// Compensate the ForkJoinPool scheduler when a pinned virtual thread's
+// carrier is about to block on a contended monitor.  Without compensation
+// the pool has no visibility that its worker is blocked, which can lead to
+// carrier starvation / deadlock (JDK-8345294).
+//
+// This calls CarrierThread.beginMonitorBlock() which uses
+// ForkJoinPool.tryCompensateForMonitor() to (a) activate an idle worker or
+// create a replacement spare carrier, and (b) decrement RC so that signalWork
+// can find a carrier for any queued continuation (e.g. the VT holding the
+// contested lock) that would otherwise be starved because blocked carriers
+// still count as active in the pool's RC field.  The RC decrement is restored
+// by endCompensatedBlock when this carrier unblocks.
+//
+// IMPORTANT: At this point notify_contended_enter() has already set
+// _current_pending_monitor. The Java call below may itself trigger
+// nested monitor contention (e.g., ThreadGroup.addUnstarted acquires a
+// monitor when creating a spare worker thread).  To avoid:
+//   (a) assertion failures  (pending_monitor != nullptr on nested entry)
+//   (b) corruption of _current_pending_monitor for the outer monitor
+// we save and clear the pending monitor before the Java call and restore
+// it afterwards.
+static bool compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return false;
+
+  // Save and clear pending monitor so nested monitor enters work cleanly.
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  // CarrierThread.beginMonitorBlock() — compensates the ForkJoinPool via
+  // tryCompensateForMonitor which always activates/creates a real spare
+  // carrier (branch 2 of tryCompensate is omitted for this path).
+  // Not every carrier thread is a CarrierThread (custom schedulers may
+  // use plain threads), so we look up the method and silently return
+  // false if it is not found.
+  TempNewSymbol begin_name = SymbolTable::new_symbol("beginMonitorBlock");
+  TempNewSymbol begin_sig  = SymbolTable::new_symbol("()Z");
+
+  JavaValue result(T_BOOLEAN);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          begin_name, begin_sig, current);
+
+  bool compensated = false;
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  } else {
+    compensated = result.get_jboolean();
+  }
+
+  // Restore pending monitor for the outer monitor enter.
+  current->set_current_pending_monitor(saved_pending);
+  return compensated;
+}
+
+static void end_compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return;
+
+  // Save and clear pending monitor (should be nullptr here, but be safe).
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  TempNewSymbol end_name = SymbolTable::new_symbol("endBlocking");
+  TempNewSymbol end_sig  = vmSymbols::void_method_signature();
+
+  JavaValue result(T_VOID);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          end_name, end_sig, current);
+
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  }
+
+  current->set_current_pending_monitor(saved_pending);
+}
+
 void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonitorContentionMark &cm) {
   assert(current == JavaThread::current(), "must be");
   assert(!has_owner(current), "must be");
@@ -570,6 +656,15 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     }
   }
 
+  // Compensate the ForkJoinPool before the carrier blocks so that the
+  // scheduler can spin up a replacement worker thread.  This prevents
+  // deadlocks where every carrier is pinned and waiting for a monitor
+  // held by an unmounted virtual thread that can never get a carrier.
+  bool compensated = false;
+  if (is_virtual) {
+    compensated = compensate_pinned_carrier(current);
+  }
+
   {
     // Change java thread status to indicate blocked on monitor enter.
     JavaThreadBlockedOnMonitorEnterState jtbmes(current, this);
@@ -606,6 +701,11 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     // the monitor free and clear.
   }
 
+  // End compensation now that the carrier is no longer blocked.
+  if (compensated) {
+    end_compensate_pinned_carrier(current);
+  }
+
   assert(contentions() >= 0, "must not be negative: contentions=%d", contentions());
 
   // Must either set _recursions = 0 or ASSERT _recursions == 0.
diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java
index b0b134e69a3..17ce3e552ac 100644
--- a/src/java.base/share/classes/java/lang/VirtualThread.java
+++ b/src/java.base/share/classes/java/lang/VirtualThread.java
@@ -567,6 +567,13 @@ private void afterYield() {
                 setState(newState = TIMED_PARKED);
             }
 
+            // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state
+            // is visible before reading parkPermit (Dekker pattern with
+            // unpark which writes parkPermit then reads state).
+            // Note: storeFence is insufficient — on ARM64 it only emits
+            // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish).
+            U.fullFence();
+
             // may have been unparked while parking
             if (parkPermit && compareAndSetState(newState, UNPARKED)) {
                 // lazy submit if local queue is empty
@@ -592,6 +599,10 @@ private void afterYield() {
         if (s == BLOCKING) {
             setState(BLOCKED);
 
+            // Full fence (StoreLoad) for Dekker pattern with unblock
+            // which writes blockPermit then reads state.
+            U.fullFence();
+
             // may have been unblocked while blocking
             if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) {
                 // lazy submit if local queue is empty
@@ -622,6 +633,10 @@ private void afterYield() {
                 }
             }
 
+            // Full fence (StoreLoad) for Dekker pattern with notify
+            // which writes notified then reads state.
+            U.fullFence();
+
             // may have been notified while in transition to wait state
             if (notified && compareAndSetState(newState, BLOCKED)) {
                 // may have even been unblocked already
@@ -659,6 +674,14 @@ private void afterDone(boolean notifyContainer) {
         assert carrierThread == null;
         setState(TERMINATED);
 
+        // Full fence (StoreLoad) to ensure the TERMINATED state is
+        // visible to any thread that has stored a termination latch
+        // (joinNanos). Without this, on ARM64 the volatile write of
+        // state and the subsequent volatile read of termination can be
+        // reordered, causing a missed-wakeup where both sides miss
+        // each other's store (Dekker pattern).
+        U.fullFence();
+
         // notify anyone waiting for this virtual thread to terminate
         CountDownLatch termination = this.termination;
         if (termination != null) {
@@ -847,6 +870,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) {
      */
     private void unpark(boolean lazySubmit) {
         if (!getAndSetParkPermit(true) && currentThread() != this) {
+            // Full fence (StoreLoad) to ensure parkPermit=true is visible
+            // before reading state (Dekker pattern with afterYield PARKING
+            // path which writes state then reads parkPermit).
+            U.fullFence();
             int s = state();
 
             // unparked while parked
@@ -889,6 +916,7 @@ void unpark() {
     private void unblock() {
         assert !Thread.currentThread().isVirtual();
         blockPermit = true;
+        U.fullFence();  // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path
         if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) {
             submitRunContinuation();
         }
@@ -1003,6 +1031,10 @@ boolean joinNanos(long nanos) throws InterruptedException {
 
         // ensure termination object exists, then re-check state
         CountDownLatch termination = getTermination();
+        // Full fence (StoreLoad) for Dekker with afterDone: ensures that
+        // the CAS in getTermination() (storing the latch) is visible to
+        // afterDone before we read state here.
+        U.fullFence();
         if (state() == TERMINATED)
             return true;
 
diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
index 8f466e37aa5..e57398e6b91 100644
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@@ -1895,6 +1895,9 @@ else if ((v = w) == null)
                     createWorker();
                 else {
                     v.phase = sp;
+                    // Full fence (StoreLoad) for Dekker with awaitWork which
+                    // writes parking then reads phase.
+                    U.fullFence();
                     if (v.parking != 0)
                         U.unpark(v.owner);
                 }
@@ -1916,6 +1919,7 @@ private void releaseWaiters() {
                           c, ((UMASK & (c + RC_UNIT)) | (c & TC_MASK) |
                               (v.stackPred & LMASK))))) {
                 v.phase = sp;
+                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
             }
@@ -2096,6 +2100,9 @@ else if ((deadline =
             if ((p = w.phase) != activePhase && (runState & STOP) == 0L) {
                 LockSupport.setCurrentBlocker(this);
                 w.parking = 1;                 // enable unpark
+                // Full fence (StoreLoad) for Dekker with signalWork which
+                // writes phase then reads parking.
+                U.fullFence();
                 while ((p = w.phase) != activePhase) {
                     boolean trimmable = false; int trim;
                     Thread.interrupted();      // clear status
@@ -2202,6 +2209,7 @@ private int tryCompensate(long c) {
                 (v = qs[i]) != null &&
                 compareAndSetCtl(c, (c & UMASK) | (v.stackPred & LMASK))) {
                 v.phase = sp;
+                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
                 stat = UNCOMPENSATE;
@@ -4350,14 +4358,107 @@ final long beginCompensatedBlock() {
         return (c == 0) ? 0L : RC_UNIT;
     }
 
+    /**
+     * Variant of tryCompensate for use when a carrier thread is about to block
+     * on a native OS-level monitor (objectMonitor::enter_internal) while pinned
+     * by a virtual thread.
+     *
+     * <p>Unlike tryCompensate, this method omits the passive RC-only path
+     * (branch 2 of tryCompensate), so the blocking carrier always results in
+     * either an idle worker being activated (branch 1) or a new spare being
+     * created (branch 3).
+     *
+     * <p>Branch 3 decrements RC in addition to incrementing TC so that
+     * {@code signalWork} (triggered by subsequent VT task submissions) sees
+     * the pool as under-active and can dispatch the queued VT continuations
+     * to the new spare.  Unlike the regular {@code tryCompensate}, this
+     * extra RC decrement requires {@code endCompensatedBlock} to restore
+     * <em>two</em> RC_UNITs: one for the decrement here and one for the
+     * spare's eventual deactivation (RC--).  The return value of {@code 2}
+     * (vs {@code UNCOMPENSATE = 1} for branch 1) signals
+     * {@code beginMonitorCompensatedBlock} to pass {@code 2 * RC_UNIT}
+     * to {@code endCompensatedBlock}, keeping the pool's active-worker
+     * accounting balanced across iterations:
+     * <pre>
+     *   branch 3 CAS: RC--        (-1)
+     *   spare deactivates: RC--   (-1)
+     *   endCompensatedBlock:      (+2)
+     *   net:                       0
+     * </pre>
+     */
+    private int tryCompensateForMonitor(long c) {
+        Predicate<? super ForkJoinPool> sat;
+        long b = config;
+        int pc        = parallelism,
+            maxTotal  = (short)(b >>> TC_SHIFT) + pc,
+            active    = (short)(c >>> RC_SHIFT),
+            total     = (short)(c >>> TC_SHIFT),
+            sp        = (int)c,
+            stat      = -1;
+        if (sp != 0 && active <= pc) {              // activate idle worker (branch 1)
+            WorkQueue[] qs; WorkQueue v; int i;
+            if ((qs = queues) != null && qs.length > (i = sp & SMASK) &&
+                (v = qs[i]) != null &&
+                compareAndSetCtl(c, (c & UMASK) | (v.stackPred & LMASK))) {
+                v.phase = sp;
+                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
+                if (v.parking != 0)
+                    U.unpark(v.owner);
+                stat = 1;                           // need 1 * RC_UNIT at unblock
+            }
+        }
+        // Branch 2 (passive RC-only decrement) is intentionally absent.
+        else if (total < maxTotal && total < MAX_CAP) { // create spare (branch 3)
+            // TC++ and RC-- so that signalWork sees the pool as under-active.
+            // Return stat=2 so that beginMonitorCompensatedBlock passes
+            // 2*RC_UNIT to endCompensatedBlock, balancing both the RC--
+            // here and the spare's eventual deactivation (RC--).
+            long nc = ((c + TC_UNIT) & TC_MASK) |
+                      ((c - RC_UNIT) & RC_MASK) |
+                      (c & LMASK);
+            if ((runState & STOP) != 0L)
+                stat = 0;
+            else if (compareAndSetCtl(c, nc)) {
+                createWorker();
+                stat = 2;                           // need 2 * RC_UNIT at unblock
+            }
+        }
+        else if (!compareAndSetCtl(c, c))               // validate
+            ;
+        else if ((sat = saturate) != null && sat.test(this))
+            stat = 0;
+        else
+            throw new RejectedExecutionException(
+                "Thread limit exceeded replacing blocked worker");
+        return stat;
+    }
+
+    /**
+     * Like beginCompensatedBlock but for carrier threads that are about to
+     * block on a native OS-level monitor (objectMonitor::enter_internal) while
+     * pinned by a virtual thread.  Uses tryCompensateForMonitor which always
+     * activates or creates a real replacement carrier (never passive RC-only).
+     * <p>tryCompensateForMonitor returns:
+     * <ul>
+     *   <li>{@code 1} — branch 1 (activated an idle worker), need 1 × RC_UNIT</li>
+     *   <li>{@code 2} — branch 3 (created spare with RC--), need 2 × RC_UNIT</li>
+     *   <li>{@code 0} — saturated/stopping, no restoration needed</li>
+     * </ul>
+     * @return value to pass to endCompensatedBlock
+     */
+    final long beginMonitorCompensatedBlock() {
+        int c;
+        do {} while ((c = tryCompensateForMonitor(ctl)) < 0);
+        return (c == 0) ? 0L : (long)c * RC_UNIT;
+    }
+
     /**
      * Re-adjusts parallelism after a blocking operation completes.
-     * @param post value from beginCompensatedBlock
+     * @param post value from beginCompensatedBlock or beginMonitorCompensatedBlock
      */
     void endCompensatedBlock(long post) {
-        if (post > 0L) {
+        if (post > 0L)
             getAndAddCtl(post);
-        }
     }
 
     /** ManagedBlock for external threads */
@@ -4414,9 +4515,14 @@ protected <T> RunnableFuture<T> newTaskFor(Callable<T> callable) {
                 public long beginCompensatedBlock(ForkJoinPool pool) {
                     return pool.beginCompensatedBlock();
                 }
+                @Override
                 public void endCompensatedBlock(ForkJoinPool pool, long post) {
                     pool.endCompensatedBlock(post);
                 }
+                @Override
+                public long beginMonitorCompensatedBlock(ForkJoinPool pool) {
+                    return pool.beginMonitorCompensatedBlock();
+                }
             });
         defaultForkJoinWorkerThreadFactory =
             new DefaultForkJoinWorkerThreadFactory();
diff --git a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
index 0ff216c80a0..1c794032aae 100644
--- a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
+++ b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
@@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared,
                 Thread.onSpinWait();
             } else if (node.status == 0) {
                 node.status = WAITING;          // enable signal and recheck
+                // Full fence (StoreLoad) to ensure WAITING status is visible
+                // before re-reading state in tryAcquire/tryAcquireShared
+                // (Dekker pattern with releaseShared/release which writes
+                // state then reads node.status in signalNext).
+                // On ARM64, volatile write (stlr) + volatile read (ldar) to
+                // different addresses does NOT provide StoreLoad ordering.
+                U.fullFence();
             } else {
                 spins = postSpins = (byte)((postSpins << 1) | 1);
                 try {
@@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout)
      */
     public final boolean release(int arg) {
         if (tryRelease(arg)) {
+            // Full fence (StoreLoad) to ensure the state update from
+            // tryRelease is visible before reading node.status in signalNext
+            // (Dekker pattern: release writes state then reads status,
+            // acquire writes status then reads state).
+            // On ARM64, CAS (stlxr/release) + ldar to different addresses
+            // does NOT provide StoreLoad ordering.
+            U.fullFence();
             signalNext(head);
             return true;
         }
@@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout)
      */
     public final boolean releaseShared(int arg) {
         if (tryReleaseShared(arg)) {
+            // Full fence (StoreLoad) — see comment in release()
+            U.fullFence();
             signalNext(head);
             return true;
         }
diff --git a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
index 28c1b7fddee..5f824dfabbd 100644
--- a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
+++ b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
@@ -29,4 +29,20 @@
 public interface JavaUtilConcurrentFJPAccess {
     long beginCompensatedBlock(ForkJoinPool pool);
     void endCompensatedBlock(ForkJoinPool pool, long post);
+    /**
+     * Like beginCompensatedBlock but for carrier threads that are about to
+     * block on a native OS-level monitor (objectMonitor::enter_internal) while
+     * pinned by a virtual thread.  This variant:
+     * <ul>
+     *   <li>Never takes the passive RC-only path (branch 2 of tryCompensate);
+     *       it always activates an idle worker or creates a new spare.</li>
+     *   <li>Decrements RC immediately so that signalWork can create a carrier
+     *       for any continuation (e.g. the contested lock's holder) that is
+     *       already in the queue, which would otherwise be starved because
+     *       blocked carriers still count as active in RC.</li>
+     * </ul>
+     * The RC decrement is restored by endCompensatedBlock when the carrier
+     * unblocks.
+     */
+    long beginMonitorCompensatedBlock(ForkJoinPool pool);
 }
diff --git a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
index b314b1823a5..811477c2475 100644
--- a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
+++ b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
@@ -89,6 +89,47 @@ public boolean beginBlocking() {
         }
     }
 
+    /**
+     * Mark the start of a blocking operation where the carrier thread is about
+     * to block on a native OS-level monitor (objectMonitor::enter_internal)
+     * while pinned by a virtual thread.  Unlike beginBlocking, this always
+     * activates an idle worker or creates a new spare carrier (never takes the
+     * passive RC-only path that creates no worker).  endBlocking restores
+     * the compensateValue (1 or 2 RC_UNITs) when this carrier unblocks.
+     */
+    public boolean beginMonitorBlock() {
+        assert Thread.currentThread().isVirtual() && JLA.currentCarrierThread() == this;
+
+        // Guard against re-entrant calls.  tryCompensateForMonitor may call
+        // createWorker() which can hit a nested monitor (e.g. ThreadGroup),
+        // re-entering objectMonitor::enter_with_contention_mark and thus this
+        // method.  Without this check each nested call overwrites
+        // compensateValue, leaking an RC_UNIT per re-entrant compensation.
+        if (compensating != NOT_COMPENSATING) {
+            return false;
+        }
+
+        // don't preempt when attempting to compensate
+        Continuation.pin();
+        try {
+            compensating = COMPENSATE_IN_PROGRESS;
+
+            // Uses FJP.tryCompensateForMonitor (branch 2 removed) – always
+            // activates an idle worker (1*RC_UNIT) or creates a new spare
+            // with RC-- (2*RC_UNIT to balance both the RC-- and the spare's
+            // eventual deactivation).
+            compensateValue = ForkJoinPools.beginMonitorCompensatedBlock(getPool());
+            compensating = COMPENSATING;
+            return true;
+        } catch (Throwable e) {
+            // exception starting spare thread
+            compensating = NOT_COMPENSATING;
+            throw e;
+        } finally {
+            Continuation.unpin();
+        }
+    }
+
     /**
      * Mark the end of a blocking operation.
      */
@@ -133,6 +174,9 @@ static long beginCompensatedBlock(ForkJoinPool pool) {
         static void endCompensatedBlock(ForkJoinPool pool, long post) {
             FJP_ACCESS.endCompensatedBlock(pool, post);
         }
+        static long beginMonitorCompensatedBlock(ForkJoinPool pool) {
+            return FJP_ACCESS.beginMonitorCompensatedBlock(pool);
+        }
     }
 
     static {

From 368d041575f07a03e8410b9c793d295df8ad6e80 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Wed, 4 Mar 2026 13:19:49 -0800
Subject: [PATCH 08/12] Starvation test passes

---
 .../java/util/concurrent/ForkJoinPool.java    | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
index e57398e6b91..aea149289be 100644
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@@ -4368,22 +4368,23 @@ final long beginCompensatedBlock() {
      * either an idle worker being activated (branch 1) or a new spare being
      * created (branch 3).
      *
-     * <p>Branch 3 decrements RC in addition to incrementing TC so that
-     * {@code signalWork} (triggered by subsequent VT task submissions) sees
-     * the pool as under-active and can dispatch the queued VT continuations
-     * to the new spare.  Unlike the regular {@code tryCompensate}, this
-     * extra RC decrement requires {@code endCompensatedBlock} to restore
-     * <em>two</em> RC_UNITs: one for the decrement here and one for the
-     * spare's eventual deactivation (RC--).  The return value of {@code 2}
-     * (vs {@code UNCOMPENSATE = 1} for branch 1) signals
-     * {@code beginMonitorCompensatedBlock} to pass {@code 2 * RC_UNIT}
-     * to {@code endCompensatedBlock}, keeping the pool's active-worker
-     * accounting balanced across iterations:
+     * <p>Both branches decrement RC so that {@code signalWork} (triggered by
+     * subsequent VT task submissions) sees the pool as under-active and can
+     * dispatch the queued VT continuations.  Without the RC decrement in
+     * branch 1, monitor-blocked carriers are counted as "active" by
+     * {@code signalWork}'s {@code (short)(c >>> RC_SHIFT) >= pc} check,
+     * preventing it from waking idle workers when tasks are queued.
+     *
+     * <p>The RC decrement in both branches requires {@code endCompensatedBlock}
+     * to restore <em>two</em> RC_UNITs: one for the decrement here and one for
+     * the compensator's eventual deactivation (RC--).  Both branches return
+     * {@code 2} so that {@code beginMonitorCompensatedBlock} passes
+     * {@code 2 * RC_UNIT} to {@code endCompensatedBlock}:
      * <pre>
-     *   branch 3 CAS: RC--        (-1)
-     *   spare deactivates: RC--   (-1)
-     *   endCompensatedBlock:      (+2)
-     *   net:                       0
+     *   branch CAS: RC--                (-1)
+     *   compensator deactivates: RC--   (-1)
+     *   endCompensatedBlock:            (+2)
+     *   net:                             0
      * </pre>
      */
     private int tryCompensateForMonitor(long c) {
@@ -4391,20 +4392,27 @@ private int tryCompensateForMonitor(long c) {
         long b = config;
         int pc        = parallelism,
             maxTotal  = (short)(b >>> TC_SHIFT) + pc,
-            active    = (short)(c >>> RC_SHIFT),
             total     = (short)(c >>> TC_SHIFT),
             sp        = (int)c,
             stat      = -1;
-        if (sp != 0 && active <= pc) {              // activate idle worker (branch 1)
+        if (sp != 0) {                              // activate idle worker (branch 1)
+            // RC-- so that signalWork sees the pool as under-active while
+            // the carrier is blocked on the monitor.  Without this, RC
+            // counts monitor-blocked carriers as "active", preventing
+            // signalWork from waking idle workers.  Return stat=2 so
+            // that beginMonitorCompensatedBlock passes 2*RC_UNIT to
+            // endCompensatedBlock, balancing both the RC-- here and the
+            // compensator's eventual deactivation (RC--).
             WorkQueue[] qs; WorkQueue v; int i;
             if ((qs = queues) != null && qs.length > (i = sp & SMASK) &&
                 (v = qs[i]) != null &&
-                compareAndSetCtl(c, (c & UMASK) | (v.stackPred & LMASK))) {
+                compareAndSetCtl(c, ((c - RC_UNIT) & UMASK) |
+                                    (v.stackPred & LMASK))) {
                 v.phase = sp;
                 U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
-                stat = 1;                           // need 1 * RC_UNIT at unblock
+                stat = 2;                           // need 2 * RC_UNIT at unblock
             }
         }
         // Branch 2 (passive RC-only decrement) is intentionally absent.
@@ -4440,7 +4448,7 @@ else if ((sat = saturate) != null && sat.test(this))
      * activates or creates a real replacement carrier (never passive RC-only).
      * <p>tryCompensateForMonitor returns:
      * <ul>
-     *   <li>{@code 1} — branch 1 (activated an idle worker), need 1 × RC_UNIT</li>
+     *   <li>{@code 2} — branch 1 (activated idle worker with RC--), need 2 × RC_UNIT</li>
      *   <li>{@code 2} — branch 3 (created spare with RC--), need 2 × RC_UNIT</li>
      *   <li>{@code 0} — saturated/stopping, no restoration needed</li>
      * </ul>

From 4acd3a3fe995abf5cd0cd8635232a4857ac555aa Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Wed, 4 Mar 2026 14:50:24 -0800
Subject: [PATCH 09/12] Fix for PingPong test

---
 .../classes/java/util/concurrent/LinkedTransferQueue.java    | 5 +++++
 .../share/classes/java/util/concurrent/SynchronousQueue.java | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
index 118d648c7a2..cff4da50d30 100644
--- a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
+++ b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
@@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) {
                 q = p.next;
                 if (p.isData != haveData && haveData != (m != null) &&
                     p.cmpExItem(m, e) == m) {
+                    // Full fence (StoreLoad) for Dekker with await() which
+                    // writes waiter then reads item. On ARM64, CAS
+                    // (ldaxr/stlxr) + plain load to a different field does
+                    // NOT provide StoreLoad ordering.
+                    VarHandle.fullFence();
                     Thread w = p.waiter;    // matched complementary node
                     if (p != h && h == cmpExHead(h, (q == null) ? p : q))
                         h.next = h;         // advance head; self-link old
diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
index 49efe5d5c2c..c74a2483aa4 100644
--- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
+++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
@@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) {
                     else if (p.cmpExItem(m, e) != m)
                         p = head;                  // missed; restart
                     else {                         // matched complementary node
+                        // Full fence (StoreLoad) for Dekker with await() which
+                        // writes waiter then reads item. On ARM64, CAS
+                        // (ldaxr/stlxr) + plain load to a different field does
+                        // NOT provide StoreLoad ordering.
+                        VarHandle.fullFence();
                         Thread w = p.waiter;
                         cmpExHead(p, p.next);
                         LockSupport.unpark(w);

From dba3bac18967243070baacefa1a096a2119a33a6 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Wed, 4 Mar 2026 16:10:33 -0800
Subject: [PATCH 10/12] Saints fix for TestTrampoline when testing on windows

---
 .../compiler/c2/aarch64/TestTrampoline.java   | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
index 114f7f9bfab..d44440849dd 100644
--- a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
+++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
@@ -23,7 +23,7 @@
  */
 
 package compiler.c2.aarch64;
-
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Iterator;
 import jdk.test.lib.process.OutputAnalyzer;
@@ -96,6 +96,26 @@ private static void test(String s, int i) {
 
         public static void main(String[] args) {
             String s = "Returns the char value at the specified index.";
+
+             boolean isWindows = System.getProperty("os.name").startsWith("Windows");
+            if (isWindows) {
+                /*
+                Depending on the Windows code page, the UTF-16 path in the String.charAt method
+                may be executed during JVM startup. However, it may not execute often enough to
+                cause it to be inlined by C2. In this case, C2 may not fully inline the
+                String.charAt method, resulting in the unexpected generation of trampolines.
+                Therefore, we execute the UTF-16 path in the String.charAt method in a loop
+                enough times to ensure that it will be inlined by C2.
+                */
+                String jnuEncoding = System.getProperty("sun.jnu.encoding");
+                if (jnuEncoding != null && Charset.isSupported(jnuEncoding)) {
+                    byte[] bytes = s.getBytes(Charset.forName(jnuEncoding));
+
+                    for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) {
+                        String encoded = new String(bytes, Charset.forName(jnuEncoding));
+                    }
+                }
+            }
             for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) {
                 test(s, i % s.length());
             }

From 80ff85a7a43284848a201f7b87e6dbab47fec0b6 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Thu, 5 Mar 2026 11:24:38 -0800
Subject: [PATCH 11/12] Fixes for issues introduced that impacted
 JSR166TestCase

---
 src/hotspot/cpu/aarch64/aarch64.ad            | 44 -------------------
 .../gc/shared/barrierSetAssembler_aarch64.cpp | 15 -------
 .../share/gc/shared/c1/barrierSetC1.cpp       | 10 -----
 src/hotspot/share/opto/memnode.cpp            | 14 ------
 .../java/util/concurrent/ForkJoinPool.java    | 23 +++-------
 5 files changed, 6 insertions(+), 100 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 28a5c62bb05..6899526a282 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -6676,28 +6676,6 @@ instruct loadP(iRegPNoSp dst, memory8 mem)
   ins_pipe(iload_reg_mem);
 %}
 
-// Load Pointer with acquire semantics (load + barrier for complex addressing)
-// Cost must be <= VOLATILE_REF_COST so the matcher prefers this pattern
-// (which supports memory8 offset addressing) over loadP_volatile + addP_reg_imm
-// when the address has a foldable offset.
-instruct loadP_acq(iRegPNoSp dst, memory8 mem)
-%{
-  match(Set dst (LoadP mem));
-  predicate(needs_acquiring_load(n) && (n->as_Load()->barrier_data() == 0));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldr  $dst, $mem\t# ptr (acquire)\n\t"
-            "dmb ishld" %}
-
-  ins_encode %{
-    loadStore(masm, &MacroAssembler::ldr, $dst$$Register, $mem->opcode(),
-               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp, 8);
-    __ membar(Assembler::LoadLoad | Assembler::LoadStore);
-  %}
-
-  ins_pipe(pipe_serial);
-%}
-
 // Load Compressed Pointer
 instruct loadN(iRegNNoSp dst, memory4 mem)
 %{
@@ -6712,28 +6690,6 @@ instruct loadN(iRegNNoSp dst, memory4 mem)
   ins_pipe(iload_reg_mem);
 %}
 
-// Load Compressed Pointer with acquire semantics (load + barrier for complex addressing)
-// Cost must be <= VOLATILE_REF_COST so the matcher prefers this pattern
-// (which supports memory4 offset addressing) over loadN_volatile + addP_reg_imm
-// when the address has a foldable offset.
-instruct loadN_acq(iRegNNoSp dst, memory4 mem)
-%{
-  match(Set dst (LoadN mem));
-  predicate(needs_acquiring_load(n) && n->as_Load()->barrier_data() == 0);
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldrw  $dst, $mem\t# compressed ptr (acquire)\n\t"
-            "dmb ishld" %}
-
-  ins_encode %{
-    loadStore(masm, &MacroAssembler::ldrw, $dst$$Register, $mem->opcode(),
-               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp, 4);
-    __ membar(Assembler::LoadLoad | Assembler::LoadStore);
-  %}
-
-  ins_pipe(pipe_serial);
-%}
-
 // Load Klass Pointer
 instruct loadKlass(iRegPNoSp dst, memory8 mem)
 %{
diff --git a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
index a86eaf23a14..302701e1cad 100644
--- a/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/shared/barrierSetAssembler_aarch64.cpp
@@ -67,21 +67,6 @@ void BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators,
       assert(in_native, "why else?");
       __ ldr(dst, src);
     }
-    // On AArch64's weakly-ordered memory model, a plain LDR of a reference can
-    // be reordered with subsequent dependent loads (e.g. dereferencing the loaded
-    // pointer to read a field). This causes failures in concurrent data structures
-    // such as ForkJoinPool's work queues where a reference load followed by a
-    // field access may see stale data.
-    //
-    // Add a LoadLoad|LoadStore barrier (dmb ishld) after every reference load to
-    // enforce acquire semantics, matching the C2 fix in memnode.cpp that forces
-    // MemNode::acquire on all LoadP/LoadN nodes on AArch64.
-    //
-    // This is only needed when the caller hasn't already requested acquire or
-    // stronger ordering through the decorators, to avoid redundant barriers.
-    if ((decorators & (MO_ACQUIRE | MO_SEQ_CST)) == 0) {
-      __ membar(Assembler::LoadLoad | Assembler::LoadStore);
-    }
     break;
   }
   case T_BOOLEAN: __ load_unsigned_byte (dst, src); break;
diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
index 04bff8d6936..1291f6e1b33 100644
--- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
+++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -188,16 +188,6 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
-#ifdef AARCH64
-  // On AArch64's weakly-ordered memory model, a plain load of a reference can be
-  // reordered with subsequent dependent loads (e.g. dereferencing the loaded pointer
-  // to read a field). Add acquire barrier after all reference loads to match the C2
-  // fix in memnode.cpp that forces MemNode::acquire on all LoadP/LoadN on AArch64.
-  if (!is_volatile && !is_acquire && is_reference_type(access.type())) {
-    __ membar_acquire();
-  }
-#endif
-
   if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index 9cab7b7adf7..24b81b894cb 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -975,26 +975,12 @@ Node* LoadNode::make(PhaseGVN& gvn, Node* ctl, Node* mem, Node* adr, const TypeP
   case T_NARROWOOP:
 #ifdef _LP64
     if (adr->bottom_type()->is_ptr_to_narrowoop()) {
-#ifdef AARCH64
-      // On ARM64 with MSVC /volatile:iso, plain volatile loads have no acquire
-      // semantics. Use acquire for oop reference loads (T_OBJECT/T_NARROWOOP) to
-      // ensure proper ordering after GC reference loads that bypass GC barriers
-      // (barrier_data == 0). T_ADDRESS (raw pointer) loads are excluded — they are
-      // internal JVM loads (TLAB, TLS, metadata) that don't need Java memory model
-      // ordering and adding acquire causes C2 TLAB allocation corruption.
-      load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), barrier_data == 0 ? MemNode::acquire : mo, control_dependency);
-#else
       load = new LoadNNode(ctl, mem, adr, adr_type, rt->make_narrowoop(), mo, control_dependency);
-#endif
     } else
 #endif
     {
       assert(!adr->bottom_type()->is_ptr_to_narrowoop() && !adr->bottom_type()->is_ptr_to_narrowklass(), "should have got back a narrow oop");
-#ifdef AARCH64
-      load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), barrier_data == 0 ? MemNode::acquire : mo, control_dependency);
-#else
       load = new LoadPNode(ctl, mem, adr, adr_type, rt->is_ptr(), mo, control_dependency);
-#endif
     }
     break;
   default:
diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
index aea149289be..cd2594a6262 100644
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@@ -1895,9 +1895,6 @@ else if ((v = w) == null)
                     createWorker();
                 else {
                     v.phase = sp;
-                    // Full fence (StoreLoad) for Dekker with awaitWork which
-                    // writes parking then reads phase.
-                    U.fullFence();
                     if (v.parking != 0)
                         U.unpark(v.owner);
                 }
@@ -1919,7 +1916,6 @@ private void releaseWaiters() {
                           c, ((UMASK & (c + RC_UNIT)) | (c & TC_MASK) |
                               (v.stackPred & LMASK))))) {
                 v.phase = sp;
-                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
             }
@@ -1997,11 +1993,11 @@ final void runWorker(WorkQueue w) {
                                 a, k = slotOffset(m & b));
                             if (b != (b = q.base) || t == null ||
                                 !U.compareAndSetReference(a, k, t, null)) {
-                                if (U.getReferenceAcquire(a, slotOffset(m & b)) == null) {
+                                if (a[b & m] == null) {
                                     if (rescan)           // end of run
                                         break scan;
-                                    if (U.getReferenceAcquire(a, slotOffset(m & (b + 1))) == null &&
-                                        U.getReferenceAcquire(a, slotOffset(m & (b + 2))) == null) {
+                                    if (a[(b + 1) & m] == null &&
+                                        a[(b + 2) & m] == null) {
                                         break;            // probably empty
                                     }
                                     if (pb == (pb = b)) { // track progress
@@ -2012,15 +2008,13 @@ final void runWorker(WorkQueue w) {
                             }
                             else {
                                 boolean propagate;
-                                q.updateBase(b + 1);
-                                int nb = b + 1, prevSrc = src;
+                                int nb = q.base = b + 1, prevSrc = src;
                                 w.nsteals = ++nsteals;
                                 w.source = src = j;       // volatile
                                 rescan = true;
                                 int nh = t.noUserHelp();
                                 if (propagate =
-                                    (prevSrc != src || nh != 0) &&
-                                    U.getReferenceAcquire(a, slotOffset(m & nb)) != null)
+                                    (prevSrc != src || nh != 0) && a[nb & m] != null)
                                     signalWork();
                                 w.topLevelExec(t, fifo);
                                 if ((b = q.base) != nb && !propagate)
@@ -2070,7 +2064,7 @@ private int deactivate(WorkQueue w, int phase) {
             if ((q = qs[k & (n - 1)]) == null)
                 Thread.onSpinWait();
             else if ((a = q.array) != null && (cap = a.length) > 0 &&
-                     U.getReferenceAcquire(a, slotOffset(q.base & (cap - 1))) != null &&
+                     a[q.base & (cap - 1)] != null &&
                      --prechecks < 0 &&
                      (int)(c = ctl) == activePhase &&
                      compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK)))
@@ -2100,9 +2094,6 @@ else if ((deadline =
             if ((p = w.phase) != activePhase && (runState & STOP) == 0L) {
                 LockSupport.setCurrentBlocker(this);
                 w.parking = 1;                 // enable unpark
-                // Full fence (StoreLoad) for Dekker with signalWork which
-                // writes phase then reads parking.
-                U.fullFence();
                 while ((p = w.phase) != activePhase) {
                     boolean trimmable = false; int trim;
                     Thread.interrupted();      // clear status
@@ -2209,7 +2200,6 @@ private int tryCompensate(long c) {
                 (v = qs[i]) != null &&
                 compareAndSetCtl(c, (c & UMASK) | (v.stackPred & LMASK))) {
                 v.phase = sp;
-                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
                 stat = UNCOMPENSATE;
@@ -4409,7 +4399,6 @@ private int tryCompensateForMonitor(long c) {
                 compareAndSetCtl(c, ((c - RC_UNIT) & UMASK) |
                                     (v.stackPred & LMASK))) {
                 v.phase = sp;
-                U.fullFence(); // StoreLoad for Dekker with awaitWork parking/phase
                 if (v.parking != 0)
                     U.unpark(v.owner);
                 stat = 2;                           // need 2 * RC_UNIT at unblock

From 2baed3c7fc339135b4ba6ec768476d07b4af0c90 Mon Sep 17 00:00:00 2001
From: Mat Carter <macarte@microsoft.com>
Date: Thu, 5 Mar 2026 17:30:42 -0800
Subject: [PATCH 12/12] Update comments to reflect current code state

---
 .../atomic_windows_aarch64.hpp                | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
index bd727723977..4f88debe78a 100644
--- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -110,20 +110,14 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 // Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
 //
-// Under MSVC /volatile:iso (the default for ARM64), the generic PlatformLoad
-// and PlatformStore use plain volatile dereferences which generate plain LDR/STR
-// instructions with NO acquire/release barriers. This is insufficient for ARM64's
-// weak memory model in HotSpot's concurrent runtime code (ObjectMonitor Dekker
-// protocols, ParkEvent signaling, etc.) where Atomic::load()/Atomic::store() are
-// used in cross-thread communication patterns that depend on load/store ordering.
-//
-// On x86, this works by accident because x86-TSO provides store-release and
-// load-acquire semantics for all memory accesses. On ARM64, we must explicitly
-// use LDAR (acquire load) and STLR (release store) to get equivalent ordering.
-//
-// This matches the effective behavior of /volatile:ms but scoped only to
-// Atomic:: operations rather than ALL volatile accesses, and ensures correct
-// cross-core visibility for HotSpot's lock-free algorithms.
+// The generic PlatformLoad and PlatformStore use plain volatile dereferences.
+// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles
+// those to LDAR/STLR, so these overrides produce identical codegen. They are
+// retained as defense-in-depth: they guarantee acquire/release semantics for
+// Atomic::load()/Atomic::store() regardless of the compiler flag setting,
+// ensuring correct cross-core visibility for HotSpot's lock-free algorithms
+// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if
+// /volatile:ms were ever removed or overridden.
 
 template<>
 struct Atomic::PlatformLoad<1> {