microsoft · macarte · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
@@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
   elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
     if test "x$FLAGS_CPU" = xx86; then
       $1_CFLAGS_CPU_JVM="-arch:IA32"
+    elif test "x$FLAGS_CPU" = xaarch64; then
+      # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
+      # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
+      # was written assuming volatile provides acquire/release semantics (as on x86
+      # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
+      # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
+      # lock-free algorithms that use plain volatile fields.
+      $1_CFLAGS_CPU_JVM="/volatile:ms"
     elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
       if test "x$DEBUG_LEVEL" != xrelease; then
         # NOTE: This is probably redundant; -homeparams is default on

diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -108,4 +108,106 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
+// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
+// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
+// These emit single LDAR/STLR instructions that have acquire/release semantics
+// baked in, rather than the generic fallback of separate dmb + plain load/store.
+// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
+// for cross-core visibility (Dekker patterns, etc.).
+
+template<>
+struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
+            PrimitiveConversions::cast<unsigned __int8>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int16>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int32>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int64>(v));
+  }
+};
+
+// release_store + fence combination, matching Linux AArch64
+template<size_t byte_size>
+struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    Atomic::release_store(p, v);
+    OrderAccess::fence();
+  }
+};
+
 #endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
diff --git a/src/hotspot/share/c1/c1_GraphBuilder.cpp b/src/hotspot/share/c1/c1_GraphBuilder.cpp
@@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
 const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
   if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
   if ( callee->dont_inline())          return "don't inline by annotation";
+
+  // Don't inline a method that changes Thread.currentThread() except
+  // into another method that is annotated @ChangesCurrentThread.
+  if (callee->changes_current_thread()
+      && !compilation()->method()->changes_current_thread()) {
+    return "method changes current thread";
+  }
+
   return nullptr;
 }
 

diff --git a/src/hotspot/share/c1/c1_LIRGenerator.cpp b/src/hotspot/share/c1/c1_LIRGenerator.cpp
@@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
   LIR_Opr temp = new_register(T_ADDRESS);
   LIR_Opr reg = rlock_result(x);
   __ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
-  access_load(IN_NATIVE, T_OBJECT,
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
               LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
 }
 

diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
   LIRGenerator *gen = access.gen();
   DecoratorSet decorators = access.decorators();
   bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_acquire  = (decorators & MO_ACQUIRE) != 0;
   bool is_atomic = is_volatile || AlwaysAtomicAccesses;
   bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
   bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
@@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
-  if (is_volatile) {
+  if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
 

diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
@@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
       : make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
   thread_obj_handle = _gvn.transform(thread_obj_handle);
 
-  DecoratorSet decorators = IN_NATIVE;
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure proper handling without compressed oop decoding.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
   if (is_immutable) {
     decorators |= C2_IMMUTABLE_MEMORY;
   }

diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
@@ -22,6 +22,7 @@
  *
  */
 
+#include "classfile/symbolTable.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "gc/shared/oopStorage.hpp"
 #include "gc/shared/oopStorageSet.hpp"
@@ -42,6 +43,7 @@
 #include "runtime/globals.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/javaCalls.hpp"
 #include "runtime/javaThread.inline.hpp"
 #include "runtime/lightweightSynchronizer.hpp"
 #include "runtime/mutexLocker.hpp"
@@ -523,6 +525,90 @@ void ObjectMonitor::notify_contended_enter(JavaThread* current) {
   }
 }
 
+// Compensate the ForkJoinPool scheduler when a pinned virtual thread's
+// carrier is about to block on a contended monitor.  Without compensation
+// the pool has no visibility that its worker is blocked, which can lead to
+// carrier starvation / deadlock (JDK-8345294).
+//
+// This calls CarrierThread.beginMonitorBlock() which uses
+// ForkJoinPool.tryCompensateForMonitor() to (a) activate an idle worker or
+// create a replacement spare carrier, and (b) decrement RC so that signalWork
+// can find a carrier for any queued continuation (e.g. the VT holding the
+// contested lock) that would otherwise be starved because blocked carriers
+// still count as active in the pool's RC field.  The RC decrement is restored
+// by endCompensatedBlock when this carrier unblocks.
+//
+// IMPORTANT: At this point notify_contended_enter() has already set
+// _current_pending_monitor. The Java call below may itself trigger
+// nested monitor contention (e.g., ThreadGroup.addUnstarted acquires a
+// monitor when creating a spare worker thread).  To avoid:
+//   (a) assertion failures  (pending_monitor != nullptr on nested entry)
+//   (b) corruption of _current_pending_monitor for the outer monitor
+// we save and clear the pending monitor before the Java call and restore
+// it afterwards.
+static bool compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return false;
+
+  // Save and clear pending monitor so nested monitor enters work cleanly.
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  // CarrierThread.beginMonitorBlock() — compensates the ForkJoinPool via
+  // tryCompensateForMonitor which always activates/creates a real spare
+  // carrier (branch 2 of tryCompensate is omitted for this path).
+  // Not every carrier thread is a CarrierThread (custom schedulers may
+  // use plain threads), so we look up the method and silently return
+  // false if it is not found.
+  TempNewSymbol begin_name = SymbolTable::new_symbol("beginMonitorBlock");
+  TempNewSymbol begin_sig  = SymbolTable::new_symbol("()Z");
+
+  JavaValue result(T_BOOLEAN);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          begin_name, begin_sig, current);
+
+  bool compensated = false;
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  } else {
+    compensated = result.get_jboolean();
+  }
+
+  // Restore pending monitor for the outer monitor enter.
+  current->set_current_pending_monitor(saved_pending);
+  return compensated;
+}
+
+static void end_compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return;
+
+  // Save and clear pending monitor (should be nullptr here, but be safe).
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  TempNewSymbol end_name = SymbolTable::new_symbol("endBlocking");
+  TempNewSymbol end_sig  = vmSymbols::void_method_signature();
+
+  JavaValue result(T_VOID);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          end_name, end_sig, current);
+
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  }
+
+  current->set_current_pending_monitor(saved_pending);
+}
+
 void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonitorContentionMark &cm) {
   assert(current == JavaThread::current(), "must be");
   assert(!has_owner(current), "must be");
@@ -570,6 +656,15 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     }
   }
 
+  // Compensate the ForkJoinPool before the carrier blocks so that the
+  // scheduler can spin up a replacement worker thread.  This prevents
+  // deadlocks where every carrier is pinned and waiting for a monitor
+  // held by an unmounted virtual thread that can never get a carrier.
+  bool compensated = false;
+  if (is_virtual) {
+    compensated = compensate_pinned_carrier(current);
+  }
+
   {
     // Change java thread status to indicate blocked on monitor enter.
     JavaThreadBlockedOnMonitorEnterState jtbmes(current, this);
@@ -606,6 +701,11 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     // the monitor free and clear.
   }
 
+  // End compensation now that the carrier is no longer blocked.
+  if (compensated) {
+    end_compensate_pinned_carrier(current);
+  }
+
   assert(contentions() >= 0, "must not be negative: contentions=%d", contentions());
 
   // Must either set _recursions = 0 or ASSERT _recursions == 0.
@@ -2346,6 +2446,16 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
   if (!has_successor()) {
     set_successor(current);
   }
+  // Dekker/Lamport duality with exit(): ST _succ; MEMBAR; LD _owner.
+  // The exiter's side (release_clear_owner + storeload) is in exit().
+  // Here on the spinner's side, we need a StoreLoad barrier between
+  // setting _succ and reading _owner to prevent the CPU from reordering
+  // the _owner load before the _succ store. On ARM64 with MSVC
+  // /volatile:iso, Atomic::store/load are plain STR/LDR with no
+  // barrier, so without this fence the Dekker protocol is broken and
+  // the exiter may not see our successor designation while we may not
+  // see its lock release — leading to missed wakeups and starvation.
+  OrderAccess::storeload();
   int64_t prv = NO_OWNER;
 
   // There are three ways to exit the following loop:
@@ -2421,6 +2531,8 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
 
     if (!has_successor()) {
       set_successor(current);
+      // See Dekker/storeload comment before the loop.
+      OrderAccess::storeload();
     }
   }