microsoft · macarte · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
@@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
   elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
     if test "x$FLAGS_CPU" = xx86; then
       $1_CFLAGS_CPU_JVM="-arch:IA32"
+    elif test "x$FLAGS_CPU" = xaarch64; then
+      # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
+      # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
+      # was written assuming volatile provides acquire/release semantics (as on x86
+      # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
+      # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
+      # lock-free algorithms that use plain volatile fields.
+      $1_CFLAGS_CPU_JVM="/volatile:ms"
     elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
       if test "x$DEBUG_LEVEL" != xrelease; then
         # NOTE: This is probably redundant; -homeparams is default on

diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -108,4 +108,207 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
+//
+// Under MSVC /volatile:iso (the default for ARM64), the generic PlatformLoad
+// and PlatformStore use plain volatile dereferences which generate plain LDR/STR
+// instructions with NO acquire/release barriers. This is insufficient for ARM64's
+// weak memory model in HotSpot's concurrent runtime code (ObjectMonitor Dekker
+// protocols, ParkEvent signaling, etc.) where Atomic::load()/Atomic::store() are
+// used in cross-thread communication patterns that depend on load/store ordering.
+//
+// On x86, this works by accident because x86-TSO provides store-release and
+// load-acquire semantics for all memory accesses. On ARM64, we must explicitly
+// use LDAR (acquire load) and STLR (release store) to get equivalent ordering.
+//
+// This matches the effective behavior of /volatile:ms but scoped only to
+// Atomic:: operations rather than ALL volatile accesses, and ensures correct
+// cross-core visibility for HotSpot's lock-free algorithms.
+
+template<>
+struct Atomic::PlatformLoad<1> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<2> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<4> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<8> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<1> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
+            PrimitiveConversions::cast<unsigned __int8>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<2> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int16>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<4> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int32>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<8> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int64>(new_value));
+  }
+};
+
+// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
+// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
+// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
+// These emit single LDAR/STLR instructions that have acquire/release semantics
+// baked in, rather than the generic fallback of separate dmb + plain load/store.
+// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
+// for cross-core visibility (Dekker patterns, etc.).
+
+template<>
+struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
+            PrimitiveConversions::cast<unsigned __int8>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int16>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int32>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int64>(v));
+  }
+};
+
+// release_store + fence combination, matching Linux AArch64
+template<size_t byte_size>
+struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    Atomic::release_store(p, v);
+    OrderAccess::fence();
+  }
+};
+
 #endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
@@ -26,22 +26,29 @@
 #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP
 
 // Included in orderAccess.hpp header file.
-#include <atomic>
-using std::atomic_thread_fence;
 #include <arm64intr.h>
 #include "vm_version_aarch64.hpp"
 #include "runtime/vm_version.hpp"
 
 // Implementation of class OrderAccess.
+//
+// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
+// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
+// reordering" in addition to emitting the hardware DMB instruction. This is critical
+// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
+// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
+// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
+// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
+// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.
 
 inline void OrderAccess::loadload()   { acquire(); }
 inline void OrderAccess::storestore() { release(); }
 inline void OrderAccess::loadstore()  { acquire(); }
 inline void OrderAccess::storeload()  { fence(); }
 
-#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
-#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
-#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
+#define READ_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISHLD)
+#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
+#define FULL_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISH)
 
 inline void OrderAccess::acquire() {
   READ_MEM_BARRIER;

diff --git a/src/hotspot/share/c1/c1_GraphBuilder.cpp b/src/hotspot/share/c1/c1_GraphBuilder.cpp
@@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
 const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
   if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
   if ( callee->dont_inline())          return "don't inline by annotation";
+
+  // Don't inline a method that changes Thread.currentThread() except
+  // into another method that is annotated @ChangesCurrentThread.
+  if (callee->changes_current_thread()
+      && !compilation()->method()->changes_current_thread()) {
+    return "method changes current thread";
+  }
+
   return nullptr;
 }
 

diff --git a/src/hotspot/share/c1/c1_LIRGenerator.cpp b/src/hotspot/share/c1/c1_LIRGenerator.cpp
@@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
   LIR_Opr temp = new_register(T_ADDRESS);
   LIR_Opr reg = rlock_result(x);
   __ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
-  access_load(IN_NATIVE, T_OBJECT,
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
               LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
 }
 

diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
   LIRGenerator *gen = access.gen();
   DecoratorSet decorators = access.decorators();
   bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_acquire  = (decorators & MO_ACQUIRE) != 0;
   bool is_atomic = is_volatile || AlwaysAtomicAccesses;
   bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
   bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
@@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
-  if (is_volatile) {
+  if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
 

diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
@@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
       : make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
   thread_obj_handle = _gvn.transform(thread_obj_handle);
 
-  DecoratorSet decorators = IN_NATIVE;
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure proper handling without compressed oop decoding.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
   if (is_immutable) {
     decorators |= C2_IMMUTABLE_MEMORY;
   }