diff --git a/make/autoconf/flags-cflags.m4 b/make/autoconf/flags-cflags.m4
index 3a61840e8c9..9b8f9cdb505 100644
--- a/make/autoconf/flags-cflags.m4
+++ b/make/autoconf/flags-cflags.m4
@@ -776,6 +776,14 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
   elif test "x$TOOLCHAIN_TYPE" = xmicrosoft; then
     if test "x$FLAGS_CPU" = xx86; then
       $1_CFLAGS_CPU_JVM="-arch:IA32"
+    elif test "x$FLAGS_CPU" = xaarch64; then
+      # MSVC defaults to /volatile:iso on ARM64, which makes volatile reads/writes
+      # plain LDR/STR with no acquire/release barriers. HotSpot's C++ runtime code
+      # was written assuming volatile provides acquire/release semantics (as on x86
+      # and GCC/Clang AArch64). Use /volatile:ms to restore those semantics and
+      # prevent memory ordering bugs in ObjectMonitor, ParkEvent, and other
+      # lock-free algorithms that use plain volatile fields.
+      $1_CFLAGS_CPU_JVM="/volatile:ms"
     elif test "x$OPENJDK_TARGET_CPU" = xx86_64; then
       if test "x$DEBUG_LEVEL" != xrelease; then
         # NOTE: This is probably redundant; -homeparams is default on
diff --git a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
index 90fc8ecfba4..4f88debe78a 100644
--- a/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/atomic_windows_aarch64.hpp
@@ -108,4 +108,201 @@ DEFINE_INTRINSIC_CMPXCHG(InterlockedCompareExchange64, __int64)
 
 #undef DEFINE_INTRINSIC_CMPXCHG
 
+// Override PlatformLoad and PlatformStore to use LDAR/STLR on Windows AArch64.
+//
+// The generic PlatformLoad and PlatformStore use plain volatile dereferences.
+// With /volatile:ms (set in flags-cflags.m4 for AArch64), MSVC already compiles
+// those to LDAR/STLR, so these overrides produce identical codegen. They are
+// retained as defense-in-depth: they guarantee acquire/release semantics for
+// Atomic::load()/Atomic::store() regardless of the compiler flag setting,
+// ensuring correct cross-core visibility for HotSpot's lock-free algorithms
+// (ObjectMonitor Dekker protocols, ParkEvent signaling, etc.) even if
+// /volatile:ms were ever removed or overridden.
+
+template<>
+struct Atomic::PlatformLoad<1> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<2> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<4> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformLoad<8> {
+  template<typename T>
+  T operator()(T const volatile* dest) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(dest))));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<1> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(dest),
+            PrimitiveConversions::cast<unsigned __int8>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<2> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int16>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<4> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int32>(new_value));
+  }
+};
+
+template<>
+struct Atomic::PlatformStore<8> {
+  template<typename T>
+  void operator()(T volatile* dest, T new_value) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(dest),
+             PrimitiveConversions::cast<unsigned __int64>(new_value));
+  }
+};
+
+// Specialize PlatformOrderedLoad and PlatformOrderedStore to use MSVC's
+// __ldar/__stlr intrinsics, matching the Linux AArch64 implementation which
+// uses __atomic_load/__atomic_store with __ATOMIC_ACQUIRE/__ATOMIC_RELEASE.
+// These emit single LDAR/STLR instructions that have acquire/release semantics
+// baked in, rather than the generic fallback of separate dmb + plain load/store.
+// On AArch64, LDAR/STLR provide stronger ordering guarantees than dmb + ldr/str
+// for cross-core visibility (Dekker patterns, etc.).
+
+template<>
+struct Atomic::PlatformOrderedLoad<1, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    return PrimitiveConversions::cast<T>(
+      __ldar8(reinterpret_cast<unsigned __int8 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<2, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    return PrimitiveConversions::cast<T>(
+      __ldar16(reinterpret_cast<unsigned __int16 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<4, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    return PrimitiveConversions::cast<T>(
+      __ldar32(reinterpret_cast<unsigned __int32 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedLoad<8, X_ACQUIRE> {
+  template <typename T>
+  T operator()(const volatile T* p) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    return PrimitiveConversions::cast<T>(
+      __ldar64(reinterpret_cast<unsigned __int64 volatile*>(
+        const_cast<T volatile*>(p))));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<1, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 1);
+    __stlr8(reinterpret_cast<unsigned __int8 volatile*>(p),
+            PrimitiveConversions::cast<unsigned __int8>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<2, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 2);
+    __stlr16(reinterpret_cast<unsigned __int16 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int16>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<4, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 4);
+    __stlr32(reinterpret_cast<unsigned __int32 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int32>(v));
+  }
+};
+
+template<>
+struct Atomic::PlatformOrderedStore<8, RELEASE_X> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    STATIC_ASSERT(sizeof(T) == 8);
+    __stlr64(reinterpret_cast<unsigned __int64 volatile*>(p),
+             PrimitiveConversions::cast<unsigned __int64>(v));
+  }
+};
+
+// release_store + fence combination, matching Linux AArch64
+template<size_t byte_size>
+struct Atomic::PlatformOrderedStore<byte_size, RELEASE_X_FENCE> {
+  template <typename T>
+  void operator()(volatile T* p, T v) const {
+    Atomic::release_store(p, v);
+    OrderAccess::fence();
+  }
+};
+
 #endif // OS_CPU_WINDOWS_AARCH64_ATOMIC_WINDOWS_AARCH64_HPP
diff --git a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
index 5385f3e6a10..01711aff0e4 100644
--- a/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
+++ b/src/hotspot/os_cpu/windows_aarch64/orderAccess_windows_aarch64.hpp
@@ -26,22 +26,29 @@
 #define OS_CPU_WINDOWS_AARCH64_ORDERACCESS_WINDOWS_AARCH64_HPP
 
 // Included in orderAccess.hpp header file.
-#include <atomic>
-using std::atomic_thread_fence;
 #include <arm64intr.h>
 #include "vm_version_aarch64.hpp"
 #include "runtime/vm_version.hpp"
 
 // Implementation of class OrderAccess.
+//
+// Use the MSVC __dmb() intrinsic directly rather than C++ std::atomic_thread_fence().
+// Microsoft documents that __dmb() "inserts compiler blocks to prevent instruction
+// reordering" in addition to emitting the hardware DMB instruction. This is critical
+// because HotSpot uses volatile (non-std::atomic) fields throughout the runtime, and
+// std::atomic_thread_fence() is only defined by the C++ standard to order std::atomic
+// operations — it may not act as a compiler barrier for volatile/non-atomic accesses
+// on ARM64 with /volatile:iso. Using __dmb() ensures correct ordering for the Dekker
+// protocol in ObjectMonitor::exit() and similar patterns throughout HotSpot.
 
 inline void OrderAccess::loadload()   { acquire(); }
 inline void OrderAccess::storestore() { release(); }
 inline void OrderAccess::loadstore()  { acquire(); }
 inline void OrderAccess::storeload()  { fence(); }
 
-#define READ_MEM_BARRIER atomic_thread_fence(std::memory_order_acquire);
-#define WRITE_MEM_BARRIER atomic_thread_fence(std::memory_order_release);
-#define FULL_MEM_BARRIER atomic_thread_fence(std::memory_order_seq_cst);
+#define READ_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISHLD)
+#define WRITE_MEM_BARRIER __dmb(_ARM64_BARRIER_ISH)
+#define FULL_MEM_BARRIER  __dmb(_ARM64_BARRIER_ISH)
 
 inline void OrderAccess::acquire() {
   READ_MEM_BARRIER;
diff --git a/src/hotspot/share/c1/c1_GraphBuilder.cpp b/src/hotspot/share/c1/c1_GraphBuilder.cpp
index 8658bebdaee..0dc3034c4af 100644
--- a/src/hotspot/share/c1/c1_GraphBuilder.cpp
+++ b/src/hotspot/share/c1/c1_GraphBuilder.cpp
@@ -3560,6 +3560,14 @@ const char* GraphBuilder::check_can_parse(ciMethod* callee) const {
 const char* GraphBuilder::should_not_inline(ciMethod* callee) const {
   if ( compilation()->directive()->should_not_inline(callee)) return "disallowed by CompileCommand";
   if ( callee->dont_inline())          return "don't inline by annotation";
+  
+  // Don't inline a method that changes Thread.currentThread() except
+  // into another method that is annotated @ChangesCurrentThread.
+  if (callee->changes_current_thread()
+      && !compilation()->method()->changes_current_thread()) {
+    return "method changes current thread";
+  }
+
   return nullptr;
 }
 
diff --git a/src/hotspot/share/c1/c1_LIRGenerator.cpp b/src/hotspot/share/c1/c1_LIRGenerator.cpp
index 341de0ac0c2..0fd8644eaee 100644
--- a/src/hotspot/share/c1/c1_LIRGenerator.cpp
+++ b/src/hotspot/share/c1/c1_LIRGenerator.cpp
@@ -1381,7 +1381,14 @@ void LIRGenerator::do_JavaThreadField(Intrinsic* x, ByteSize offset) {
   LIR_Opr temp = new_register(T_ADDRESS);
   LIR_Opr reg = rlock_result(x);
   __ move(new LIR_Address(getThreadPointer(), in_bytes(offset), T_ADDRESS), temp);
-  access_load(IN_NATIVE, T_OBJECT,
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure a raw 64-bit load without compressed oop handling.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  access_load(IN_NATIVE | MO_ACQUIRE, T_OBJECT,
               LIR_OprFact::address(new LIR_Address(temp, T_OBJECT)), reg);
 }
 
diff --git a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
index ac640fb88d2..1291f6e1b33 100644
--- a/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
+++ b/src/hotspot/share/gc/shared/c1/barrierSetC1.cpp
@@ -169,6 +169,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
   LIRGenerator *gen = access.gen();
   DecoratorSet decorators = access.decorators();
   bool is_volatile = (decorators & MO_SEQ_CST) != 0;
+  bool is_acquire  = (decorators & MO_ACQUIRE) != 0;
   bool is_atomic = is_volatile || AlwaysAtomicAccesses;
   bool needs_patching = (decorators & C1_NEEDS_PATCHING) != 0;
   bool mask_boolean = (decorators & C1_MASK_BOOLEAN) != 0;
@@ -187,7 +188,7 @@ void BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
     __ load(access.resolved_addr()->as_address_ptr(), result, access.access_emit_info(), patch_code);
   }
 
-  if (is_volatile) {
+  if (is_volatile || is_acquire) {
     __ membar_acquire();
   }
 
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index f74af38387c..23d30e065c5 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -980,7 +980,14 @@ Node* LibraryCallKit::current_thread_helper(Node*& tls_output, ByteSize handle_o
       : make_load(nullptr, p, p->bottom_type()->is_ptr(), T_ADDRESS, MemNode::unordered));
   thread_obj_handle = _gvn.transform(thread_obj_handle);
 
-  DecoratorSet decorators = IN_NATIVE;
+  // OopHandle stores uncompressed oops in native memory.
+  // Use IN_NATIVE to ensure proper handling without compressed oop decoding.
+  // Use MO_ACQUIRE so that subsequent loads (e.g. Thread.cont used by
+  // Continuation.yield) cannot float above this load on weakly-ordered
+  // architectures such as AArch64.  Without acquire semantics the hardware
+  // may reorder a later field load before the OopHandle dereference,
+  // observing a stale value after a virtual thread migrates between carriers.
+  DecoratorSet decorators = IN_NATIVE | MO_ACQUIRE;
   if (is_immutable) {
     decorators |= C2_IMMUTABLE_MEMORY;
   }
diff --git a/src/hotspot/share/runtime/objectMonitor.cpp b/src/hotspot/share/runtime/objectMonitor.cpp
index ceac0b42acf..16e9cda0a50 100644
--- a/src/hotspot/share/runtime/objectMonitor.cpp
+++ b/src/hotspot/share/runtime/objectMonitor.cpp
@@ -22,6 +22,7 @@
  *
  */
 
+#include "classfile/symbolTable.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "gc/shared/oopStorage.hpp"
 #include "gc/shared/oopStorageSet.hpp"
@@ -42,6 +43,7 @@
 #include "runtime/globals.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/javaCalls.hpp"
 #include "runtime/javaThread.inline.hpp"
 #include "runtime/lightweightSynchronizer.hpp"
 #include "runtime/mutexLocker.hpp"
@@ -523,6 +525,90 @@ void ObjectMonitor::notify_contended_enter(JavaThread* current) {
   }
 }
 
+// Compensate the ForkJoinPool scheduler when a pinned virtual thread's
+// carrier is about to block on a contended monitor.  Without compensation
+// the pool has no visibility that its worker is blocked, which can lead to
+// carrier starvation / deadlock (JDK-8345294).
+//
+// This calls CarrierThread.beginMonitorBlock() which uses
+// ForkJoinPool.tryCompensateForMonitor() to (a) activate an idle worker or
+// create a replacement spare carrier, and (b) decrement RC so that signalWork
+// can find a carrier for any queued continuation (e.g. the VT holding the
+// contested lock) that would otherwise be starved because blocked carriers
+// still count as active in the pool's RC field.  The RC decrement is restored
+// by endCompensatedBlock when this carrier unblocks.
+//
+// IMPORTANT: At this point notify_contended_enter() has already set
+// _current_pending_monitor. The Java call below may itself trigger
+// nested monitor contention (e.g., ThreadGroup.addUnstarted acquires a
+// monitor when creating a spare worker thread).  To avoid:
+//   (a) assertion failures  (pending_monitor != nullptr on nested entry)
+//   (b) corruption of _current_pending_monitor for the outer monitor
+// we save and clear the pending monitor before the Java call and restore
+// it afterwards.
+static bool compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return false;
+
+  // Save and clear pending monitor so nested monitor enters work cleanly.
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  // CarrierThread.beginMonitorBlock() — compensates the ForkJoinPool via
+  // tryCompensateForMonitor which always activates/creates a real spare
+  // carrier (branch 2 of tryCompensate is omitted for this path).
+  // Not every carrier thread is a CarrierThread (custom schedulers may
+  // use plain threads), so we look up the method and silently return
+  // false if it is not found.
+  TempNewSymbol begin_name = SymbolTable::new_symbol("beginMonitorBlock");
+  TempNewSymbol begin_sig  = SymbolTable::new_symbol("()Z");
+
+  JavaValue result(T_BOOLEAN);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          begin_name, begin_sig, current);
+
+  bool compensated = false;
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  } else {
+    compensated = result.get_jboolean();
+  }
+
+  // Restore pending monitor for the outer monitor enter.
+  current->set_current_pending_monitor(saved_pending);
+  return compensated;
+}
+
+static void end_compensate_pinned_carrier(JavaThread* current) {
+  oop carrier = current->threadObj();
+  if (carrier == nullptr) return;
+
+  // Save and clear pending monitor (should be nullptr here, but be safe).
+  ObjectMonitor* saved_pending = current->current_pending_monitor();
+  current->set_current_pending_monitor(nullptr);
+
+  HandleMark hm(current);
+  Handle carrier_h(current, carrier);
+  Klass* klass = carrier_h->klass();
+
+  TempNewSymbol end_name = SymbolTable::new_symbol("endBlocking");
+  TempNewSymbol end_sig  = vmSymbols::void_method_signature();
+
+  JavaValue result(T_VOID);
+  JavaCalls::call_virtual(&result, carrier_h, klass,
+                          end_name, end_sig, current);
+
+  if (current->has_pending_exception()) {
+    current->clear_pending_exception();
+  }
+
+  current->set_current_pending_monitor(saved_pending);
+}
+
 void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonitorContentionMark &cm) {
   assert(current == JavaThread::current(), "must be");
   assert(!has_owner(current), "must be");
@@ -570,6 +656,15 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     }
   }
 
+  // Compensate the ForkJoinPool before the carrier blocks so that the
+  // scheduler can spin up a replacement worker thread.  This prevents
+  // deadlocks where every carrier is pinned and waiting for a monitor
+  // held by an unmounted virtual thread that can never get a carrier.
+  bool compensated = false;
+  if (is_virtual) {
+    compensated = compensate_pinned_carrier(current);
+  }
+
   {
     // Change java thread status to indicate blocked on monitor enter.
     JavaThreadBlockedOnMonitorEnterState jtbmes(current, this);
@@ -606,6 +701,11 @@ void ObjectMonitor::enter_with_contention_mark(JavaThread* current, ObjectMonito
     // the monitor free and clear.
   }
 
+  // End compensation now that the carrier is no longer blocked.
+  if (compensated) {
+    end_compensate_pinned_carrier(current);
+  }
+
   assert(contentions() >= 0, "must not be negative: contentions=%d", contentions());
 
   // Must either set _recursions = 0 or ASSERT _recursions == 0.
@@ -2346,6 +2446,16 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
   if (!has_successor()) {
     set_successor(current);
   }
+  // Dekker/Lamport duality with exit(): ST _succ; MEMBAR; LD _owner.
+  // The exiter's side (release_clear_owner + storeload) is in exit().
+  // Here on the spinner's side, we need a StoreLoad barrier between
+  // setting _succ and reading _owner to prevent the CPU from reordering
+  // the _owner load before the _succ store. On ARM64 with MSVC
+  // /volatile:iso, Atomic::store/load are plain STR/LDR with no
+  // barrier, so without this fence the Dekker protocol is broken and
+  // the exiter may not see our successor designation while we may not
+  // see its lock release — leading to missed wakeups and starvation.
+  OrderAccess::storeload();
   int64_t prv = NO_OWNER;
 
   // There are three ways to exit the following loop:
@@ -2421,6 +2531,8 @@ bool ObjectMonitor::try_spin(JavaThread* current) {
 
     if (!has_successor()) {
       set_successor(current);
+      // See Dekker/storeload comment before the loop.
+      OrderAccess::storeload();
     }
   }
 
diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java
index b0b134e69a3..17ce3e552ac 100644
--- a/src/java.base/share/classes/java/lang/VirtualThread.java
+++ b/src/java.base/share/classes/java/lang/VirtualThread.java
@@ -567,6 +567,13 @@ private void afterYield() {
                 setState(newState = TIMED_PARKED);
             }
 
+            // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state
+            // is visible before reading parkPermit (Dekker pattern with
+            // unpark which writes parkPermit then reads state).
+            // Note: storeFence is insufficient — on ARM64 it only emits
+            // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish).
+            U.fullFence();
+
             // may have been unparked while parking
             if (parkPermit && compareAndSetState(newState, UNPARKED)) {
                 // lazy submit if local queue is empty
@@ -592,6 +599,10 @@ private void afterYield() {
         if (s == BLOCKING) {
             setState(BLOCKED);
 
+            // Full fence (StoreLoad) for Dekker pattern with unblock
+            // which writes blockPermit then reads state.
+            U.fullFence();
+
             // may have been unblocked while blocking
             if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) {
                 // lazy submit if local queue is empty
@@ -622,6 +633,10 @@ private void afterYield() {
                 }
             }
 
+            // Full fence (StoreLoad) for Dekker pattern with notify
+            // which writes notified then reads state.
+            U.fullFence();
+
             // may have been notified while in transition to wait state
             if (notified && compareAndSetState(newState, BLOCKED)) {
                 // may have even been unblocked already
@@ -659,6 +674,14 @@ private void afterDone(boolean notifyContainer) {
         assert carrierThread == null;
         setState(TERMINATED);
 
+        // Full fence (StoreLoad) to ensure the TERMINATED state is
+        // visible to any thread that has stored a termination latch
+        // (joinNanos). Without this, on ARM64 the volatile write of
+        // state and the subsequent volatile read of termination can be
+        // reordered, causing a missed-wakeup where both sides miss
+        // each other's store (Dekker pattern).
+        U.fullFence();
+
         // notify anyone waiting for this virtual thread to terminate
         CountDownLatch termination = this.termination;
         if (termination != null) {
@@ -847,6 +870,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) {
      */
     private void unpark(boolean lazySubmit) {
         if (!getAndSetParkPermit(true) && currentThread() != this) {
+            // Full fence (StoreLoad) to ensure parkPermit=true is visible
+            // before reading state (Dekker pattern with afterYield PARKING
+            // path which writes state then reads parkPermit).
+            U.fullFence();
             int s = state();
 
             // unparked while parked
@@ -889,6 +916,7 @@ void unpark() {
     private void unblock() {
         assert !Thread.currentThread().isVirtual();
         blockPermit = true;
+        U.fullFence();  // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path
         if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) {
             submitRunContinuation();
         }
@@ -1003,6 +1031,10 @@ boolean joinNanos(long nanos) throws InterruptedException {
 
         // ensure termination object exists, then re-check state
         CountDownLatch termination = getTermination();
+        // Full fence (StoreLoad) for Dekker with afterDone: ensures that
+        // the CAS in getTermination() (storing the latch) is visible to
+        // afterDone before we read state here.
+        U.fullFence();
         if (state() == TERMINATED)
             return true;
 
diff --git a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
index cdc5e0b9ea6..cd2594a6262 100644
--- a/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
+++ b/src/java.base/share/classes/java/util/concurrent/ForkJoinPool.java
@@ -2064,7 +2064,8 @@ private int deactivate(WorkQueue w, int phase) {
             if ((q = qs[k & (n - 1)]) == null)
                 Thread.onSpinWait();
             else if ((a = q.array) != null && (cap = a.length) > 0 &&
-                     a[q.base & (cap - 1)] != null && --prechecks < 0 &&
+                     a[q.base & (cap - 1)] != null &&
+                     --prechecks < 0 &&
                      (int)(c = ctl) == activePhase &&
                      compareAndSetCtl(c, (sp & LMASK) | ((c + RC_UNIT) & UMASK)))
                 return w.phase = activePhase; // reactivate
@@ -4347,14 +4348,114 @@ final long beginCompensatedBlock() {
         return (c == 0) ? 0L : RC_UNIT;
     }
 
+    /**
+     * Variant of tryCompensate for use when a carrier thread is about to block
+     * on a native OS-level monitor (objectMonitor::enter_internal) while pinned
+     * by a virtual thread.
+     *
+     * <p>Unlike tryCompensate, this method omits the passive RC-only path
+     * (branch 2 of tryCompensate), so the blocking carrier always results in
+     * either an idle worker being activated (branch 1) or a new spare being
+     * created (branch 3).
+     *
+     * <p>Both branches decrement RC so that {@code signalWork} (triggered by
+     * subsequent VT task submissions) sees the pool as under-active and can
+     * dispatch the queued VT continuations.  Without the RC decrement in
+     * branch 1, monitor-blocked carriers are counted as "active" by
+     * {@code signalWork}'s {@code (short)(c >>> RC_SHIFT) >= pc} check,
+     * preventing it from waking idle workers when tasks are queued.
+     *
+     * <p>The RC decrement in both branches requires {@code endCompensatedBlock}
+     * to restore <em>two</em> RC_UNITs: one for the decrement here and one for
+     * the compensator's eventual deactivation (RC--).  Both branches return
+     * {@code 2} so that {@code beginMonitorCompensatedBlock} passes
+     * {@code 2 * RC_UNIT} to {@code endCompensatedBlock}:
+     * <pre>
+     *   branch CAS: RC--                (-1)
+     *   compensator deactivates: RC--   (-1)
+     *   endCompensatedBlock:            (+2)
+     *   net:                             0
+     * </pre>
+     */
+    private int tryCompensateForMonitor(long c) {
+        Predicate<? super ForkJoinPool> sat;
+        long b = config;
+        int pc        = parallelism,
+            maxTotal  = (short)(b >>> TC_SHIFT) + pc,
+            total     = (short)(c >>> TC_SHIFT),
+            sp        = (int)c,
+            stat      = -1;
+        if (sp != 0) {                              // activate idle worker (branch 1)
+            // RC-- so that signalWork sees the pool as under-active while
+            // the carrier is blocked on the monitor.  Without this, RC
+            // counts monitor-blocked carriers as "active", preventing
+            // signalWork from waking idle workers.  Return stat=2 so
+            // that beginMonitorCompensatedBlock passes 2*RC_UNIT to
+            // endCompensatedBlock, balancing both the RC-- here and the
+            // compensator's eventual deactivation (RC--).
+            WorkQueue[] qs; WorkQueue v; int i;
+            if ((qs = queues) != null && qs.length > (i = sp & SMASK) &&
+                (v = qs[i]) != null &&
+                compareAndSetCtl(c, ((c - RC_UNIT) & UMASK) |
+                                    (v.stackPred & LMASK))) {
+                v.phase = sp;
+                if (v.parking != 0)
+                    U.unpark(v.owner);
+                stat = 2;                           // need 2 * RC_UNIT at unblock
+            }
+        }
+        // Branch 2 (passive RC-only decrement) is intentionally absent.
+        else if (total < maxTotal && total < MAX_CAP) { // create spare (branch 3)
+            // TC++ and RC-- so that signalWork sees the pool as under-active.
+            // Return stat=2 so that beginMonitorCompensatedBlock passes
+            // 2*RC_UNIT to endCompensatedBlock, balancing both the RC--
+            // here and the spare's eventual deactivation (RC--).
+            long nc = ((c + TC_UNIT) & TC_MASK) |
+                      ((c - RC_UNIT) & RC_MASK) |
+                      (c & LMASK);
+            if ((runState & STOP) != 0L)
+                stat = 0;
+            else if (compareAndSetCtl(c, nc)) {
+                createWorker();
+                stat = 2;                           // need 2 * RC_UNIT at unblock
+            }
+        }
+        else if (!compareAndSetCtl(c, c))               // validate
+            ;
+        else if ((sat = saturate) != null && sat.test(this))
+            stat = 0;
+        else
+            throw new RejectedExecutionException(
+                "Thread limit exceeded replacing blocked worker");
+        return stat;
+    }
+
+    /**
+     * Like beginCompensatedBlock but for carrier threads that are about to
+     * block on a native OS-level monitor (objectMonitor::enter_internal) while
+     * pinned by a virtual thread.  Uses tryCompensateForMonitor which always
+     * activates or creates a real replacement carrier (never passive RC-only).
+     * <p>tryCompensateForMonitor returns:
+     * <ul>
+     *   <li>{@code 2} — branch 1 (activated idle worker with RC--), need 2 × RC_UNIT</li>
+     *   <li>{@code 2} — branch 3 (created spare with RC--), need 2 × RC_UNIT</li>
+     *   <li>{@code 0} — saturated/stopping, no restoration needed</li>
+     * </ul>
+     * @return value to pass to endCompensatedBlock
+     */
+    final long beginMonitorCompensatedBlock() {
+        int c;
+        do {} while ((c = tryCompensateForMonitor(ctl)) < 0);
+        return (c == 0) ? 0L : (long)c * RC_UNIT;
+    }
+
     /**
      * Re-adjusts parallelism after a blocking operation completes.
-     * @param post value from beginCompensatedBlock
+     * @param post value from beginCompensatedBlock or beginMonitorCompensatedBlock
      */
     void endCompensatedBlock(long post) {
-        if (post > 0L) {
+        if (post > 0L)
             getAndAddCtl(post);
-        }
     }
 
     /** ManagedBlock for external threads */
@@ -4411,9 +4512,14 @@ protected <T> RunnableFuture<T> newTaskFor(Callable<T> callable) {
                 public long beginCompensatedBlock(ForkJoinPool pool) {
                     return pool.beginCompensatedBlock();
                 }
+                @Override
                 public void endCompensatedBlock(ForkJoinPool pool, long post) {
                     pool.endCompensatedBlock(post);
                 }
+                @Override
+                public long beginMonitorCompensatedBlock(ForkJoinPool pool) {
+                    return pool.beginMonitorCompensatedBlock();
+                }
             });
         defaultForkJoinWorkerThreadFactory =
             new DefaultForkJoinWorkerThreadFactory();
diff --git a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
index 118d648c7a2..cff4da50d30 100644
--- a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
+++ b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java
@@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) {
                 q = p.next;
                 if (p.isData != haveData && haveData != (m != null) &&
                     p.cmpExItem(m, e) == m) {
+                    // Full fence (StoreLoad) for Dekker with await() which
+                    // writes waiter then reads item. On ARM64, CAS
+                    // (ldaxr/stlxr) + plain load to a different field does
+                    // NOT provide StoreLoad ordering.
+                    VarHandle.fullFence();
                     Thread w = p.waiter;    // matched complementary node
                     if (p != h && h == cmpExHead(h, (q == null) ? p : q))
                         h.next = h;         // advance head; self-link old
diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
index 49efe5d5c2c..c74a2483aa4 100644
--- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
+++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java
@@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) {
                     else if (p.cmpExItem(m, e) != m)
                         p = head;                  // missed; restart
                     else {                         // matched complementary node
+                        // Full fence (StoreLoad) for Dekker with await() which
+                        // writes waiter then reads item. On ARM64, CAS
+                        // (ldaxr/stlxr) + plain load to a different field does
+                        // NOT provide StoreLoad ordering.
+                        VarHandle.fullFence();
                         Thread w = p.waiter;
                         cmpExHead(p, p.next);
                         LockSupport.unpark(w);
diff --git a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
index 0ff216c80a0..1c794032aae 100644
--- a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
+++ b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java
@@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared,
                 Thread.onSpinWait();
             } else if (node.status == 0) {
                 node.status = WAITING;          // enable signal and recheck
+                // Full fence (StoreLoad) to ensure WAITING status is visible
+                // before re-reading state in tryAcquire/tryAcquireShared
+                // (Dekker pattern with releaseShared/release which writes
+                // state then reads node.status in signalNext).
+                // On ARM64, volatile write (stlr) + volatile read (ldar) to
+                // different addresses does NOT provide StoreLoad ordering.
+                U.fullFence();
             } else {
                 spins = postSpins = (byte)((postSpins << 1) | 1);
                 try {
@@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout)
      */
     public final boolean release(int arg) {
         if (tryRelease(arg)) {
+            // Full fence (StoreLoad) to ensure the state update from
+            // tryRelease is visible before reading node.status in signalNext
+            // (Dekker pattern: release writes state then reads status,
+            // acquire writes status then reads state).
+            // On ARM64, CAS (stlxr/release) + ldar to different addresses
+            // does NOT provide StoreLoad ordering.
+            U.fullFence();
             signalNext(head);
             return true;
         }
@@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout)
      */
     public final boolean releaseShared(int arg) {
         if (tryReleaseShared(arg)) {
+            // Full fence (StoreLoad) — see comment in release()
+            U.fullFence();
             signalNext(head);
             return true;
         }
diff --git a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
index 28c1b7fddee..5f824dfabbd 100644
--- a/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
+++ b/src/java.base/share/classes/jdk/internal/access/JavaUtilConcurrentFJPAccess.java
@@ -29,4 +29,20 @@
 public interface JavaUtilConcurrentFJPAccess {
     long beginCompensatedBlock(ForkJoinPool pool);
     void endCompensatedBlock(ForkJoinPool pool, long post);
+    /**
+     * Like beginCompensatedBlock but for carrier threads that are about to
+     * block on a native OS-level monitor (objectMonitor::enter_internal) while
+     * pinned by a virtual thread.  This variant:
+     * <ul>
+     *   <li>Never takes the passive RC-only path (branch 2 of tryCompensate);
+     *       it always activates an idle worker or creates a new spare.</li>
+     *   <li>Decrements RC immediately so that signalWork can create a carrier
+     *       for any continuation (e.g. the contested lock's holder) that is
+     *       already in the queue, which would otherwise be starved because
+     *       blocked carriers still count as active in RC.</li>
+     * </ul>
+     * The RC decrement is restored by endCompensatedBlock when the carrier
+     * unblocks.
+     */
+    long beginMonitorCompensatedBlock(ForkJoinPool pool);
 }
diff --git a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
index b314b1823a5..811477c2475 100644
--- a/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
+++ b/src/java.base/share/classes/jdk/internal/misc/CarrierThread.java
@@ -89,6 +89,47 @@ public boolean beginBlocking() {
         }
     }
 
+    /**
+     * Mark the start of a blocking operation where the carrier thread is about
+     * to block on a native OS-level monitor (objectMonitor::enter_internal)
+     * while pinned by a virtual thread.  Unlike beginBlocking, this always
+     * activates an idle worker or creates a new spare carrier (never takes the
+     * passive RC-only path that creates no worker).  endBlocking restores
+     * the compensateValue (1 or 2 RC_UNITs) when this carrier unblocks.
+     */
+    public boolean beginMonitorBlock() {
+        assert Thread.currentThread().isVirtual() && JLA.currentCarrierThread() == this;
+
+        // Guard against re-entrant calls.  tryCompensateForMonitor may call
+        // createWorker() which can hit a nested monitor (e.g. ThreadGroup),
+        // re-entering objectMonitor::enter_with_contention_mark and thus this
+        // method.  Without this check each nested call overwrites
+        // compensateValue, leaking an RC_UNIT per re-entrant compensation.
+        if (compensating != NOT_COMPENSATING) {
+            return false;
+        }
+
+        // don't preempt when attempting to compensate
+        Continuation.pin();
+        try {
+            compensating = COMPENSATE_IN_PROGRESS;
+
+            // Uses FJP.tryCompensateForMonitor (branch 2 removed) – always
+            // activates an idle worker (1*RC_UNIT) or creates a new spare
+            // with RC-- (2*RC_UNIT to balance both the RC-- and the spare's
+            // eventual deactivation).
+            compensateValue = ForkJoinPools.beginMonitorCompensatedBlock(getPool());
+            compensating = COMPENSATING;
+            return true;
+        } catch (Throwable e) {
+            // exception starting spare thread
+            compensating = NOT_COMPENSATING;
+            throw e;
+        } finally {
+            Continuation.unpin();
+        }
+    }
+
     /**
      * Mark the end of a blocking operation.
      */
@@ -133,6 +174,9 @@ static long beginCompensatedBlock(ForkJoinPool pool) {
         static void endCompensatedBlock(ForkJoinPool pool, long post) {
             FJP_ACCESS.endCompensatedBlock(pool, post);
         }
+        static long beginMonitorCompensatedBlock(ForkJoinPool pool) {
+            return FJP_ACCESS.beginMonitorCompensatedBlock(pool);
+        }
     }
 
     static {
diff --git a/src/java.base/share/classes/jdk/internal/vm/Continuation.java b/src/java.base/share/classes/jdk/internal/vm/Continuation.java
index a97f9ac9ea4..6ed31e08bd1 100644
--- a/src/java.base/share/classes/jdk/internal/vm/Continuation.java
+++ b/src/java.base/share/classes/jdk/internal/vm/Continuation.java
@@ -345,6 +345,7 @@ private boolean isEmpty() {
      * @throws IllegalStateException if not currently in the given {@code scope},
      */
     @Hidden
+    @DontInline
     @JvmtiHideEvents
     public static boolean yield(ContinuationScope scope) {
         Continuation cont = JLA.getContinuation(currentCarrierThread());
diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
index 114f7f9bfab..d44440849dd 100644
--- a/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
+++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestTrampoline.java
@@ -23,7 +23,7 @@
  */
 
 package compiler.c2.aarch64;
-
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Iterator;
 import jdk.test.lib.process.OutputAnalyzer;
@@ -96,6 +96,26 @@ private static void test(String s, int i) {
 
         public static void main(String[] args) {
             String s = "Returns the char value at the specified index.";
+
+             boolean isWindows = System.getProperty("os.name").startsWith("Windows");
+            if (isWindows) {
+                /*
+                Depending on the Windows code page, the UTF-16 path in the String.charAt method
+                may be executed during JVM startup. However, it may not execute often enough to
+                cause it to be inlined by C2. In this case, C2 may not fully inline the
+                String.charAt method, resulting in the unexpected generation of trampolines.
+                Therefore, we execute the UTF-16 path in the String.charAt method in a loop
+                enough times to ensure that it will be inlined by C2.
+                */
+                String jnuEncoding = System.getProperty("sun.jnu.encoding");
+                if (jnuEncoding != null && Charset.isSupported(jnuEncoding)) {
+                    byte[] bytes = s.getBytes(Charset.forName(jnuEncoding));
+
+                    for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) {
+                        String encoded = new String(bytes, Charset.forName(jnuEncoding));
+                    }
+                }
+            }
             for (int i = 0; i < ITERATIONS_TO_HEAT_LOOP; ++i) {
                 test(s, i % s.length());
             }