diff --git a/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.cc b/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.cc index 8d282a6df81..dd3cb537d92 100755 --- a/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.cc +++ b/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.cc @@ -50,6 +50,11 @@ #include #include #include +#include +#include +#include +#include +#include #include "suites/stress/queue_write_index_concurrent_tests.h" #include "common/base_rocr_utils.h" @@ -579,3 +584,233 @@ void QueueWriteIndexConcurrentTest::QueueLoadStoreWriteIndexAtomic(void) { } } +void QueueWriteIndexConcurrentTest::TestCasNoSpuriousWakeup(void) { + hsa_status_t err; + + if (verbosity() > 0) { + PrintDebugSubtestHeader("Signal CAS No Spurious Wakeup - Performance Comparison"); + } + + pthread_t main_tid = pthread_self(); + std::cout << " [MAIN THREAD TID: " << main_tid << "]" << std::endl; + + // Get CPU agent to use as consumer (forces InterruptSignal creation) + hsa_agent_t* cpu_agent_ptr = cpu_device(); + ASSERT_NE(cpu_agent_ptr, nullptr); + + std::cout << "\n ===== TEST 1: CAS Conditional SetEvent =====" << std::endl; + std::cout << " Testing with failing CAS operations (expected != actual)" << std::endl; + std::cout << " Expected: Minimal context switches " << std::endl; + + const int num_wait_iterations = 5; + long total_ctx_switches_test1 = 0; + long total_ctx_switches_test2 = 0; + long test1_duration_ms = 0; + long test2_duration_ms = 0; + + // Test 1: Fixed implementation - CAS with wrong expected value (fails, no SetEvent) + { + hsa_signal_t signal; + err = hsa_signal_create(0, 1, cpu_agent_ptr, &signal); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + std::atomic stop_cas{false}; + std::atomic waiter_ready{false}; + const hsa_signal_value_t target_value = 100; + + auto test1_start = std::chrono::steady_clock::now(); + + // Thread that performs failing CAS operations continuously + std::thread cas_thread([&]() { + pthread_t cas_tid = pthread_self(); + std::cout << " [CAS THREAD TID: " << cas_tid << "] Started" << std::endl; + + int cas_count = 0; + while (!stop_cas.load(std::memory_order_acquire)) { + // Attempt CAS with wrong expected value - always fails + // SetEvent NOT called when CAS fails + hsa_signal_cas_relaxed(signal, 50, 75); + cas_count++; + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + } + std::cout << " [CAS THREAD TID: " << cas_tid << "] Performed " << cas_count + << " failing CAS operations" << std::endl; + }); + + // Give CAS thread time to start + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Waiter thread that repeatedly waits + std::thread waiter_thread([&]() { + pthread_t waiter_tid = pthread_self(); + std::cout << " [WAITER THREAD TID: " << waiter_tid << "] Started" << std::endl; + + waiter_ready.store(true, std::memory_order_release); + + struct rusage usage_start, usage_end; + getrusage(RUSAGE_THREAD, &usage_start); + + for (int i = 0; i < num_wait_iterations; ++i) { + std::cout << " [WAITER] Wait iteration " << (i+1) << "/" << num_wait_iterations << std::endl; + + auto start = std::chrono::steady_clock::now(); + hsa_signal_value_t observed = hsa_signal_wait_relaxed( + signal, HSA_SIGNAL_CONDITION_EQ, target_value, + 200000000, HSA_WAIT_STATE_BLOCKED); // 200ms timeout + auto elapsed_ms = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start).count(); + + std::cout << " Returned after " << elapsed_ms << "ms (value=" << observed << ")" << std::endl; + } + + getrusage(RUSAGE_THREAD, &usage_end); + long voluntary = usage_end.ru_nvcsw - usage_start.ru_nvcsw; + long involuntary = usage_end.ru_nivcsw - usage_start.ru_nivcsw; + total_ctx_switches_test1 = voluntary + involuntary; + + std::cout << " [WAITER THREAD TID: " << waiter_tid << "] Exiting" << std::endl; + }); + + // Wait for waiter to be ready + while (!waiter_ready.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + + waiter_thread.join(); + + // Stop CAS thread + stop_cas.store(true, std::memory_order_release); + cas_thread.join(); + + test1_duration_ms = std::chrono::duration_cast( + std::chrono::steady_clock::now() - test1_start).count(); + + std::cout << "\n TEST 1 RESULTS:" << std::endl; + std::cout << " Total context switches: " << total_ctx_switches_test1 << std::endl; + std::cout << " Avg context switches per wait: " + << std::fixed << std::setprecision(1) + << (double)total_ctx_switches_test1 / num_wait_iterations << std::endl; + std::cout << " Total time: " << test1_duration_ms << "ms" << std::endl; + + hsa_signal_destroy(signal); + + // Verify minimal context switches with fixed implementation + EXPECT_LT(total_ctx_switches_test1, 20) + << "Fixed CAS should have minimal context switches (< 20)"; + } + + std::cout << "\n ===== TEST 2: Simulated Behavior: Unconditional Wake-ups =====" << std::endl; + std::cout << " Testing with regular stores that always call SetEvent" << std::endl; + std::cout << " Expected: High context switches " << std::endl; + + // Test 2: Simulate behavior using regular stores (always triggers SetEvent) + { + hsa_signal_t signal; + err = hsa_signal_create(0, 1, cpu_agent_ptr, &signal); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + std::atomic stop_stores{false}; + std::atomic waiter_ready{false}; + const hsa_signal_value_t target_value = 100; + + auto test2_start = std::chrono::steady_clock::now(); + + // Thread that performs stores (always calls SetEvent, simulating CAS) + std::thread store_thread([&]() { + pthread_t store_tid = pthread_self(); + std::cout << " [STORE THREAD TID: " << store_tid << "] Started" << std::endl; + + int store_count = 0; + while (!stop_stores.load(std::memory_order_acquire)) { + // Regular store always calls SetEvent (simulates unconditional wake-up) + hsa_signal_store_relaxed(signal, 0); + store_count++; + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + } + + std::cout << " [STORE THREAD TID: " << store_tid << "] Performed " << store_count + << " stores (each triggers SetEvent)" << std::endl; + }); + + // Give store thread time to start + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Waiter thread that repeatedly waits + std::thread waiter_thread([&]() { + pthread_t waiter_tid = pthread_self(); + std::cout << " [WAITER THREAD TID: " << waiter_tid << "] Started" << std::endl; + + waiter_ready.store(true, std::memory_order_release); + + struct rusage usage_start, usage_end; + getrusage(RUSAGE_THREAD, &usage_start); + + for (int i = 0; i < num_wait_iterations; ++i) { + std::cout << " [WAITER] Wait iteration " << (i+1) << "/" << num_wait_iterations << std::endl; + + auto start = std::chrono::steady_clock::now(); + hsa_signal_value_t observed = hsa_signal_wait_relaxed( + signal, HSA_SIGNAL_CONDITION_EQ, target_value, + 200000000, HSA_WAIT_STATE_BLOCKED); // 200ms timeout + auto elapsed_ms = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start).count(); + + std::cout << " Returned after " << elapsed_ms << "ms (value=" << observed << ")" << std::endl; + } + + getrusage(RUSAGE_THREAD, &usage_end); + long voluntary = usage_end.ru_nvcsw - usage_start.ru_nvcsw; + long involuntary = usage_end.ru_nivcsw - usage_start.ru_nivcsw; + total_ctx_switches_test2 = voluntary + involuntary; + + std::cout << " [WAITER THREAD TID: " << waiter_tid << "] Exiting" << std::endl; + }); + + // Wait for waiter to be ready + while (!waiter_ready.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + + waiter_thread.join(); + + // Stop store thread + stop_stores.store(true, std::memory_order_release); + store_thread.join(); + + test2_duration_ms = std::chrono::duration_cast( + std::chrono::steady_clock::now() - test2_start).count(); + + std::cout << "\n TEST 2 RESULTS:" << std::endl; + std::cout << " Total context switches: " << total_ctx_switches_test2 << std::endl; + std::cout << " Avg context switches per wait: " + << std::fixed << std::setprecision(1) + << (double)total_ctx_switches_test2 / num_wait_iterations << std::endl; + std::cout << " Total time: " << test2_duration_ms << "ms" << std::endl; + + hsa_signal_destroy(signal); + + // Verify high context switches with simulated behavior + EXPECT_GT(total_ctx_switches_test2, 50) + << "Simulated behavior should have many context switches (> 50)"; + } + + // Calculate and print efficiency comparison + double efficiency_ratio = (total_ctx_switches_test1 > 0) + ? (double)total_ctx_switches_test2 / (double)total_ctx_switches_test1 + : 0.0; + + std::cout << "\n ===== EFFICIENCY COMPARISON =====" << std::endl; + std::cout << " Test 1 (Conditional SetEvent): " << total_ctx_switches_test1 + << " context switches" << std::endl; + std::cout << " Test 2 (Unconditional Wake-ups): " << total_ctx_switches_test2 + << " context switches" << std::endl; + std::cout << " Efficiency improvement: " << std::fixed << std::setprecision(1) + << efficiency_ratio << "x fewer context switches with fix" << std::endl; + std::cout << " ==================================" << std::endl; + + if (verbosity() > 0) { + std::cout << "subtest Passed" << std::endl; + std::cout << kSubTestSeparator << std::endl; + } +} + diff --git a/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.h b/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.h index c4185c31646..fb9e6753a66 100755 --- a/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.h +++ b/projects/rocr-runtime/rocrtst/suites/stress/queue_write_index_concurrent_tests.h @@ -90,6 +90,10 @@ class QueueWriteIndexConcurrentTest : public TestBase { // concurrently. void QueueLoadStoreWriteIndexAtomic(void); + // @Brief: Verifies that signal CAS operations do not cause spurious wake-ups + // when they fail, ensuring efficient wait/wake behavior. + void TestCasNoSpuriousWakeup(void); + private: void QueueAddWriteIndexAtomic(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent); void QueueCasWriteIndexAtomic(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent); diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc index 4299caae875..738a2bd4191 100644 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc @@ -780,6 +780,13 @@ TEST(rocrtstStress, Queue_LoadStore_Write_Index_ConcurrentTest) { RunCustomTestEpilog(&Qw); } +TEST(rocrtstStress, Signal_CAS_No_Spurious_Wakeup) { + QueueWriteIndexConcurrentTest Qw(false, false, false); + if (!RunCustomTestProlog(&Qw)) return; + Qw.TestCasNoSpuriousWakeup(); + RunCustomTestEpilog(&Qw); +} + TEST(rocrtstPerf, Memory_Async_Copy) { MemoryAsyncCopy mac; // To do full test, uncomment this: diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp index f35b98edc01..ce9350a907a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/interrupt_signal.cpp @@ -340,7 +340,9 @@ hsa_signal_value_t InterruptSignal::CasRelaxed(hsa_signal_value_t expected, hsa_signal_value_t ret = hsa_signal_value_t( atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), std::memory_order_relaxed)); - SetEvent(); + // Only wake waiters if CAS succeeded (signal value actually changed) + if (ret == expected) + SetEvent(); return ret; } @@ -349,7 +351,9 @@ hsa_signal_value_t InterruptSignal::CasAcquire(hsa_signal_value_t expected, hsa_signal_value_t ret = hsa_signal_value_t( atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), std::memory_order_acquire)); - SetEvent(); + // Only wake waiters if CAS succeeded (signal value actually changed) + if (ret == expected) + SetEvent(); return ret; } @@ -358,7 +362,9 @@ hsa_signal_value_t InterruptSignal::CasRelease(hsa_signal_value_t expected, hsa_signal_value_t ret = hsa_signal_value_t( atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), std::memory_order_release)); - SetEvent(); + // Only wake waiters if CAS succeeded (signal value actually changed) + if (ret == expected) + SetEvent(); return ret; } @@ -367,7 +373,9 @@ hsa_signal_value_t InterruptSignal::CasAcqRel(hsa_signal_value_t expected, hsa_signal_value_t ret = hsa_signal_value_t( atomic::Cas(&signal_.value, int64_t(value), int64_t(expected), std::memory_order_acq_rel)); - SetEvent(); + // Only wake waiters if CAS succeeded (signal value actually changed) + if (ret == expected) + SetEvent(); return ret; } /// @brief Notify driver of signal value change if necessary.