ggml: Implement yield barrier using futex for improved thread scheduling efficiency

SongXiaoXi · SongXiaoXi · commit 2fbabd7128f5 · 2025-04-23T22:16:50.000+08:00
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -189,6 +189,7 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_YIELD_BARRIER                   "ggml: replace spin barrier with yield barrier to improve scheduling efficiency"             OFF)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -59,6 +59,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     endif()
 
+    if (GGML_YIELD_BARRIER)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_YIELD_BARRIER)
+    endif()
+
     if (GGML_LLAMAFILE)
         target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -6,3 +6,11 @@
 // GGML internal header
 
 ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
+
+__BEGIN_DECLS
+
+#if defined(GGML_YIELD_BARRIER)
+size_t ggml_barrier_spin_count(unsigned int n_threads);
+#endif
+
+__END_DECLS
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3,6 +3,7 @@
 
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
+#include "ggml-cpu-aarch64.h"
 #include "ggml-cpu-traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
@@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
 #endif
 
-
 #if defined(_WIN32)
 
 #define WIN32_LEAN_AND_MEAN
@@ -1385,6 +1385,9 @@ struct ggml_threadpool {
     struct ggml_compute_state * workers;   // per thread state
     int          n_threads_max; // number of threads in the pool
     atomic_int   n_threads_cur; // number of threads used in the current graph
+#if defined(GGML_YIELD_BARRIER)
+    size_t n_barrier_spin_count;
+#endif
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
@@ -2450,6 +2453,63 @@ struct ggml_state {
 
 static struct ggml_state g_state = {0};
 
+#if defined(__gnu_linux__) || defined(__ANDROID__)
+#include <sys/syscall.h>
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_PRIVATE_FLAG   128
+#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
+#define futex_wait(uaddr, val) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0)
+#define futex_wake(uaddr, n)   syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0)
+#elif defined(__APPLE__)
+#include <stdatomic.h>
+
+extern int __ulock_wait(uint32_t operation, volatile int *addr, uint64_t value, uint32_t timeout);
+extern int __ulock_wake(uint32_t operation, volatile int *addr, uint64_t wake_value);
+
+#define UL_COMPARE_AND_WAIT        1
+
+#define ULF_WAKE_ALL    0x00000100
+#define ULF_WAKE_THREAD 0x00000200
+
+static int futex_wait(volatile int *addr, int expected) {
+    int op = UL_COMPARE_AND_WAIT;
+    int ret = __ulock_wait(op, (void *)addr, (uint64_t)expected, 0);
+    if (ret == -1) {
+        return -1;
+    }
+    return 0;
+}
+
+static int futex_wake(volatile int *addr, int count) {
+    if (count <= 0) {
+        return 0;
+    }
+    uint32_t op = UL_COMPARE_AND_WAIT;
+    if (count == INT_MAX) {
+        op |= ULF_WAKE_ALL;
+        if (__ulock_wake(op, (void *)addr, 0) == -1) {
+            return -1;
+        }
+        return 0;
+    }
+    int woken = 0;
+    for (int i = 0; i < count; ++i) {
+        if (__ulock_wake(op, (void *)addr, 0) == -1) {
+            if (errno == ENOENT || errno == ESRCH) {
+                break;
+            } else {
+                return -1;
+            }
+        }
+        woken++;
+    }
+    return woken;
+}
+
+#endif
+
 void ggml_barrier(struct ggml_threadpool * tp) {
     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
     if (n_threads == 1) {
@@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 
         // exit barrier (fill seq-cst fence)
         atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
+#if defined(GGML_YIELD_BARRIER)
+        // wake up all threads
+        futex_wake(&tp->n_barrier_passed, INT_MAX);
+#endif
         return;
     }
 
+#if !defined(GGML_YIELD_BARRIER)
     // wait for other threads
     while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
         ggml_thread_cpu_relax();
     }
+#else
+    size_t spin_count = tp->n_barrier_spin_count;
+    size_t i;
+    do {
+        for (i = 0; i < spin_count; i++) {
+            if (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) != n_passed) {
+                goto exit_barrier;
+            }
+            ggml_thread_cpu_relax();
+        }
 
+        futex_wait(&tp->n_barrier_passed, n_passed);
+    } while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed);
+    return;
+exit_barrier:
+#endif
     // exit barrier (full seq-cst fence)
     // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
     #ifdef GGML_TSAN_ENABLED
@@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
 
     for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
         if (mask[i]) {
-            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
             CPU_SET(i, &cpuset);
         }
     }
@@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
         threadpool->ec               = GGML_STATUS_SUCCESS;
+#if defined(GGML_YIELD_BARRIER)
+        threadpool->n_barrier_spin_count = ggml_barrier_spin_count(tpp->n_threads);
+#endif
     }
 
     // Allocate and init workers state
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -572,6 +572,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
     #ifdef GGML_USE_ACCELERATE
         features.push_back({ "ACCELERATE", "1" });
     #endif
+    #ifdef GGML_YIELD_BARRIER
+        features.push_back({ "YIELD_BARRIER", "1" });
+    #endif
     #ifdef GGML_USE_CPU_HBM
         features.push_back({ "CPU_HBM", "1" });
     #endif
@@ -633,6 +636,136 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
     GGML_UNUSED(reg);
 }
 
+#if defined(GGML_YIELD_BARRIER)
+#include <thread>
+
+#if defined(__x86_64__)
+#if defined(_MSC_VER)
+#include <intrin.h>
+static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
+    int regs[4];
+    __cpuidex(regs, leaf, subleaf);
+    *eax = regs[0];
+    *ebx = regs[1];
+    *ecx = regs[2];
+    *edx = regs[3];
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
+    __asm__ volatile (
+        "cpuid"
+        : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
+        : "a"(leaf), "c"(subleaf)
+    );
+}
+#else
+  #error Unsupported compiler
+#endif
+
+static bool cpu_is_hybrid() {
+    int eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+#elif defined(__aarch64__) && defined(__gnu_linux__)
+
+bool cpu_is_hybrid() {
+    FILE *fp = fopen("/proc/cpuinfo", "r");
+    if (!fp) {
+        return false;
+    }
+
+    char line[256];
+    char first_cpu_part[64] = {0};
+    bool found_first = false;
+    bool hybrid = false;
+
+    while (fgets(line, sizeof(line), fp)) {
+        if (strncmp(line, "CPU part", 8) == 0 || strncmp(line, "cpu part", 8) == 0) {
+            char *colon = strchr(line, ':');
+            if (colon) {
+                colon++;
+                while (*colon == ' ' || *colon == '\t') {
+                    colon++;
+                }
+                char *newline = strchr(colon, '\n');
+                if (newline) {
+                    *newline = '\0';
+                }
+
+                if (!found_first) {
+                    strncpy(first_cpu_part, colon, sizeof(first_cpu_part)-1);
+                    found_first = true;
+                } else {
+                    if (strcmp(first_cpu_part, colon) != 0) {
+                        hybrid = true;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    fclose(fp);
+    return hybrid;
+}
+
+#elif defined(__aarch64__) && defined(__APPLE__)
+
+bool cpu_is_hybrid() {
+    int64_t cpu_count = 0;
+    size_t size = sizeof(cpu_count);
+    if (sysctlbyname("hw.perflevel1.physicalcpu", &cpu_count, &size, NULL, 0) == 0) {
+        return cpu_count > 0;
+    }
+    return false;
+}
+
+#else
+
+bool cpu_is_hybrid() {
+    return false;
+}
+
+#endif
+
+#if defined(__gnu_linux__)
+static size_t get_affinity_cores() {
+    cpu_set_t set;
+    int num_cores = 0;
+
+    CPU_ZERO(&set);
+    if (sched_getaffinity(0, sizeof(cpu_set_t), &set) == -1) {
+        return std::thread::hardware_concurrency();
+    }
+
+    for (int i = 0; i < CPU_SETSIZE; ++i) {
+        if (CPU_ISSET(i, &set)) {
+            num_cores++;
+        }
+    }
+
+    return num_cores;
+}
+#else
+static size_t get_affinity_cores() {
+    return std::thread::hardware_concurrency();
+}
+#endif
+
+extern "C"
+size_t ggml_barrier_spin_count(unsigned int n_threads) {
+    size_t count = 30000;
+    if (n_threads > get_affinity_cores()) {
+        count = 100;
+    }
+    if (cpu_is_hybrid()) {
+        count = 1;
+    }
+    return count;
+}
+#endif
+
 static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
     /* .get_name         = */ ggml_backend_cpu_reg_get_name,
     /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,