diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d33f843b417cf..beadd55738150 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -189,6 +189,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING "ggml: metal minimum macOS version") set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") option(GGML_OPENMP "ggml: use OpenMP" ON) +option(GGML_YIELD_BARRIER "ggml: replace spin barrier with yield barrier to improve scheduling efficiency" OFF) option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index c8cc32fa5afcc..7a315e6edc961 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -59,6 +59,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() + if (GGML_YIELD_BARRIER) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_YIELD_BARRIER) + endif() + if (GGML_LLAMAFILE) target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index 6e84c826b4091..e6e9f6d05e4f4 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -6,3 +6,15 @@ // GGML internal header ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(GGML_YIELD_BARRIER) +size_t ggml_barrier_spin_count(unsigned int n_threads); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f3925e17a2b8e..1cb48845f127f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-cpu-aarch64.h" #include "ggml-cpu-traits.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" @@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type { } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1}; #endif - #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -1385,6 +1385,9 @@ struct ggml_threadpool { struct ggml_compute_state * workers; // per thread state int n_threads_max; // number of threads in the pool atomic_int n_threads_cur; // number of threads used in the current graph +#if defined(GGML_YIELD_BARRIER) + size_t n_barrier_spin_count; +#endif int32_t prio; // Scheduling priority uint32_t poll; // Polling level (0 - no polling) @@ -2450,6 +2453,63 @@ struct ggml_state { static struct ggml_state g_state = {0}; +#if defined(__gnu_linux__) || defined(__ANDROID__) +#include +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) +#define futex_wait(uaddr, val) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0) +#define futex_wake(uaddr, n) syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0) +#elif defined(__APPLE__) +#include + +extern int __ulock_wait(uint32_t operation, volatile int *addr, uint64_t value, uint32_t timeout); +extern int __ulock_wake(uint32_t operation, volatile int *addr, uint64_t wake_value); + +#define UL_COMPARE_AND_WAIT 1 + +#define ULF_WAKE_ALL 0x00000100 +#define ULF_WAKE_THREAD 0x00000200 + +static int futex_wait(volatile int *addr, int expected) { + int op = UL_COMPARE_AND_WAIT; + int ret = __ulock_wait(op, (volatile void *)addr, (uint64_t)expected, 0); + if (ret == -1) { + return -1; + } + return 0; +} + +static int futex_wake(volatile int *addr, int count) { + if (count <= 0) { + return 0; + } + uint32_t op = UL_COMPARE_AND_WAIT; + if (count == INT_MAX) { + op |= ULF_WAKE_ALL; + if (__ulock_wake(op, (volatile void *)addr, 0) == -1) { + return -1; + } + return 0; + } + int woken = 0; + for (int i = 0; i < count; ++i) { + if (__ulock_wake(op, (volatile void *)addr, 0) == -1) { + if (errno == ENOENT || errno == ESRCH) { + break; + } else { + return -1; + } + } + woken++; + } + return woken; +} + +#endif + void ggml_barrier(struct ggml_threadpool * tp) { int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); if (n_threads == 1) { @@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) { // exit barrier (fill seq-cst fence) atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst); +#if defined(GGML_YIELD_BARRIER) + // wake up all threads + futex_wake(&tp->n_barrier_passed, INT_MAX); +#endif return; } +#if !defined(GGML_YIELD_BARRIER) // wait for other threads while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { ggml_thread_cpu_relax(); } +#else + size_t spin_count = tp->n_barrier_spin_count; + size_t i; + do { + for (i = 0; i < spin_count; i++) { + if (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) != n_passed) { + goto exit_barrier; + } + ggml_thread_cpu_relax(); + } + futex_wait(&tp->n_barrier_passed, n_passed); + } while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed); + return; +exit_barrier: +#endif // exit barrier (full seq-cst fence) // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead #ifdef GGML_TSAN_ENABLED @@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) { for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { if (mask[i]) { - GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i); CPU_SET(i, &cpuset); } } @@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->poll = tpp->poll; threadpool->prio = tpp->prio; threadpool->ec = GGML_STATUS_SUCCESS; +#if defined(GGML_YIELD_BARRIER) + threadpool->n_barrier_spin_count = ggml_barrier_spin_count(tpp->n_threads); +#endif } // Allocate and init workers state diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 09f8382b988a4..496c528e2896f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -572,6 +572,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r #ifdef GGML_USE_ACCELERATE features.push_back({ "ACCELERATE", "1" }); #endif + #ifdef GGML_YIELD_BARRIER + features.push_back({ "YIELD_BARRIER", "1" }); + #endif #ifdef GGML_USE_CPU_HBM features.push_back({ "CPU_HBM", "1" }); #endif @@ -633,6 +636,136 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch GGML_UNUSED(reg); } +#if defined(GGML_YIELD_BARRIER) +#include + +#if defined(__x86_64__) +#if defined(_MSC_VER) +#include +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { + int regs[4]; + __cpuidex(regs, leaf, subleaf); + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; +} +#elif defined(__GNUC__) || defined(__clang__) +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { + __asm__ volatile ( + "cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "a"(leaf), "c"(subleaf) + ); +} +#else + #error Unsupported compiler +#endif + +static bool cpu_is_hybrid() { + int eax, ebx, ecx, edx; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + return !!(edx & (1u << 15)); +} + +#elif defined(__aarch64__) && defined(__gnu_linux__) + +bool cpu_is_hybrid() { + FILE *fp = fopen("/proc/cpuinfo", "r"); + if (!fp) { + return false; + } + + char line[256]; + char first_cpu_part[64] = {0}; + bool found_first = false; + bool hybrid = false; + + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "CPU part", 8) == 0 || strncmp(line, "cpu part", 8) == 0) { + char *colon = strchr(line, ':'); + if (colon) { + colon++; + while (*colon == ' ' || *colon == '\t') { + colon++; + } + char *newline = strchr(colon, '\n'); + if (newline) { + *newline = '\0'; + } + + if (!found_first) { + strncpy(first_cpu_part, colon, sizeof(first_cpu_part)-1); + found_first = true; + } else { + if (strcmp(first_cpu_part, colon) != 0) { + hybrid = true; + break; + } + } + } + } + } + fclose(fp); + return hybrid; +} + +#elif defined(__aarch64__) && defined(__APPLE__) + +bool cpu_is_hybrid() { + int64_t cpu_count = 0; + size_t size = sizeof(cpu_count); + if (sysctlbyname("hw.perflevel1.physicalcpu", &cpu_count, &size, NULL, 0) == 0) { + return cpu_count > 0; + } + return false; +} + +#else + +bool cpu_is_hybrid() { + return false; +} + +#endif + +#if defined(__gnu_linux__) +static size_t get_affinity_cores() { + cpu_set_t set; + int num_cores = 0; + + CPU_ZERO(&set); + if (sched_getaffinity(0, sizeof(cpu_set_t), &set) == -1) { + return std::thread::hardware_concurrency(); + } + + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &set)) { + num_cores++; + } + } + + return num_cores; +} +#else +static size_t get_affinity_cores() { + return std::thread::hardware_concurrency(); +} +#endif + +extern "C" +size_t ggml_barrier_spin_count(unsigned int n_threads) { + size_t count = 30000; + if (n_threads > get_affinity_cores()) { + count = 100; + } + if (cpu_is_hybrid()) { + count = 1; + } + return count; +} +#endif + static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = { /* .get_name = */ ggml_backend_cpu_reg_get_name, /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,