Skip to content

Commit 2fbabd7

Browse files
committed
ggml: Implement yield barrier using futex for improved thread scheduling efficiency
1 parent 42eb248 commit 2fbabd7

File tree

5 files changed

+231
-2
lines changed

5 files changed

+231
-2
lines changed

ggml/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
189189
"ggml: metal minimum macOS version")
190190
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
191191
option(GGML_OPENMP "ggml: use OpenMP" ON)
192+
option(GGML_YIELD_BARRIER "ggml: replace spin barrier with yield barrier to improve scheduling efficiency" OFF)
192193
option(GGML_RPC "ggml: use RPC" OFF)
193194
option(GGML_SYCL "ggml: use SYCL" OFF)
194195
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)

ggml/src/ggml-cpu/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
5959
endif()
6060
endif()
6161

62+
if (GGML_YIELD_BARRIER)
63+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_YIELD_BARRIER)
64+
endif()
65+
6266
if (GGML_LLAMAFILE)
6367
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
6468

ggml/src/ggml-cpu/ggml-cpu-aarch64.h

+8
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,11 @@
66
// GGML internal header
77

88
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
9+
10+
__BEGIN_DECLS
11+
12+
#if defined(GGML_YIELD_BARRIER)
13+
size_t ggml_barrier_spin_count(unsigned int n_threads);
14+
#endif
15+
16+
__END_DECLS

ggml/src/ggml-cpu/ggml-cpu.c

+85-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "ggml-backend-impl.h"
55
#include "ggml-backend.h"
6+
#include "ggml-cpu-aarch64.h"
67
#include "ggml-cpu-traits.h"
78
#include "ggml-cpu-impl.h"
89
#include "ggml-cpu.h"
@@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type {
118119
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
119120
#endif
120121

121-
122122
#if defined(_WIN32)
123123

124124
#define WIN32_LEAN_AND_MEAN
@@ -1385,6 +1385,9 @@ struct ggml_threadpool {
13851385
struct ggml_compute_state * workers; // per thread state
13861386
int n_threads_max; // number of threads in the pool
13871387
atomic_int n_threads_cur; // number of threads used in the current graph
1388+
#if defined(GGML_YIELD_BARRIER)
1389+
size_t n_barrier_spin_count;
1390+
#endif
13881391

13891392
int32_t prio; // Scheduling priority
13901393
uint32_t poll; // Polling level (0 - no polling)
@@ -2450,6 +2453,63 @@ struct ggml_state {
24502453

24512454
static struct ggml_state g_state = {0};
24522455

2456+
#if defined(__gnu_linux__) || defined(__ANDROID__)
2457+
#include <sys/syscall.h>
2458+
#define FUTEX_WAIT 0
2459+
#define FUTEX_WAKE 1
2460+
#define FUTEX_PRIVATE_FLAG 128
2461+
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
2462+
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
2463+
#define futex_wait(uaddr, val) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0)
2464+
#define futex_wake(uaddr, n) syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0)
2465+
#elif defined(__APPLE__)
2466+
#include <stdatomic.h>
2467+
2468+
extern int __ulock_wait(uint32_t operation, volatile int *addr, uint64_t value, uint32_t timeout);
2469+
extern int __ulock_wake(uint32_t operation, volatile int *addr, uint64_t wake_value);
2470+
2471+
#define UL_COMPARE_AND_WAIT 1
2472+
2473+
#define ULF_WAKE_ALL 0x00000100
2474+
#define ULF_WAKE_THREAD 0x00000200
2475+
2476+
static int futex_wait(volatile int *addr, int expected) {
2477+
int op = UL_COMPARE_AND_WAIT;
2478+
int ret = __ulock_wait(op, (void *)addr, (uint64_t)expected, 0);
2479+
if (ret == -1) {
2480+
return -1;
2481+
}
2482+
return 0;
2483+
}
2484+
2485+
static int futex_wake(volatile int *addr, int count) {
2486+
if (count <= 0) {
2487+
return 0;
2488+
}
2489+
uint32_t op = UL_COMPARE_AND_WAIT;
2490+
if (count == INT_MAX) {
2491+
op |= ULF_WAKE_ALL;
2492+
if (__ulock_wake(op, (void *)addr, 0) == -1) {
2493+
return -1;
2494+
}
2495+
return 0;
2496+
}
2497+
int woken = 0;
2498+
for (int i = 0; i < count; ++i) {
2499+
if (__ulock_wake(op, (void *)addr, 0) == -1) {
2500+
if (errno == ENOENT || errno == ESRCH) {
2501+
break;
2502+
} else {
2503+
return -1;
2504+
}
2505+
}
2506+
woken++;
2507+
}
2508+
return woken;
2509+
}
2510+
2511+
#endif
2512+
24532513
void ggml_barrier(struct ggml_threadpool * tp) {
24542514
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
24552515
if (n_threads == 1) {
@@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) {
24702530

24712531
// exit barrier (fill seq-cst fence)
24722532
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
2533+
#if defined(GGML_YIELD_BARRIER)
2534+
// wake up all threads
2535+
futex_wake(&tp->n_barrier_passed, INT_MAX);
2536+
#endif
24732537
return;
24742538
}
24752539

2540+
#if !defined(GGML_YIELD_BARRIER)
24762541
// wait for other threads
24772542
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
24782543
ggml_thread_cpu_relax();
24792544
}
2545+
#else
2546+
size_t spin_count = tp->n_barrier_spin_count;
2547+
size_t i;
2548+
do {
2549+
for (i = 0; i < spin_count; i++) {
2550+
if (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) != n_passed) {
2551+
goto exit_barrier;
2552+
}
2553+
ggml_thread_cpu_relax();
2554+
}
24802555

2556+
futex_wait(&tp->n_barrier_passed, n_passed);
2557+
} while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed);
2558+
return;
2559+
exit_barrier:
2560+
#endif
24812561
// exit barrier (full seq-cst fence)
24822562
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
24832563
#ifdef GGML_TSAN_ENABLED
@@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
1312613206

1312713207
for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
1312813208
if (mask[i]) {
13129-
GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
13209+
printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
1313013210
CPU_SET(i, &cpuset);
1313113211
}
1313213212
}
@@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
1368013760
threadpool->poll = tpp->poll;
1368113761
threadpool->prio = tpp->prio;
1368213762
threadpool->ec = GGML_STATUS_SUCCESS;
13763+
#if defined(GGML_YIELD_BARRIER)
13764+
threadpool->n_barrier_spin_count = ggml_barrier_spin_count(tpp->n_threads);
13765+
#endif
1368313766
}
1368413767

1368513768
// Allocate and init workers state

ggml/src/ggml-cpu/ggml-cpu.cpp

+133
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
572572
#ifdef GGML_USE_ACCELERATE
573573
features.push_back({ "ACCELERATE", "1" });
574574
#endif
575+
#ifdef GGML_YIELD_BARRIER
576+
features.push_back({ "YIELD_BARRIER", "1" });
577+
#endif
575578
#ifdef GGML_USE_CPU_HBM
576579
features.push_back({ "CPU_HBM", "1" });
577580
#endif
@@ -633,6 +636,136 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
633636
GGML_UNUSED(reg);
634637
}
635638

639+
#if defined(GGML_YIELD_BARRIER)
640+
#include <thread>
641+
642+
#if defined(__x86_64__)
643+
#if defined(_MSC_VER)
644+
#include <intrin.h>
645+
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
646+
int regs[4];
647+
__cpuidex(regs, leaf, subleaf);
648+
*eax = regs[0];
649+
*ebx = regs[1];
650+
*ecx = regs[2];
651+
*edx = regs[3];
652+
}
653+
#elif defined(__GNUC__) || defined(__clang__)
654+
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
655+
__asm__ volatile (
656+
"cpuid"
657+
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
658+
: "a"(leaf), "c"(subleaf)
659+
);
660+
}
661+
#else
662+
#error Unsupported compiler
663+
#endif
664+
665+
static bool cpu_is_hybrid() {
666+
int eax, ebx, ecx, edx;
667+
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
668+
return !!(edx & (1u << 15));
669+
}
670+
671+
#elif defined(__aarch64__) && defined(__gnu_linux__)
672+
673+
bool cpu_is_hybrid() {
674+
FILE *fp = fopen("/proc/cpuinfo", "r");
675+
if (!fp) {
676+
return false;
677+
}
678+
679+
char line[256];
680+
char first_cpu_part[64] = {0};
681+
bool found_first = false;
682+
bool hybrid = false;
683+
684+
while (fgets(line, sizeof(line), fp)) {
685+
if (strncmp(line, "CPU part", 8) == 0 || strncmp(line, "cpu part", 8) == 0) {
686+
char *colon = strchr(line, ':');
687+
if (colon) {
688+
colon++;
689+
while (*colon == ' ' || *colon == '\t') {
690+
colon++;
691+
}
692+
char *newline = strchr(colon, '\n');
693+
if (newline) {
694+
*newline = '\0';
695+
}
696+
697+
if (!found_first) {
698+
strncpy(first_cpu_part, colon, sizeof(first_cpu_part)-1);
699+
found_first = true;
700+
} else {
701+
if (strcmp(first_cpu_part, colon) != 0) {
702+
hybrid = true;
703+
break;
704+
}
705+
}
706+
}
707+
}
708+
}
709+
fclose(fp);
710+
return hybrid;
711+
}
712+
713+
#elif defined(__aarch64__) && defined(__APPLE__)
714+
715+
bool cpu_is_hybrid() {
716+
int64_t cpu_count = 0;
717+
size_t size = sizeof(cpu_count);
718+
if (sysctlbyname("hw.perflevel1.physicalcpu", &cpu_count, &size, NULL, 0) == 0) {
719+
return cpu_count > 0;
720+
}
721+
return false;
722+
}
723+
724+
#else
725+
726+
bool cpu_is_hybrid() {
727+
return false;
728+
}
729+
730+
#endif
731+
732+
#if defined(__gnu_linux__)
733+
static size_t get_affinity_cores() {
734+
cpu_set_t set;
735+
int num_cores = 0;
736+
737+
CPU_ZERO(&set);
738+
if (sched_getaffinity(0, sizeof(cpu_set_t), &set) == -1) {
739+
return std::thread::hardware_concurrency();
740+
}
741+
742+
for (int i = 0; i < CPU_SETSIZE; ++i) {
743+
if (CPU_ISSET(i, &set)) {
744+
num_cores++;
745+
}
746+
}
747+
748+
return num_cores;
749+
}
750+
#else
751+
static size_t get_affinity_cores() {
752+
return std::thread::hardware_concurrency();
753+
}
754+
#endif
755+
756+
extern "C"
757+
size_t ggml_barrier_spin_count(unsigned int n_threads) {
758+
size_t count = 30000;
759+
if (n_threads > get_affinity_cores()) {
760+
count = 100;
761+
}
762+
if (cpu_is_hybrid()) {
763+
count = 1;
764+
}
765+
return count;
766+
}
767+
#endif
768+
636769
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
637770
/* .get_name = */ ggml_backend_cpu_reg_get_name,
638771
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,

0 commit comments

Comments
 (0)