Skip to content

ggml: Implement yield barrier using futex for improved thread scheduling efficiency #13079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_YIELD_BARRIER "ggml: replace spin barrier with yield barrier to improve scheduling efficiency" OFF)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
endif()

if (GGML_YIELD_BARRIER)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_YIELD_BARRIER)
endif()

if (GGML_LLAMAFILE)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)

Expand Down
12 changes: 12 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,15 @@
// GGML internal header

ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);

#ifdef __cplusplus
extern "C" {
#endif

#if defined(GGML_YIELD_BARRIER)
size_t ggml_barrier_spin_count(unsigned int n_threads);
#endif

#ifdef __cplusplus
}
#endif
87 changes: 85 additions & 2 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-cpu-aarch64.h"
#include "ggml-cpu-traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
Expand Down Expand Up @@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type {
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
#endif


#if defined(_WIN32)

#define WIN32_LEAN_AND_MEAN
Expand Down Expand Up @@ -1385,6 +1385,9 @@ struct ggml_threadpool {
struct ggml_compute_state * workers; // per thread state
int n_threads_max; // number of threads in the pool
atomic_int n_threads_cur; // number of threads used in the current graph
#if defined(GGML_YIELD_BARRIER)
size_t n_barrier_spin_count;
#endif

int32_t prio; // Scheduling priority
uint32_t poll; // Polling level (0 - no polling)
Expand Down Expand Up @@ -2450,6 +2453,63 @@ struct ggml_state {

static struct ggml_state g_state = {0};

#if defined(__gnu_linux__) || defined(__ANDROID__)
#include <sys/syscall.h>
#define FUTEX_WAIT 0
#define FUTEX_WAKE 1
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
#define futex_wait(uaddr, val) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0)
#define futex_wake(uaddr, n) syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0)
#elif defined(__APPLE__)
#include <stdatomic.h>

extern int __ulock_wait(uint32_t operation, volatile int *addr, uint64_t value, uint32_t timeout);
extern int __ulock_wake(uint32_t operation, volatile int *addr, uint64_t wake_value);

#define UL_COMPARE_AND_WAIT 1

#define ULF_WAKE_ALL 0x00000100
#define ULF_WAKE_THREAD 0x00000200

static int futex_wait(volatile int *addr, int expected) {
int op = UL_COMPARE_AND_WAIT;
int ret = __ulock_wait(op, (volatile void *)addr, (uint64_t)expected, 0);
if (ret == -1) {
return -1;
}
return 0;
}

static int futex_wake(volatile int *addr, int count) {
if (count <= 0) {
return 0;
}
uint32_t op = UL_COMPARE_AND_WAIT;
if (count == INT_MAX) {
op |= ULF_WAKE_ALL;
if (__ulock_wake(op, (volatile void *)addr, 0) == -1) {
return -1;
}
return 0;
}
int woken = 0;
for (int i = 0; i < count; ++i) {
if (__ulock_wake(op, (volatile void *)addr, 0) == -1) {
if (errno == ENOENT || errno == ESRCH) {
break;
} else {
return -1;
}
}
woken++;
}
return woken;
}

#endif

void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
if (n_threads == 1) {
Expand All @@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) {

// exit barrier (fill seq-cst fence)
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
#if defined(GGML_YIELD_BARRIER)
// wake up all threads
futex_wake(&tp->n_barrier_passed, INT_MAX);
#endif
return;
}

#if !defined(GGML_YIELD_BARRIER)
// wait for other threads
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
ggml_thread_cpu_relax();
}
#else
size_t spin_count = tp->n_barrier_spin_count;
size_t i;
do {
for (i = 0; i < spin_count; i++) {
if (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) != n_passed) {
goto exit_barrier;
}
ggml_thread_cpu_relax();
}

futex_wait(&tp->n_barrier_passed, n_passed);
} while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed);
return;
exit_barrier:
#endif
// exit barrier (full seq-cst fence)
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
#ifdef GGML_TSAN_ENABLED
Expand Down Expand Up @@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {

for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (mask[i]) {
GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
CPU_SET(i, &cpuset);
}
}
Expand Down Expand Up @@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->poll = tpp->poll;
threadpool->prio = tpp->prio;
threadpool->ec = GGML_STATUS_SUCCESS;
#if defined(GGML_YIELD_BARRIER)
threadpool->n_barrier_spin_count = ggml_barrier_spin_count(tpp->n_threads);
#endif
}

// Allocate and init workers state
Expand Down
133 changes: 133 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
#ifdef GGML_USE_ACCELERATE
features.push_back({ "ACCELERATE", "1" });
#endif
#ifdef GGML_YIELD_BARRIER
features.push_back({ "YIELD_BARRIER", "1" });
#endif
#ifdef GGML_USE_CPU_HBM
features.push_back({ "CPU_HBM", "1" });
#endif
Expand Down Expand Up @@ -633,6 +636,136 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
GGML_UNUSED(reg);
}

#if defined(GGML_YIELD_BARRIER)
#include <thread>

#if defined(__x86_64__)
#if defined(_MSC_VER)
#include <intrin.h>
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
int regs[4];
__cpuidex(regs, leaf, subleaf);
*eax = regs[0];
*ebx = regs[1];
*ecx = regs[2];
*edx = regs[3];
}
#elif defined(__GNUC__) || defined(__clang__)
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
__asm__ volatile (
"cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "a"(leaf), "c"(subleaf)
);
}
#else
#error Unsupported compiler
#endif

static bool cpu_is_hybrid() {
int eax, ebx, ecx, edx;
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
return !!(edx & (1u << 15));
}

#elif defined(__aarch64__) && defined(__gnu_linux__)

bool cpu_is_hybrid() {
FILE *fp = fopen("/proc/cpuinfo", "r");
if (!fp) {
return false;
}

char line[256];
char first_cpu_part[64] = {0};
bool found_first = false;
bool hybrid = false;

while (fgets(line, sizeof(line), fp)) {
if (strncmp(line, "CPU part", 8) == 0 || strncmp(line, "cpu part", 8) == 0) {
char *colon = strchr(line, ':');
if (colon) {
colon++;
while (*colon == ' ' || *colon == '\t') {
colon++;
}
char *newline = strchr(colon, '\n');
if (newline) {
*newline = '\0';
}

if (!found_first) {
strncpy(first_cpu_part, colon, sizeof(first_cpu_part)-1);
found_first = true;
} else {
if (strcmp(first_cpu_part, colon) != 0) {
hybrid = true;
break;
}
}
}
}
}
fclose(fp);
return hybrid;
}

#elif defined(__aarch64__) && defined(__APPLE__)

bool cpu_is_hybrid() {
int64_t cpu_count = 0;
size_t size = sizeof(cpu_count);
if (sysctlbyname("hw.perflevel1.physicalcpu", &cpu_count, &size, NULL, 0) == 0) {
return cpu_count > 0;
}
return false;
}

#else

bool cpu_is_hybrid() {
return false;
}

#endif

#if defined(__gnu_linux__)
static size_t get_affinity_cores() {
cpu_set_t set;
int num_cores = 0;

CPU_ZERO(&set);
if (sched_getaffinity(0, sizeof(cpu_set_t), &set) == -1) {
return std::thread::hardware_concurrency();
}

for (int i = 0; i < CPU_SETSIZE; ++i) {
if (CPU_ISSET(i, &set)) {
num_cores++;
}
}

return num_cores;
}
#else
static size_t get_affinity_cores() {
return std::thread::hardware_concurrency();
}
#endif

extern "C"
size_t ggml_barrier_spin_count(unsigned int n_threads) {
size_t count = 30000;
if (n_threads > get_affinity_cores()) {
count = 100;
}
if (cpu_is_hybrid()) {
count = 1;
}
return count;
}
#endif

static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
/* .get_name = */ ggml_backend_cpu_reg_get_name,
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
Expand Down
Loading