3
3
4
4
#include "ggml-backend-impl.h"
5
5
#include "ggml-backend.h"
6
+ #include "ggml-cpu-aarch64.h"
6
7
#include "ggml-cpu-traits.h"
7
8
#include "ggml-cpu-impl.h"
8
9
#include "ggml-cpu.h"
@@ -118,7 +119,6 @@ struct ggml_arm_arch_features_type {
118
119
} ggml_arm_arch_features = {-1 , -1 , -1 , -1 , 0 , -1 };
119
120
#endif
120
121
121
-
122
122
#if defined(_WIN32 )
123
123
124
124
#define WIN32_LEAN_AND_MEAN
@@ -1385,6 +1385,9 @@ struct ggml_threadpool {
1385
1385
struct ggml_compute_state * workers ; // per thread state
1386
1386
int n_threads_max ; // number of threads in the pool
1387
1387
atomic_int n_threads_cur ; // number of threads used in the current graph
1388
+ #if defined(GGML_YIELD_BARRIER )
1389
+ size_t n_barrier_spin_count ;
1390
+ #endif
1388
1391
1389
1392
int32_t prio ; // Scheduling priority
1390
1393
uint32_t poll ; // Polling level (0 - no polling)
@@ -2450,6 +2453,63 @@ struct ggml_state {
2450
2453
2451
2454
static struct ggml_state g_state = {0 };
2452
2455
2456
+ #if defined(__gnu_linux__ ) || defined(__ANDROID__ )
2457
+ #include <sys/syscall.h>
2458
+ #define FUTEX_WAIT 0
2459
+ #define FUTEX_WAKE 1
2460
+ #define FUTEX_PRIVATE_FLAG 128
2461
+ #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
2462
+ #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
2463
+ #define futex_wait (uaddr , val ) syscall(SYS_futex, uaddr, FUTEX_WAIT_PRIVATE, val, NULL, NULL, 0)
2464
+ #define futex_wake (uaddr , n ) syscall(SYS_futex, uaddr, FUTEX_WAKE_PRIVATE, n, NULL, NULL, 0)
2465
+ #elif defined(__APPLE__ )
2466
+ #include <stdatomic.h>
2467
+
2468
+ extern int __ulock_wait (uint32_t operation , volatile int * addr , uint64_t value , uint32_t timeout );
2469
+ extern int __ulock_wake (uint32_t operation , volatile int * addr , uint64_t wake_value );
2470
+
2471
+ #define UL_COMPARE_AND_WAIT 1
2472
+
2473
+ #define ULF_WAKE_ALL 0x00000100
2474
+ #define ULF_WAKE_THREAD 0x00000200
2475
+
2476
+ static int futex_wait (volatile int * addr , int expected ) {
2477
+ int op = UL_COMPARE_AND_WAIT ;
2478
+ int ret = __ulock_wait (op , (void * )addr , (uint64_t )expected , 0 );
2479
+ if (ret == -1 ) {
2480
+ return -1 ;
2481
+ }
2482
+ return 0 ;
2483
+ }
2484
+
2485
+ static int futex_wake (volatile int * addr , int count ) {
2486
+ if (count <= 0 ) {
2487
+ return 0 ;
2488
+ }
2489
+ uint32_t op = UL_COMPARE_AND_WAIT ;
2490
+ if (count == INT_MAX ) {
2491
+ op |= ULF_WAKE_ALL ;
2492
+ if (__ulock_wake (op , (void * )addr , 0 ) == -1 ) {
2493
+ return -1 ;
2494
+ }
2495
+ return 0 ;
2496
+ }
2497
+ int woken = 0 ;
2498
+ for (int i = 0 ; i < count ; ++ i ) {
2499
+ if (__ulock_wake (op , (void * )addr , 0 ) == -1 ) {
2500
+ if (errno == ENOENT || errno == ESRCH ) {
2501
+ break ;
2502
+ } else {
2503
+ return -1 ;
2504
+ }
2505
+ }
2506
+ woken ++ ;
2507
+ }
2508
+ return woken ;
2509
+ }
2510
+
2511
+ #endif
2512
+
2453
2513
void ggml_barrier (struct ggml_threadpool * tp ) {
2454
2514
int n_threads = atomic_load_explicit (& tp -> n_threads_cur , memory_order_relaxed );
2455
2515
if (n_threads == 1 ) {
@@ -2470,14 +2530,34 @@ void ggml_barrier(struct ggml_threadpool * tp) {
2470
2530
2471
2531
// exit barrier (fill seq-cst fence)
2472
2532
atomic_fetch_add_explicit (& tp -> n_barrier_passed , 1 , memory_order_seq_cst );
2533
+ #if defined(GGML_YIELD_BARRIER )
2534
+ // wake up all threads
2535
+ futex_wake (& tp -> n_barrier_passed , INT_MAX );
2536
+ #endif
2473
2537
return ;
2474
2538
}
2475
2539
2540
+ #if !defined(GGML_YIELD_BARRIER )
2476
2541
// wait for other threads
2477
2542
while (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) == n_passed ) {
2478
2543
ggml_thread_cpu_relax ();
2479
2544
}
2545
+ #else
2546
+ size_t spin_count = tp -> n_barrier_spin_count ;
2547
+ size_t i ;
2548
+ do {
2549
+ for (i = 0 ; i < spin_count ; i ++ ) {
2550
+ if (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) != n_passed ) {
2551
+ goto exit_barrier ;
2552
+ }
2553
+ ggml_thread_cpu_relax ();
2554
+ }
2480
2555
2556
+ futex_wait (& tp -> n_barrier_passed , n_passed );
2557
+ } while (atomic_load_explicit (& tp -> n_barrier_passed , memory_order_relaxed ) == n_passed );
2558
+ return ;
2559
+ exit_barrier :
2560
+ #endif
2481
2561
// exit barrier (full seq-cst fence)
2482
2562
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
2483
2563
#ifdef GGML_TSAN_ENABLED
@@ -13126,7 +13206,7 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
13126
13206
13127
13207
for (uint32_t i = 0 ; i < GGML_MAX_N_THREADS ; i ++ ) {
13128
13208
if (mask [i ]) {
13129
- GGML_PRINT_DEBUG ("Thread %lx: adding %d to cpuset\n" , pthread_self (), i );
13209
+ printf ("Thread %lx: adding %d to cpuset\n" , pthread_self (), i );
13130
13210
CPU_SET (i , & cpuset );
13131
13211
}
13132
13212
}
@@ -13680,6 +13760,9 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
13680
13760
threadpool -> poll = tpp -> poll ;
13681
13761
threadpool -> prio = tpp -> prio ;
13682
13762
threadpool -> ec = GGML_STATUS_SUCCESS ;
13763
+ #if defined(GGML_YIELD_BARRIER )
13764
+ threadpool -> n_barrier_spin_count = ggml_barrier_spin_count (tpp -> n_threads );
13765
+ #endif
13683
13766
}
13684
13767
13685
13768
// Allocate and init workers state
0 commit comments