From 57ccbef5b6cdbe512d790510d4b6be2c8020106f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 6 Apr 2025 15:15:49 +0200
Subject: [PATCH 1/4] simd implementation of fp32 gelu

Co-authored-by: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
---
 src/ggml-cpu/vec.h | 125 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 95 insertions(+), 30 deletions(-)

diff --git a/src/ggml-cpu/vec.h b/src/ggml-cpu/vec.h
index 23cbb3051f..9c24cf6d56 100644
--- a/src/ggml-cpu/vec.h
+++ b/src/ggml-cpu/vec.h
@@ -433,36 +433,6 @@ inline static float ggml_gelu_f32(float x) {
     return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }
 
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        if (x[i] <= -10.0f) {
-            y[i] = 0.0f;
-        } else if (x[i] >= 10.0f) {
-            y[i] = x[i];
-        } else {
-            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-            memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
-        }
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
 inline static float ggml_gelu_quick_f32(float x) {
     return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }
@@ -541,6 +511,17 @@ inline static float32x4_t ggml_v_expf(float32x4_t x) {
                      vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
 }
 
+// Slower than lookup on my M2-Max
+inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
+    arg = vmulq_f32(arg, vmulq_f32(x, c2));
+    float32x4_t exp_arg = ggml_v_expf(arg);
+    float32x4_t gelu = vmulq_f32(x, vdivq_f32(exp_arg, vaddq_f32(exp_arg, one)));
+    uint32x4_t mask = vcgtq_f32(x, vdupq_n_f32(10.f));
+    return vbslq_f32(mask, x, gelu);
+}
+
 // computes silu x/(1+exp(-x)) in single precision vector
 inline static float32x4_t ggml_v_silu(float32x4_t x) {
     const float32x4_t one = vdupq_n_f32(1.0f);
@@ -584,6 +565,16 @@ inline static __m512 ggml_v_expf(__m512 x) {
   return _mm512_mask_blend_ps(d, res, alt);
 }
 
+inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
+    const __m512 one = _mm512_set1_ps(1.0f);
+    __m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
+    arg = _mm512_mul_ps(arg, _mm512_mul_ps(c2, x));
+    const __mmask16 mask = _mm512_cmp_ps_mask(arg, _mm512_set1_ps(30.f), _CMP_GT_OQ);
+    const __m512 exp_arg = ggml_v_expf(arg);
+    const __m512 ratio = _mm512_div_ps(exp_arg, _mm512_add_ps(exp_arg, one));
+    return _mm512_mul_ps(x, _mm512_mask_blend_ps(mask, ratio, one));
+}
+
 // computes silu x/(1+exp(-x)) in single precision vector
 inline static __m512 ggml_v_silu(__m512 x) {
     const __m512 one = _mm512_set1_ps(1);
@@ -639,6 +630,16 @@ inline static __m256 ggml_v_expf(__m256 x) {
               _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
 }
 
+inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
+    const __m256 one = _mm256_set1_ps(1.0f);
+    const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
+    __m256 arg = _mm256_add_ps(one, _mm256_mul_ps(_mm256_mul_ps(x, x), c1));
+    arg = _mm256_mul_ps(arg, _mm256_mul_ps(x, c2));
+    __m256 exp_arg = ggml_v_expf(arg);
+    __m256 gelu = _mm256_mul_ps(x, _mm256_div_ps(exp_arg, _mm256_add_ps(exp_arg, one)));
+    return _mm256_or_ps(_mm256_and_ps(mask, x), _mm256_andnot_ps(mask, gelu));
+}
+
 // computes silu x/(1+exp(-x)) in single precision vector
 inline static __m256 ggml_v_silu(__m256 x) {
     const __m256 one = _mm256_set1_ps(1);
@@ -693,6 +694,70 @@ inline static __m128 ggml_v_expf(__m128 x) {
                                 _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
 }
 
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_f16[i16[i]];
+    }
+}
+
+//
+// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
+// via SIMD instructions is faster than the fp16-based lookup table.
+// On my M2-Max CPU the lookup table is slightly faster than the SIMD version,
+// hence we use the SIMD version only if GGML_GELU_FP16 is not defined.
+// We do not run into numerical issues for large or small arguments because
+//    0.5f * (1 + tanhf(arg))
+// is computed as
+//    exp(2*arg)/(exp(2*arg) + 1)
+// The ggml_v_expf functions flushes to zero for large enough negative
+// arguments, so the above becomes zero. ggml_v_expf returns INFINITY
+// for large positive arguments, so we would get a NaN if we did nothing. But in the
+// ggml_v_gelu SIMD implementations we override the gelu result with the
+// input argument when the argument is greater than 10, so it is all good.
+//
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    __m512 c1 = _mm512_set1_ps(GELU_COEF_A);
+    __m512 c2 = _mm512_set1_ps(2.f*SQRT_2_OVER_PI);
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_gelu(_mm512_loadu_ps(x + i), c1, c2));
+    }
+#elif defined __AVX2__ && defined __FMA__
+    __m256 c1 = _mm256_set1_ps(GELU_COEF_A);
+    __m256 c2 = _mm256_set1_ps(2.f*SQRT_2_OVER_PI);
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_gelu(_mm256_loadu_ps(x + i), c1, c2));
+    }
+#endif
+#ifdef GGML_GELU_FP16
+    uint16_t t;
+    for (; i < n; ++i) {
+        if (x[i] <= -10.0f) {
+            y[i] = 0.0f;
+        } else if (x[i] >= 10.0f) {
+            y[i] = x[i];
+        } else {
+            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            memcpy(&t, &fp16, sizeof(uint16_t));
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        }
+    }
+#else
+#if defined __ARM_NEON
+    float32x4_t c1 = vdupq_n_f32(GELU_COEF_A);
+    float32x4_t c2 = vdupq_n_f32(2.f*SQRT_2_OVER_PI);
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_gelu(vld1q_f32(x + i), c1, c2));
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_gelu_f32(x[i]);
+    }
+#endif
+}
+
 // computes silu x/(1+exp(-x)) in single precision vector
 inline static __m128 ggml_v_silu(__m128 x) {
     const __m128 one = _mm_set1_ps(1);

From 74c38bc7badca258b80dba6190b3f56d520a33a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 6 Apr 2025 16:30:23 +0200
Subject: [PATCH 2/4] add copyright notice

---
 src/ggml-cpu/vec.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ggml-cpu/vec.h b/src/ggml-cpu/vec.h
index 9c24cf6d56..c8f0016323 100644
--- a/src/ggml-cpu/vec.h
+++ b/src/ggml-cpu/vec.h
@@ -512,6 +512,7 @@ inline static float32x4_t ggml_v_expf(float32x4_t x) {
 }
 
 // Slower than lookup on my M2-Max
+// MIT licensed. Copyright (C) 2024 Iwan Kawrakow
 inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
     const float32x4_t one = vdupq_n_f32(1.0f);
     float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
@@ -565,6 +566,7 @@ inline static __m512 ggml_v_expf(__m512 x) {
   return _mm512_mask_blend_ps(d, res, alt);
 }
 
+// MIT licensed. Copyright (C) 2024 Iwan Kawrakow
 inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
     const __m512 one = _mm512_set1_ps(1.0f);
     __m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
@@ -630,6 +632,7 @@ inline static __m256 ggml_v_expf(__m256 x) {
               _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
 }
 
+// MIT licensed. Copyright (C) 2024 Iwan Kawrakow
 inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
     const __m256 one = _mm256_set1_ps(1.0f);
     const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
@@ -716,6 +719,7 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 // ggml_v_gelu SIMD implementations we override the gelu result with the
 // input argument when the argument is greater than 10, so it is all good.
 //
+// MIT licensed. Copyright (C) 2024 Iwan Kawrakow
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     int i = 0;
 #if defined(__AVX512F__) && defined(__AVX512DQ__)

From 847a35bb85305c65f1e12b048889142637f616ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 6 Apr 2025 17:03:12 +0200
Subject: [PATCH 3/4] cut'n'paste error

---
 src/ggml-cpu/vec.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/ggml-cpu/vec.h b/src/ggml-cpu/vec.h
index c8f0016323..db97957c40 100644
--- a/src/ggml-cpu/vec.h
+++ b/src/ggml-cpu/vec.h
@@ -697,6 +697,18 @@ inline static __m128 ggml_v_expf(__m128 x) {
                                 _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
 }
 
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __AVX2__ / __SSE2__
+
 inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     const uint16_t * i16 = (const uint16_t *) x;
     for (int i = 0; i < n; ++i) {
@@ -762,18 +774,6 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 #endif
 }
 
-// computes silu x/(1+exp(-x)) in single precision vector
-inline static __m128 ggml_v_silu(__m128 x) {
-    const __m128 one = _mm_set1_ps(1);
-    const __m128 zero = _mm_setzero_ps();
-    const __m128 neg_x = _mm_sub_ps(zero, x);
-    const __m128 exp_neg_x = ggml_v_expf(neg_x);
-    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
-    return _mm_div_ps(x, one_plus_exp_neg_x);
-}
-
-#endif // __ARM_NEON / __AVX2__ / __SSE2__
-
 inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
         y[i] = ggml_silu_f16(x[i]);

From 88a7f8b08ea6c48d1f1b3d30638127a09f35e881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 6 Apr 2025 21:01:15 +0200
Subject: [PATCH 4/4] reference original PR

---
 src/ggml-cpu/vec.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ggml-cpu/vec.h b/src/ggml-cpu/vec.h
index db97957c40..507342df93 100644
--- a/src/ggml-cpu/vec.h
+++ b/src/ggml-cpu/vec.h
@@ -513,6 +513,7 @@ inline static float32x4_t ggml_v_expf(float32x4_t x) {
 
 // Slower than lookup on my M2-Max
 // MIT licensed. Copyright (C) 2024 Iwan Kawrakow
+// https://github.com/ikawrakow/ik_llama.cpp/pull/9
 inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
     const float32x4_t one = vdupq_n_f32(1.0f);
     float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
@@ -567,6 +568,7 @@ inline static __m512 ggml_v_expf(__m512 x) {
 }
 
 // MIT licensed. Copyright (C) 2024 Iwan Kawrakow
+// https://github.com/ikawrakow/ik_llama.cpp/pull/9
 inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
     const __m512 one = _mm512_set1_ps(1.0f);
     __m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
@@ -633,6 +635,7 @@ inline static __m256 ggml_v_expf(__m256 x) {
 }
 
 // MIT licensed. Copyright (C) 2024 Iwan Kawrakow
+// https://github.com/ikawrakow/ik_llama.cpp/pull/9
 inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
     const __m256 one = _mm256_set1_ps(1.0f);
     const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
@@ -732,6 +735,7 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 // input argument when the argument is greater than 10, so it is all good.
 //
 // MIT licensed. Copyright (C) 2024 Iwan Kawrakow
+// https://github.com/ikawrakow/ik_llama.cpp/pull/9
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     int i = 0;
 #if defined(__AVX512F__) && defined(__AVX512DQ__)