HarbourMasters · louist103 · Jun 18, 2025
diff --git a/mm/2s2h/CpuHelpers.c b/mm/2s2h/CpuHelpers.c
@@ -0,0 +1,32 @@
+// This file is only useful for x86(_64) systems. There isn't a standard way to detect these.
+// https://xkcd.com/927/ comes to mind...
+#include "CpuHelpers.h"
+#if defined (X86_CPU)
+
+#if defined (__unix__)
+#include <cpuid.h>
+#elif defined (_WIN32)
+// From the GCC cpuid.h header
+/* Extended Features Leaf (%eax == 7, %ecx == 0) */
+/* %ebx */
+#define bit_AVX2	(1 << 5)
+#include <intrin.h>
+#endif
+
+// Other checks can be added as needed.
+
+int Cpu_SupportsAVX2(void) {
+#ifdef _WIN32
+    int cpuidData[4];
+    __cpuid(cpuidData, 7);
+#else
+    unsigned int cpuidData[4];
+    __get_cpuid(7, &cpuidData[0], &cpuidData[1], &cpuidData[2], &cpuidData[3]);
+#endif
+    if (cpuidData[1] & bit_AVX2) {
+        return 1;
+    }
+    return 0;
+}
+
+#endif
diff --git a/mm/2s2h/CpuHelpers.h b/mm/2s2h/CpuHelpers.h
@@ -0,0 +1,27 @@
+#ifndef CPU_HELPERS_H
+#define CPU_HELPERS_H
+
+// This file is only useful for x86(_64) systems right now. There isn't a standard way to detect these.
+// https://xkcd.com/927/ comes to mind...
+
+
+#if defined (__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(__i386) || defined(_M_IX86)
+#define X86_CPU
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Checks if the current CPU supports AVX2 instructions. This function always calls CPUID so its result should be cached
+// by the caller.
+int Cpu_SupportsAVX2(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+
+#endif
diff --git a/mm/2s2h/mixer.c b/mm/2s2h/mixer.c
@@ -6,6 +6,8 @@
 #include <stdio.h>
 
 #include "mixer.h"
+#include "CpuHelpers.h"
+
 #ifndef __clang__
 #pragma GCC optimize("unroll-loops")
 #endif
@@ -69,8 +71,13 @@ static int16_t resample_table[64][4] = {
     { 0xffdf, 0x0d46, 0x66ad, 0x0c39 }
 };
 
+static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
 static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
 static void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
+static void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
+
+typedef void (*aMixFunc)(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
+static aMixFunc sMixFunc;
 
 static inline int16_t clamp16(int32_t v) {
     if (v < -0x8000) {
@@ -90,6 +97,26 @@ static inline int32_t clamp32(int64_t v) {
     return (int32_t)v;
 }
 
+extern int Cpu_SupportsAVX2(void);
+
+// Sets up which version of a function to use. If you are trying to read an example implementation, look at the function
+// name ending with "ImplRef". If you are debugging a crash or issue, break in the function calling the function pointer
+void Mixer_Init(void) {
+#if defined(X86_CPU)
+    if (Cpu_SupportsAVX2()) {
+        sMixFunc = aMixImpl256;
+        return;
+    }
+    // If AVX2 isn't supported, fallback to the SSE2 implementation. SSE2 support goes back to early 2000s. We can
+    // assume it is supported
+    sMixFunc = aMixImplSSE2;
+#elif defiend(__ARM_NEON)
+    sMixFunc = aMixImplNEON;
+#else
+    sMixFunc = aMixImplRef;
+#endif
+}
+
 void aClearBufferImpl(uint16_t addr, int nbytes) {
     nbytes = ROUND_UP_16(nbytes);
     memset(BUF_U8(addr), 0, nbytes);
@@ -357,13 +384,7 @@ static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t
 }
 
 void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
-#if defined(__SSE2__) || defined(_M_AMD64)
-    aMixImplSSE2(count, gain, in_addr, out_addr);
-#elif defined(__ARM_NEON)
-    aMixImplNEON(count, gain, in_addr, out_addr);
-#else
-    aMixImplRef(count, gain, in_addr, out_addr);
-#endif
+    sMixFunc(count, gain, in_addr, out_addr);
 }
 
 void aS8DecImpl(uint8_t flags, ADPCM_STATE state) {
@@ -607,25 +628,17 @@ void aUnkCmd19Impl(uint8_t f, uint16_t count, uint16_t out_addr, uint16_t in_add
 // SIMD operations expect aligned data
 #include "align_asset_macro.h"
 
-#if defined(__SSE2__) || defined(_M_AMD64)
+#if defined(X86_CPU)
 #include <immintrin.h>
 
-static const ALIGN_ASSET(16) int16_t x7fff[8] = {
-    0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
-};
-static const ALIGN_ASSET(16) int32_t x4000[4] = {
-    0x4000,
-    0x4000,
-    0x4000,
-    0x4000,
-};
+static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,};
+static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000};
 
 static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
     int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
     int16_t* in = BUF_S16(in_addr);
     int16_t* out = BUF_S16(out_addr);
-    int i;
-    int32_t sample;
+
     if (gain == -0x8000) {
         while (nbytes > 0) {
             for (unsigned int i = 0; i < 2; i++) {
@@ -638,6 +651,7 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_
                 out += 8;
             }
         }
+        return;
     }
     // Load constants into vectors from aligned memory.
     __m128i x7fffVec = _mm_load_si128((__m128i*)x7fff);
@@ -686,74 +700,10 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_
         }
     }
 }
-#endif
-#if defined(__ARM_NEON)
-#include <arm_neon.h>
-static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 };
-void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
-    int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
-    int16_t* in = BUF_S16(in_addr);
-    int16_t* out = BUF_S16(out_addr);
-    int i;
-    int32_t sample;
-
-    if (gain == -0x8000) {
-        while (nbytes > 0) {
-            for (unsigned int i = 0; i < 2; i++) {
-                int16x8_t outVec = vld1q_s16(out);
-                int16x8_t inVec = vld1q_s16(in);
-                int16x8_t subVec = vqsubq_s16(outVec, inVec);
-                vst1q_s16(out, subVec);
-                nbytes -= 8 * sizeof(int16_t);
-                out += 8;
-                in += 8;
-            }
-        }
-    }
-    int16x8_t gainVec = vdupq_n_s16(gain);
-    int32x4_t x4000Vec = vld1q_s32(x4000Arr);
-    while (nbytes > 0) {
-        for (unsigned int i = 0; i < 2; i++) {
-            // for (i = 0; i < 16; i++) {
-            int16x8_t outVec = vld1q_s16(out);
-            int16x8_t inVec = vld1q_s16(in);
-            int16x4_t outLoVec = vget_low_s16(outVec);
-            int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec);
-            int16x4_t inLoVec = vget_low_s16(inVec);
-            int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec);
-            int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF);
-            int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF);
-
-            int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec);
-            int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec);
-            int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec);
-            int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec);
-            addVecHi = vaddq_s32(addVecHi, x4000Vec);
-            addVecLo = vaddq_s32(addVecLo, x4000Vec);
-            int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15);
-            int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15);
-            int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi);
-            int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo);
-            vst1_s16(out, shiftedNarrowLoVec);
-            out += 4;
-            vst1_s16(out, shiftedNarrowHiVec);
-            // int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec);
-            // vst1q_s16(out, finalVec);
-            out += 4;
-            in += 8;
-
-            nbytes -= 8 * sizeof(int16_t);
-        }
-    }
-}
-#endif
-
-#if 0
-static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,};
-static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000};
 
 #pragma GCC target("avx2")
-// AVX2 version of the SSE2 implementation above. AVX2 wasn't released until 2014 and I don't have a good way of checking for it at compile time.
+// AVX2 version of the SSE2 implementation above. AVX2 support can be forced because we check if the CPU supports it
+// at runtime.
 void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
     int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
     int16_t* in = BUF_S16(in_addr);
@@ -770,6 +720,7 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad
             out += 16;
             nbytes -= 16 * sizeof(int16_t);
         }
+        return;
     }
     // Load constants into vectors from aligned memory.
     __m256i x7fffVec = _mm256_load_si256((__m256i*)x7fff);
@@ -817,3 +768,64 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad
     }
 }
 #endif
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 };
+void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
+    int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
+    int16_t* in = BUF_S16(in_addr);
+    int16_t* out = BUF_S16(out_addr);
+    int i;
+    int32_t sample;
+
+    if (gain == -0x8000) {
+        while (nbytes > 0) {
+            for (unsigned int i = 0; i < 2; i++) {
+                int16x8_t outVec = vld1q_s16(out);
+                int16x8_t inVec = vld1q_s16(in);
+                int16x8_t subVec = vqsubq_s16(outVec, inVec);
+                vst1q_s16(out, subVec);
+                nbytes -= 8 * sizeof(int16_t);
+                out += 8;
+                in += 8;
+            }
+        }
+    }
+    int16x8_t gainVec = vdupq_n_s16(gain);
+    int32x4_t x4000Vec = vld1q_s32(x4000Arr);
+    while (nbytes > 0) {
+        for (unsigned int i = 0; i < 2; i++) {
+            // for (i = 0; i < 16; i++) {
+            int16x8_t outVec = vld1q_s16(out);
+            int16x8_t inVec = vld1q_s16(in);
+            int16x4_t outLoVec = vget_low_s16(outVec);
+            int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec);
+            int16x4_t inLoVec = vget_low_s16(inVec);
+            int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec);
+            int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF);
+            int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF);
+
+            int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec);
+            int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec);
+            int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec);
+            int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec);
+            addVecHi = vaddq_s32(addVecHi, x4000Vec);
+            addVecLo = vaddq_s32(addVecLo, x4000Vec);
+            int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15);
+            int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15);
+            int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi);
+            int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo);
+            vst1_s16(out, shiftedNarrowLoVec);
+            out += 4;
+            vst1_s16(out, shiftedNarrowHiVec);
+            // int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec);
+            // vst1q_s16(out, finalVec);
+            out += 4;
+            in += 8;
+
+            nbytes -= 8 * sizeof(int16_t);
+        }
+    }
+}
+#endif
diff --git a/mm/src/audio/lib/load.c b/mm/src/audio/lib/load.c
@@ -1142,6 +1142,8 @@ extern AudioContext gAudioCtx;
 // #end region
 #include "resourcebridge.h"
 
+void Mixer_Init(void);
+
 void AudioLoad_Init(void* heap, size_t heapSize) {
     s32 pad1[9];
     s32 numFonts;
@@ -1150,6 +1152,7 @@ void AudioLoad_Init(void* heap, size_t heapSize) {
     void* addr;
     s32 i;
     s32 j;
+    Mixer_Init();
 
     gAudioCustomUpdateFunction = NULL;
     gAudioCustomReverbFunction = NULL;