diff --git a/mm/2s2h/CpuHelpers.c b/mm/2s2h/CpuHelpers.c new file mode 100644 index 0000000000..b401c30bd0 --- /dev/null +++ b/mm/2s2h/CpuHelpers.c @@ -0,0 +1,32 @@ +// This file is only useful for x86(_64) systems. There isn't a standard way to detect these. +// https://xkcd.com/927/ comes to mind... +#include "CpuHelpers.h" +#if defined (X86_CPU) + +#if defined (__unix__) +#include +#elif defined (_WIN32) +// From the GCC cpuid.h header +/* Extended Features Leaf (%eax == 7, %ecx == 0) */ +/* %ebx */ +#define bit_AVX2 (1 << 5) +#include +#endif + +// Other checks can be added as needed. + +int Cpu_SupportsAVX2(void) { +#ifdef _WIN32 + int cpuidData[4]; + __cpuid(cpuidData, 7); +#else + unsigned int cpuidData[4]; + __get_cpuid(7, &cpuidData[0], &cpuidData[1], &cpuidData[2], &cpuidData[3]); +#endif + if (cpuidData[1] & bit_AVX2) { + return 1; + } + return 0; +} + +#endif \ No newline at end of file diff --git a/mm/2s2h/CpuHelpers.h b/mm/2s2h/CpuHelpers.h new file mode 100644 index 0000000000..eefdff3bab --- /dev/null +++ b/mm/2s2h/CpuHelpers.h @@ -0,0 +1,27 @@ +#ifndef CPU_HELPERS_H +#define CPU_HELPERS_H + +// This file is only useful for x86(_64) systems right now. There isn't a standard way to detect these. +// https://xkcd.com/927/ comes to mind... + + +#if defined (__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(__i386) || defined(_M_IX86) +#define X86_CPU + +#ifdef __cplusplus +extern "C" { +#endif + +//Checks if the current CPU supports AVX2 instructions. This function always calls CPUID so its result should be cached +// by the caller. +int Cpu_SupportsAVX2(void); + + +#ifdef __cplusplus +} +#endif + + +#endif + +#endif diff --git a/mm/2s2h/mixer.c b/mm/2s2h/mixer.c index 6364fa5dda..626ecfbf56 100644 --- a/mm/2s2h/mixer.c +++ b/mm/2s2h/mixer.c @@ -6,6 +6,8 @@ #include #include "mixer.h" +#include "CpuHelpers.h" + #ifndef __clang__ #pragma GCC optimize("unroll-loops") #endif @@ -69,8 +71,13 @@ static int16_t resample_table[64][4] = { { 0xffdf, 0x0d46, 0x66ad, 0x0c39 } }; +static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr); static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr); static void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr); +static void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr); + +typedef void (*aMixFunc)(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr); +static aMixFunc sMixFunc; static inline int16_t clamp16(int32_t v) { if (v < -0x8000) { @@ -90,6 +97,26 @@ static inline int32_t clamp32(int64_t v) { return (int32_t)v; } +extern int Cpu_SupportsAVX2(void); + +// Sets up which version of a function to use. If you are trying to read an example implementation, look at the function +// name ending with "ImplRef". If you are debugging a crash or issue, break in the function calling the function pointer +void Mixer_Init(void) { +#if defined(X86_CPU) + if (Cpu_SupportsAVX2()) { + sMixFunc = aMixImpl256; + return; + } + // If AVX2 isn't supported, fallback to the SSE2 implementation. SSE2 support goes back to early 2000s. We can + // assume it is supported + sMixFunc = aMixImplSSE2; +#elif defiend(__ARM_NEON) + sMixFunc = aMixImplNEON; +#else + sMixFunc = aMixImplRef; +#endif +} + void aClearBufferImpl(uint16_t addr, int nbytes) { nbytes = ROUND_UP_16(nbytes); memset(BUF_U8(addr), 0, nbytes); @@ -357,13 +384,7 @@ static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t } void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) { -#if defined(__SSE2__) || defined(_M_AMD64) - aMixImplSSE2(count, gain, in_addr, out_addr); -#elif defined(__ARM_NEON) - aMixImplNEON(count, gain, in_addr, out_addr); -#else - aMixImplRef(count, gain, in_addr, out_addr); -#endif + sMixFunc(count, gain, in_addr, out_addr); } void aS8DecImpl(uint8_t flags, ADPCM_STATE state) { @@ -607,25 +628,17 @@ void aUnkCmd19Impl(uint8_t f, uint16_t count, uint16_t out_addr, uint16_t in_add // SIMD operations expect aligned data #include "align_asset_macro.h" -#if defined(__SSE2__) || defined(_M_AMD64) +#if defined(X86_CPU) #include -static const ALIGN_ASSET(16) int16_t x7fff[8] = { - 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, -}; -static const ALIGN_ASSET(16) int32_t x4000[4] = { - 0x4000, - 0x4000, - 0x4000, - 0x4000, -}; +static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,}; +static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000}; static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) { int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4)); int16_t* in = BUF_S16(in_addr); int16_t* out = BUF_S16(out_addr); - int i; - int32_t sample; + if (gain == -0x8000) { while (nbytes > 0) { for (unsigned int i = 0; i < 2; i++) { @@ -638,6 +651,7 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_ out += 8; } } + return; } // Load constants into vectors from aligned memory. __m128i x7fffVec = _mm_load_si128((__m128i*)x7fff); @@ -686,74 +700,10 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_ } } } -#endif -#if defined(__ARM_NEON) -#include -static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 }; -void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) { - int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4)); - int16_t* in = BUF_S16(in_addr); - int16_t* out = BUF_S16(out_addr); - int i; - int32_t sample; - - if (gain == -0x8000) { - while (nbytes > 0) { - for (unsigned int i = 0; i < 2; i++) { - int16x8_t outVec = vld1q_s16(out); - int16x8_t inVec = vld1q_s16(in); - int16x8_t subVec = vqsubq_s16(outVec, inVec); - vst1q_s16(out, subVec); - nbytes -= 8 * sizeof(int16_t); - out += 8; - in += 8; - } - } - } - int16x8_t gainVec = vdupq_n_s16(gain); - int32x4_t x4000Vec = vld1q_s32(x4000Arr); - while (nbytes > 0) { - for (unsigned int i = 0; i < 2; i++) { - // for (i = 0; i < 16; i++) { - int16x8_t outVec = vld1q_s16(out); - int16x8_t inVec = vld1q_s16(in); - int16x4_t outLoVec = vget_low_s16(outVec); - int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec); - int16x4_t inLoVec = vget_low_s16(inVec); - int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec); - int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF); - int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF); - - int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec); - int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec); - int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec); - int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec); - addVecHi = vaddq_s32(addVecHi, x4000Vec); - addVecLo = vaddq_s32(addVecLo, x4000Vec); - int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15); - int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15); - int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi); - int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo); - vst1_s16(out, shiftedNarrowLoVec); - out += 4; - vst1_s16(out, shiftedNarrowHiVec); - // int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec); - // vst1q_s16(out, finalVec); - out += 4; - in += 8; - - nbytes -= 8 * sizeof(int16_t); - } - } -} -#endif - -#if 0 -static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,}; -static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000}; #pragma GCC target("avx2") -// AVX2 version of the SSE2 implementation above. AVX2 wasn't released until 2014 and I don't have a good way of checking for it at compile time. +// AVX2 version of the SSE2 implementation above. AVX2 support can be forced because we check if the CPU supports it +// at runtime. void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) { int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4)); int16_t* in = BUF_S16(in_addr); @@ -770,6 +720,7 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad out += 16; nbytes -= 16 * sizeof(int16_t); } + return; } // Load constants into vectors from aligned memory. __m256i x7fffVec = _mm256_load_si256((__m256i*)x7fff); @@ -817,3 +768,64 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad } } #endif + +#if defined(__ARM_NEON) +#include +static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 }; +void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) { + int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4)); + int16_t* in = BUF_S16(in_addr); + int16_t* out = BUF_S16(out_addr); + int i; + int32_t sample; + + if (gain == -0x8000) { + while (nbytes > 0) { + for (unsigned int i = 0; i < 2; i++) { + int16x8_t outVec = vld1q_s16(out); + int16x8_t inVec = vld1q_s16(in); + int16x8_t subVec = vqsubq_s16(outVec, inVec); + vst1q_s16(out, subVec); + nbytes -= 8 * sizeof(int16_t); + out += 8; + in += 8; + } + } + } + int16x8_t gainVec = vdupq_n_s16(gain); + int32x4_t x4000Vec = vld1q_s32(x4000Arr); + while (nbytes > 0) { + for (unsigned int i = 0; i < 2; i++) { + // for (i = 0; i < 16; i++) { + int16x8_t outVec = vld1q_s16(out); + int16x8_t inVec = vld1q_s16(in); + int16x4_t outLoVec = vget_low_s16(outVec); + int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec); + int16x4_t inLoVec = vget_low_s16(inVec); + int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec); + int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF); + int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF); + + int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec); + int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec); + int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec); + int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec); + addVecHi = vaddq_s32(addVecHi, x4000Vec); + addVecLo = vaddq_s32(addVecLo, x4000Vec); + int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15); + int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15); + int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi); + int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo); + vst1_s16(out, shiftedNarrowLoVec); + out += 4; + vst1_s16(out, shiftedNarrowHiVec); + // int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec); + // vst1q_s16(out, finalVec); + out += 4; + in += 8; + + nbytes -= 8 * sizeof(int16_t); + } + } +} +#endif diff --git a/mm/src/audio/lib/load.c b/mm/src/audio/lib/load.c index 550e8e82f9..62358d423e 100644 --- a/mm/src/audio/lib/load.c +++ b/mm/src/audio/lib/load.c @@ -1142,6 +1142,8 @@ extern AudioContext gAudioCtx; // #end region #include "resourcebridge.h" +void Mixer_Init(void); + void AudioLoad_Init(void* heap, size_t heapSize) { s32 pad1[9]; s32 numFonts; @@ -1150,6 +1152,7 @@ void AudioLoad_Init(void* heap, size_t heapSize) { void* addr; s32 i; s32 j; + Mixer_Init(); gAudioCustomUpdateFunction = NULL; gAudioCustomReverbFunction = NULL;