Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions mm/2s2h/CpuHelpers.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// This file is only useful for x86(_64) systems. There isn't a standard way to detect these.
// https://xkcd.com/927/ comes to mind...
#include "CpuHelpers.h"
#if defined (X86_CPU)

#if defined (__unix__)
#include <cpuid.h>
#elif defined (_WIN32)
// From the GCC cpuid.h header
/* Extended Features Leaf (%eax == 7, %ecx == 0) */
/* %ebx */
#define bit_AVX2 (1 << 5)
#include <intrin.h>
#endif

// Other checks can be added as needed.

int Cpu_SupportsAVX2(void) {
#ifdef _WIN32
int cpuidData[4];
__cpuid(cpuidData, 7);
#else
unsigned int cpuidData[4];
__get_cpuid(7, &cpuidData[0], &cpuidData[1], &cpuidData[2], &cpuidData[3]);
#endif
if (cpuidData[1] & bit_AVX2) {
return 1;
}
return 0;
}

#endif
27 changes: 27 additions & 0 deletions mm/2s2h/CpuHelpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef CPU_HELPERS_H
#define CPU_HELPERS_H

// This file is only useful for x86(_64) systems right now. There isn't a standard way to detect these.
// https://xkcd.com/927/ comes to mind...


#if defined (__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(__i386) || defined(_M_IX86)
#define X86_CPU

#ifdef __cplusplus
extern "C" {
#endif

//Checks if the current CPU supports AVX2 instructions. This function always calls CPUID so its result should be cached
// by the caller.
int Cpu_SupportsAVX2(void);


#ifdef __cplusplus
}
#endif


#endif

#endif
182 changes: 97 additions & 85 deletions mm/2s2h/mixer.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <stdio.h>

#include "mixer.h"
#include "CpuHelpers.h"

#ifndef __clang__
#pragma GCC optimize("unroll-loops")
#endif
Expand Down Expand Up @@ -69,8 +71,13 @@ static int16_t resample_table[64][4] = {
{ 0xffdf, 0x0d46, 0x66ad, 0x0c39 }
};

static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
static void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
static void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);

typedef void (*aMixFunc)(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr);
static aMixFunc sMixFunc;

static inline int16_t clamp16(int32_t v) {
if (v < -0x8000) {
Expand All @@ -90,6 +97,26 @@ static inline int32_t clamp32(int64_t v) {
return (int32_t)v;
}

extern int Cpu_SupportsAVX2(void);

// Sets up which version of a function to use. If you are trying to read an example implementation, look at the function
// name ending with "ImplRef". If you are debugging a crash or issue, break in the function calling the function pointer
void Mixer_Init(void) {
#if defined(X86_CPU)
if (Cpu_SupportsAVX2()) {
sMixFunc = aMixImpl256;
return;
}
// If AVX2 isn't supported, fallback to the SSE2 implementation. SSE2 support goes back to early 2000s. We can
// assume it is supported
sMixFunc = aMixImplSSE2;
#elif defiend(__ARM_NEON)
sMixFunc = aMixImplNEON;
#else
sMixFunc = aMixImplRef;
#endif
}

void aClearBufferImpl(uint16_t addr, int nbytes) {
nbytes = ROUND_UP_16(nbytes);
memset(BUF_U8(addr), 0, nbytes);
Expand Down Expand Up @@ -357,13 +384,7 @@ static void aMixImplRef(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t
}

void aMixImpl(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
#if defined(__SSE2__) || defined(_M_AMD64)
aMixImplSSE2(count, gain, in_addr, out_addr);
#elif defined(__ARM_NEON)
aMixImplNEON(count, gain, in_addr, out_addr);
#else
aMixImplRef(count, gain, in_addr, out_addr);
#endif
sMixFunc(count, gain, in_addr, out_addr);
}

void aS8DecImpl(uint8_t flags, ADPCM_STATE state) {
Expand Down Expand Up @@ -607,25 +628,17 @@ void aUnkCmd19Impl(uint8_t f, uint16_t count, uint16_t out_addr, uint16_t in_add
// SIMD operations expect aligned data
#include "align_asset_macro.h"

#if defined(__SSE2__) || defined(_M_AMD64)
#if defined(X86_CPU)
#include <immintrin.h>

static const ALIGN_ASSET(16) int16_t x7fff[8] = {
0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
};
static const ALIGN_ASSET(16) int32_t x4000[4] = {
0x4000,
0x4000,
0x4000,
0x4000,
};
static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,};
static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000};

static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
int i;
int32_t sample;

if (gain == -0x8000) {
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
Expand All @@ -638,6 +651,7 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_
out += 8;
}
}
return;
}
// Load constants into vectors from aligned memory.
__m128i x7fffVec = _mm_load_si128((__m128i*)x7fff);
Expand Down Expand Up @@ -686,74 +700,10 @@ static void aMixImplSSE2(uint16_t count, int16_t gain, uint16_t in_addr, uint16_
}
}
}
#endif
#if defined(__ARM_NEON)
#include <arm_neon.h>
static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 };
void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
int i;
int32_t sample;

if (gain == -0x8000) {
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x8_t subVec = vqsubq_s16(outVec, inVec);
vst1q_s16(out, subVec);
nbytes -= 8 * sizeof(int16_t);
out += 8;
in += 8;
}
}
}
int16x8_t gainVec = vdupq_n_s16(gain);
int32x4_t x4000Vec = vld1q_s32(x4000Arr);
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
// for (i = 0; i < 16; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x4_t outLoVec = vget_low_s16(outVec);
int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec);
int16x4_t inLoVec = vget_low_s16(inVec);
int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec);
int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF);
int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF);

int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec);
int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec);
int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec);
int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec);
addVecHi = vaddq_s32(addVecHi, x4000Vec);
addVecLo = vaddq_s32(addVecLo, x4000Vec);
int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15);
int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15);
int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi);
int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo);
vst1_s16(out, shiftedNarrowLoVec);
out += 4;
vst1_s16(out, shiftedNarrowHiVec);
// int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec);
// vst1q_s16(out, finalVec);
out += 4;
in += 8;

nbytes -= 8 * sizeof(int16_t);
}
}
}
#endif

#if 0
static const ALIGN_ASSET(32) int16_t x7fff[16] = { 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,};
static const ALIGN_ASSET(32) int32_t x4000[8] = { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000};

#pragma GCC target("avx2")
// AVX2 version of the SSE2 implementation above. AVX2 wasn't released until 2014 and I don't have a good way of checking for it at compile time.
// AVX2 version of the SSE2 implementation above. AVX2 support can be forced because we check if the CPU supports it
// at runtime.
void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
Expand All @@ -770,6 +720,7 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad
out += 16;
nbytes -= 16 * sizeof(int16_t);
}
return;
}
// Load constants into vectors from aligned memory.
__m256i x7fffVec = _mm256_load_si256((__m256i*)x7fff);
Expand Down Expand Up @@ -817,3 +768,64 @@ void aMixImpl256(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_ad
}
}
#endif

#if defined(__ARM_NEON)
#include <arm_neon.h>
static const int32_t x4000Arr[4] = { 0x4000, 0x4000, 0x4000, 0x4000 };
void aMixImplNEON(uint16_t count, int16_t gain, uint16_t in_addr, uint16_t out_addr) {
int nbytes = ROUND_UP_32(ROUND_DOWN_16(count << 4));
int16_t* in = BUF_S16(in_addr);
int16_t* out = BUF_S16(out_addr);
int i;
int32_t sample;

if (gain == -0x8000) {
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x8_t subVec = vqsubq_s16(outVec, inVec);
vst1q_s16(out, subVec);
nbytes -= 8 * sizeof(int16_t);
out += 8;
in += 8;
}
}
}
int16x8_t gainVec = vdupq_n_s16(gain);
int32x4_t x4000Vec = vld1q_s32(x4000Arr);
while (nbytes > 0) {
for (unsigned int i = 0; i < 2; i++) {
// for (i = 0; i < 16; i++) {
int16x8_t outVec = vld1q_s16(out);
int16x8_t inVec = vld1q_s16(in);
int16x4_t outLoVec = vget_low_s16(outVec);
int16x8_t outLoVec2 = vcombine_s16(outLoVec, outLoVec);
int16x4_t inLoVec = vget_low_s16(inVec);
int16x8_t inLoVec2 = vcombine_s16(inLoVec, inLoVec);
int32x4_t outX7fffHiVec = vmull_high_n_s16(outVec, 0x7FFF);
int32x4_t outX7fffLoVec = vmull_high_n_s16(outLoVec2, 0x7FFF);

int32x4_t inGainLoVec = vmull_high_s16(inLoVec2, gainVec);
int32x4_t inGainHiVec = vmull_high_s16(inVec, gainVec);
int32x4_t addVecLo = vaddq_s32(outX7fffLoVec, inGainLoVec);
int32x4_t addVecHi = vaddq_s32(outX7fffHiVec, inGainHiVec);
addVecHi = vaddq_s32(addVecHi, x4000Vec);
addVecLo = vaddq_s32(addVecLo, x4000Vec);
int32x4_t shiftVecHi = vshrq_n_s32(addVecHi, 15);
int32x4_t shiftVecLo = vshrq_n_s32(addVecLo, 15);
int16x4_t shiftedNarrowHiVec = vqmovn_s32(shiftVecHi);
int16x4_t shiftedNarrowLoVec = vqmovn_s32(shiftVecLo);
vst1_s16(out, shiftedNarrowLoVec);
out += 4;
vst1_s16(out, shiftedNarrowHiVec);
// int16x8_t finalVec = vcombine_s16(shiftedNarrowLoVec, shiftedNarrowHiVec);
// vst1q_s16(out, finalVec);
out += 4;
in += 8;

nbytes -= 8 * sizeof(int16_t);
}
}
}
#endif
3 changes: 3 additions & 0 deletions mm/src/audio/lib/load.c
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,8 @@ extern AudioContext gAudioCtx;
// #end region
#include "resourcebridge.h"

void Mixer_Init(void);

void AudioLoad_Init(void* heap, size_t heapSize) {
s32 pad1[9];
s32 numFonts;
Expand All @@ -1150,6 +1152,7 @@ void AudioLoad_Init(void* heap, size_t heapSize) {
void* addr;
s32 i;
s32 j;
Mixer_Init();

gAudioCustomUpdateFunction = NULL;
gAudioCustomReverbFunction = NULL;
Expand Down
Loading