diff --git a/internal/silk/decoder.go b/internal/silk/decoder.go index f1f9842..24a3d6e 100644 --- a/internal/silk/decoder.go +++ b/internal/silk/decoder.go @@ -40,6 +40,10 @@ type Decoder struct { // https://www.rfc-editor.org/rfc/rfc6716#section-4.2.7.9.1 finalOutValues []float32 + previousGainQ16 int32 + sLPCQ14Buf [maxSilkLPCOrder]int32 + outBuf [maxSilkOutBufferLength]int16 + // n0Q15 are the LSF coefficients decoded for the prior frame // see normalizeLSFInterpolation n0Q15 []int16 @@ -56,14 +60,16 @@ type Decoder struct { // NewDecoder creates a new Silk Decoder. func NewDecoder() Decoder { return Decoder{ - sideDecoder: newChannelDecoder(), - finalOutValues: make([]float32, 306), + sideDecoder: newChannelDecoder(), + finalOutValues: make([]float32, 306), + previousGainQ16: 65536, } } func newChannelDecoder() *Decoder { return &Decoder{ - finalOutValues: make([]float32, 306), + finalOutValues: make([]float32, 306), + previousGainQ16: 65536, } } @@ -74,6 +80,9 @@ func (d *Decoder) resetPredictionState() { d.previousLogGain = 10 d.previousFrameLPCValues = nil clear(d.finalOutValues) + d.previousGainQ16 = 65536 + clear(d.sLPCQ14Buf[:]) + clear(d.outBuf[:]) d.n0Q15 = nil } @@ -1780,7 +1789,7 @@ func (d *Decoder) samplesInSubframe(bandwidth Bandwidth) int { // https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9.1 // -//nolint:gocognit,cyclop +//nolint:gocognit,cyclop,unused // Kept as the RFC-prose reference path while fixed-point reconstruction is evaluated. func (d *Decoder) ltpSynthesis( out []float32, bQ7 [][]int8, @@ -2002,6 +2011,8 @@ func (d *Decoder) lpcSynthesis( // of -1.0 to 1.0. // // https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9 +// +//nolint:unused // Kept as the RFC-prose reference path while fixed-point reconstruction is evaluated. func (d *Decoder) silkFrameReconstruction( signalType frameSignalType, bandwidth Bandwidth, subframeCount int, @@ -2078,6 +2089,211 @@ func (d *Decoder) silkFrameReconstruction( } } +func silkLTPMemLength(bandwidth Bandwidth) int { + switch bandwidth { + case BandwidthNarrowband: + return 160 + case BandwidthMediumband: + return 240 + case BandwidthWideband: + return 320 + } + + return 0 +} + +//nolint:cyclop // Mirrors silk_decode_core() setup from the RFC 6716 C reference. +func (d *Decoder) silkFrameReconstructionFixed( + signalType frameSignalType, bandwidth Bandwidth, + subframeCount int, + dLPC int, + bQ7 [][]int8, + pitchLags []int, + eQ23 []int32, + ltpScaleQ14 float32, + wQ2 int16, + aQ12 [][]float32, + gainQ16, out []float32, +) { + frameLength := len(out) + if frameLength == 0 { + return + } + + predCoefQ12 := make([][maxSilkLPCOrder]int16, len(aQ12)) + for setIndex := range aQ12 { + for i := range min(dLPC, len(aQ12[setIndex])) { + predCoefQ12[setIndex][i] = int16(aQ12[setIndex][i]) //nolint:gosec // G115 + } + } + + ltpCoefQ14 := make([][silkLTPOrder]int16, subframeCount) + if signalType == frameSignalTypeVoiced { + for subframe := range min(subframeCount, len(bQ7)) { + for i := range min(silkLTPOrder, len(bQ7[subframe])) { + ltpCoefQ14[subframe][i] = int16(bQ7[subframe][i]) << 7 + } + } + } + + gainsQ16 := make([]int32, len(gainQ16)) + for i, gain := range gainQ16 { + gainsQ16[i] = int32(gain) //nolint:gosec // G115 + } + + excQ14 := make([]int32, len(eQ23)) + for i, excitation := range eQ23 { + excQ14[i] = excitation << 6 + } + + xq := make([]int16, frameLength) + d.decodeCoreFixed( + xq, + excQ14, + signalType, + bandwidth, + dLPC, + predCoefQ12, + ltpCoefQ14, + pitchLags, + gainsQ16, + int32(ltpScaleQ14), + wQ2, + ) + + for i, sample := range xq { + out[i] = float32(sample) / 32768.0 + } + + ltpMemLength := silkLTPMemLength(bandwidth) + mvLen := ltpMemLength - frameLength + if mvLen > 0 { + copy(d.outBuf[:mvLen], d.outBuf[frameLength:frameLength+mvLen]) + copy(d.outBuf[mvLen:ltpMemLength], xq) + } else { + copy(d.outBuf[:ltpMemLength], xq[frameLength-ltpMemLength:]) + } +} + +// decodeCoreFixed mirrors the RFC 6716 fixed-point silk_decode_core() loop. +// +//nolint:cyclop,gocognit,gosec,nestif // Directly mirrors the C reference's fixed-point decode_core loop. +func (d *Decoder) decodeCoreFixed( + xq []int16, + excQ14 []int32, + signalType frameSignalType, + bandwidth Bandwidth, + dLPC int, + predCoefQ12 [][maxSilkLPCOrder]int16, + ltpCoefQ14 [][silkLTPOrder]int16, + pitchLags []int, + gainsQ16 []int32, + ltpScaleQ14 int32, + wQ2 int16, +) { + n := d.samplesInSubframe(bandwidth) + ltpMemLength := silkLTPMemLength(bandwidth) + interpolatedNLSF := wQ2 < 4 + sLPCQ14 := make([]int32, maxSilkSubframeLength+maxSilkLPCOrder) + sLTPQ15 := make([]int32, 2*maxSilkFrameLength) + sLTP := make([]int16, maxSilkFrameLength) + resQ14 := make([]int32, maxSilkSubframeLength) + copy(sLPCQ14[:maxSilkLPCOrder], d.sLPCQ14Buf[:]) + sLTPBufIndex := ltpMemLength + if d.previousGainQ16 == 0 { + d.previousGainQ16 = 65536 + } + + for subframe := 0; subframe*n < len(xq); subframe++ { + subframeOffset := subframe * n + coefIndex := subframe >> 1 + if coefIndex >= len(predCoefQ12) { + coefIndex = len(predCoefQ12) - 1 + } + aQ12 := predCoefQ12[coefIndex][:] + gainQ16 := gainsQ16[min(subframe, len(gainsQ16)-1)] + gainQ10 := gainQ16 >> 6 + invGainQ31 := inverse32VarQ(gainQ16, 47) + gainAdjQ16 := int32(1 << 16) + if gainQ16 != d.previousGainQ16 { + gainAdjQ16 = div32VarQ(d.previousGainQ16, gainQ16, 16) + for i := range maxSilkLPCOrder { + sLPCQ14[i] = smulww(gainAdjQ16, sLPCQ14[i]) + } + } + d.previousGainQ16 = gainQ16 + + if signalType == frameSignalTypeVoiced { + lag := pitchLags[min(subframe, len(pitchLags)-1)] + if subframe == 0 || (subframe == 2 && interpolatedNLSF) { + startIndex := max(0, ltpMemLength-lag-dLPC-silkLTPOrder/2) + if subframe == 2 { + copy(d.outBuf[ltpMemLength:], xq[:2*n]) + } + lpcAnalysisFilterFixed( + sLTP[startIndex:], + d.outBuf[startIndex+subframe*n:], + aQ12, + ltpMemLength-startIndex, + dLPC, + ) + if subframe == 0 { + invGainQ31 = smulwb(invGainQ31, ltpScaleQ14) << 2 + } + for i := 0; i < lag+silkLTPOrder/2; i++ { + sLTPQ15[sLTPBufIndex-i-1] = smulwb(invGainQ31, int32(sLTP[ltpMemLength-i-1])) + } + } else if gainAdjQ16 != 1<<16 { + for i := 0; i < lag+silkLTPOrder/2; i++ { + sLTPQ15[sLTPBufIndex-i-1] = smulww(gainAdjQ16, sLTPQ15[sLTPBufIndex-i-1]) + } + } + + predLagIndex := sLTPBufIndex - lag + silkLTPOrder/2 + for i := range n { + ltpPredQ13 := int32(2) + ltpPredQ13 = smlaWB(ltpPredQ13, sLTPQ15[predLagIndex], int32(ltpCoefQ14[subframe][0])) + ltpPredQ13 = smlaWB(ltpPredQ13, sLTPQ15[predLagIndex-1], int32(ltpCoefQ14[subframe][1])) + ltpPredQ13 = smlaWB(ltpPredQ13, sLTPQ15[predLagIndex-2], int32(ltpCoefQ14[subframe][2])) + ltpPredQ13 = smlaWB(ltpPredQ13, sLTPQ15[predLagIndex-3], int32(ltpCoefQ14[subframe][3])) + ltpPredQ13 = smlaWB(ltpPredQ13, sLTPQ15[predLagIndex-4], int32(ltpCoefQ14[subframe][4])) + predLagIndex++ + + resQ14[i] = excQ14[subframeOffset+i] + (ltpPredQ13 << 1) + sLTPQ15[sLTPBufIndex] = resQ14[i] << 1 + sLTPBufIndex++ + } + } else { + copy(resQ14[:n], excQ14[subframeOffset:subframeOffset+n]) + } + + for i := range n { + lpcPredQ10 := int32(dLPC >> 1) + for k := range dLPC { + lpcPredQ10 = smlaWB(lpcPredQ10, sLPCQ14[maxSilkLPCOrder+i-k-1], int32(aQ12[k])) + } + sLPCQ14[maxSilkLPCOrder+i] = resQ14[i] + (lpcPredQ10 << 4) + xq[subframeOffset+i] = saturate16(rshiftRound32(smulww(sLPCQ14[maxSilkLPCOrder+i], gainQ10), 8)) + } + + copy(sLPCQ14[:maxSilkLPCOrder], sLPCQ14[n:n+maxSilkLPCOrder]) + } + + copy(d.sLPCQ14Buf[:], sLPCQ14[:maxSilkLPCOrder]) +} + +func lpcAnalysisFilterFixed(out, in []int16, b []int16, length, order int) { + for ix := order; ix < length; ix++ { + out32Q12 := smulbb(int32(in[ix-1]), int32(b[0])) + for j := 1; j < order; j++ { + out32Q12 = smlaBB(out32Q12, int32(in[ix-1-j]), int32(b[j])) + } + out32Q12 = (int32(in[ix]) << 12) - out32Q12 + out[ix] = saturate16(rshiftRound32(out32Q12, 12)) + } + clear(out[:min(order, len(out))]) +} + func (d *Decoder) decodeFrame( out []float32, voiceActivityDetected bool, @@ -2121,7 +2337,7 @@ func (d *Decoder) decodeFrame( aQ12 = d.generateAQ12(nlsfQ15, bandwidth, aQ12) // https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.1 - lagMax, pitchLags := d.decodePitchLags(signalType, bandwidth, nanoseconds, isFirstSilkFrameInOpusFrame) + _, pitchLags := d.decodePitchLags(signalType, bandwidth, nanoseconds, isFirstSilkFrameInOpusFrame) // https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.6.2 bQ7 := d.decodeLTPFilterCoefficients(signalType, subframeCount) @@ -2145,11 +2361,10 @@ func (d *Decoder) decodeFrame( eQ23 := d.decodeExcitation(signalType, quantizationOffsetType, lcgSeed, pulsecounts, lsbcounts) // https://www.rfc-editor.org/rfc/rfc6716.html#section-4.2.7.9 - d.silkFrameReconstruction( + d.silkFrameReconstructionFixed( signalType, bandwidth, subframeCount, dLPC, - lagMax, bQ7, pitchLags, eQ23, @@ -2231,6 +2446,19 @@ func (d *Decoder) stereoPhaseOneSampleCount(bandwidth Bandwidth) int { return 0 } +func (d *Decoder) stereoPhaseOneDenominatorQ16(bandwidth Bandwidth) int32 { + switch bandwidth { + case BandwidthNarrowband: + return 1024 // (1 << 16) / 64 + case BandwidthMediumband: + return 682 // (1 << 16) / 96 + case BandwidthWideband: + return 512 // (1 << 16) / 128 + } + + return 0 +} + func (d *Decoder) delayMid(out []float32) { if len(out) == 0 { return @@ -2261,37 +2489,61 @@ func (d *Decoder) delayMono(out []float32) { d.wasStereo = false } +func silkSampleToPCM16(sample float32) int16 { + switch { + case sample >= 32767.0/32768.0: + return math.MaxInt16 + case sample <= -1: + return math.MinInt16 + default: + return int16(sample * 32768) //nolint:gosec // Samples come from the fixed-point SILK core. + } +} + // RFC 6716 Section 4.2.8 converts mid-side stereo to left-right stereo. +// Keep this stage fixed-point too: stereo_MS_to_LR.c interpolates Q13 +// predictors and rounds the predicted side sample before the final L/R sum. func (d *Decoder) stereoUnmix(mid, side, out []float32, w0Q13, w1Q13 int32, bandwidth Bandwidth) { phaseOneSampleCount := d.stereoPhaseOneSampleCount(bandwidth) - previousW0Q13 := d.previousStereoWeights[0] - previousW1Q13 := d.previousStereoWeights[1] - midPrev2 := d.previousMidValues[0] - midPrev1 := d.previousMidValues[1] - sidePrev := d.previousSideValue + pred0Q13 := d.previousStereoWeights[0] + pred1Q13 := d.previousStereoWeights[1] + denomQ16 := d.stereoPhaseOneDenominatorQ16(bandwidth) + delta0Q13 := rshiftRound32(smulbb(w0Q13-pred0Q13, denomQ16), 16) + delta1Q13 := rshiftRound32(smulbb(w1Q13-pred1Q13, denomQ16), 16) + midPrev2Q0 := silkSampleToPCM16(d.previousMidValues[0]) + midPrev1Q0 := silkSampleToPCM16(d.previousMidValues[1]) + sidePrevQ0 := silkSampleToPCM16(d.previousSideValue) for i := range mid { - interpSample := min(i, phaseOneSampleCount) + if i < phaseOneSampleCount { + pred0Q13 += delta0Q13 + pred1Q13 += delta1Q13 + } else { + pred0Q13 = w0Q13 + pred1Q13 = w1Q13 + } - w0 := float32(previousW0Q13)/8192.0 + - float32(interpSample)*float32(w0Q13-previousW0Q13)/(8192.0*float32(phaseOneSampleCount)) - w1 := float32(previousW1Q13)/8192.0 + - float32(interpSample)*float32(w1Q13-previousW1Q13)/(8192.0*float32(phaseOneSampleCount)) - p0 := (midPrev2 + 2*midPrev1 + mid[i]) / 4.0 + midQ0 := silkSampleToPCM16(mid[i]) + sideQ0 := silkSampleToPCM16(side[i]) + predictedSideQ8 := int32(sidePrevQ0) << 8 + sumQ11 := (int32(midPrev2Q0) + 2*int32(midPrev1Q0) + int32(midQ0)) << 9 + predictedSideQ8 = smlaWB(predictedSideQ8, sumQ11, pred0Q13) + predictedSideQ8 = smlaWB(predictedSideQ8, int32(midPrev1Q0)<<11, pred1Q13) + predictedSideQ0 := saturate16(rshiftRound32(predictedSideQ8, 8)) - out[i*2] = clampNegativeOneToOne((1+w1)*midPrev1 + sidePrev + w0*p0) - out[i*2+1] = clampNegativeOneToOne((1-w1)*midPrev1 - sidePrev - w0*p0) + out[i*2] = float32(saturate16(int32(midPrev1Q0)+int32(predictedSideQ0))) / 32768 + out[i*2+1] = float32(saturate16(int32(midPrev1Q0)-int32(predictedSideQ0))) / 32768 - midPrev2 = midPrev1 - midPrev1 = mid[i] - sidePrev = side[i] + midPrev2Q0 = midPrev1Q0 + midPrev1Q0 = midQ0 + sidePrevQ0 = sideQ0 } d.previousStereoWeights[0] = w0Q13 d.previousStereoWeights[1] = w1Q13 - d.previousMidValues[0] = midPrev2 - d.previousMidValues[1] = midPrev1 - d.previousSideValue = sidePrev + d.previousMidValues[0] = float32(midPrev2Q0) / 32768 + d.previousMidValues[1] = float32(midPrev1Q0) / 32768 + d.previousSideValue = float32(sidePrevQ0) / 32768 d.wasStereo = true } diff --git a/internal/silk/decoder_test.go b/internal/silk/decoder_test.go index 2ef932d..53cd96e 100644 --- a/internal/silk/decoder_test.go +++ b/internal/silk/decoder_test.go @@ -112,8 +112,8 @@ func TestStereoUnmix(t *testing.T) { d.stereoUnmix(mid, side, out, 0, 0, BandwidthNarrowband) - assert.Equal(t, []float32{0, 0, 1, 1}, out) - assert.Equal(t, [2]float32{1, 0}, d.previousMidValues) + assert.Equal(t, []float32{0, 0, 32767.0 / 32768.0, 32767.0 / 32768.0}, out) + assert.Equal(t, [2]float32{32767.0 / 32768.0, 0}, d.previousMidValues) assert.Equal(t, float32(0), d.previousSideValue) assert.True(t, d.wasStereo) } @@ -774,9 +774,11 @@ func TestDecode(t *testing.T) { if i > 0 { expectedSample = expectedOut[i-1] } - // The decoder keeps this stage in float, so cross-frame LPC state - // updates accumulate a few extra LSBs versus the old fixture. - assert.InDelta(t, expectedSample, out[i], 4*floatEqualityThreshold) + // SILK reconstruction mirrors the fixed-point C decoder and emits + // PCM16-stepped samples. The fixture was generated from the earlier + // RFC-prose float reconstruction, so allow one PCM16 quantization + // interval while still catching larger synthesis regressions. + assert.InDelta(t, expectedSample, out[i], 1.0/32768.0) } previousSample = expectedOut[len(expectedOut)-1] diff --git a/internal/silk/silk.go b/internal/silk/silk.go index a706729..1839d3d 100644 --- a/internal/silk/silk.go +++ b/internal/silk/silk.go @@ -22,6 +22,12 @@ const ( pulsecountLargestPartitionSize = 16 + maxSilkFrameLength = 320 + maxSilkSubframeLength = 80 + maxSilkOutBufferLength = maxSilkFrameLength + 2*maxSilkSubframeLength + maxSilkLPCOrder = 16 + silkLTPOrder = 5 + nanoseconds10Ms = 10000000 nanoseconds20Ms = 20000000 nanoseconds40Ms = 40000000 @@ -200,12 +206,82 @@ func smulwb(a, b int32) int32 { return int32((int64(a) * int64(int16(b))) >> 16) //nolint:gosec // G115 } +func smlaWB(a, b, c int32) int32 { + return a + smulwb(b, c) +} + +func smulww(a, b int32) int32 { + return smulwb(a, b) + a*rshiftRound32(b, 16) +} + +func smulbb(a, b int32) int32 { + return int32(int16(a)) * int32(int16(b)) //nolint:gosec // G115: mirrors silk_SMULBB low-16-bit multiply. +} + +func smlaBB(a, b, c int32) int32 { + return a + smulbb(b, c) +} + // smlaWW mirrors silk_SMLAWW() from the RFC 6716 C macros. The inverse helper // uses it for the Newton-Raphson refinement step in silk_INVERSE32_varQ(). func smlaWW(a, b, c int32) int32 { return a + smulwb(b, c) + b*rshiftRound32(c, 16) } +func lshiftSat32(v int32, shift int) int32 { + if shift <= 0 { + return v + } + low := int32(math.MinInt32 >> shift) + high := int32(math.MaxInt32 >> shift) + v = clamp(low, v, high) + + return v << shift +} + +// div32VarQ mirrors silk_DIV32_varQ() from the RFC 6716 C reference +// (Inlines.h). The fixed-point SILK core uses this approximate normalized +// division when rescaling LPC/LTP state across subframe gain changes. +func div32VarQ(a32, b32 int32, qRes int) int32 { + if b32 == 0 { + if a32 < 0 { + return math.MinInt32 + } + + return math.MaxInt32 + } + + aHeadrm := clz32(absInt32(a32)) - 1 + a32Nrm := a32 << aHeadrm + bHeadrm := clz32(absInt32(b32)) - 1 + b32Nrm := b32 << bHeadrm + b32Inv := (math.MaxInt32 >> 2) / (b32Nrm >> 16) + result := smulwb(a32Nrm, b32Inv) + a32Nrm -= smmul(b32Nrm, result) << 3 + result = smlaWB(result, a32Nrm, b32Inv) + + lshift := 29 + aHeadrm - bHeadrm - qRes + if lshift < 0 { + return lshiftSat32(result, -lshift) + } + if lshift < 32 { + return result >> lshift + } + + return 0 +} + +func saturate16(v int32) int16 { + if v > math.MaxInt16 { + return math.MaxInt16 + } + if v < math.MinInt16 { + return math.MinInt16 + } + + return int16(v) +} + // inverse32VarQ mirrors silk_INVERSE32_varQ() from the RFC 6716 C reference // (Inlines.h). The LPC gain limiter uses this to approximate the reciprocal // of div_Q30 with the same normalization and refinement steps as the