Skip to content

WIP - Speed improvements to resize convolution (no vpermps w/ FMA) #2793

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
75 changes: 75 additions & 0 deletions src/ImageSharp/Common/Helpers/Numerics.cs
Original file line number Diff line number Diff line change
@@ -1097,4 +1097,79 @@ public static nuint Vector512Count<TVector>(this Span<float> span)
public static nuint Vector512Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector512<TVector>.Count;

/// <summary>
/// Normalizes the values in a given <see cref="Span{T}"/>.
/// </summary>
/// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
/// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void Normalize(Span<float> span, float sum)
{
if (Vector512.IsHardwareAccelerated)
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15);
Vector512<float> sum512 = Vector512.Create(sum);

while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
Unsafe.As<float, Vector512<float>>(ref startRef) /= sum512;
startRef = ref Unsafe.Add(ref startRef, (nuint)16);
}

if ((span.Length & 15) >= 8)
{
Unsafe.As<float, Vector256<float>>(ref startRef) /= sum512.GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)8);
}

if ((span.Length & 7) >= 4)
{
Unsafe.As<float, Vector128<float>>(ref startRef) /= sum512.GetLower().GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)4);
}

endRef = ref Unsafe.Add(ref startRef, span.Length & 3);

while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
startRef /= sum;
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
}
}
else if (Vector256.IsHardwareAccelerated)
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
Vector256<float> sum256 = Vector256.Create(sum);

while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
startRef = ref Unsafe.Add(ref startRef, (nuint)8);
}

if ((span.Length & 7) >= 4)
{
Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)4);
}

endRef = ref Unsafe.Add(ref startRef, span.Length & 3);

while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
startRef /= sum;
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
}
}
else
{
for (int i = 0; i < span.Length; i++)
{
span[i] /= sum;
}
}
}
}
38 changes: 38 additions & 0 deletions src/ImageSharp/Common/Helpers/Vector128Utilities.cs
Original file line number Diff line number Diff line change
@@ -245,6 +245,44 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
return default;
}

/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
/// in slightly different results compared to performing the multiplication and addition separately due to
/// differences in how floating-point
/// rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> MultiplyAddEstimate(Vector128<float> a, Vector128<float> b, Vector128<float> c)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(a, b, c);
}

return (a * b) + c;
}

[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
38 changes: 38 additions & 0 deletions src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Original file line number Diff line number Diff line change
@@ -110,6 +110,44 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
return Vector256.ConvertToInt32(val_2p23_f32 | sign);
}

/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
/// in slightly different results compared to performing the multiplication and addition separately due to
/// differences in how floating-point
/// rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> MultiplyAddEstimate(Vector256<float> a, Vector256<float> b, Vector256<float> c)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(a, b, c);
}

return (a * b) + c;
}

[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
38 changes: 38 additions & 0 deletions src/ImageSharp/Common/Helpers/Vector512Utilities.cs
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@

using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@@ -110,6 +111,43 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
return Vector512.ConvertToInt32(val_2p23_f32 | sign);
}

/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector512{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/> against the upper and lower
/// buts. This approach can result in slightly different results compared to performing the multiplication and
/// addition separately due to differences in how floating-point rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
{
if (Avx512F.IsSupported)
{
return Avx512F.FusedMultiplyAdd(a, b, c);
}

return (a + b) * c;
}

[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
191 changes: 128 additions & 63 deletions src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;

namespace SixLabors.ImageSharp.Processing.Processors.Transforms;

@@ -14,11 +14,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
/// </summary>
internal readonly unsafe struct ResizeKernel
{
/// <summary>
/// The buffer with the convolution factors.
/// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
/// </summary>
private readonly float* bufferPtr;

/// <summary>
/// Initializes a new instance of the <see cref="ResizeKernel"/> struct.
/// </summary>
/// <param name="startIndex">The starting index for the destination row.</param>
/// <param name="bufferPtr">The pointer to the buffer with the convolution factors.</param>
/// <param name="length">The length of the kernel.</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal ResizeKernel(int startIndex, float* bufferPtr, int length)
{
@@ -27,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length)
this.Length = length;
}

/// <summary>
/// Gets a value indicating whether vectorization is supported.
/// </summary>
public static bool SupportsVectorization
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Vector256.IsHardwareAccelerated;
}

/// <summary>
/// Gets the start index for the destination row.
/// </summary>
@@ -53,7 +69,15 @@ public int Length
public Span<float> Values
{
[MethodImpl(InliningOptions.ShortMethod)]
get => new(this.bufferPtr, this.Length);
get
{
if (Vector256.IsHardwareAccelerated)
{
return new(this.bufferPtr, this.Length * 4);
}

return new(this.bufferPtr, this.Length);
}
}

/// <summary>
@@ -68,73 +92,99 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
[MethodImpl(InliningOptions.ShortMethod)]
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
{
if (Avx2.IsSupported && Fma.IsSupported)
if (SupportsVectorization)
{
float* bufferStart = this.bufferPtr;
float* bufferEnd = bufferStart + (this.Length & ~3);
Vector256<float> result256_0 = Vector256<float>.Zero;
Vector256<float> result256_1 = Vector256<float>.Zero;
ReadOnlySpan<byte> maskBytes = new byte[]
if (Vector512.IsHardwareAccelerated)
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
};
Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
float* bufferStart = this.bufferPtr;
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
Vector512<float> result512_0 = Vector512<float>.Zero;
Vector512<float> result512_1 = Vector512<float>.Zero;

while (bufferStart < bufferEnd)
{
// It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
// for the FMA operation, and execute it directly on the target register and reading directly from
// memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
// The code below should compile in the following assembly on .NET 5 x64:
//
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
//
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);

result256_1 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
result256_1);

bufferStart += 4;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
}
while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
{
Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));

result256_0 = Avx.Add(result256_0, result256_1);
result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);

if ((this.Length & 3) >= 2)
{
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);
bufferStart += 32;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
}

bufferStart += 2;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
}
result512_0 += result512_1;

Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
if ((this.Length & 7) >= 4)
{
Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);

if ((this.Length & 1) != 0)
{
result128 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
Vector128.Create(*bufferStart),
result128);
bufferStart += 16;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
}

Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();

if ((this.Length & 3) >= 2)
{
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);

bufferStart += 8;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
}

Vector128<float> result128 = result256.GetLower() + result256.GetUpper();

if ((this.Length & 1) != 0)
{
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
}

return *(Vector4*)&result128;
}
else
{
float* bufferStart = this.bufferPtr;
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
Vector256<float> result256_0 = Vector256<float>.Zero;
Vector256<float> result256_1 = Vector256<float>.Zero;

while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
{
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));

result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);

bufferStart += 16;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
}

result256_0 += result256_1;

if ((this.Length & 3) >= 2)
{
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);

bufferStart += 8;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
}

Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();

return *(Vector4*)&result128;
if ((this.Length & 1) != 0)
{
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
}

return *(Vector4*)&result128;
}
}
else
{
@@ -149,7 +199,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
result += rowStartRef * *bufferStart;

bufferStart++;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
}

return result;
@@ -160,17 +210,32 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
/// Copy the contents of <see cref="ResizeKernel"/> altering <see cref="StartIndex"/>
/// to the value <paramref name="left"/>.
/// </summary>
/// <param name="left">The new value for <see cref="StartIndex"/>.</param>
[MethodImpl(InliningOptions.ShortMethod)]
internal ResizeKernel AlterLeftValue(int left)
=> new(left, this.bufferPtr, this.Length);

internal void Fill(Span<double> values)
internal void FillOrCopyAndExpand(Span<float> values)
{
DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");

for (int i = 0; i < this.Length; i++)
if (Vector256.IsHardwareAccelerated)
{
Vector4* bufferStart = (Vector4*)this.bufferPtr;
ref float valuesStart = ref MemoryMarshal.GetReference(values);
ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);

while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
{
*bufferStart = new Vector4(valuesStart);

bufferStart++;
valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
}
}
else
{
this.Values[i] = (float)values[i];
values.CopyTo(this.Values);
}
}
}
Original file line number Diff line number Diff line change
@@ -54,7 +54,7 @@ protected internal override void Initialize<TResampler>(in TResampler sampler)
int bottomStartDest = this.DestinationLength - this.cornerInterval;
for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
{
double center = ((i + .5) * this.ratio) - .5;
float center = (float)(((i + .5) * this.ratio) - .5);
int left = (int)TolerantMath.Ceiling(center - this.radius);
ResizeKernel kernel = this.kernels[i - this.period];
this.kernels[i] = kernel.AlterLeftValue(left);
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@ internal partial class ResizeKernelMap : IDisposable
private bool isDisposed;

// To avoid both GC allocations, and MemoryAllocator ceremony:
private readonly double[] tempValues;
private readonly float[] tempValues;

private ResizeKernelMap(
MemoryAllocator memoryAllocator,
@@ -50,10 +50,19 @@ private ResizeKernelMap(
this.sourceLength = sourceLength;
this.DestinationLength = destinationLength;
this.MaxDiameter = (radius * 2) + 1;
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean);

if (ResizeKernel.SupportsVectorization)
{
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
}
else
{
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true);
}

this.pinHandle = this.data.DangerousGetSingleMemory().Pin();
this.kernels = new ResizeKernel[destinationLength];
this.tempValues = new double[this.MaxDiameter];
this.tempValues = new float[this.MaxDiameter];
}

/// <summary>
@@ -155,23 +164,23 @@ public static ResizeKernelMap Calculate<TResampler>(
bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize;

ResizeKernelMap result = hasAtLeast2Periods
? new PeriodicKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
ratio,
scale,
radius,
period,
cornerInterval)
: new ResizeKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
destinationSize,
ratio,
scale,
radius);
? new PeriodicKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
ratio,
scale,
radius,
period,
cornerInterval)
: new ResizeKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
destinationSize,
ratio,
scale,
radius);

result.Initialize(in sampler);

@@ -198,7 +207,8 @@ protected internal virtual void Initialize<TResampler>(in TResampler sampler)
private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowIndex, int dataRowIndex)
where TResampler : struct, IResampler
{
double center = ((destRowIndex + .5) * this.ratio) - .5;
float center = (float)(((destRowIndex + .5) * this.ratio) - .5);
float scale = (float)this.scale;

// Keep inside bounds.
int left = (int)TolerantMath.Ceiling(center - this.radius);
@@ -214,30 +224,25 @@ private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowI
}

ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);

Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
double sum = 0;
Span<float> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues);
float sum = 0;

for (int j = left; j <= right; j++)
{
double value = sampler.GetValue((float)((j - center) / this.scale));
float value = sampler.GetValue((j - center) / scale);
sum += value;

kernelValues[j - left] = value;
kernelStart = value;
kernelStart = ref Unsafe.Add(ref kernelStart, 1);
}

// Normalize, best to do it here rather than in the pixel loop later on.
if (sum > 0)
{
for (int j = 0; j < kernel.Length; j++)
{
// weights[w] = weights[w] / sum:
ref double kRef = ref kernelValues[j];
kRef /= sum;
}
Numerics.Normalize(kernelValues, sum);
}

kernel.Fill(kernelValues);
kernel.FillOrCopyAndExpand(kernelValues);

return kernel;
}
Original file line number Diff line number Diff line change
@@ -16,9 +16,7 @@ internal class ReferenceKernelMap
private readonly ReferenceKernel[] kernels;

public ReferenceKernelMap(ReferenceKernel[] kernels)
{
this.kernels = kernels;
}
=> this.kernels = kernels;

public int DestinationSize => this.kernels.Length;

@@ -28,22 +26,23 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
where TResampler : struct, IResampler
{
double ratio = (double)sourceSize / destinationSize;
double scale = ratio;
double scaleD = ratio;

if (scale < 1F)
if (scaleD < 1)
{
scale = 1F;
scaleD = 1;
}

TolerantMath tolerantMath = TolerantMath.Default;

double radius = tolerantMath.Ceiling(scale * sampler.Radius);
double radius = tolerantMath.Ceiling(scaleD * sampler.Radius);

var result = new List<ReferenceKernel>();
List<ReferenceKernel> result = [];

float scale = (float)scaleD;
for (int i = 0; i < destinationSize; i++)
{
double center = ((i + .5) * ratio) - .5;
float center = (float)(((i + .5) * ratio) - .5);

// Keep inside bounds.
int left = (int)tolerantMath.Ceiling(center - radius);
@@ -58,15 +57,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
right = sourceSize - 1;
}

double sum = 0;
float sum = 0;

double[] values = new double[right - left + 1];
float[] values = new float[right - left + 1];

for (int j = left; j <= right; j++)
{
double weight = sampler.GetValue((float)((j - center) / scale));
float weight = sampler.GetValue((j - center) / scale);
sum += weight;

values[j - left] = weight;
}

@@ -78,16 +76,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
}
}

float[] floatVals = values.Select(v => (float)v).ToArray();

result.Add(new ReferenceKernel(left, floatVals));
result.Add(new ReferenceKernel(left, values));
}

return new ReferenceKernelMap(result.ToArray());
return new ReferenceKernelMap([.. result]);
}
}

internal struct ReferenceKernel
internal readonly struct ReferenceKernel
{
public ReferenceKernel(int left, float[] values)
{
@@ -102,8 +98,6 @@ public ReferenceKernel(int left, float[] values)
public int Length => this.Values.Length;

public static implicit operator ReferenceKernel(ResizeKernel orig)
{
return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray());
}
=> new(orig.StartIndex, orig.Values.ToArray());
}
}
Original file line number Diff line number Diff line change
@@ -124,7 +124,6 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n");
this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n");
#endif
var comparer = new ApproximateFloatComparer(1e-6f);

for (int i = 0; i < kernelMap.DestinationLength; i++)
{
@@ -139,7 +138,29 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
referenceKernel.Left == kernel.StartIndex,
$"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}");
float[] expectedValues = referenceKernel.Values;
Span<float> actualValues = kernel.Values;
Span<float> actualValues;

ApproximateFloatComparer comparer;
if (ResizeKernel.SupportsVectorization)
{
comparer = new ApproximateFloatComparer(1e-4f);

Assert.Equal(expectedValues.Length, kernel.Values.Length / 4);

int actualLength = referenceKernel.Length / 4;

actualValues = new float[expectedValues.Length];

for (int j = 0; j < expectedValues.Length; j++)
{
actualValues[j] = kernel.Values[j * 4];
}
}
else
{
comparer = new ApproximateFloatComparer(1e-6f);
actualValues = kernel.Values;
}

Assert.Equal(expectedValues.Length, actualValues.Length);

Loading