SixLabors · JimBobSquarePants · Aug 14, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -1097,4 +1097,79 @@ public static nuint Vector512Count<TVector>(this Span<float> span)
     public static nuint Vector512Count<TVector>(int length)
         where TVector : struct
         => (uint)length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Normalizes the values in a given <see cref="Span{T}"/>.
+    /// </summary>
+    /// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
+    /// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void Normalize(Span<float> span, float sum)
+    {
+        if (Vector512.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15);
+            Vector512<float> sum512 = Vector512.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector512<float>>(ref startRef) /= sum512;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)16);
+            }
+
+            if ((span.Length & 15) >= 8)
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum512.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum512.GetLower().GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else if (Vector256.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
+            Vector256<float> sum256 = Vector256.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < span.Length; i++)
+            {
+                span[i] /= sum;
+            }
+        }
+    }
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -245,6 +245,44 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
         return default;
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<float> MultiplyAddEstimate(Vector128<float> a, Vector128<float> b, Vector128<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -110,6 +110,44 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
         return Vector256.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplyAddEstimate(Vector256<float> a, Vector256<float> b, Vector256<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -3,6 +3,7 @@
 
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
@@ -110,6 +111,43 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
         return Vector512.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector512{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/> against the upper and lower
+    /// buts. This approach can result in slightly different results compared to performing the multiplication and
+    /// addition separately due to differences in how floating-point rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
+    {
+        if (Avx512F.IsSupported)
+        {
+            return Avx512F.FusedMultiplyAdd(a, b, c);
+        }
+
+        return (a + b) * c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -5,7 +5,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 
 namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
 
@@ -14,11 +14,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
 /// </summary>
 internal readonly unsafe struct ResizeKernel
 {
+    /// <summary>
+    /// The buffer with the convolution factors.
+    /// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
+    /// </summary>
     private readonly float* bufferPtr;
 
     /// <summary>
     /// Initializes a new instance of the <see cref="ResizeKernel"/> struct.
     /// </summary>
+    /// <param name="startIndex">The starting index for the destination row.</param>
+    /// <param name="bufferPtr">The pointer to the buffer with the convolution factors.</param>
+    /// <param name="length">The length of the kernel.</param>
     [MethodImpl(InliningOptions.ShortMethod)]
     internal ResizeKernel(int startIndex, float* bufferPtr, int length)
     {
@@ -27,6 +34,15 @@ internal ResizeKernel(int startIndex, float* bufferPtr, int length)
         this.Length = length;
     }
 
+    /// <summary>
+    /// Gets a value indicating whether vectorization is supported.
+    /// </summary>
+    public static bool SupportsVectorization
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        get => Vector256.IsHardwareAccelerated;
+    }
+
     /// <summary>
     /// Gets the start index for the destination row.
     /// </summary>
@@ -53,7 +69,15 @@ public int Length
     public Span<float> Values
     {
         [MethodImpl(InliningOptions.ShortMethod)]
-        get => new(this.bufferPtr, this.Length);
+        get
+        {
+            if (Vector256.IsHardwareAccelerated)
+            {
+                return new(this.bufferPtr, this.Length * 4);
+            }
+
+            return new(this.bufferPtr, this.Length);
+        }
     }
 
     /// <summary>
@@ -68,73 +92,99 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
     [MethodImpl(InliningOptions.ShortMethod)]
     public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     {
-        if (Avx2.IsSupported && Fma.IsSupported)
+        if (SupportsVectorization)
         {
-            float* bufferStart = this.bufferPtr;
-            float* bufferEnd = bufferStart + (this.Length & ~3);
-            Vector256<float> result256_0 = Vector256<float>.Zero;
-            Vector256<float> result256_1 = Vector256<float>.Zero;
-            ReadOnlySpan<byte> maskBytes = new byte[]
+            if (Vector512.IsHardwareAccelerated)
             {
-                0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-                1, 0, 0, 0, 1, 0, 0, 0,
-            };
-            Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
+                float* bufferStart = this.bufferPtr;
+                ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
+                Vector512<float> result512_0 = Vector512<float>.Zero;
+                Vector512<float> result512_1 = Vector512<float>.Zero;
 
-            while (bufferStart < bufferEnd)
-            {
-                // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
-                // for the FMA operation, and execute it directly on the target register and reading directly from
-                // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
-                // The code below should compile in the following assembly on .NET 5 x64:
-                //
-                // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
-                // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
-                // vfmadd231ps ymm0, ymm2, [r8]     ; result256_0 = FMA(pixels, factors) + result256_0
-                //
-                // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
-                // Additionally, we're also unrolling two computations per each loop iterations to leverage the
-                // fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
-
-                result256_1 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
-                    result256_1);
-
-                bufferStart += 4;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
-            }
+                while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+                {
+                    Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                    Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));
 
-            result256_0 = Avx.Add(result256_0, result256_1);
+                    result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
+                    result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);
 
-            if ((this.Length & 3) >= 2)
-            {
-                result256_0 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
-                    Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
-                    result256_0);
+                    bufferStart += 32;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
+                }
 
-                bufferStart += 2;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
-            }
+                result512_0 += result512_1;
 
-            Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
+                if ((this.Length & 7) >= 4)
+                {
+                    Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
+                    result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
 
-            if ((this.Length & 1) != 0)
-            {
-                result128 = Fma.MultiplyAdd(
-                    Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
-                    Vector128.Create(*bufferStart),
-                    result128);
+                    bufferStart += 16;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
+                }
+
+                Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();
+
+                if ((this.Length & 3) >= 2)
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);
+
+                    bufferStart += 8;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
+                }
+
+                Vector128<float> result128 = result256.GetLower() + result256.GetUpper();
+
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
+                }
+
+                return *(Vector4*)&result128;
             }
+            else
+            {
+                float* bufferStart = this.bufferPtr;
+                ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
+                Vector256<float> result256_0 = Vector256<float>.Zero;
+                Vector256<float> result256_1 = Vector256<float>.Zero;
+
+                while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
+
+                    result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
+                    result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
+
+                    bufferStart += 16;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
+                }
+
+                result256_0 += result256_1;
+
+                if ((this.Length & 3) >= 2)
+                {
+                    Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
+                    result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
+
+                    bufferStart += 8;
+                    rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
+                }
+
+                Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
 
-            return *(Vector4*)&result128;
+                if ((this.Length & 1) != 0)
+                {
+                    Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
+                    result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
+                }
+
+                return *(Vector4*)&result128;
+            }
         }
         else
         {
@@ -149,7 +199,7 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
                 result += rowStartRef * *bufferStart;
 
                 bufferStart++;
-                rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
+                rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
             }
 
             return result;
@@ -160,17 +210,32 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
     /// Copy the contents of <see cref="ResizeKernel"/> altering <see cref="StartIndex"/>
     /// to the value <paramref name="left"/>.
     /// </summary>
+    /// <param name="left">The new value for <see cref="StartIndex"/>.</param>
     [MethodImpl(InliningOptions.ShortMethod)]
     internal ResizeKernel AlterLeftValue(int left)
         => new(left, this.bufferPtr, this.Length);
 
-    internal void Fill(Span<double> values)
+    internal void FillOrCopyAndExpand(Span<float> values)
     {
         DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");
 
-        for (int i = 0; i < this.Length; i++)
+        if (Vector256.IsHardwareAccelerated)
+        {
+            Vector4* bufferStart = (Vector4*)this.bufferPtr;
+            ref float valuesStart = ref MemoryMarshal.GetReference(values);
+            ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);
+
+            while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
+            {
+                *bufferStart = new Vector4(valuesStart);
+
+                bufferStart++;
+                valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
+            }
+        }
+        else
         {
-            this.Values[i] = (float)values[i];
+            values.CopyTo(this.Values);
         }
     }
 }
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
@@ -54,7 +54,7 @@ protected internal override void Initialize<TResampler>(in TResampler sampler)
             int bottomStartDest = this.DestinationLength - this.cornerInterval;
             for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
             {
-                double center = ((i + .5) * this.ratio) - .5;
+                float center = (float)(((i + .5) * this.ratio) - .5);
                 int left = (int)TolerantMath.Ceiling(center - this.radius);
                 ResizeKernel kernel = this.kernels[i - this.period];
                 this.kernels[i] = kernel.AlterLeftValue(left);

diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
@@ -33,7 +33,7 @@ internal partial class ResizeKernelMap : IDisposable
     private bool isDisposed;
 
     // To avoid both GC allocations, and MemoryAllocator ceremony:
-    private readonly double[] tempValues;
+    private readonly float[] tempValues;
 
     private ResizeKernelMap(
         MemoryAllocator memoryAllocator,
@@ -50,10 +50,19 @@ private ResizeKernelMap(
         this.sourceLength = sourceLength;
         this.DestinationLength = destinationLength;
         this.MaxDiameter = (radius * 2) + 1;
-        this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean);
+
+        if (ResizeKernel.SupportsVectorization)
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
+        }
+        else
+        {
+            this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true);
+        }
+
         this.pinHandle = this.data.DangerousGetSingleMemory().Pin();
         this.kernels = new ResizeKernel[destinationLength];
-        this.tempValues = new double[this.MaxDiameter];
+        this.tempValues = new float[this.MaxDiameter];
     }
 
     /// <summary>
@@ -155,23 +164,23 @@ public static ResizeKernelMap Calculate<TResampler>(
         bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize;
 
         ResizeKernelMap result = hasAtLeast2Periods
-                                     ? new PeriodicKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius,
-                                         period,
-                                         cornerInterval)
-                                     : new ResizeKernelMap(
-                                         memoryAllocator,
-                                         sourceSize,
-                                         destinationSize,
-                                         destinationSize,
-                                         ratio,
-                                         scale,
-                                         radius);
+        ? new PeriodicKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius,
+            period,
+            cornerInterval)
+        : new ResizeKernelMap(
+            memoryAllocator,
+            sourceSize,
+            destinationSize,
+            destinationSize,
+            ratio,
+            scale,
+            radius);
 
         result.Initialize(in sampler);
 
@@ -198,7 +207,8 @@ protected internal virtual void Initialize<TResampler>(in TResampler sampler)
     private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowIndex, int dataRowIndex)
         where TResampler : struct, IResampler
     {
-        double center = ((destRowIndex + .5) * this.ratio) - .5;
+        float center = (float)(((destRowIndex + .5) * this.ratio) - .5);
+        float scale = (float)this.scale;
 
         // Keep inside bounds.
         int left = (int)TolerantMath.Ceiling(center - this.radius);
@@ -214,30 +224,25 @@ private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowI
         }
 
         ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
-
-        Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
-        double sum = 0;
+        Span<float> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
+        ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues);
+        float sum = 0;
 
         for (int j = left; j <= right; j++)
         {
-            double value = sampler.GetValue((float)((j - center) / this.scale));
+            float value = sampler.GetValue((j - center) / scale);
             sum += value;
-
-            kernelValues[j - left] = value;
+            kernelStart = value;
+            kernelStart = ref Unsafe.Add(ref kernelStart, 1);
         }
 
         // Normalize, best to do it here rather than in the pixel loop later on.
         if (sum > 0)
         {
-            for (int j = 0; j < kernel.Length; j++)
-            {
-                // weights[w] = weights[w] / sum:
-                ref double kRef = ref kernelValues[j];
-                kRef /= sum;
-            }
+            Numerics.Normalize(kernelValues, sum);
         }
 
-        kernel.Fill(kernelValues);
+        kernel.FillOrCopyAndExpand(kernelValues);
 
         return kernel;
     }

diff --git a/...geSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs b/...geSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
@@ -16,9 +16,7 @@ internal class ReferenceKernelMap
         private readonly ReferenceKernel[] kernels;
 
         public ReferenceKernelMap(ReferenceKernel[] kernels)
-        {
-            this.kernels = kernels;
-        }
+            => this.kernels = kernels;
 
         public int DestinationSize => this.kernels.Length;
 
@@ -28,22 +26,23 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
             where TResampler : struct, IResampler
         {
             double ratio = (double)sourceSize / destinationSize;
-            double scale = ratio;
+            double scaleD = ratio;
 
-            if (scale < 1F)
+            if (scaleD < 1)
             {
-                scale = 1F;
+                scaleD = 1;
             }
 
             TolerantMath tolerantMath = TolerantMath.Default;
 
-            double radius = tolerantMath.Ceiling(scale * sampler.Radius);
+            double radius = tolerantMath.Ceiling(scaleD * sampler.Radius);
 
-            var result = new List<ReferenceKernel>();
+            List<ReferenceKernel> result = [];
 
+            float scale = (float)scaleD;
             for (int i = 0; i < destinationSize; i++)
             {
-                double center = ((i + .5) * ratio) - .5;
+                float center = (float)(((i + .5) * ratio) - .5);
 
                 // Keep inside bounds.
                 int left = (int)tolerantMath.Ceiling(center - radius);
@@ -58,15 +57,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
                     right = sourceSize - 1;
                 }
 
-                double sum = 0;
+                float sum = 0;
 
-                double[] values = new double[right - left + 1];
+                float[] values = new float[right - left + 1];
 
                 for (int j = left; j <= right; j++)
                 {
-                    double weight = sampler.GetValue((float)((j - center) / scale));
+                    float weight = sampler.GetValue((j - center) / scale);
                     sum += weight;
-
                     values[j - left] = weight;
                 }
 
@@ -78,16 +76,14 @@ public static ReferenceKernelMap Calculate<TResampler>(in TResampler sampler, in
                     }
                 }
 
-                float[] floatVals = values.Select(v => (float)v).ToArray();
-
-                result.Add(new ReferenceKernel(left, floatVals));
+                result.Add(new ReferenceKernel(left, values));
             }
 
-            return new ReferenceKernelMap(result.ToArray());
+            return new ReferenceKernelMap([.. result]);
         }
     }
 
-    internal struct ReferenceKernel
+    internal readonly struct ReferenceKernel
     {
         public ReferenceKernel(int left, float[] values)
         {
@@ -102,8 +98,6 @@ public ReferenceKernel(int left, float[] values)
         public int Length => this.Values.Length;
 
         public static implicit operator ReferenceKernel(ResizeKernel orig)
-        {
-            return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray());
-        }
+            => new(orig.StartIndex, orig.Values.ToArray());
     }
 }
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs b/tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs
@@ -124,7 +124,6 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
         this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n");
         this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n");
 #endif
-        var comparer = new ApproximateFloatComparer(1e-6f);
 
         for (int i = 0; i < kernelMap.DestinationLength; i++)
         {
@@ -139,7 +138,29 @@ private void VerifyKernelMapContentIsCorrect<TResampler>(TResampler resampler, i
                 referenceKernel.Left == kernel.StartIndex,
                 $"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}");
             float[] expectedValues = referenceKernel.Values;
-            Span<float> actualValues = kernel.Values;
+            Span<float> actualValues;
+
+            ApproximateFloatComparer comparer;
+            if (ResizeKernel.SupportsVectorization)
+            {
+                comparer = new ApproximateFloatComparer(1e-4f);
+
+                Assert.Equal(expectedValues.Length, kernel.Values.Length / 4);
+
+                int actualLength = referenceKernel.Length / 4;
+
+                actualValues = new float[expectedValues.Length];
+
+                for (int j = 0; j < expectedValues.Length; j++)
+                {
+                    actualValues[j] = kernel.Values[j * 4];
+                }
+            }
+            else
+            {
+                comparer = new ApproximateFloatComparer(1e-6f);
+                actualValues = kernel.Values;
+            }
 
             Assert.Equal(expectedValues.Length, actualValues.Length);