From 2195828d46e621f10f937247f57f674fa3c096f7 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 15:39:39 +0400
Subject: [PATCH 1/9] Optimize python part

---
 src/blurhash/__init__.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/blurhash/__init__.py b/src/blurhash/__init__.py
index 8bf74b7..1200a30 100644
--- a/src/blurhash/__init__.py
+++ b/src/blurhash/__init__.py
@@ -1,18 +1,13 @@
-from __future__ import absolute_import
-from itertools import chain
-
+from enum import Enum
 
 from PIL import Image
 
-from six.moves import zip
-from enum import Enum
-
 from ._functions import ffi as _ffi, lib as _lib
 from ._version import version as __version__
 
 
-__all__ = 'encode', 'decode', 'is_valid_blurhash', 'PixelMode', \
-          'BlurhashDecodeError', '__version__'
+__all__ = ('encode', 'decode', 'is_valid_blurhash', 'PixelMode',
+           'BlurhashDecodeError', '__version__')
 
 
 class PixelMode(Enum):
@@ -34,10 +29,7 @@ def encode(image, x_components, y_components):
         image = Image.open(image)
     if image.mode != 'RGB':
         image = image.convert('RGB')
-    red_band = image.getdata(band=0)
-    green_band = image.getdata(band=1)
-    blue_band = image.getdata(band=2)
-    rgb_data = list(chain.from_iterable(zip(red_band, green_band, blue_band)))
+    rgb_data = image.tobytes()
     width, height = image.size
     image.close()
 

From cbe5fbf1184716ebd7d47ffe7eb613c0e43aa428 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 15:39:49 +0400
Subject: [PATCH 2/9] Optimize C code

---
 src/common.h | 10 +++++-----
 src/encode.c | 35 +++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/common.h b/src/common.h
index ce58144..3cea89b 100644
--- a/src/common.h
+++ b/src/common.h
@@ -1,7 +1,7 @@
 #ifndef __BLURHASH_COMMON_H__
 #define __BLURHASH_COMMON_H__
 
-#include<math.h>
+#include <math.h>
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
@@ -13,10 +13,10 @@ static inline int linearTosRGB(float value) {
 	else return (1.055 * powf(v, 1 / 2.4) - 0.055) * 255 + 0.5;
 }
 
-static inline float sRGBToLinear(int value) {
-	float v = (float)value / 255;
-	if(v <= 0.04045) return v / 12.92;
-	else return powf((v + 0.055) / 1.055, 2.4);
+static inline float sRGBToLinear(uint8_t value) {
+	float v = value * (1 / 255.0);
+	if(v <= 0.04045) return v * (1 / 12.92);
+	else return powf((v + 0.055) * (1 /1.055), 2.4);
 }
 
 static inline float signPow(float value, float exp) {
diff --git a/src/encode.c b/src/encode.c
index aa2a990..4628dea 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -1,13 +1,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 
 #include "common.h"
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
 
 struct RGB {
 	float r;
@@ -21,6 +17,18 @@ static char *encode_int(int value, int length, char *destination);
 static int encodeDC(float r, float g, float b);
 static int encodeAC(float r, float g, float b, float maximumValue);
 
+float *sRGBToLinear_cache = NULL;
+
+static void init_sRGBToLinear_cache() {
+	if (sRGBToLinear_cache != NULL) {
+		return;
+	}
+	sRGBToLinear_cache = (float *)malloc(sizeof(float) * 256);
+	for (int x = 0; x < 256; x++) {
+		sRGBToLinear_cache[x] = sRGBToLinear(x);
+	}
+}
+
 const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow, char *destination) {
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
@@ -32,6 +40,8 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 #endif
 	memset(factors, 0, sizeof(factors));
 
+	init_sRGBToLinear_cache();
+
 	for(int y = 0; y < yComponents; y++) {
 		for(int x = 0; x < xComponents; x++) {
 			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
@@ -78,16 +88,25 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
 	struct RGB result = { 0, 0, 0 };
 	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
+	float *cosx = (float *)malloc(sizeof(float) * width);
+
+	for(int x = 0; x < width; x++) {
+		cosx[x] = cosf(M_PI * xComponent * x / width);
+	}
 
 	for(int y = 0; y < height; y++) {
+		float cosy = cosf(M_PI * yComponent * y / height);
+		uint8_t *src = rgb + y * bytesPerRow;
 		for(int x = 0; x < width; x++) {
-			float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height);
-			result.r += basis * sRGBToLinear(rgb[3 * x + 0 + y * bytesPerRow]);
-			result.g += basis * sRGBToLinear(rgb[3 * x + 1 + y * bytesPerRow]);
-			result.b += basis * sRGBToLinear(rgb[3 * x + 2 + y * bytesPerRow]);
+			float basis = cosy * cosx[x];
+			result.r += basis * sRGBToLinear_cache[src[3 * x + 0]];
+			result.g += basis * sRGBToLinear_cache[src[3 * x + 1]];
+			result.b += basis * sRGBToLinear_cache[src[3 * x + 2]];
 		}
 	}
 
+	free(cosx);
+
 	float scale = normalisation / (width * height);
 
 	result.r *= scale;

From 6abedbafaf74d0e3d4b1b263b3ae0fc9ecefa882 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 16:47:27 +0400
Subject: [PATCH 3/9] Allocate cosx array in safe function

---
 src/encode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index 4628dea..81d96fc 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -11,7 +11,7 @@ struct RGB {
 	float b;
 };
 
-static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow);
+static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
@@ -42,14 +42,17 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 
 	init_sRGBToLinear_cache();
 
+	float *cosx = (float *)malloc(sizeof(float) * width);
+	if (! cosx) return NULL;
 	for(int y = 0; y < yComponents; y++) {
 		for(int x = 0; x < xComponents; x++) {
-			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
+			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosx);
 			factors[y * xComponents + x][0] = factor.r;
 			factors[y * xComponents + x][1] = factor.g;
 			factors[y * xComponents + x][2] = factor.b;
 		}
 	}
+	free(cosx);
 
 	float *dc = factors[0];
 	float *ac = dc + 3;
@@ -85,10 +88,9 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return destination;
 }
 
-static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
+static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx) {
 	struct RGB result = { 0, 0, 0 };
 	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
-	float *cosx = (float *)malloc(sizeof(float) * width);
 
 	for(int x = 0; x < width; x++) {
 		cosx[x] = cosf(M_PI * xComponent * x / width);
@@ -105,8 +107,6 @@ static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int widt
 		}
 	}
 
-	free(cosx);
-
 	float scale = normalisation / (width * height);
 
 	result.r *= scale;

From ab47f4ce4a6f752a92163e647c6f2eeed2503a5c Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 16:53:17 +0400
Subject: [PATCH 4/9] Remove extra initialization

---
 src/encode.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index 81d96fc..7058367 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -33,12 +33,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-#ifndef _MSC_VER
-	float factors[yComponents * xComponents][3];
-#else
 	float factors[9 * 9][3];
-#endif
-	memset(factors, 0, sizeof(factors));
 
 	init_sRGBToLinear_cache();
 

From 9e744cf678d51a4f3d88bcff6320c32cd806bb12 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 17:00:41 +0400
Subject: [PATCH 5/9] Release Pillow image only if open it

---
 src/blurhash/__init__.py | 16 ++++++++++------
 tests/test_encode.py     | 10 +++++++---
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/blurhash/__init__.py b/src/blurhash/__init__.py
index 1200a30..d0988e0 100644
--- a/src/blurhash/__init__.py
+++ b/src/blurhash/__init__.py
@@ -1,3 +1,4 @@
+from contextlib import nullcontext
 from enum import Enum
 
 from PIL import Image
@@ -25,13 +26,16 @@ def __str__(self):
 
 
 def encode(image, x_components, y_components):
-    if not isinstance(image, Image.Image):
+    if isinstance(image, Image.Image):
+        image_context = nullcontext()
+    else:
         image = Image.open(image)
-    if image.mode != 'RGB':
-        image = image.convert('RGB')
-    rgb_data = image.tobytes()
-    width, height = image.size
-    image.close()
+        image_context = image
+    with image_context:
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        rgb_data = image.tobytes()
+        width, height = image.size
 
     rgb = _ffi.new('uint8_t[]', rgb_data)
     bytes_per_row = _ffi.cast('size_t', width * 3)
diff --git a/tests/test_encode.py b/tests/test_encode.py
index 5d7cb27..7c7b931 100644
--- a/tests/test_encode.py
+++ b/tests/test_encode.py
@@ -13,11 +13,15 @@ def test_encode_file():
     assert result == 'LlMF%n00%#MwS|WCWEM{R*bbWBbH'
 
 
-def test_encode_pil_image():
+def test_encode_pil_image_twise():
     with Image.open('tests/pic2.png') as image:
-        result = encode(image, 4, 3)
+        image = image.convert('RGB')
+        result1 = encode(image, 4, 3)
+        # Should not raise second time
+        result2 = encode(image, 4, 3)
 
-    assert result == 'LlMF%n00%#MwS|WCWEM{R*bbWBbH'
+    assert result1 == result2
+    assert result1 == 'LlMF%n00%#MwS|WCWEM{R*bbWBbH'
 
 
 def test_encode_with_filename():

From f61730133a72a6870ddaf2e5b6fb6f9f2b8abe9e Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 17:08:58 +0400
Subject: [PATCH 6/9] Remove extra requirements

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index a39252b..d6a94ce 100755
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@
     install_requires=[
         'cffi',
         'Pillow',
-        'six',
     ],
     setup_requires=[
         'cffi',

From f4a1ed0f713cc92a4feaddb052e855e10afa6d49 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 18:31:37 +0400
Subject: [PATCH 7/9] Calculate cosx and cosy once

---
 src/encode.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index 7058367..3ede7df 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -11,7 +11,7 @@ struct RGB {
 	float b;
 };
 
-static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx);
+static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx, float *cosy);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
@@ -37,17 +37,32 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 
 	init_sRGBToLinear_cache();
 
-	float *cosx = (float *)malloc(sizeof(float) * width);
+	float *cosx = (float *)malloc(sizeof(float) * width * xComponents);
 	if (! cosx) return NULL;
+	float *cosy = (float *)malloc(sizeof(float) * height);
+	if (! cosy) {
+		free(cosx);
+		return NULL;
+	}
+	for(int x = 0; x < xComponents; x++) {
+		for(int i = 0; i < width; i++) {
+			cosx[x * width + i] = cosf(M_PI * x * i / width);
+		}
+	}
 	for(int y = 0; y < yComponents; y++) {
+		for(int i = 0; i < height; i++) {
+			cosy[i] = cosf(M_PI * y * i / height);
+		}
 		for(int x = 0; x < xComponents; x++) {
-			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosx);
+			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow,
+				cosx + x * width, cosy);
 			factors[y * xComponents + x][0] = factor.r;
 			factors[y * xComponents + x][1] = factor.g;
 			factors[y * xComponents + x][2] = factor.b;
 		}
 	}
 	free(cosx);
+	free(cosy);
 
 	float *dc = factors[0];
 	float *ac = dc + 3;
@@ -83,19 +98,17 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return destination;
 }
 
-static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx) {
+static struct RGB multiplyBasisFunction(
+	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosx, float *cosy
+) {
 	struct RGB result = { 0, 0, 0 };
 	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
 
-	for(int x = 0; x < width; x++) {
-		cosx[x] = cosf(M_PI * xComponent * x / width);
-	}
-
 	for(int y = 0; y < height; y++) {
-		float cosy = cosf(M_PI * yComponent * y / height);
 		uint8_t *src = rgb + y * bytesPerRow;
 		for(int x = 0; x < width; x++) {
-			float basis = cosy * cosx[x];
+			float basis = cosy[y] * cosx[x];
 			result.r += basis * sRGBToLinear_cache[src[3 * x + 0]];
 			result.g += basis * sRGBToLinear_cache[src[3 * x + 1]];
 			result.b += basis * sRGBToLinear_cache[src[3 * x + 2]];

From ae86c2dc439d21dfc9e200b6f6315ab8fb41913d Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Tue, 24 Sep 2024 21:59:32 +0400
Subject: [PATCH 8/9] Calculate factors in one pass

---
 src/encode.c | 90 ++++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 41 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index 3ede7df..b218fb5 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -11,7 +11,9 @@ struct RGB {
 	float b;
 };
 
-static struct RGB multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosx, float *cosy);
+static void multiplyBasisFunction(
+	struct RGB *factors, int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
@@ -33,40 +35,41 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-	float factors[9 * 9][3];
+	struct RGB factors[9 * 9] = {0};
+	int factorsCount = xComponents * yComponents;
 
 	init_sRGBToLinear_cache();
 
-	float *cosx = (float *)malloc(sizeof(float) * width * xComponents);
-	if (! cosx) return NULL;
-	float *cosy = (float *)malloc(sizeof(float) * height);
-	if (! cosy) {
-		free(cosx);
+	float *cosX = (float *)malloc(sizeof(float) * width * factorsCount);
+	if (! cosX) return NULL;
+	float *cosY = (float *)malloc(sizeof(float) * height * factorsCount);
+	if (! cosY) {
+		free(cosX);
 		return NULL;
 	}
-	for(int x = 0; x < xComponents; x++) {
-		for(int i = 0; i < width; i++) {
-			cosx[x * width + i] = cosf(M_PI * x * i / width);
+	for(int i = 0; i < width; i++) {
+		for(int x = 0; x < xComponents; x++) {
+			float weight = cosf(M_PI * x * i / width);
+			for(int y = 0; y < yComponents; y++) {
+				cosX[i * factorsCount + y * xComponents + x] = weight;
+			}
 		}
 	}
-	for(int y = 0; y < yComponents; y++) {
-		for(int i = 0; i < height; i++) {
-			cosy[i] = cosf(M_PI * y * i / height);
-		}
-		for(int x = 0; x < xComponents; x++) {
-			struct RGB factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow,
-				cosx + x * width, cosy);
-			factors[y * xComponents + x][0] = factor.r;
-			factors[y * xComponents + x][1] = factor.g;
-			factors[y * xComponents + x][2] = factor.b;
+	for(int i = 0; i < height; i++) {
+		for(int y = 0; y < yComponents; y++) {
+			float weight = cosf(M_PI * y * i / height);
+			for(int x = 0; x < xComponents; x++) {
+				cosY[i * factorsCount + y * xComponents + x] = weight;
+			}
 		}
 	}
-	free(cosx);
-	free(cosy);
+	multiplyBasisFunction(factors, factorsCount, width, height, rgb, bytesPerRow, cosX, cosY);
+	free(cosX);
+	free(cosY);
 
-	float *dc = factors[0];
+	float *dc = (float *)factors;
 	float *ac = dc + 3;
-	int acCount = xComponents * yComponents - 1;
+	int acCount = factorsCount - 1;
 	char *ptr = destination;
 
 	int sizeFlag = (xComponents - 1) + (yComponents - 1) * 9;
@@ -98,30 +101,35 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return destination;
 }
 
-static struct RGB multiplyBasisFunction(
-	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
-	float *cosx, float *cosy
+static void multiplyBasisFunction(
+	struct RGB *factors, int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX, float *cosY
 ) {
-	struct RGB result = { 0, 0, 0 };
-	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
-
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
+		float *cosYLocal = cosY + y * factorsCount;
 		for(int x = 0; x < width; x++) {
-			float basis = cosy[y] * cosx[x];
-			result.r += basis * sRGBToLinear_cache[src[3 * x + 0]];
-			result.g += basis * sRGBToLinear_cache[src[3 * x + 1]];
-			result.b += basis * sRGBToLinear_cache[src[3 * x + 2]];
+			float pixel[3];
+			float *cosXLocal = cosX + x * factorsCount;
+			pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
+			pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];
+			pixel[2] = sRGBToLinear_cache[src[3 * x + 2]];
+			for (int i = 0; i < factorsCount; i++) {
+				float basis = cosYLocal[i] * cosXLocal[i];
+				factors[i].r += basis * pixel[0];
+				factors[i].g += basis * pixel[1];
+				factors[i].b += basis * pixel[2];
+			}
 		}
 	}
 
-	float scale = normalisation / (width * height);
-
-	result.r *= scale;
-	result.g *= scale;
-	result.b *= scale;
-
-	return result;
+	for (int i = 0; i < factorsCount; i++) {
+		float normalisation = (i == 0) ? 1 : 2;
+		float scale = normalisation / (width * height);
+		factors[i].r *= scale;
+		factors[i].g *= scale;
+		factors[i].b *= scale;
+	}
 }
 
 static int encodeDC(float r, float g, float b) {

From f077928a520d1861ba8f5afc857802e2052bfe49 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Fri, 11 Oct 2024 19:58:18 +0400
Subject: [PATCH 9/9] Update to lates versions from optimization branch

---
 src/encode.c | 54 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/encode.c b/src/encode.c
index b218fb5..f329ba3 100644
--- a/src/encode.c
+++ b/src/encode.c
@@ -5,14 +5,8 @@
 #include "common.h"
 
 
-struct RGB {
-	float r;
-	float g;
-	float b;
-};
-
 static void multiplyBasisFunction(
-	struct RGB *factors, int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
@@ -35,8 +29,9 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-	struct RGB factors[9 * 9] = {0};
+	float factors[yComponents * xComponents][4];
 	int factorsCount = xComponents * yComponents;
+	memset(factors, 0, sizeof(factors));
 
 	init_sRGBToLinear_cache();
 
@@ -67,8 +62,8 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	free(cosX);
 	free(cosY);
 
-	float *dc = (float *)factors;
-	float *ac = dc + 3;
+	float *dc = factors[0];
+	float *ac = dc + 4;
 	int acCount = factorsCount - 1;
 	char *ptr = destination;
 
@@ -78,7 +73,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	float maximumValue;
 	if(acCount > 0) {
 		float actualMaximumValue = 0;
-		for(int i = 0; i < acCount * 3; i++) {
+		for(int i = 0; i < acCount * 4; i++) {
 			actualMaximumValue = fmaxf(fabsf(ac[i]), actualMaximumValue);
 		}
 
@@ -93,7 +88,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	ptr = encode_int(encodeDC(dc[0], dc[1], dc[2]), 4, ptr);
 
 	for(int i = 0; i < acCount; i++) {
-		ptr = encode_int(encodeAC(ac[i * 3 + 0], ac[i * 3 + 1], ac[i * 3 + 2], maximumValue), 2, ptr);
+		ptr = encode_int(encodeAC(ac[i * 4 + 0], ac[i * 4 + 1], ac[i * 4 + 2], maximumValue), 2, ptr);
 	}
 
 	*ptr = 0;
@@ -102,23 +97,40 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 }
 
 static void multiplyBasisFunction(
-	struct RGB *factors, int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY
 ) {
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
 		float *cosYLocal = cosY + y * factorsCount;
-		for(int x = 0; x < width; x++) {
-			float pixel[3];
+		int x = 0;
+		for(; x < width - 3; x += 4) {
+			float *cosXLocal = cosX + x * factorsCount;
+			float pixel0[4] = {sRGBToLinear_cache[src[3 * (x+0) + 0]], sRGBToLinear_cache[src[3 * (x+0) + 1]], sRGBToLinear_cache[src[3 * (x+0) + 2]]};
+			float pixel1[4] = {sRGBToLinear_cache[src[3 * (x+1) + 0]], sRGBToLinear_cache[src[3 * (x+1) + 1]], sRGBToLinear_cache[src[3 * (x+1) + 2]]};
+			float pixel2[4] = {sRGBToLinear_cache[src[3 * (x+2) + 0]], sRGBToLinear_cache[src[3 * (x+2) + 1]], sRGBToLinear_cache[src[3 * (x+2) + 2]]};
+			float pixel3[4] = {sRGBToLinear_cache[src[3 * (x+3) + 0]], sRGBToLinear_cache[src[3 * (x+3) + 1]], sRGBToLinear_cache[src[3 * (x+3) + 2]]};
+			for (int i = 0; i < factorsCount; i++) {
+				float basis0 = cosYLocal[i] * cosXLocal[i + 0 * factorsCount];
+				float basis1 = cosYLocal[i] * cosXLocal[i + 1 * factorsCount];
+				float basis2 = cosYLocal[i] * cosXLocal[i + 2 * factorsCount];
+				float basis3 = cosYLocal[i] * cosXLocal[i + 3 * factorsCount];
+				factors[i][0] += basis0 * pixel0[0] + basis1 * pixel1[0] + basis2 * pixel2[0] + basis3 * pixel3[0];
+				factors[i][1] += basis0 * pixel0[1] + basis1 * pixel1[1] + basis2 * pixel2[1] + basis3 * pixel3[1];
+				factors[i][2] += basis0 * pixel0[2] + basis1 * pixel1[2] + basis2 * pixel2[2] + basis3 * pixel3[2];
+			}
+		}
+		for(; x < width; x++) {
+			float pixel[4];
 			float *cosXLocal = cosX + x * factorsCount;
 			pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
 			pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];
 			pixel[2] = sRGBToLinear_cache[src[3 * x + 2]];
 			for (int i = 0; i < factorsCount; i++) {
 				float basis = cosYLocal[i] * cosXLocal[i];
-				factors[i].r += basis * pixel[0];
-				factors[i].g += basis * pixel[1];
-				factors[i].b += basis * pixel[2];
+				factors[i][0] += basis * pixel[0];
+				factors[i][1] += basis * pixel[1];
+				factors[i][2] += basis * pixel[2];
 			}
 		}
 	}
@@ -126,9 +138,9 @@ static void multiplyBasisFunction(
 	for (int i = 0; i < factorsCount; i++) {
 		float normalisation = (i == 0) ? 1 : 2;
 		float scale = normalisation / (width * height);
-		factors[i].r *= scale;
-		factors[i].g *= scale;
-		factors[i].b *= scale;
+		factors[i][0] *= scale;
+		factors[i][1] *= scale;
+		factors[i][2] *= scale;
 	}
 }