albumentations-team
diff --git a/‎.pre-commit-config.yaml
+2-2 b/‎.pre-commit-config.yaml
+2-2
diff --git a/‎albucore/functions.py
+26-18 b/‎albucore/functions.py
+26-18
diff --git a/‎albucore/utils.py
+3-1 b/‎albucore/utils.py
+3-1
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/AddArray.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/AddArray.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/AddConstant.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/AddConstant.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/AddVector.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/AddVector.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/AddWeighted.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/AddWeighted.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/FromFloat.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/FromFloat.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/HorizontalFlip.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/HorizontalFlip.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/MultiplyAdd.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/MultiplyAdd.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/MultiplyArray.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/MultiplyArray.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/MultiplyConstant.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/MultiplyConstant.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/MultiplyVector.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/MultiplyVector.md
+4-4
diff --git a/‎benchmark/albucore_benchmark/results/float32_1/Normalize.md
+4-4 b/‎benchmark/albucore_benchmark/results/float32_1/Normalize.md
+4-4
@@ -53,13 +53,13 @@ repos:
   #   hooks:
   #     - id: markdownlint
   - repo: https://github.com/tox-dev/pyproject-fmt
-    rev: "v2.4.3"
+    rev: "v2.5.0"
     hooks:
       - id: pyproject-fmt
         additional_dependencies: ["tomli"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.7.1
+    rev: v0.7.2
     hooks:
       # Run the linter.
       - id: ruff
 
@@ -32,7 +32,7 @@ def add_weighted_simsimd(img1: np.ndarray, weight1: float, img2: np.ndarray, wei
     original_dtype = img1.dtype
 
     if img2.dtype != original_dtype:
-        img2 = clip(img2.astype(original_dtype), original_dtype)
+        img2 = clip(img2.astype(original_dtype), original_dtype, inplace=True)
 
     return np.frombuffer(
         ss.wsum(img1.reshape(-1), img2.astype(original_dtype).reshape(-1), alpha=weight1, beta=weight2),
@@ -96,7 +96,7 @@ def apply_lut(
 
     num_channels = img.shape[-1]
     luts = create_lut_array(dtype, value, operation)
-    return cv2.merge([sz_lut(img[:, :, i], clip(luts[i], dtype), inplace) for i in range(num_channels)])
+    return cv2.merge([sz_lut(img[:, :, i], clip(luts[i], dtype, inplace=False), inplace) for i in range(num_channels)])
 
 
 def prepare_value_opencv(
@@ -212,7 +212,7 @@ def multiply(img: np.ndarray, value: ValueType, inplace: bool = False) -> np.nda
 
 
 @preserve_channel_dim
-def add_opencv(img: np.ndarray, value: np.ndarray | float) -> np.ndarray:
+def add_opencv(img: np.ndarray, value: np.ndarray | float, inplace: bool = False) -> np.ndarray:
     value = prepare_value_opencv(img, value, "add")
 
     # Convert to float32 if:
@@ -225,7 +225,9 @@ def add_opencv(img: np.ndarray, value: np.ndarray | float) -> np.ndarray:
     if needs_float:
         return cv2.add(img.astype(np.float32), value if isinstance(value, (int, float)) else value.astype(np.float32))
 
-    return cv2.add(img, value)
+    # Use img as the destination array if inplace=True
+    dst = img if inplace else None
+    return cv2.add(img, value, dst=dst)
 
 
 def add_numpy(img: np.ndarray, value: float | np.ndarray) -> np.ndarray:
@@ -237,20 +239,20 @@ def add_lut(img: np.ndarray, value: np.ndarray | float, inplace: bool) -> np.nda
 
 
 @clipped
-def add_constant(img: np.ndarray, value: float) -> np.ndarray:
-    return add_opencv(img, value)
+def add_constant(img: np.ndarray, value: float, inplace: bool = False) -> np.ndarray:
+    return add_opencv(img, value, inplace)
 
 
 @clipped
 def add_vector(img: np.ndarray, value: np.ndarray, inplace: bool) -> np.ndarray:
     if img.dtype == np.uint8:
         return add_lut(img, value, inplace)
-    return add_opencv(img, value)
+    return add_opencv(img, value, inplace)
 
 
 @clipped
-def add_array(img: np.ndarray, value: np.ndarray) -> np.ndarray:
-    return add_opencv(img, value)
+def add_array(img: np.ndarray, value: np.ndarray, inplace: bool = False) -> np.ndarray:
+    return add_opencv(img, value, inplace)
 
 
 def add(img: np.ndarray, value: ValueType, inplace: bool = False) -> np.ndarray:
@@ -264,9 +266,9 @@ def add(img: np.ndarray, value: ValueType, inplace: bool = False) -> np.ndarray:
         if img.dtype == np.uint8:
             value = int(value)
 
-        return add_constant(img, value)
+        return add_constant(img, value, inplace)
 
-    return add_vector(img, value, inplace) if value.ndim == 1 else add_array(img, value)
+    return add_vector(img, value, inplace) if value.ndim == 1 else add_array(img, value, inplace)
 
 
 def normalize_numpy(img: np.ndarray, mean: float | np.ndarray, denominator: float | np.ndarray) -> np.ndarray:
@@ -371,11 +373,17 @@ def add_weighted_numpy(img1: np.ndarray, weight1: float, img2: np.ndarray, weigh
 
 @preserve_channel_dim
 def add_weighted_opencv(img1: np.ndarray, weight1: float, img2: np.ndarray, weight2: float) -> np.ndarray:
-    return cv2.addWeighted(img1.astype(np.float32), weight1, img2.astype(np.float32), weight2, 0)
+    return cv2.addWeighted(img1, weight1, img2, weight2, 0)
 
 
 @preserve_channel_dim
-def add_weighted_lut(img1: np.ndarray, weight1: float, img2: np.ndarray, weight2: float) -> np.ndarray:
+def add_weighted_lut(
+    img1: np.ndarray,
+    weight1: float,
+    img2: np.ndarray,
+    weight2: float,
+    inplace: bool = False,
+) -> np.ndarray:
     dtype = img1.dtype
     max_value = MAX_VALUES_BY_DTYPE[dtype]
 
@@ -389,15 +397,15 @@ def add_weighted_lut(img1: np.ndarray, weight1: float, img2: np.ndarray, weight2
         return np.zeros_like(img1)
 
     if weight1 == 1 and weight2 == 1:
-        return add_array(img1, img2)
+        return add_array(img1, img2, inplace)
 
     lut1 = np.arange(0, max_value + 1, dtype=np.float32) * weight1
     result1 = cv2.LUT(img1, lut1)
 
     lut2 = np.arange(0, max_value + 1, dtype=np.float32) * weight2
     result2 = cv2.LUT(img2, lut2)
 
-    return add_opencv(result1, result2)
+    return add_opencv(result1, result2, inplace)
 
 
 @clipped
@@ -437,7 +445,7 @@ def multiply_add_lut(img: np.ndarray, factor: ValueType, value: ValueType, inpla
     num_channels = get_num_channels(img)
 
     if isinstance(factor, (float, int)) and isinstance(value, (float, int)):
-        lut = clip(np.arange(0, max_value + 1, dtype=np.float32) * factor + value, dtype)
+        lut = clip(np.arange(0, max_value + 1, dtype=np.float32) * factor + value, dtype, inplace=False)
         return sz_lut(img, lut, inplace)
 
     if isinstance(factor, np.ndarray) and factor.shape != ():
@@ -446,7 +454,7 @@ def multiply_add_lut(img: np.ndarray, factor: ValueType, value: ValueType, inpla
     if isinstance(value, np.ndarray) and value.shape != ():
         value = value.reshape(-1, 1)
 
-    luts = clip(np.arange(0, max_value + 1, dtype=np.float32) * factor + value, dtype)
+    luts = clip(np.arange(0, max_value + 1, dtype=np.float32) * factor + value, dtype, inplace=True)
 
     return cv2.merge([sz_lut(img[:, :, i], luts[i], inplace) for i in range(num_channels)])
 
@@ -641,7 +649,7 @@ def to_float(img: np.ndarray, max_value: float | None = None) -> np.ndarray:
 def from_float_numpy(img: np.ndarray, target_dtype: np.dtype, max_value: float | None = None) -> np.ndarray:
     if max_value is None:
         max_value = get_max_value(target_dtype)
-    return clip(np.rint(img * max_value), target_dtype)
+    return clip(np.rint(img * max_value), target_dtype, inplace=True)
 
 
 @preserve_channel_dim
 
@@ -101,8 +101,10 @@ def __process_fn(img: np.ndarray, *process_args: P.args, **process_kwargs: P.kwa
     return __process_fn
 
 
-def clip(img: np.ndarray, dtype: Any) -> np.ndarray:
+def clip(img: np.ndarray, dtype: Any, inplace: bool = False) -> np.ndarray:
     max_value = MAX_VALUES_BY_DTYPE[dtype]
+    if inplace:
+        return np.clip(img, 0, max_value, out=img)
     return np.clip(img, 0, max_value).astype(dtype)
 
 
 
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|          | albucore         | lut   | opencv           | numpy            | simsimd          |
-|:---------|:-----------------|:------|:-----------------|:-----------------|:-----------------|
-| AddArray | 1854.77 ± 415.86 | N/A   | 1852.88 ± 121.24 | 1459.54 ± 375.42 | 1654.07 ± 126.99 |
+|          | albucore        | lut   | opencv          | numpy            | simsimd          |
+|:---------|:----------------|:------|:----------------|:-----------------|:-----------------|
+| AddArray | 1884.89 ± 67.19 | N/A   | 1860.28 ± 59.29 | 1442.33 ± 176.91 | 1310.62 ± 268.11 |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|             | albucore         | lut   | opencv          | numpy           | simsimd          |
-|:------------|:-----------------|:------|:----------------|:----------------|:-----------------|
-| AddConstant | 1888.37 ± 106.49 | N/A   | 1954.72 ± 42.99 | 1861.73 ± 74.98 | 1219.05 ± 189.20 |
+|             | albucore        | lut   | opencv           | numpy            | simsimd          |
+|:------------|:----------------|:------|:-----------------|:-----------------|:-----------------|
+| AddConstant | 1965.69 ± 83.21 | N/A   | 1632.53 ± 240.98 | 1754.53 ± 219.01 | 1248.80 ± 210.45 |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|           | albucore         | lut   | opencv          | numpy           | simsimd   |
-|:----------|:-----------------|:------|:----------------|:----------------|:----------|
-| AddVector | 1933.94 ± 176.48 | N/A   | 1802.99 ± 47.72 | 1876.27 ± 73.40 | N/A       |
+|           | albucore         | lut   | opencv           | numpy            | simsimd   |
+|:----------|:-----------------|:------|:-----------------|:-----------------|:----------|
+| AddVector | 1948.10 ± 195.87 | N/A   | 1612.23 ± 308.33 | 1896.24 ± 112.59 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|             | albucore        | lut   | opencv          | numpy            | simsimd         |
-|:------------|:----------------|:------|:----------------|:-----------------|:----------------|
-| AddWeighted | 1522.93 ± 45.94 | N/A   | 1378.03 ± 37.98 | 1134.65 ± 176.15 | 1555.68 ± 37.54 |
+|             | albucore        | lut   | opencv           | numpy           | simsimd         |
+|:------------|:----------------|:------|:-----------------|:----------------|:----------------|
+| AddWeighted | 1481.93 ± 91.72 | N/A   | 1796.92 ± 292.15 | 967.24 ± 216.82 | 988.38 ± 287.66 |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|           | albucore        | lut   | opencv         | numpy          | simsimd   |
-|:----------|:----------------|:------|:---------------|:---------------|:----------|
-| FromFloat | 1260.47 ± 41.56 | N/A   | 712.18 ± 88.37 | 734.41 ± 58.55 | N/A       |
+|           | albucore         | lut   | opencv         | numpy            | simsimd   |
+|:----------|:-----------------|:------|:---------------|:-----------------|:----------|
+| FromFloat | 1027.60 ± 247.12 | N/A   | 686.23 ± 40.08 | 1148.59 ± 221.74 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|                | albucore          | lut   | opencv           | numpy            | simsimd   |
-|:---------------|:------------------|:------|:-----------------|:-----------------|:----------|
-| HorizontalFlip | 4655.65 ± 1173.34 | N/A   | 1665.88 ± 304.01 | 1802.95 ± 195.86 | N/A       |
+|                | albucore         | lut   | opencv           | numpy           | simsimd   |
+|:---------------|:-----------------|:------|:-----------------|:----------------|:----------|
+| HorizontalFlip | 6997.24 ± 616.52 | N/A   | 2097.85 ± 121.23 | 1952.43 ± 83.57 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|             | albucore       | lut   | opencv          | numpy            | simsimd   |
-|:------------|:---------------|:------|:----------------|:-----------------|:----------|
-| MultiplyAdd | 629.51 ± 81.55 | N/A   | 445.55 ± 178.66 | 1733.97 ± 110.31 | N/A       |
+|             | albucore       | lut   | opencv         | numpy            | simsimd   |
+|:------------|:---------------|:------|:---------------|:-----------------|:----------|
+| MultiplyAdd | 758.16 ± 22.12 | N/A   | 728.14 ± 50.02 | 1856.18 ± 159.34 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|               | albucore       | lut   | opencv        | numpy          | simsimd   |
-|:--------------|:---------------|:------|:--------------|:---------------|:----------|
-| MultiplyArray | 485.21 ± 13.04 | N/A   | 511.79 ± 3.88 | 446.52 ± 32.21 | N/A       |
+|               | albucore       | lut   | opencv         | numpy          | simsimd   |
+|:--------------|:---------------|:------|:---------------|:---------------|:----------|
+| MultiplyArray | 471.85 ± 77.52 | N/A   | 456.21 ± 49.29 | 309.56 ± 87.87 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|                  | albucore        | lut   | opencv          | numpy           | simsimd          |
-|:-----------------|:----------------|:------|:----------------|:----------------|:-----------------|
-| MultiplyConstant | 1878.75 ± 49.01 | N/A   | 1416.15 ± 79.34 | 1840.54 ± 78.93 | 1525.54 ± 242.47 |
+|                  | albucore         | lut   | opencv           | numpy            | simsimd          |
+|:-----------------|:-----------------|:------|:-----------------|:-----------------|:-----------------|
+| MultiplyConstant | 1917.92 ± 120.49 | N/A   | 1176.76 ± 307.71 | 1983.32 ± 154.74 | 1813.73 ± 226.46 |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|                | albucore        | lut   | opencv          | numpy           | simsimd   |
-|:---------------|:----------------|:------|:----------------|:----------------|:----------|
-| MultiplyVector | 1815.01 ± 79.45 | N/A   | 1719.36 ± 94.23 | 1882.30 ± 56.89 | N/A       |
+|                | albucore         | lut   | opencv           | numpy            | simsimd   |
+|:---------------|:-----------------|:------|:-----------------|:-----------------|:----------|
+| MultiplyVector | 1797.16 ± 344.07 | N/A   | 1872.64 ± 210.21 | 1897.21 ± 132.37 | N/A       |
@@ -13,11 +13,11 @@ Number of images: 500
 
 | Python                                | albucore   | opencv-python-headless   | numpy   | torchvision   |
 |:--------------------------------------|:-----------|:-------------------------|:--------|:--------------|
-| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.19     | 4.10.0.84                | 2.0.2   | 0.19.1        |
+| 3.9.20 (main, Oct  3 2024, 02:24:59)  | 0.0.20     | 4.10.0.84                | 2.0.2   | 0.19.1        |
 | [Clang 14.0.6 ]                       |            |                          |         |               |
 
 ## Performance (images/second)
 
-|           | albucore         | lut   | opencv         | numpy           | simsimd   |
-|:----------|:-----------------|:------|:---------------|:----------------|:----------|
-| Normalize | 1368.75 ± 159.20 | N/A   | 490.46 ± 60.55 | 801.62 ± 112.00 | N/A       |
+|           | albucore         | lut   | opencv          | numpy          | simsimd   |
+|:----------|:-----------------|:------|:----------------|:---------------|:----------|
+| Normalize | 1512.83 ± 151.87 | N/A   | 443.25 ± 100.17 | 816.24 ± 71.20 | N/A       |