google · lhpqaq · Dec 2, 2024 · Dec 4, 2024 · Dec 9, 2024 · Dec 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,7 @@ build*
 third_party/googletest
 third_party/turbojpeg
 third_party/benchmark
-tests/data
+tests/data
+
+# IDE's configs
+.vscode/settings.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -279,10 +279,10 @@ else()
     add_compile_options(-march=armv8-a)
     add_compile_options(-fno-lax-vector-conversions)
   elseif(ARCH STREQUAL "riscv64")
-    add_compile_options(-march=rv64gc)
+    add_compile_options(-march=rv64gcv)
     add_compile_options(-mabi=lp64d)
   elseif(ARCH STREQUAL "riscv32")
-    add_compile_options(-march=rv32gc)
+    add_compile_options(-march=rv32gcv)
     add_compile_options(-mabi=ilp32d)
   elseif(ARCH STREQUAL "loong64")
     add_compile_options(-march=loongarch64)
@@ -553,6 +553,14 @@ if(UHDR_ENABLE_INTRINSICS)
     file(GLOB UHDR_CORE_NEON_SRCS_LIST "${SOURCE_DIR}/src/dsp/arm/*.cpp")
     list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_NEON_SRCS_LIST})
   endif()
+  if(ARCH STREQUAL "riscv64")
+    file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
+    list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
+  endif()
+  if(ARCH STREQUAL "riscv32")
+    file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
+    list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
+  endif()
 endif()
 if(UHDR_ENABLE_GLES)
   file(GLOB UHDR_CORE_GLES_SRCS_LIST "${SOURCE_DIR}/src/gpu/*.cpp")

diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h
@@ -414,14 +414,20 @@ extern const std::array<float, 9> kYuvBt601ToBt2100;
 extern const std::array<float, 9> kYuvBt2100ToBt709;
 extern const std::array<float, 9> kYuvBt2100ToBt601;
 
-#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+#ifdef UHDR_ENABLE_INTRINSICS
+
+extern const int16_t kYuv709To601_coeffs_simd[8];
+extern const int16_t kYuv709To2100_coeffs_simd[8];
+extern const int16_t kYuv601To709_coeffs_simd[8];
+extern const int16_t kYuv601To2100_coeffs_simd[8];
+extern const int16_t kYuv2100To709_coeffs_simd[8];
+extern const int16_t kYuv2100To601_coeffs_simd[8];
+
+extern const uint16_t kRgb709ToYuv_coeffs_simd[8];
+extern const uint16_t kRgbDispP3ToYuv_coeffs_simd[8];
+extern const uint16_t kRgb2100ToYuv_coeffs_simd[8];
 
-extern const int16_t kYuv709To601_coeffs_neon[8];
-extern const int16_t kYuv709To2100_coeffs_neon[8];
-extern const int16_t kYuv601To709_coeffs_neon[8];
-extern const int16_t kYuv601To2100_coeffs_neon[8];
-extern const int16_t kYuv2100To709_coeffs_neon[8];
-extern const int16_t kYuv2100To601_coeffs_neon[8];
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
 
 /*
  * The Y values are provided at half the width of U & V values to allow use of the widening
@@ -435,6 +441,15 @@ void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);
 
 uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
                                   uhdr_color_gamut_t dst_encoding);
+
+#elif defined(__riscv_v_intrinsic)
+
+void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);
+
+uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
+                                  uhdr_color_gamut_t dst_encoding);
+
+#endif
 #endif
 
 // Performs a color gamut transformation on an yuv image.
@@ -588,6 +603,8 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr(
 
 #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
 std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src);
+#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
+std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src);
 #endif
 
 bool floatToSignedFraction(float v, int32_t* numerator, uint32_t* denominator);

diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp
@@ -27,55 +27,6 @@
 
 namespace ultrahdr {
 
-// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
-// by one error compared to the scalar floating-point implementation.
-
-// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
-// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
-// {Y1, Y2, U1, U2, V1, V2, 0, 0}
-
-// Yuv Bt709 -> Yuv Bt601
-// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
-// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
-// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
-ALIGNED(16)
-const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
-
-// Yuv Bt709 -> Yuv Bt2100
-// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
-// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
-// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
-ALIGNED(16)
-const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
-
-// Yuv Bt601 -> Yuv Bt709
-// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
-// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
-// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
-ALIGNED(16)
-const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
-
-// Yuv Bt601 -> Yuv Bt2100
-// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
-// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
-// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
-ALIGNED(16)
-const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
-
-// Yuv Bt2100 -> Yuv Bt709
-// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
-// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
-// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
-ALIGNED(16)
-const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
-
-// Yuv Bt2100 -> Yuv Bt601
-// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
-// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
-// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
-ALIGNED(16)
-const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
-
 static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
   int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0);
   int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0);
@@ -244,10 +195,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
         case UHDR_CG_BT_709:
           return status;
         case UHDR_CG_DISPLAY_P3:
-          coeffs = kYuv709To601_coeffs_neon;
+          coeffs = kYuv709To601_coeffs_simd;
           break;
         case UHDR_CG_BT_2100:
-          coeffs = kYuv709To2100_coeffs_neon;
+          coeffs = kYuv709To2100_coeffs_simd;
           break;
         default:
           status.error_code = UHDR_CODEC_INVALID_PARAM;
@@ -260,12 +211,12 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
     case UHDR_CG_DISPLAY_P3:
       switch (dst_encoding) {
         case UHDR_CG_BT_709:
-          coeffs = kYuv601To709_coeffs_neon;
+          coeffs = kYuv601To709_coeffs_simd;
           break;
         case UHDR_CG_DISPLAY_P3:
           return status;
         case UHDR_CG_BT_2100:
-          coeffs = kYuv601To2100_coeffs_neon;
+          coeffs = kYuv601To2100_coeffs_simd;
           break;
         default:
           status.error_code = UHDR_CODEC_INVALID_PARAM;
@@ -278,10 +229,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
     case UHDR_CG_BT_2100:
       switch (dst_encoding) {
         case UHDR_CG_BT_709:
-          coeffs = kYuv2100To709_coeffs_neon;
+          coeffs = kYuv2100To709_coeffs_simd;
           break;
         case UHDR_CG_DISPLAY_P3:
-          coeffs = kYuv2100To601_coeffs_neon;
+          coeffs = kYuv2100To601_coeffs_simd;
           break;
         case UHDR_CG_BT_2100:
           return status;
@@ -317,33 +268,6 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
   return status;
 }
 
-// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
-// by one error compared to the scalar floating-point implementation.
-
-// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
-// with an array size of 8 elements.
-
-// RGB Bt709 -> Yuv Bt709
-// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
-// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
-// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
-ALIGNED(16)
-const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};
-
-// RGB Display P3 -> Yuv Display P3
-// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
-// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
-// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
-ALIGNED(16)
-const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};
-
-// RGB Bt2100 -> Yuv Bt2100
-// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
-// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
-// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
-ALIGNED(16)
-const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};
-
 // The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of
 // libjpeg-turbo
 static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
@@ -460,11 +384,11 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_i
     const uint16_t* coeffs_ptr = nullptr;
 
     if (src->cg == UHDR_CG_BT_709) {
-      coeffs_ptr = kRgb709ToYuv_coeffs_neon;
+      coeffs_ptr = kRgb709ToYuv_coeffs_simd;
     } else if (src->cg == UHDR_CG_BT_2100) {
-      coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon;
+      coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd;
     } else if (src->cg == UHDR_CG_DISPLAY_P3) {
-      coeffs_ptr = kRgb2100ToYuv_coeffs_neon;
+      coeffs_ptr = kRgb2100ToYuv_coeffs_simd;
     } else {
       return dst;
     }