Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ build*
third_party/googletest
third_party/turbojpeg
third_party/benchmark
tests/data
tests/data

# IDE's configs
.vscode/settings.json
12 changes: 10 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,10 +279,10 @@ else()
add_compile_options(-march=armv8-a)
add_compile_options(-fno-lax-vector-conversions)
elseif(ARCH STREQUAL "riscv64")
add_compile_options(-march=rv64gc)
add_compile_options(-march=rv64gcv)
add_compile_options(-mabi=lp64d)
elseif(ARCH STREQUAL "riscv32")
add_compile_options(-march=rv32gc)
add_compile_options(-march=rv32gcv)
add_compile_options(-mabi=ilp32d)
elseif(ARCH STREQUAL "loong64")
add_compile_options(-march=loongarch64)
Expand Down Expand Up @@ -553,6 +553,14 @@ if(UHDR_ENABLE_INTRINSICS)
file(GLOB UHDR_CORE_NEON_SRCS_LIST "${SOURCE_DIR}/src/dsp/arm/*.cpp")
list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_NEON_SRCS_LIST})
endif()
if(ARCH STREQUAL "riscv64")
file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
endif()
if(ARCH STREQUAL "riscv32")
file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
endif()
endif()
if(UHDR_ENABLE_GLES)
file(GLOB UHDR_CORE_GLES_SRCS_LIST "${SOURCE_DIR}/src/gpu/*.cpp")
Expand Down
31 changes: 24 additions & 7 deletions lib/include/ultrahdr/gainmapmath.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,20 @@ extern const std::array<float, 9> kYuvBt601ToBt2100;
extern const std::array<float, 9> kYuvBt2100ToBt709;
extern const std::array<float, 9> kYuvBt2100ToBt601;

#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
#ifdef UHDR_ENABLE_INTRINSICS

extern const int16_t kYuv709To601_coeffs_simd[8];
extern const int16_t kYuv709To2100_coeffs_simd[8];
extern const int16_t kYuv601To709_coeffs_simd[8];
extern const int16_t kYuv601To2100_coeffs_simd[8];
extern const int16_t kYuv2100To709_coeffs_simd[8];
extern const int16_t kYuv2100To601_coeffs_simd[8];

extern const uint16_t kRgb709ToYuv_coeffs_simd[8];
extern const uint16_t kRgbDispP3ToYuv_coeffs_simd[8];
extern const uint16_t kRgb2100ToYuv_coeffs_simd[8];

extern const int16_t kYuv709To601_coeffs_neon[8];
extern const int16_t kYuv709To2100_coeffs_neon[8];
extern const int16_t kYuv601To709_coeffs_neon[8];
extern const int16_t kYuv601To2100_coeffs_neon[8];
extern const int16_t kYuv2100To709_coeffs_neon[8];
extern const int16_t kYuv2100To601_coeffs_neon[8];
#if (defined(__ARM_NEON__) || defined(__ARM_NEON))

/*
* The Y values are provided at half the width of U & V values to allow use of the widening
Expand All @@ -435,6 +441,15 @@ void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);

uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
uhdr_color_gamut_t dst_encoding);

#elif defined(__riscv_v_intrinsic)

void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);

uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
uhdr_color_gamut_t dst_encoding);

#endif
#endif

// Performs a color gamut transformation on an yuv image.
Expand Down Expand Up @@ -588,6 +603,8 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr(

#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src);
#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src);
#endif

bool floatToSignedFraction(float v, int32_t* numerator, uint32_t* denominator);
Expand Down
94 changes: 9 additions & 85 deletions lib/src/dsp/arm/gainmapmath_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,55 +27,6 @@

namespace ultrahdr {

// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
// by one error compared to the scalar floating-point implementation.

// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
// {Y1, Y2, U1, U2, V1, V2, 0, 0}

// Yuv Bt709 -> Yuv Bt601
// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
ALIGNED(16)
const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};

// Yuv Bt709 -> Yuv Bt2100
// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
ALIGNED(16)
const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};

// Yuv Bt601 -> Yuv Bt709
// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
ALIGNED(16)
const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};

// Yuv Bt601 -> Yuv Bt2100
// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
ALIGNED(16)
const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};

// Yuv Bt2100 -> Yuv Bt709
// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
ALIGNED(16)
const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};

// Yuv Bt2100 -> Yuv Bt601
// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
ALIGNED(16)
const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};

static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0);
int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0);
Expand Down Expand Up @@ -244,10 +195,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
case UHDR_CG_BT_709:
return status;
case UHDR_CG_DISPLAY_P3:
coeffs = kYuv709To601_coeffs_neon;
coeffs = kYuv709To601_coeffs_simd;
break;
case UHDR_CG_BT_2100:
coeffs = kYuv709To2100_coeffs_neon;
coeffs = kYuv709To2100_coeffs_simd;
break;
default:
status.error_code = UHDR_CODEC_INVALID_PARAM;
Expand All @@ -260,12 +211,12 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
case UHDR_CG_DISPLAY_P3:
switch (dst_encoding) {
case UHDR_CG_BT_709:
coeffs = kYuv601To709_coeffs_neon;
coeffs = kYuv601To709_coeffs_simd;
break;
case UHDR_CG_DISPLAY_P3:
return status;
case UHDR_CG_BT_2100:
coeffs = kYuv601To2100_coeffs_neon;
coeffs = kYuv601To2100_coeffs_simd;
break;
default:
status.error_code = UHDR_CODEC_INVALID_PARAM;
Expand All @@ -278,10 +229,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
case UHDR_CG_BT_2100:
switch (dst_encoding) {
case UHDR_CG_BT_709:
coeffs = kYuv2100To709_coeffs_neon;
coeffs = kYuv2100To709_coeffs_simd;
break;
case UHDR_CG_DISPLAY_P3:
coeffs = kYuv2100To601_coeffs_neon;
coeffs = kYuv2100To601_coeffs_simd;
break;
case UHDR_CG_BT_2100:
return status;
Expand Down Expand Up @@ -317,33 +268,6 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
return status;
}

// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
// by one error compared to the scalar floating-point implementation.

// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
// with an array size of 8 elements.

// RGB Bt709 -> Yuv Bt709
// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
ALIGNED(16)
const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};

// RGB Display P3 -> Yuv Display P3
// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
ALIGNED(16)
const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};

// RGB Bt2100 -> Yuv Bt2100
// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
ALIGNED(16)
const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};

// The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of
// libjpeg-turbo
static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
Expand Down Expand Up @@ -460,11 +384,11 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_i
const uint16_t* coeffs_ptr = nullptr;

if (src->cg == UHDR_CG_BT_709) {
coeffs_ptr = kRgb709ToYuv_coeffs_neon;
coeffs_ptr = kRgb709ToYuv_coeffs_simd;
} else if (src->cg == UHDR_CG_BT_2100) {
coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon;
coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd;
} else if (src->cg == UHDR_CG_DISPLAY_P3) {
coeffs_ptr = kRgb2100ToYuv_coeffs_neon;
coeffs_ptr = kRgb2100ToYuv_coeffs_simd;
} else {
return dst;
}
Expand Down
Loading
Loading