diff --git a/CMakeLists.txt b/CMakeLists.txt index 62b99d136..090c6c970 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,8 +252,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)") add_subdirectory(third_party/cpu_features EXCLUDE_FROM_ALL) set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}") list(APPEND LIBRARIES cpu_features) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + add_definitions(-DCT2_WITH_RVV) + set(CT2_BUILD_ARCH "riscv64") + message(STATUS "Target architecture is RISC-V with Vector extension") endif() +message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}") + if(ENABLE_CPU_DISPATCH) message(STATUS "Compiling for multiple CPU ISA and enabling runtime dispatch") add_definitions(-DCT2_WITH_CPU_DISPATCH) @@ -269,6 +275,9 @@ if(ENABLE_CPU_DISPATCH) endif() elseif(CT2_BUILD_ARCH STREQUAL "arm64") ct2_compile_kernels_for_isa(neon "-DUSE_NEON") + elseif(CT2_BUILD_ARCH STREQUAL "riscv64") + ct2_compile_kernels_for_isa(rvv "-march=rv64gcv") + message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}") endif() endif() diff --git a/python/tools/prepare_build_environment_linux.sh b/python/tools/prepare_build_environment_linux.sh index 61ac171a3..58d9cb52e 100755 --- a/python/tools/prepare_build_environment_linux.sh +++ b/python/tools/prepare_build_environment_linux.sh @@ -32,6 +32,7 @@ else libcudnn9-devel-cuda-12-9.1.0.70-1 \ libcublas-devel-12-2-12.2.5.6-1 \ libnccl-devel-2.19.3-1+cuda12.2 + yum update -y libstdc++ ln -s cuda-12.2 /usr/local/cuda ONEAPI_VERSION=2023.2.0 diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc index 9030ac7a4..d5f78b5db 100644 --- a/src/cpu/cpu_info.cc +++ b/src/cpu/cpu_info.cc @@ -58,4 +58,22 @@ namespace ctranslate2 { } } +#elif defined(CT2_WITH_RVV) + +namespace ctranslate2 { + namespace cpu { + + const char* cpu_vendor() { + return "RVV"; + } + + bool cpu_supports_rvv() { + return true; + } + + } +} + + + #endif diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h index c2951bcc0..1251a1f7a 100644 --- a/src/cpu/cpu_info.h +++ b/src/cpu/cpu_info.h @@ -14,6 +14,8 @@ namespace ctranslate2 { bool cpu_supports_avx512(); #elif defined(CT2_ARM64_BUILD) bool cpu_supports_neon(); +#elif defined(CT2_WITH_RVV) + bool cpu_supports_rvv(); #endif } diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc index c16aeda22..e9ba8fbf9 100644 --- a/src/cpu/cpu_isa.cc +++ b/src/cpu/cpu_isa.cc @@ -35,6 +35,9 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) case CpuIsa::NEON: return "NEON"; +#elif defined(CT2_WITH_RVV) + case CpuIsa::RVV: + return "RVV"; #endif default: return "GENERIC"; @@ -54,6 +57,9 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) if (env_isa == "NEON") return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon()); +#elif defined(CT2_WITH_RVV) + if (env_isa == "RVV") + return try_isa(env_isa, CpuIsa::RVV, cpu_supports_rvv()); #endif if (env_isa == "GENERIC") return CpuIsa::GENERIC; @@ -71,6 +77,9 @@ namespace ctranslate2 { # elif defined(CT2_ARM64_BUILD) if (cpu_supports_neon()) return CpuIsa::NEON; +# elif defined(CT2_WITH_RVV) + if (cpu_supports_rvv()) + return CpuIsa::RVV; # endif #endif diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index 4f42bdf26..b5d7cf5e3 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -13,6 +13,8 @@ namespace ctranslate2 { AVX512, #elif defined(CT2_ARM64_BUILD) NEON, +#elif defined(CT2_WITH_RVV) + RVV, #endif }; @@ -54,6 +56,12 @@ namespace ctranslate2 { CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } +#elif defined(CT2_WITH_RVV) +# define CPU_ISA_DISPATCH(STMTS) \ + switch (cpu::get_cpu_isa()) { \ + CPU_ISA_CASE(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS)) \ + CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ + } #endif #elif defined(__AVX512F__) # define CPU_ISA_DISPATCH(STMTS) \ @@ -75,6 +83,11 @@ namespace ctranslate2 { switch (cpu::get_cpu_isa()) { \ CPU_ISA_DEFAULT(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ } +#elif defined(__riscv_vector) +# define CPU_ISA_DISPATCH(STMTS) \ + switch (cpu::get_cpu_isa()) { \ + CPU_ISA_DEFAULT(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS)) \ + } #else # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc index c1f48553d..5496a1612 100644 --- a/src/cpu/kernels.cc +++ b/src/cpu/kernels.cc @@ -14,6 +14,10 @@ #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON) # define TARGET_ISA CpuIsa::NEON # include "cpu/vec_neon.h" +#elif (defined(CT2_WITH_RVV) && defined(__riscv_vector)) +# define USE_RVV +# define TARGET_ISA CpuIsa::RVV +# include "cpu/vec_rvv.h" #else # define TARGET_ISA CpuIsa::GENERIC # include "cpu/vec.h" @@ -213,7 +217,7 @@ namespace ctranslate2 { template<> void exp(const float* x, float* y, dim_t size) { - vectorized_unary_transform(x, y, size, Vec::exp); + vectorized_unary_transform(x, y, size, Vec::exp); } template<> @@ -263,11 +267,20 @@ namespace ctranslate2 { template void add(T a, const T* x, T* y, dim_t size) { +#ifdef USE_RVV + T a_copy = a; + vectorized_unary_transform(x, y, size, + [a_copy](vec_type v) { + auto vec_a = Vec::load(a_copy); + return Vec::add(v, vec_a); + }); +#else auto vec_a = Vec::load(a); vectorized_unary_transform(x, y, size, - [vec_a](vec_type v) { - return Vec::add(v, vec_a); - }); + [vec_a](vec_type v) { + return Vec::add(v, vec_a); + }); +#endif } template @@ -282,11 +295,20 @@ namespace ctranslate2 { template void mul(T a, const T* x, T* y, dim_t size) { +#ifdef USE_RVV + T a_copy = a; + vectorized_unary_transform(x, y, size, + [a_copy](vec_type v) { + auto vec_a = Vec::load(a_copy); + return Vec::mul(v, vec_a); + }); +#else auto vec_a = Vec::load(a); vectorized_unary_transform(x, y, size, - [vec_a](vec_type v) { - return Vec::mul(v, vec_a); - }); + [vec_a](vec_type v) { + return Vec::mul(v, vec_a); + }); +#endif } template @@ -296,11 +318,20 @@ namespace ctranslate2 { template void max(T a, const T* x, T* y, dim_t size) { +#ifdef USE_RVV + T a_copy = a; + vectorized_unary_transform(x, y, size, + [a_copy](vec_type v) { + auto vec_a = Vec::load(a_copy); + return Vec::max(v, vec_a); + }); +#else auto vec_a = Vec::load(a); vectorized_unary_transform(x, y, size, - [vec_a](vec_type v) { - return Vec::max(v, vec_a); - }); + [vec_a](vec_type v) { + return Vec::max(v, vec_a); + }); +#endif } template @@ -310,11 +341,20 @@ namespace ctranslate2 { template void min(T a, const T* x, T* y, dim_t size) { +#ifdef USE_RVV + T a_copy = a; + vectorized_unary_transform(x, y, size, + [a_copy](vec_type v) { + auto vec_a = Vec::load(a_copy); + return Vec::min(v, vec_a); + }); +#else auto vec_a = Vec::load(a); vectorized_unary_transform(x, y, size, - [vec_a](vec_type v) { - return Vec::min(v, vec_a); - }); + [vec_a](vec_type v) { + return Vec::min(v, vec_a); + }); +#endif } template @@ -349,6 +389,7 @@ namespace ctranslate2 { static_cast(0), Vec::abs, Vec::max, + Vec::reduce_max, Vec::abs, Vec::max); @@ -377,14 +418,22 @@ namespace ctranslate2 { using VecType = Vec; const auto x_max = reduce_max(x, size); - const auto vec_x_max = VecType::load(x_max); - const auto scalar_exp_func = [x_max](vec_type v) { - return Vec::exp(Vec::sub(v, x_max)); + const auto scalar_exp_func = [x_max](float v) { + return std::exp(v - x_max); }; - const auto vec_exp_func = [vec_x_max](vec_type v) { +#ifdef USE_RVV + float x_max_copy = x_max; + auto vec_exp_func = [x_max_copy](vec_type v) { + auto vec_x_max = VecType::load(x_max_copy); return VecType::exp(VecType::sub(v, vec_x_max)); }; +#else + const auto vec_x_max = VecType::load(x_max); + auto vec_exp_func = [vec_x_max](vec_type v) { + return VecType::exp(VecType::sub(v, vec_x_max)); + }; +#endif const auto exp_sum = vectorized_map_reduce_all( x, @@ -429,14 +478,21 @@ namespace ctranslate2 { } const auto x_max = reduce_max(x, size); - const auto vec_x_max = VecType::load(x_max); - - const auto scalar_exp_func = [x_max](vec_type v) { - return Vec::exp(Vec::sub(v, x_max)); + const auto scalar_exp_func = [x_max](float v) { + return std::exp(v - x_max); + }; +#ifdef USE_RVV + float x_max_copy = x_max; + auto vec_exp_func = [x_max_copy](vec_type v) { + auto vec_x_max = VecType::load(x_max_copy); + return VecType::exp(VecType::sub(v, vec_x_max)); }; - const auto vec_exp_func = [vec_x_max](vec_type v) { +#else + const auto vec_x_max = VecType::load(x_max); + auto vec_exp_func = [vec_x_max](vec_type v) { return VecType::exp(VecType::sub(v, vec_x_max)); }; +#endif if (log) { const auto exp_sum = vectorized_map_reduce_all( diff --git a/src/cpu/vec_rvv.h b/src/cpu/vec_rvv.h new file mode 100644 index 000000000..8a67f3a81 --- /dev/null +++ b/src/cpu/vec_rvv.h @@ -0,0 +1,237 @@ +#pragma once + +#include +#include "vec.h" + +namespace ctranslate2 { + namespace cpu { + + template<> + struct Vec { + using value_type = vfloat32m1_t; + using mask_type = vbool32_t; + static constexpr dim_t width = 4; + static inline value_type load(float value) { + return __riscv_vfmv_v_f_f32m1(value, width); + } + + static inline value_type load(const float* ptr) { + return __riscv_vle32_v_f32m1(ptr, width); + } + + static inline value_type load(const float* ptr, dim_t count, float default_value = 0) { + if (count == width) { + return __riscv_vle32_v_f32m1(ptr, width); + } else { + float tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return __riscv_vle32_v_f32m1(tmp_values, width); + } + } + + static inline value_type load_and_convert(const int32_t* ptr) { + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vle32_v_i32m1(ptr, width), width); + } + + static inline value_type load_and_convert(const int32_t* ptr, dim_t count, int32_t default_value = 0) { + if (count == width) { + return load_and_convert(ptr); + } else { + int32_t tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return load_and_convert(tmp_values); + } + } + + static inline void store(value_type value, float* ptr) { + __riscv_vse32_v_f32m1(ptr, value, width); + } + + static inline void store(value_type value, float* ptr, dim_t count) { + if (count == width) { + __riscv_vse32_v_f32m1(ptr, value, width); + } else { + float tmp_values[width]; + __riscv_vse32_v_f32m1(tmp_values, value, width); + std::copy(tmp_values, tmp_values + count, ptr); + } + } + + static inline value_type bit_and(value_type a, value_type b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), width)); + } + + static inline value_type bit_xor(value_type a, value_type b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), width)); + } + + static inline mask_type lt(value_type a, value_type b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, width); + } + + static inline value_type select(mask_type mask, value_type a, value_type b) { + return __riscv_vmerge_vvm_f32m1( a, b,mask, width); + } + + static inline value_type abs(value_type a) { + return __riscv_vfabs_v_f32m1(a, width); + } + + static inline value_type neg(value_type a) { + return __riscv_vfneg_v_f32m1(a, width); + } + + static inline value_type rcp(value_type a) { + return __riscv_vfrdiv_vf_f32m1(a, 1.f, width); + } + + static inline value_type exp(value_type a) { + // Need to implement exp using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::exp(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type log(value_type a) { + // Need to implement log using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::log(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type sin(value_type a) { + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::sin(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type cos(value_type a) { + // Need to implement cos using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::cos(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type tanh(value_type a) { + // Need to implement tanh using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::tanh(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type erf(value_type a) { + // Need to implement erf using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::erf(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + static inline value_type max(value_type a, value_type b) { + return __riscv_vfmax_vv_f32m1(a, b, width); + } + + static inline value_type min(value_type a, value_type b) { + return __riscv_vfmin_vv_f32m1(a, b, width); + } + + static inline value_type add(value_type a, value_type b) { + return __riscv_vfadd_vv_f32m1(a, b, width); + } + + static inline value_type sub(value_type a, value_type b) { + return __riscv_vfsub_vv_f32m1(a, b, width); + } + + static inline value_type mul(value_type a, value_type b) { + return __riscv_vfmul_vv_f32m1(a, b, width); + } + + static inline value_type div(value_type a, value_type b) { + return __riscv_vfdiv_vv_f32m1(a, b, width); + } + + static inline value_type mul_add(value_type a, value_type b, value_type c) { + return __riscv_vfmacc_vv_f32m1(c, a, b, width); + } + + static inline float reduce_add(value_type a) { + // 使用RVV reduce sum内在函数 + value_type result = __riscv_vfredusum_vs_f32m1_f32m1(a, a, width); + return __riscv_vfmv_f_s_f32m1_f32(result); + } + + static inline float reduce_max(value_type a) { + // 使用RVV reduce max内在函数 + value_type result = __riscv_vfredmax_vs_f32m1_f32m1(a, a, width); + return __riscv_vfmv_f_s_f32m1_f32(result); + } + + static inline value_type round(value_type a) { + // Need to implement erf using RVV intrinsics + float temp_a[4]; + float temp_result[4]; + + __riscv_vse32_v_f32m1(temp_a, a, width); + + for (size_t i = 0; i < width; ++i) { + temp_result[i] = std::round(temp_a[i]); + } + + return __riscv_vle32_v_f32m1(temp_result, width); + } + + template + static void convert_and_store(value_type v, T* a, dim_t count) { + auto i32 = __riscv_vfcvt_x_f_v_i32m1(v, width); + int32_t tmp[width]; + __riscv_vse32_v_i32m1(tmp, i32, width); + std::copy(tmp, tmp + count, a); + } + }; + + } +}