Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)")
add_subdirectory(third_party/cpu_features EXCLUDE_FROM_ALL)
set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
list(APPEND LIBRARIES cpu_features)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
add_definitions(-DCT2_WITH_RVV)
set(CT2_BUILD_ARCH "riscv64")
message(STATUS "Target architecture is RISC-V with Vector extension")
endif()

message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")

if(ENABLE_CPU_DISPATCH)
message(STATUS "Compiling for multiple CPU ISA and enabling runtime dispatch")
add_definitions(-DCT2_WITH_CPU_DISPATCH)
Expand All @@ -269,6 +275,9 @@ if(ENABLE_CPU_DISPATCH)
endif()
elseif(CT2_BUILD_ARCH STREQUAL "arm64")
ct2_compile_kernels_for_isa(neon "-DUSE_NEON")
elseif(CT2_BUILD_ARCH STREQUAL "riscv64")
ct2_compile_kernels_for_isa(rvv "-march=rv64gcv")
message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")
endif()
endif()

Expand Down
1 change: 1 addition & 0 deletions python/tools/prepare_build_environment_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ else
libcudnn9-devel-cuda-12-9.1.0.70-1 \
libcublas-devel-12-2-12.2.5.6-1 \
libnccl-devel-2.19.3-1+cuda12.2
yum update -y libstdc++
ln -s cuda-12.2 /usr/local/cuda

ONEAPI_VERSION=2023.2.0
Expand Down
18 changes: 18 additions & 0 deletions src/cpu/cpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,22 @@ namespace ctranslate2 {
}
}

#elif defined(CT2_WITH_RVV)

namespace ctranslate2 {
namespace cpu {

const char* cpu_vendor() {
return "RVV";
}

bool cpu_supports_rvv() {
return true;
}

}
}



#endif
2 changes: 2 additions & 0 deletions src/cpu/cpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace ctranslate2 {
bool cpu_supports_avx512();
#elif defined(CT2_ARM64_BUILD)
bool cpu_supports_neon();
#elif defined(CT2_WITH_RVV)
bool cpu_supports_rvv();
#endif

}
Expand Down
9 changes: 9 additions & 0 deletions src/cpu/cpu_isa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ namespace ctranslate2 {
#elif defined(CT2_ARM64_BUILD)
case CpuIsa::NEON:
return "NEON";
#elif defined(CT2_WITH_RVV)
case CpuIsa::RVV:
return "RVV";
#endif
default:
return "GENERIC";
Expand All @@ -54,6 +57,9 @@ namespace ctranslate2 {
#elif defined(CT2_ARM64_BUILD)
if (env_isa == "NEON")
return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
#elif defined(CT2_WITH_RVV)
if (env_isa == "RVV")
return try_isa(env_isa, CpuIsa::RVV, cpu_supports_rvv());
#endif
if (env_isa == "GENERIC")
return CpuIsa::GENERIC;
Expand All @@ -71,6 +77,9 @@ namespace ctranslate2 {
# elif defined(CT2_ARM64_BUILD)
if (cpu_supports_neon())
return CpuIsa::NEON;
# elif defined(CT2_WITH_RVV)
if (cpu_supports_rvv())
return CpuIsa::RVV;
# endif
#endif

Expand Down
13 changes: 13 additions & 0 deletions src/cpu/cpu_isa.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace ctranslate2 {
AVX512,
#elif defined(CT2_ARM64_BUILD)
NEON,
#elif defined(CT2_WITH_RVV)
RVV,
#endif
};

Expand Down Expand Up @@ -54,6 +56,12 @@ namespace ctranslate2 {
CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \
CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \
}
#elif defined(CT2_WITH_RVV)
# define CPU_ISA_DISPATCH(STMTS) \
switch (cpu::get_cpu_isa()) { \
CPU_ISA_CASE(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS)) \
CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \
}
#endif
#elif defined(__AVX512F__)
# define CPU_ISA_DISPATCH(STMTS) \
Expand All @@ -75,6 +83,11 @@ namespace ctranslate2 {
switch (cpu::get_cpu_isa()) { \
CPU_ISA_DEFAULT(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \
}
#elif defined(__riscv_vector)
# define CPU_ISA_DISPATCH(STMTS) \
switch (cpu::get_cpu_isa()) { \
CPU_ISA_DEFAULT(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS)) \
}
#else
# define CPU_ISA_DISPATCH(STMTS) \
switch (cpu::get_cpu_isa()) { \
Expand Down
100 changes: 78 additions & 22 deletions src/cpu/kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON)
# define TARGET_ISA CpuIsa::NEON
# include "cpu/vec_neon.h"
#elif (defined(CT2_WITH_RVV) && defined(__riscv_vector))
# define USE_RVV
# define TARGET_ISA CpuIsa::RVV
# include "cpu/vec_rvv.h"
#else
# define TARGET_ISA CpuIsa::GENERIC
# include "cpu/vec.h"
Expand Down Expand Up @@ -213,7 +217,7 @@ namespace ctranslate2 {

template<>
void exp<TARGET_ISA>(const float* x, float* y, dim_t size) {
vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
}

template<>
Expand Down Expand Up @@ -263,11 +267,20 @@ namespace ctranslate2 {

template <CpuIsa ISA, typename T>
void add(T a, const T* x, T* y, dim_t size) {
#ifdef USE_RVV
T a_copy = a;
vectorized_unary_transform<ISA>(x, y, size,
[a_copy](vec_type<T, ISA> v) {
auto vec_a = Vec<T, ISA>::load(a_copy);
return Vec<T, ISA>::add(v, vec_a);
});
#else
auto vec_a = Vec<T, ISA>::load(a);
vectorized_unary_transform<ISA>(x, y, size,
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::add(v, vec_a);
});
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::add(v, vec_a);
});
#endif
}

template <CpuIsa ISA, typename T>
Expand All @@ -282,11 +295,20 @@ namespace ctranslate2 {

template <CpuIsa ISA, typename T>
void mul(T a, const T* x, T* y, dim_t size) {
#ifdef USE_RVV
T a_copy = a;
vectorized_unary_transform<ISA>(x, y, size,
[a_copy](vec_type<T, ISA> v) {
auto vec_a = Vec<T, ISA>::load(a_copy);
return Vec<T, ISA>::mul(v, vec_a);
});
#else
auto vec_a = Vec<T, ISA>::load(a);
vectorized_unary_transform<ISA>(x, y, size,
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::mul(v, vec_a);
});
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::mul(v, vec_a);
});
#endif
}

template <CpuIsa ISA, typename T>
Expand All @@ -296,11 +318,20 @@ namespace ctranslate2 {

template <CpuIsa ISA, typename T>
void max(T a, const T* x, T* y, dim_t size) {
#ifdef USE_RVV
T a_copy = a;
vectorized_unary_transform<ISA>(x, y, size,
[a_copy](vec_type<T, ISA> v) {
auto vec_a = Vec<T, ISA>::load(a_copy);
return Vec<T, ISA>::max(v, vec_a);
});
#else
auto vec_a = Vec<T, ISA>::load(a);
vectorized_unary_transform<ISA>(x, y, size,
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::max(v, vec_a);
});
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::max(v, vec_a);
});
#endif
}

template <CpuIsa ISA, typename T>
Expand All @@ -310,11 +341,20 @@ namespace ctranslate2 {

template <CpuIsa ISA, typename T>
void min(T a, const T* x, T* y, dim_t size) {
#ifdef USE_RVV
T a_copy = a;
vectorized_unary_transform<ISA>(x, y, size,
[a_copy](vec_type<T, ISA> v) {
auto vec_a = Vec<T, ISA>::load(a_copy);
return Vec<T, ISA>::min(v, vec_a);
});
#else
auto vec_a = Vec<T, ISA>::load(a);
vectorized_unary_transform<ISA>(x, y, size,
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::min(v, vec_a);
});
[vec_a](vec_type<T, ISA> v) {
return Vec<T, ISA>::min(v, vec_a);
});
#endif
}

template <CpuIsa ISA, typename T>
Expand Down Expand Up @@ -349,6 +389,7 @@ namespace ctranslate2 {
static_cast<T>(0),
Vec<T, ISA>::abs,
Vec<T, ISA>::max,

Vec<T, ISA>::reduce_max,
Vec<T>::abs,
Vec<T>::max);
Expand Down Expand Up @@ -377,14 +418,22 @@ namespace ctranslate2 {
using VecType = Vec<float, TARGET_ISA>;

const auto x_max = reduce_max<TARGET_ISA>(x, size);
const auto vec_x_max = VecType::load(x_max);

const auto scalar_exp_func = [x_max](vec_type<float> v) {
return Vec<float>::exp(Vec<float>::sub(v, x_max));
const auto scalar_exp_func = [x_max](float v) {
return std::exp(v - x_max);
};
const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
#ifdef USE_RVV
float x_max_copy = x_max;
auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
auto vec_x_max = VecType::load(x_max_copy);
return VecType::exp(VecType::sub(v, vec_x_max));
};
#else
const auto vec_x_max = VecType::load(x_max);
auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
return VecType::exp(VecType::sub(v, vec_x_max));
};
#endif

const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(
x,
Expand Down Expand Up @@ -429,14 +478,21 @@ namespace ctranslate2 {
}

const auto x_max = reduce_max<TARGET_ISA>(x, size);
const auto vec_x_max = VecType::load(x_max);

const auto scalar_exp_func = [x_max](vec_type<float> v) {
return Vec<float>::exp(Vec<float>::sub(v, x_max));
const auto scalar_exp_func = [x_max](float v) {
return std::exp(v - x_max);
};
#ifdef USE_RVV
float x_max_copy = x_max;
auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
auto vec_x_max = VecType::load(x_max_copy);
return VecType::exp(VecType::sub(v, vec_x_max));
};
const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
#else
const auto vec_x_max = VecType::load(x_max);
auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
return VecType::exp(VecType::sub(v, vec_x_max));
};
#endif

if (log) {
const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(
Expand Down
Loading
Loading