OpenNMT · little-creator · Aug 20, 2025 · Aug 20, 2025 · Aug 22, 2025 · Aug 22, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -252,8 +252,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)")
   add_subdirectory(third_party/cpu_features EXCLUDE_FROM_ALL)
   set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
   list(APPEND LIBRARIES cpu_features)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+  add_definitions(-DCT2_WITH_RVV)
+  set(CT2_BUILD_ARCH "riscv64")
+  message(STATUS "Target architecture is RISC-V with Vector extension")
 endif()
 
+message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")
+
 if(ENABLE_CPU_DISPATCH)
   message(STATUS "Compiling for multiple CPU ISA and enabling runtime dispatch")
   add_definitions(-DCT2_WITH_CPU_DISPATCH)
@@ -269,6 +275,9 @@ if(ENABLE_CPU_DISPATCH)
     endif()
   elseif(CT2_BUILD_ARCH STREQUAL "arm64")
     ct2_compile_kernels_for_isa(neon "-DUSE_NEON")
+  elseif(CT2_BUILD_ARCH STREQUAL "riscv64")
+    ct2_compile_kernels_for_isa(rvv "-march=rv64gcv")
+    message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")
   endif()
 endif()
 

diff --git a/python/tools/prepare_build_environment_linux.sh b/python/tools/prepare_build_environment_linux.sh
@@ -32,6 +32,7 @@ else
         libcudnn9-devel-cuda-12-9.1.0.70-1 \
         libcublas-devel-12-2-12.2.5.6-1 \
         libnccl-devel-2.19.3-1+cuda12.2
+    yum update -y libstdc++
     ln -s cuda-12.2 /usr/local/cuda
 
     ONEAPI_VERSION=2023.2.0

diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc
@@ -58,4 +58,22 @@ namespace ctranslate2 {
   }
 }
 
+#elif defined(CT2_WITH_RVV)
+
+namespace ctranslate2 {
+  namespace cpu {
+
+    const char* cpu_vendor() {
+      return "RVV";
+    }
+
+    bool cpu_supports_rvv() {
+      return true;
+    }
+
+  }
+}
+
+
+
 #endif
diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h
@@ -14,6 +14,8 @@ namespace ctranslate2 {
     bool cpu_supports_avx512();
 #elif defined(CT2_ARM64_BUILD)
     bool cpu_supports_neon();
+#elif defined(CT2_WITH_RVV)
+    bool cpu_supports_rvv();
 #endif
 
   }

diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc
@@ -35,6 +35,9 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
       case CpuIsa::NEON:
         return "NEON";
+#elif defined(CT2_WITH_RVV)
+      case CpuIsa::RVV:
+        return "RVV";
 #endif
       default:
         return "GENERIC";
@@ -54,6 +57,9 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
         if (env_isa == "NEON")
           return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
+#elif defined(CT2_WITH_RVV)
+        if (env_isa == "RVV")
+          return try_isa(env_isa, CpuIsa::RVV, cpu_supports_rvv());
 #endif
         if (env_isa == "GENERIC")
           return CpuIsa::GENERIC;
@@ -71,6 +77,9 @@ namespace ctranslate2 {
 #  elif defined(CT2_ARM64_BUILD)
       if (cpu_supports_neon())
         return CpuIsa::NEON;
+#  elif defined(CT2_WITH_RVV)
+      if (cpu_supports_rvv())
+        return CpuIsa::RVV;
 #  endif
 #endif
 

diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h
@@ -13,6 +13,8 @@ namespace ctranslate2 {
       AVX512,
 #elif defined(CT2_ARM64_BUILD)
       NEON,
+#elif defined(CT2_WITH_RVV)
+      RVV,
 #endif
     };
 
@@ -54,6 +56,12 @@ namespace ctranslate2 {
     CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))        \
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
+#elif defined(CT2_WITH_RVV)
+#  define CPU_ISA_DISPATCH(STMTS)                             \
+  switch (cpu::get_cpu_isa()) {                               \
+    CPU_ISA_CASE(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS))        \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
+  }  
 #endif
 #elif defined(__AVX512F__)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
@@ -75,6 +83,11 @@ namespace ctranslate2 {
   switch (cpu::get_cpu_isa()) {                               \
     CPU_ISA_DEFAULT(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))     \
   }
+#elif defined(__riscv_vector)
+#  define CPU_ISA_DISPATCH(STMTS)                             \
+  switch (cpu::get_cpu_isa()) {                               \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS))     \
+  }
 #else
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \

diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc
@@ -14,6 +14,10 @@
 #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON)
 #  define TARGET_ISA CpuIsa::NEON
 #  include "cpu/vec_neon.h"
+#elif (defined(CT2_WITH_RVV) && defined(__riscv_vector))
+#  define USE_RVV
+#  define TARGET_ISA CpuIsa::RVV
+#  include "cpu/vec_rvv.h"
 #else
 #  define TARGET_ISA CpuIsa::GENERIC
 #  include "cpu/vec.h"
@@ -213,7 +217,7 @@ namespace ctranslate2 {
 
     template<>
     void exp<TARGET_ISA>(const float* x, float* y, dim_t size) {
-      vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
+        vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
     }
 
     template<>
@@ -263,11 +267,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void add(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::add(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::add(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::add(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -282,11 +295,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void mul(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::mul(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::mul(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::mul(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -296,11 +318,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void max(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::max(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::max(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::max(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -310,11 +341,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void min(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::min(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::min(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::min(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -349,6 +389,7 @@ namespace ctranslate2 {
                                             static_cast<T>(0),
                                             Vec<T, ISA>::abs,
                                             Vec<T, ISA>::max,
+
                                             Vec<T, ISA>::reduce_max,
                                             Vec<T>::abs,
                                             Vec<T>::max);
@@ -377,14 +418,22 @@ namespace ctranslate2 {
       using VecType = Vec<float, TARGET_ISA>;
 
       const auto x_max = reduce_max<TARGET_ISA>(x, size);
-      const auto vec_x_max = VecType::load(x_max);
 
-      const auto scalar_exp_func = [x_max](vec_type<float> v) {
-        return Vec<float>::exp(Vec<float>::sub(v, x_max));
+      const auto scalar_exp_func = [x_max](float v) {
+        return std::exp(v - x_max);
       };
-      const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+#ifdef USE_RVV
+      float x_max_copy = x_max;
+      auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
+        auto vec_x_max = VecType::load(x_max_copy);
         return VecType::exp(VecType::sub(v, vec_x_max));
       };
+#else
+      const auto vec_x_max = VecType::load(x_max);
+      auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+        return VecType::exp(VecType::sub(v, vec_x_max));
+      };
+#endif
 
       const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(
         x,
@@ -429,14 +478,21 @@ namespace ctranslate2 {
           }
 
           const auto x_max = reduce_max<TARGET_ISA>(x, size);
-          const auto vec_x_max = VecType::load(x_max);
-
-          const auto scalar_exp_func = [x_max](vec_type<float> v) {
-            return Vec<float>::exp(Vec<float>::sub(v, x_max));
+          const auto scalar_exp_func = [x_max](float v) {
+            return std::exp(v - x_max);
+          };
+#ifdef USE_RVV
+          float x_max_copy = x_max;
+          auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
+            auto vec_x_max = VecType::load(x_max_copy);
+            return VecType::exp(VecType::sub(v, vec_x_max));
           };
-          const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+#else
+          const auto vec_x_max = VecType::load(x_max);
+          auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
             return VecType::exp(VecType::sub(v, vec_x_max));
           };
+#endif
 
           if (log) {
             const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(