diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62b99d136..090c6c970 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,8 +252,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)")
   add_subdirectory(third_party/cpu_features EXCLUDE_FROM_ALL)
   set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
   list(APPEND LIBRARIES cpu_features)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+  add_definitions(-DCT2_WITH_RVV)
+  set(CT2_BUILD_ARCH "riscv64")
+  message(STATUS "Target architecture is RISC-V with Vector extension")
 endif()
 
+message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")
+
 if(ENABLE_CPU_DISPATCH)
   message(STATUS "Compiling for multiple CPU ISA and enabling runtime dispatch")
   add_definitions(-DCT2_WITH_CPU_DISPATCH)
@@ -269,6 +275,9 @@ if(ENABLE_CPU_DISPATCH)
     endif()
   elseif(CT2_BUILD_ARCH STREQUAL "arm64")
     ct2_compile_kernels_for_isa(neon "-DUSE_NEON")
+  elseif(CT2_BUILD_ARCH STREQUAL "riscv64")
+    ct2_compile_kernels_for_isa(rvv "-march=rv64gcv")
+    message(STATUS "Current CT2_BUILD_ARCH is: ${CT2_BUILD_ARCH}")
   endif()
 endif()
 
diff --git a/python/tools/prepare_build_environment_linux.sh b/python/tools/prepare_build_environment_linux.sh
index 61ac171a3..58d9cb52e 100755
--- a/python/tools/prepare_build_environment_linux.sh
+++ b/python/tools/prepare_build_environment_linux.sh
@@ -32,6 +32,7 @@ else
         libcudnn9-devel-cuda-12-9.1.0.70-1 \
         libcublas-devel-12-2-12.2.5.6-1 \
         libnccl-devel-2.19.3-1+cuda12.2
+    yum update -y libstdc++
     ln -s cuda-12.2 /usr/local/cuda
 
     ONEAPI_VERSION=2023.2.0
diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc
index 9030ac7a4..d5f78b5db 100644
--- a/src/cpu/cpu_info.cc
+++ b/src/cpu/cpu_info.cc
@@ -58,4 +58,22 @@ namespace ctranslate2 {
   }
 }
 
+#elif defined(CT2_WITH_RVV)
+
+namespace ctranslate2 {
+  namespace cpu {
+
+    const char* cpu_vendor() {
+      return "RVV";
+    }
+
+    bool cpu_supports_rvv() {
+      return true;
+    }
+
+  }
+}
+
+
+
 #endif
diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h
index c2951bcc0..1251a1f7a 100644
--- a/src/cpu/cpu_info.h
+++ b/src/cpu/cpu_info.h
@@ -14,6 +14,8 @@ namespace ctranslate2 {
     bool cpu_supports_avx512();
 #elif defined(CT2_ARM64_BUILD)
     bool cpu_supports_neon();
+#elif defined(CT2_WITH_RVV)
+    bool cpu_supports_rvv();
 #endif
 
   }
diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc
index c16aeda22..e9ba8fbf9 100644
--- a/src/cpu/cpu_isa.cc
+++ b/src/cpu/cpu_isa.cc
@@ -35,6 +35,9 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
       case CpuIsa::NEON:
         return "NEON";
+#elif defined(CT2_WITH_RVV)
+      case CpuIsa::RVV:
+        return "RVV";
 #endif
       default:
         return "GENERIC";
@@ -54,6 +57,9 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
         if (env_isa == "NEON")
           return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
+#elif defined(CT2_WITH_RVV)
+        if (env_isa == "RVV")
+          return try_isa(env_isa, CpuIsa::RVV, cpu_supports_rvv());
 #endif
         if (env_isa == "GENERIC")
           return CpuIsa::GENERIC;
@@ -71,6 +77,9 @@ namespace ctranslate2 {
 #  elif defined(CT2_ARM64_BUILD)
       if (cpu_supports_neon())
         return CpuIsa::NEON;
+#  elif defined(CT2_WITH_RVV)
+      if (cpu_supports_rvv())
+        return CpuIsa::RVV;
 #  endif
 #endif
 
diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h
index 4f42bdf26..b5d7cf5e3 100644
--- a/src/cpu/cpu_isa.h
+++ b/src/cpu/cpu_isa.h
@@ -13,6 +13,8 @@ namespace ctranslate2 {
       AVX512,
 #elif defined(CT2_ARM64_BUILD)
       NEON,
+#elif defined(CT2_WITH_RVV)
+      RVV,
 #endif
     };
 
@@ -54,6 +56,12 @@ namespace ctranslate2 {
     CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))        \
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
+#elif defined(CT2_WITH_RVV)
+#  define CPU_ISA_DISPATCH(STMTS)                             \
+  switch (cpu::get_cpu_isa()) {                               \
+    CPU_ISA_CASE(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS))        \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
+  }  
 #endif
 #elif defined(__AVX512F__)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
@@ -75,6 +83,11 @@ namespace ctranslate2 {
   switch (cpu::get_cpu_isa()) {                               \
     CPU_ISA_DEFAULT(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))     \
   }
+#elif defined(__riscv_vector)
+#  define CPU_ISA_DISPATCH(STMTS)                             \
+  switch (cpu::get_cpu_isa()) {                               \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::RVV, SINGLE_ARG(STMTS))     \
+  }
 #else
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \
diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc
index c1f48553d..5496a1612 100644
--- a/src/cpu/kernels.cc
+++ b/src/cpu/kernels.cc
@@ -14,6 +14,10 @@
 #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON)
 #  define TARGET_ISA CpuIsa::NEON
 #  include "cpu/vec_neon.h"
+#elif (defined(CT2_WITH_RVV) && defined(__riscv_vector))
+#  define USE_RVV
+#  define TARGET_ISA CpuIsa::RVV
+#  include "cpu/vec_rvv.h"
 #else
 #  define TARGET_ISA CpuIsa::GENERIC
 #  include "cpu/vec.h"
@@ -213,7 +217,7 @@ namespace ctranslate2 {
 
     template<>
     void exp<TARGET_ISA>(const float* x, float* y, dim_t size) {
-      vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
+        vectorized_unary_transform<TARGET_ISA>(x, y, size, Vec<float, TARGET_ISA>::exp);
     }
 
     template<>
@@ -263,11 +267,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void add(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::add(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::add(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::add(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -282,11 +295,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void mul(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::mul(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::mul(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::mul(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -296,11 +318,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void max(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::max(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::max(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::max(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -310,11 +341,20 @@ namespace ctranslate2 {
 
     template <CpuIsa ISA, typename T>
     void min(T a, const T* x, T* y, dim_t size) {
+#ifdef USE_RVV
+      T a_copy = a;
+      vectorized_unary_transform<ISA>(x, y, size,
+        [a_copy](vec_type<T, ISA> v) {
+          auto vec_a = Vec<T, ISA>::load(a_copy);
+          return Vec<T, ISA>::min(v, vec_a);
+        });
+#else
       auto vec_a = Vec<T, ISA>::load(a);
       vectorized_unary_transform<ISA>(x, y, size,
-                                      [vec_a](vec_type<T, ISA> v) {
-                                        return Vec<T, ISA>::min(v, vec_a);
-                                      });
+        [vec_a](vec_type<T, ISA> v) {
+          return Vec<T, ISA>::min(v, vec_a);
+        });
+#endif
     }
 
     template <CpuIsa ISA, typename T>
@@ -349,6 +389,7 @@ namespace ctranslate2 {
                                             static_cast<T>(0),
                                             Vec<T, ISA>::abs,
                                             Vec<T, ISA>::max,
+                                            
                                             Vec<T, ISA>::reduce_max,
                                             Vec<T>::abs,
                                             Vec<T>::max);
@@ -377,14 +418,22 @@ namespace ctranslate2 {
       using VecType = Vec<float, TARGET_ISA>;
 
       const auto x_max = reduce_max<TARGET_ISA>(x, size);
-      const auto vec_x_max = VecType::load(x_max);
 
-      const auto scalar_exp_func = [x_max](vec_type<float> v) {
-        return Vec<float>::exp(Vec<float>::sub(v, x_max));
+      const auto scalar_exp_func = [x_max](float v) {
+        return std::exp(v - x_max);
       };
-      const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+#ifdef USE_RVV
+      float x_max_copy = x_max;
+      auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
+        auto vec_x_max = VecType::load(x_max_copy);
         return VecType::exp(VecType::sub(v, vec_x_max));
       };
+#else
+      const auto vec_x_max = VecType::load(x_max);
+      auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+        return VecType::exp(VecType::sub(v, vec_x_max));
+      };
+#endif
 
       const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(
         x,
@@ -429,14 +478,21 @@ namespace ctranslate2 {
           }
 
           const auto x_max = reduce_max<TARGET_ISA>(x, size);
-          const auto vec_x_max = VecType::load(x_max);
-
-          const auto scalar_exp_func = [x_max](vec_type<float> v) {
-            return Vec<float>::exp(Vec<float>::sub(v, x_max));
+          const auto scalar_exp_func = [x_max](float v) {
+            return std::exp(v - x_max);
+          };
+#ifdef USE_RVV
+          float x_max_copy = x_max;
+          auto vec_exp_func = [x_max_copy](vec_type<float, TARGET_ISA> v) {
+            auto vec_x_max = VecType::load(x_max_copy);
+            return VecType::exp(VecType::sub(v, vec_x_max));
           };
-          const auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
+#else
+          const auto vec_x_max = VecType::load(x_max);
+          auto vec_exp_func = [vec_x_max](vec_type<float, TARGET_ISA> v) {
             return VecType::exp(VecType::sub(v, vec_x_max));
           };
+#endif
 
           if (log) {
             const auto exp_sum = vectorized_map_reduce_all<TARGET_ISA>(
diff --git a/src/cpu/vec_rvv.h b/src/cpu/vec_rvv.h
new file mode 100644
index 000000000..8a67f3a81
--- /dev/null
+++ b/src/cpu/vec_rvv.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include <riscv_vector.h>
+#include "vec.h"
+
+namespace ctranslate2 {
+  namespace cpu {
+
+    template<>
+    struct Vec<float, CpuIsa::RVV> {
+      using value_type = vfloat32m1_t;
+      using mask_type = vbool32_t;
+      static constexpr dim_t width = 4; 
+      static inline value_type load(float value) {
+        return __riscv_vfmv_v_f_f32m1(value, width);
+      }
+
+      static inline value_type load(const float* ptr) {
+        return __riscv_vle32_v_f32m1(ptr, width);
+      }
+
+      static inline value_type load(const float* ptr, dim_t count, float default_value = 0) {
+        if (count == width) {
+          return __riscv_vle32_v_f32m1(ptr, width);
+        } else {
+          float tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+          std::copy(ptr, ptr + count, tmp_values);
+          return __riscv_vle32_v_f32m1(tmp_values, width);
+        }
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr) {
+        return __riscv_vfcvt_f_x_v_f32m1(__riscv_vle32_v_i32m1(ptr, width), width);
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr, dim_t count, int32_t default_value = 0) {
+        if (count == width) {
+          return load_and_convert(ptr);
+        } else {
+          int32_t tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+          std::copy(ptr, ptr + count, tmp_values);
+          return load_and_convert(tmp_values);
+        }
+      }
+
+      static inline void store(value_type value, float* ptr) {
+        __riscv_vse32_v_f32m1(ptr, value, width);
+      }
+
+      static inline void store(value_type value, float* ptr, dim_t count) {
+        if (count == width) {
+          __riscv_vse32_v_f32m1(ptr, value, width);
+        } else {
+          float tmp_values[width];
+          __riscv_vse32_v_f32m1(tmp_values, value, width);
+          std::copy(tmp_values, tmp_values + count, ptr);
+        }
+      }
+
+      static inline value_type bit_and(value_type a, value_type b) {
+        return  __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a),  __riscv_vreinterpret_v_f32m1_u32m1(b), width)); 
+      }
+
+      static inline value_type bit_xor(value_type a, value_type b) {
+        return  __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a),  __riscv_vreinterpret_v_f32m1_u32m1(b), width)); 
+      }
+
+      static inline mask_type lt(value_type a, value_type b) {
+        return __riscv_vmflt_vv_f32m1_b32(a, b, width);
+      }
+
+      static inline value_type select(mask_type mask, value_type a, value_type b) {
+        return __riscv_vmerge_vvm_f32m1( a, b,mask, width);
+      }
+
+      static inline value_type abs(value_type a) {
+        return __riscv_vfabs_v_f32m1(a, width);
+      }
+
+      static inline value_type neg(value_type a) {
+        return __riscv_vfneg_v_f32m1(a, width);
+      }
+
+      static inline value_type rcp(value_type a) {
+        return __riscv_vfrdiv_vf_f32m1(a, 1.f, width);
+      }
+
+      static inline value_type exp(value_type a) {
+        // Need to implement exp using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::exp(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type log(value_type a) {
+        // Need to implement log using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::log(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type sin(value_type a) {
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::sin(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type cos(value_type a) {
+        // Need to implement cos using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::cos(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type tanh(value_type a) {
+        // Need to implement tanh using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::tanh(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type erf(value_type a) {
+        // Need to implement erf using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::erf(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      static inline value_type max(value_type a, value_type b) {
+        return __riscv_vfmax_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type min(value_type a, value_type b) {
+        return __riscv_vfmin_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type add(value_type a, value_type b) {
+        return __riscv_vfadd_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type sub(value_type a, value_type b) {
+        return __riscv_vfsub_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type mul(value_type a, value_type b) {
+        return __riscv_vfmul_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type div(value_type a, value_type b) {
+        return __riscv_vfdiv_vv_f32m1(a, b, width);
+      }
+
+      static inline value_type mul_add(value_type a, value_type b, value_type c) {
+        return __riscv_vfmacc_vv_f32m1(c, a, b, width);
+      }
+
+      static inline float reduce_add(value_type a) {
+        // 使用RVV reduce sum内在函数
+        value_type result = __riscv_vfredusum_vs_f32m1_f32m1(a, a, width);
+        return __riscv_vfmv_f_s_f32m1_f32(result);
+      }
+
+      static inline float reduce_max(value_type a) {
+        // 使用RVV reduce max内在函数
+        value_type result = __riscv_vfredmax_vs_f32m1_f32m1(a, a, width);
+        return __riscv_vfmv_f_s_f32m1_f32(result);
+      }
+
+      static inline value_type round(value_type a) {
+        // Need to implement erf using RVV intrinsics
+      float temp_a[4];
+      float temp_result[4];
+
+      __riscv_vse32_v_f32m1(temp_a, a, width);
+
+      for (size_t i = 0; i < width; ++i) {
+        temp_result[i] = std::round(temp_a[i]);
+      }
+
+      return __riscv_vle32_v_f32m1(temp_result, width);
+      }
+
+      template<typename T>
+      static void convert_and_store(value_type v, T* a, dim_t count) {
+        auto i32 = __riscv_vfcvt_x_f_v_i32m1(v, width);
+        int32_t tmp[width];
+        __riscv_vse32_v_i32m1(tmp, i32, width);
+        std::copy(tmp, tmp + count, a);
+      }
+    };
+
+  }
+}