diff --git a/docs/ppc64le.md b/docs/ppc64le.md new file mode 100644 index 000000000..4828aad2b --- /dev/null +++ b/docs/ppc64le.md @@ -0,0 +1,56 @@ +# IBM Power10 -ppc64le + +CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8). + +Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core. + +It should be possible to build for Power9, but missing MMA units will have significant impact on performance. + +OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage. + +## Build docker / podman container + +This is the easy way: +```git clone --recursive https://github.com/OpenNMT/CTranslate2/ +cd CTranslate2/docker +podman build -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le .. + +``` + +Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE): +```podman run --security-opt=label=disable --ipc=host --ulimit=host -it --rm -v /tmp:/tmp elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16``` + +## Install from sources +This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power ) +``` +#sleef: +git clone -b 3.6.1 https://github.com/shibatch/sleef + +cd sleef +mkdir build && cd build +cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. + +cmake --build build -j --clean-first +sudo cmake --install build --prefix=/usr/ + + +#OneDNN; +git clone -b v3.2 --recursive https://github.com/oneapi-src/oneDNN +cd oneDNN +mkdir build && cd build +cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install + + +git clone --recursive https://github.com/Dagamies/CTranslate2 +cd CTranslate2 +mkdir build +cd build +cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install +sudo ldconfig -v +export LD_LIBRARY_PATH=/usr/local/lib64/ + +``` \ No newline at end of file diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index c87242184..de8ec6955 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -206,6 +206,63 @@ namespace ctranslate2 { float t1 = a[2] > a[3] ? a[2] : a[3]; return t0 > t1 ? t0 : t1; } + + static inline value_type round(value_type a) { + return vec_round(a); + } + + template + static inline void convert_and_store(value_type v, U* a, dim_t count) { + *a = v; + } + + static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { + auto i32 = vec_cts(v,0); + + int8_t tmp[4]; + tmp[0]=i32[0]; + tmp[1]=i32[1]; + tmp[2]=i32[2]; + tmp[3]=i32[3]; + std::copy(tmp, tmp + count, a); + + } + + static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { + auto u32 = vec_ctu(v,0); + uint8_t tmp[4]; + tmp[0]=u32[0]; + tmp[1]=u32[1]; + tmp[2]=u32[2]; + tmp[3]=u32[3]; + std::copy(tmp, tmp + count, a); + + + } + + /* static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { + //convert float32x4_t to int32x4_t + auto i32x4 = vcvtq_s32_f32(v); + //then convert to int16x4_t + auto i16x4 = vqmovn_s32(i32x4); + //finally convert to int8x4_t + auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0))); + int8_t tmp[8]; + vst1_s8(tmp, i8x8); + std::copy(tmp, tmp + count, a); + } + + static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { + //convert float32x4_t to uint32x4_t + auto u32x4 = vcvtq_u32_f32(v); + //then convert to uint16x4_t + auto u16x4 = vqmovn_u32(u32x4); + //finally convert to uint8x8_t + auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0))); + uint8_t tmp[8]; + vst1_u8(tmp, u8x8); + std::copy(tmp, tmp + count, a); + }*/ }; } }