Build instructions & fixes

Dagamies · Jul 25, 2024 · acc9c85 · acc9c85
1 parent 424d8d1
commit acc9c85
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 0 deletions.
diff --git a/docs/ppc64le.md b/docs/ppc64le.md
@@ -0,0 +1,56 @@
+# IBM Power10 -ppc64le
+
+CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8).
+
+Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core.
+
+It should be possible to build for Power9, but missing MMA units will have significant impact on performance.
+
+OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage.
+
+## Build docker / podman container
+
+This is the easy way:
+```git clone --recursive https://github.com/OpenNMT/CTranslate2/
+cd CTranslate2/docker
+podman build  -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le ..
+
+```
+
+Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE):
+```podman run  --security-opt=label=disable  --ipc=host --ulimit=host -it --rm -v /tmp:/tmp  elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16```
+
+## Install from sources
+This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power )
+```
+#sleef:
+git clone -b 3.6.1 https://github.com/shibatch/sleef
+
+cd sleef
+mkdir build && cd build
+cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE  -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++  -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release   ..
+
+cmake --build build -j --clean-first
+sudo cmake --install build --prefix=/usr/
+
+
+#OneDNN;
+git clone  -b v3.2 --recursive https://github.com/oneapi-src/oneDNN
+cd oneDNN
+mkdir build && cd build
+cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP  ..
+make -j16
+sudo make install
+
+
+git clone --recursive https://github.com/Dagamies/CTranslate2
+cd CTranslate2
+mkdir build
+cd build
+cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP ..
+make -j16
+sudo make install
+sudo ldconfig -v
+export LD_LIBRARY_PATH=/usr/local/lib64/
+
+```
diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
@@ -206,6 +206,63 @@ namespace ctranslate2 {
         float t1 = a[2] > a[3] ? a[2] : a[3];
 	return t0 > t1 ? t0 : t1;
       }
+
+      static inline value_type round(value_type a) {
+	return vec_round(a);
+      }
+
+      template<typename U>
+      static inline void convert_and_store(value_type v, U* a, dim_t count) {
+          *a = v;
+      }
+
+      static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
+	auto i32 = vec_cts(v,0);
+
+	int8_t tmp[4];
+	tmp[0]=i32[0];
+	tmp[1]=i32[1];
+	tmp[2]=i32[2];
+	tmp[3]=i32[3];
+	std::copy(tmp, tmp + count, a);
+
+      }
+
+      static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
+	auto u32 = vec_ctu(v,0);
+        uint8_t tmp[4];
+        tmp[0]=u32[0];
+        tmp[1]=u32[1];
+        tmp[2]=u32[2];
+        tmp[3]=u32[3];
+        std::copy(tmp, tmp + count, a);
+
+
+      }
+
+      /*      static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
+        //convert float32x4_t to int32x4_t
+        auto i32x4 = vcvtq_s32_f32(v);
+        //then convert to int16x4_t
+        auto i16x4 = vqmovn_s32(i32x4);
+        //finally convert to int8x4_t
+        auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0)));
+        int8_t tmp[8];
+        vst1_s8(tmp, i8x8);
+        std::copy(tmp, tmp + count, a);
+      }
+
+      static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
+        //convert float32x4_t to uint32x4_t
+        auto u32x4 = vcvtq_u32_f32(v);
+        //then convert to uint16x4_t
+        auto u16x4 = vqmovn_u32(u32x4);
+        //finally convert to uint8x8_t
+        auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0)));
+        uint8_t tmp[8];
+        vst1_u8(tmp, u8x8);
+        std::copy(tmp, tmp + count, a);
+	}*/
     };
   }
 }