Skip to content

Commit

Permalink
Update WNGT2020 images (#379)
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumekln authored Jan 14, 2021
1 parent 10f4491 commit 8ba81ac
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 46 deletions.
41 changes: 20 additions & 21 deletions examples/wngt2020/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:18.04 as builder
FROM ubuntu:20.04 as builder

RUN apt-get update && \
apt-get install -y --no-install-recommends \
Expand All @@ -14,30 +14,28 @@ RUN apt-get update && \

WORKDIR /root

RUN wget https://cmake.org/files/v3.12/cmake-3.12.2-Linux-x86_64.tar.gz
RUN tar xf cmake-3.12.2-Linux-x86_64.tar.gz && \
rm cmake-3.12.2-Linux-x86_64.tar.gz
ENV PATH=$PATH:/root/cmake-3.12.2-Linux-x86_64/bin
ENV CMAKE_VERSION=3.18.4
RUN wget -q https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-Linux-x86_64.tar.gz && \
tar xf *.tar.gz && \
rm *.tar.gz
ENV PATH=$PATH:/root/cmake-$CMAKE_VERSION-Linux-x86_64/bin

ENV MKL_VERSION=2020
ENV MKL_UPDATE=0
ENV MKL_BUILD=088
RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \
apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-*.PUB && \
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-*.PUB && \
echo "deb https://apt.repos.intel.com/mkl all main" > /etc/apt/sources.list.d/intel-mkl.list && \
ENV ONEAPI_VERSION=2021.1.1
ENV MKL_BUILD=52
RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && \
apt-key add *.PUB && \
rm *.PUB && \
echo "deb https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list && \
apt-get update && \
apt-get install -y --no-install-recommends \
intel-mkl-64bit-$MKL_VERSION.$MKL_UPDATE-$MKL_BUILD && \
intel-oneapi-mkl-devel=$ONEAPI_VERSION-$MKL_BUILD \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# The target architecture should be cascadelake but this requires updating GCC and libstd.
# We assume that we don't gain much from these compiler flags as all the heavy lifting is
# done by MKL.
ENV CXX_FLAGS="-march=skylake"
ENV CXX_FLAGS="-march=cascadelake"

ENV SENTENCEPIECE_VERSION=v0.1.8
ENV SENTENCEPIECE_VERSION=v0.1.95
RUN wget https://github.com/google/sentencepiece/archive/$SENTENCEPIECE_VERSION.tar.gz && \
tar xf $SENTENCEPIECE_VERSION.tar.gz && \
rm $SENTENCEPIECE_VERSION.tar.gz && \
Expand All @@ -49,14 +47,14 @@ RUN wget https://github.com/google/sentencepiece/archive/$SENTENCEPIECE_VERSION.
cd /root && \
rm -r sentencepiece-*

ENV CTRANSLATE2_VERSION=v1.10.0
ENV CTRANSLATE2_VERSION=v1.17.0
RUN wget https://github.com/OpenNMT/CTranslate2/archive/$CTRANSLATE2_VERSION.tar.gz && \
tar xf $CTRANSLATE2_VERSION.tar.gz && \
rm $CTRANSLATE2_VERSION.tar.gz && \
cd CTranslate2-* && \
mkdir build && \
cd build && \
cmake -DCMAKE_CXX_FLAGS=${CXX_FLAGS} -DLIB_ONLY=ON -DOPENMP_RUNTIME=NONE .. && \
cmake -DCMAKE_CXX_FLAGS=${CXX_FLAGS} -DLIB_ONLY=ON -DOPENMP_RUNTIME=NONE -DENABLE_CPU_DISPATCH=OFF .. && \
VERBOSE=1 make -j4 install && \
cd /root && \
rm -r CTranslate2-*
Expand All @@ -73,11 +71,12 @@ RUN mkdir /opt/wngt2020 && \
cp /root/build/run /opt/wngt2020 && \
cp /usr/local/lib/libctranslate2.so /opt/wngt2020

FROM ubuntu:18.04
FROM ubuntu:20.04

COPY --from=builder /opt/wngt2020 /opt/wngt2020
ENV LD_LIBRARY_PATH=/opt/wngt2020
ENV CT2_USE_EXPERIMENTAL_PACKED_GEMM=1
ENV CT2_TRANSLATORS_CORE_OFFSET=0

ARG MODEL_PATH
COPY ${MODEL_PATH} /model
Expand Down
15 changes: 8 additions & 7 deletions examples/wngt2020/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ RUN apt-get update && \

WORKDIR /root

RUN wget https://cmake.org/files/v3.12/cmake-3.12.2-Linux-x86_64.tar.gz
RUN tar xf cmake-3.12.2-Linux-x86_64.tar.gz && \
rm cmake-3.12.2-Linux-x86_64.tar.gz
ENV PATH=$PATH:/root/cmake-3.12.2-Linux-x86_64/bin
ENV CMAKE_VERSION=3.18.4
RUN wget -q https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-Linux-x86_64.tar.gz && \
tar xf *.tar.gz && \
rm *.tar.gz
ENV PATH=$PATH:/root/cmake-$CMAKE_VERSION-Linux-x86_64/bin

ENV SENTENCEPIECE_VERSION=v0.1.8
ENV SENTENCEPIECE_VERSION=v0.1.95
RUN wget https://github.com/google/sentencepiece/archive/$SENTENCEPIECE_VERSION.tar.gz && \
tar xf $SENTENCEPIECE_VERSION.tar.gz && \
rm $SENTENCEPIECE_VERSION.tar.gz && \
Expand All @@ -32,14 +33,14 @@ RUN wget https://github.com/google/sentencepiece/archive/$SENTENCEPIECE_VERSION.

ENV CUDA_ARCH_LIST="7.5"

ENV CTRANSLATE2_VERSION=e73e27fc7d86c0d22be018ea459cf7cedec72278
ENV CTRANSLATE2_VERSION=v1.17.0
RUN git clone https://github.com/OpenNMT/CTranslate2.git && \
cd CTranslate2 && \
git checkout $CTRANSLATE2_VERSION && \
git submodule update --init && \
mkdir build && \
cd build && \
cmake -DCMAKE_CXX_FLAGS=${CXX_FLAGS} -DLIB_ONLY=ON -DWITH_CUDA=ON -DWITH_MKL=OFF -DWITH_TENSORRT=OFF -DOPENMP_RUNTIME=NONE -DCUDA_ARCH_LIST="${CUDA_ARCH_LIST}" .. && \
cmake -DCMAKE_CXX_FLAGS=${CXX_FLAGS} -DLIB_ONLY=ON -DWITH_CUDA=ON -DWITH_MKL=OFF -DOPENMP_RUNTIME=NONE -DCUDA_ARCH_LIST="${CUDA_ARCH_LIST}" .. && \
VERBOSE=1 make -j4 install && \
cd /root && \
rm -r CTranslate2
Expand Down
30 changes: 12 additions & 18 deletions examples/wngt2020/main.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <sentencepiece_processor.h>
#include <ctranslate2/translator_pool.h>
#include <ctranslate2/models/sequence_to_sequence.h>
#include <fstream>
#include <regex>

static std::vector<std::string> get_vocabulary_tokens(const ctranslate2::Vocabulary& vocabulary) {
Expand Down Expand Up @@ -50,20 +49,17 @@ int main(int, char* argv[]) {
if (!status.ok())
throw std::runtime_error("Failed to set the SentencePiece vocabulary");

auto reader = [&sp_processor](std::istream& in, std::vector<std::string>& tokens) {
std::string line;
if (!std::getline(in, line))
return false;
sp_processor.Encode(line, &tokens);
return true;
};
auto tokenizer = [&sp_processor](const std::string& text) {
std::vector<std::string> tokens;
sp_processor.Encode(text, &tokens);
return tokens;
};

auto writer = [&sp_processor](std::ostream& out,
const ctranslate2::TranslationResult& result) {
std::string text;
sp_processor.Decode(result.output(), &text);
out << std::regex_replace(text, std::regex("<unk>"), "UNK") << '\n';
};
auto detokenizer = [&sp_processor](const std::vector<std::string>& tokens) {
std::string text;
sp_processor.Decode(tokens, &text);
return std::regex_replace(text, std::regex("<unk>"), "UNK");
};

ctranslate2::TranslationOptions options;
options.beam_size = 1;
Expand All @@ -73,9 +69,7 @@ int main(int, char* argv[]) {
options.use_vmap = true;
options.return_scores = false;

std::ifstream in(in_file);
std::ofstream out(out_file);
pool.consume_stream(in, out, max_batch_size * 8, options, reader, writer);
out.flush();
const size_t read_batch_size = max_batch_size * 16;
pool.consume_raw_text_file(in_file, out_file, tokenizer, detokenizer, read_batch_size, options);
return 0;
}

0 comments on commit 8ba81ac

Please sign in to comment.