Merged PR 32883: Pymarian improvements

List of changes/updates/fixes to pymarian * Rename model IDs to match with hugging face (e.g., comet22-da -> wmt22-comet-da) * Rename CLI to make it short pymarian-evaluate -> pymarian-eval. * Rename pymarian.evaluate.py -> pymarian.eval.py to reflect CLI * The functional code from pymarian.eval.py is moved to Evaluator class (goal: allow reuse of Evaluator object for scoring many small files like WMT metric task) * Use mmap *.bins instead of *.npz * Downloads *.bin and *.spm individually instead of .tgz. Future plan to support quantized / gemm models. Downloading .tgz is okay but it will get too expensive since we dont need all variants of model (.npz, .bin, fp32, fp16, avx512 ...) * Uses file locking mechanism (based on `portalocker`) to avoid race condition between parallel download processes * Added optional `-v/--vocab` argument to pymarian-eval. * Added `--fields|-f` argument: supports `src mt ref` or a subsequence of this. Raises an error when missing fields are detected, ignores that extra fields * pymarian build improvements: strict on python version match between package and native extension. Also removes custom logic for extension detection, instead uses EXT_SUFFIX from sysconfig * add `--like` argument for local models * Ran black and isort to fix code formatting issues * pypdl -- parallel download * Regression tests to pymarian -- Other scripts * Added `convert-all-models.sh` : convert pytorch to marian .npz, convert .npz to .bin and creates directory structure compatible with pymarian-eval * Added `compare.sh` to compare metrics between original implementation and pymarian
marian-nmt · Feb 15, 2024 · 9e40ac3 · 9e40ac3
1 parent 22ed792
commit 9e40ac3
Show file tree

Hide file tree

Showing 40 changed files with 1,514 additions and 923 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,7 @@
+/regression-tests
+/build*
+/.pytest_cache
+/.vscode
+/dist
+/doc
+.history*
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,7 @@ examples/mnist/*ubyte
 *.whl
 *.egg-info
 src/python/pymarian/_version.py
+src/python/tests/data
+__pycache__
+.pytest_cache
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added `pymarian-eval`, CLI for scoring metrics
 - Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
 - Added `pymarian`: python bindings based on pybind11
 - Added implementation of COMET-KIWI

diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml
@@ -42,7 +42,7 @@ stages:
         sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
         sudo update-alternatives --set python3 /usr/bin/python3.8
         sudo apt-get install -y python3-pip
-        python3 -m pip install --upgrade Cython
+        python3 -m pip install --upgrade Cython pip
       displayName: Clean and install packages
 
     # Collect details about CPU and GPU.
@@ -105,7 +105,8 @@ stages:
           -DCOMPILE_SERVER=on \
           -DCOMPILE_TESTS=on \
           -DCOMPILE_MAXWELL=on -DCOMPILE_PASCAL=off -DCOMPILE_VOLTA=off -DCOMPILE_TURING=off -DCOMPILE_AMPERE=off -DCOMPILE_AMPERE_RTX=off \
-          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1 \
+          -DPYMARIAN=on -DUSE_TCMALLOC=off -DPYTHON_EXECUTABLE=python3
       displayName: Configure CMake
 
     - bash: make -j5
@@ -141,6 +142,14 @@ stages:
       displayName: Collect outputs
       workingDirectory: regression-tests
 
+    - bash: |
+        python3 -m pip install build/pymarian-*.whl
+        python3 -m pymarian -v
+        python3 -m pip install pytest
+        python3 -m pytest src/python/tests/regression
+      displayName: Pymarian Install and Test
+
+
     - publish: regression-tests-ci-public_linux-x64-static_cuda_m60.zip
       artifact: regression-tests-ci-public_linux-x64-static_cuda_m60
       displayName: Publish outputs
diff --git a/scripts/bleurt/bleurt2marian.py b/scripts/bleurt/bleurt2marian.py
@@ -57,6 +57,7 @@ def load_bleurt_model():
 config["bert-type-vocab-size"] = 2
 config["comet-prepend-zero"] = True
 config["input-join-fields"] = True
+config["input-reorder"] = [1, 0] # bleurt expects ref < hyp order while embedding, we are providing hyp < ref, hence the reordering
 config["version"] = "bleurt2marian.py conversion"
 config["enc-depth"] = 0
 

diff --git a/scripts/metrics/.gitignore b/scripts/metrics/.gitignore
@@ -1,2 +1,4 @@
-bins/
-tmp.*
+/bins
+tmp.*
+/workspace
+/marian-metric
diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile
@@ -1,10 +1,13 @@
-FROM mcr.microsoft.com/azureml/minimal-ubuntu20.04-py38-cuda11.6.2-gpu-inference:20231102.v2
+# syntax = docker/dockerfile:experimental
+FROM mcr.microsoft.com/azureml/minimal-ubuntu22.04-py39-cuda11.8-gpu-inference:20240205.v2
 # use this if microsoft image is not accessible;
 #FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
-LABEL description="Marian image - Ubuntu 20.04"
+LABEL description="Marian image - Ubuntu 22.04"
+
+# required for microsoft cr image
+USER root
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG NCPU=24
 ARG MARIAN_REPO="https://github.com/marian-nmt/marian-dev"
 ARG MARIAN_BRANCH=master
 
@@ -18,9 +21,10 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python && \
 
 # install unbabel-comet (requires pytorch) and bleurt (requires tensorflow and cudnn)
 # note: unabel-comet 2.x is broken use 1.x. requires numpy < 1.24
+    #&& pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \
 RUN  pip install --upgrade pip \
-    && pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \
-    && pip install sacrebleu unbabel-comet==1.1.3 numpy==1.23.5 nvidia-cudnn-cu11==8.6.0.163 git+https://github.com/google-research/bleurt.git \
+    && pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu118 \
+    && pip install sacrebleu unbabel-comet==2.2.1 git+https://github.com/google-research/bleurt.git \
     && rm -rf  ~/.cache/pip/
 
 # Install sentencepiece
@@ -38,7 +42,11 @@ RUN pip3 uninstall -y sentencepiece && \
     cd ../../.. && \
     rm -rf src
 
-RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \
-    && mkdir /marian/build && cd /marian/build \
-    && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=off -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off \
-    && make -j $NCPU && cp -v marian spm_encode spm_decode /usr/bin/ \
+# add source repo (sans the .dockerignore files)
+ADD . /marian-dev
+# RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \
+
+RUN --mount=type=cache,target=/marian-dev/build mkdir -p /marian-dev/build && cd /marian-dev/build \
+    && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=on -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off -DPYMARIAN=on \
+    && make -j && cp -v marian spm_encode spm_decode /usr/bin/ \
+    && pip install -v pymarian-*.whl
diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md
@@ -1,36 +1,41 @@
-# Marian Evaluate
+# Marian Metrics
+
 The main script is `compare.sh`, however it needs to be run in an environment where all three -- marian, unbabel-comet(pytorch), and bleurt(tensorflow) are available.
-Hence, 1) we create a docker container with all the necessary libs.
-    and 2) run compare.sh inside the docker environment
+Hence we create a new python environment using conda to run comparisons.
 
-## Setup: build docker image
+## Setup
 
 ```bash
-./setup.sh
+./run.sh
+```
+This setups a conda environment named `metrics` which will have all the necessary requirements, except pymarian-eval, which you will have to install based on your CMAKE settings
+```bash
+# from the root dir of this repository
+conda activate metrics
+mkdir build; cd build
+cmake .. -DPYMARIAN=on #.. other flags
+pip install pymarian-*.whl
 ```
 
-## Run compare.sh in docker container
+## Run Compare.sh
 
 ```bash
-./docker-run.sh
+
+# option 1:
+./run.sh
+
+# option 2
+conda activate metrics
+bash compare.sh
 ```
-The `docker-run.sh` script mounts cache directory from the host to container.
-The necessary files (weights and vocabularies) will be automatically downloaded and cached for unbabel-comet and Bleurt metrics.
-However, for `marian-score.sh` expects the cache to be prepared under `$HOME/.cache/marian/metrics`.
-The structure/format of the cache directory for marian-score.sh looks as follows:
+
+This script produces reports at  `workspace/*.report.txt`, which shows average difference segment level scores between original implementation and `pymarian-eval`
+
+## Convert Metrics Weights to Marian format
+
 ```bash
-/home/$USER/.cache/marian/metrics/
-├── bleurt20-ref
-│   ├── bleurt-20.model.npz
-│   ├── bleurt.vocab.spm
-├── comet20-da-src
-│   ├── comet20-qe-da.model.npz
-│   └── roberta.vocab.spm
-└── comet20-da-src+ref
-    ├── comet20-da.model.npz
-    └── roberta.vocab.spm
+conda activate metrics
+MARIAN=../build/marian ./convert-all-models.sh
 ```
-Each metric subdir should have a `*model.npz` and a `*vocab.spm` files, and the name of metric directory should end with `-src|-qe|-ref|-src+ref` suffix to indicate the category of metric.
-
-> TODO: Upload Marian compatible comet and bleurt models to public blob storage and modify script to automatically download
 
+To add a new model ID, edit `known-models.txt` file in the same directory as this README
diff --git a/scripts/metrics/compare.sh b/scripts/metrics/compare.sh
@@ -1,20 +1,47 @@
 #!/usr/bin/env bash
+
+# This script compares the scores produced by
+# original implementation (unbabel-score or BLEURT) and Marian NMT (pymarian-eval).
+
+
 MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-export PATH=$MYDIR:$PATH
+OUT_DIR=$MYDIR/workspace
+REGEN_ORIG=0   # 1 : clear and regenerate original scores. 0: reuse previous scores
+REGEN_MARIAN=0  # 1 : to clear and regenerate marian scores (recommended).  0:  reuse / resume from previous scores
+
+DEVICES=0
+cd $MYDIR
+export CUDA_VISIBLE_DEVICES=0
+
+# add source to python path to test changes before installing
+# export PYTHONPATH=$(cd $MYDIR/../../src/python && pwd)
 
 log() {
     echo -e "\e[1;32m[$(date '+%Y-%m-%d %H:%M:%S')]\e[0m $@" >&2
 }
 
+for tool in comet-score pymarian-eval; do
+    which $tool > /dev/null || {
+        log "ERROR: $tool not found in PATH"
+        exit 1
+    }
+done
+
+
+METRIC_NAMES=$(cat $MYDIR/known-models.txt | grep -v '^#' | awk '{print $1}')
+# exclude xxl, they require more memory
+METRIC_NAMES=$(grep -v -i '\-xxl\|xcomet' <<< $METRIC_NAMES)
+
 get_sacrebleu_names(){
+    set -eu
     # using sacrebleu to get the list of systems
     testset=wmt21/systems
     while read line; do
         pair=$(cut -f1 -d':' <<< $line)
         refs=()
         mts=()
         while read name; do
-            # skip if name starts with $pair or src or docid
+            # skip if name starts with $pair or src or docidq
             if [[ $name == $pair* || $name == src || $name == docid || $name == origlang ]]; then
                 continue
             fi
@@ -29,12 +56,15 @@ get_sacrebleu_names(){
         for ref in ${refs[@]}; do
             for mt in ${mts[@]}; do
                 echo -e "$testset\t$pair\t$ref\t$mt"
+                break  # limit to one per lang pair
             done
+            break  # limit to one per lang pair
         done
     done < <(sacrebleu -t $testset --list)
 }
 
 unbabel_score(){
+    set -eu
     local metric=$1
     local prefix=$2
     log "Running $metric"
@@ -45,6 +75,7 @@ unbabel_score(){
 
 
 bleurt_score() {
+    set -eu
     local metric_name=$1
     local prefix=$2
     [[ $metric_name == "BLEURT-20" ]] || {
@@ -63,54 +94,60 @@ bleurt_score() {
 
     # to check if cuda libs are configured and GPU is available
     # python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
-    export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
+    #export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
     python -m bleurt.score_files --bleurt_checkpoint=$metric_path \
         --candidate_file=$prefix.mt --reference_file=$prefix.ref \
         --bleurt_batch_size 64 2> /dev/null
 }
 
-marian_score() {
-    local metric=$1
-    local prefix=$2
-    case $metric in
-        wmt20-comet-qe-da)  metric="comet20-da-src" ;;
-        wmt20-comet-da)     metric="comet20-da-src+ref" ;;
-        BLEURT-20)          metric="bleurt20-ref" ;;
-        *) echo "Unknown metric $metric";  exit 1;;
-    esac
-    marian-score.sh -d '0' -n $metric --src $prefix.src --ref $prefix.ref --mt $prefix.mt --seg
-}
 
+MAX_TESTS=10
+MAX_LINES=100  # in each testset
+mkdir -p $OUT_DIR
+
+while IFS=$'\t' read tset pair ref mt; do
+    data=$(sacrebleu -t $tset -l $pair --echo src ref $mt)
+    prefix=$OUT_DIR/${tset//\//-}.$pair.$MAX_LINES
+
+    [[ -s $prefix.src ]] || cut -f1 <<< "$data" | head -n $MAX_LINES > $prefix.src
+    [[ -s $prefix.ref ]] || cut -f2 <<< "$data" | head -n $MAX_LINES > $prefix.ref
+    [[ -s $prefix.mt ]] || cut -f3 <<< "$data" | head -n $MAX_LINES > $prefix.mt
+
+    report_file=$prefix.report.txt
+    echo "####$(date '+%Y-%m-%d %H:%M:%S') :: $(pymarian-eval -V) :: Avg diffs" | tee -a $report_file
 
-main() {
-    cd $MYDIR
-    local metric_names=(BLEURT-20 wmt20-comet-da wmt20-comet-qe-da)
-    export CUDA_VISIBLE_DEVICES=0
-    local max_tests=10
-    local max_lines=100  # in each testset
-    while IFS=$'\t' read tset pair ref mt; do
-        for mn in ${metric_names[@]}; do
-            log "Comparing >> $mn << on $tset $pair $ref $mt"
-            local data=$(sacrebleu -t $tset -l $pair --echo src ref $mt)
-            local tmp_pref=tmp.testset
-            rm -rf $tmp_pref.{src,ref,mt}
-            cut -f1 <<< "$data" | head -n $max_lines > $tmp_pref.src
-            cut -f2 <<< "$data" | head -n $max_lines > $tmp_pref.ref
-            cut -f3 <<< "$data" | head -n $max_lines > $tmp_pref.mt
+    for mn in ${METRIC_NAMES[@]}; do
+        log "Comparing >> $mn << on $tset $pair $ref $mt"
+        metric_id=$(basename $mn | tr '[:upper:]' '[:lower:]')
+        score_pref=$prefix.$metric_id
+        orig_file=$score_pref.orig
+        if [[ ! -s $orig_file || $REGEN_ORIG -eq 1 ]]; then
+            rm -f $score_pref  # cleanup
+            log "Generating original scores for $mn :: $prefix"
             if [[ $mn =~ BLEURT* ]]; then
-                local orig_out=$(bleurt_score $mn $tmp_pref)
+                bleurt_score $mn $prefix > $orig_file
             else
-                local orig_out=$(unbabel_score $mn $tmp_pref 2> /dev/null)
+                unbabel_score $mn $prefix 2> /dev/null > $orig_file
             fi
-            local marian_out=$(marian_score $mn $tmp_pref)
-            paste <(echo "$marian_out") <(echo "$orig_out") \
-                | awk -F '\t' -v OFS='\t' -v mn=$mn \
-                        'BEGIN {tot=0.0} {diff=sqrt(($1-$2)^2); tot+=diff; print diff,$0}
-                         END {printf "\n===Avg diff in %s: %f===\n\n", mn, tot/NR}'
-            #TODO1: extract averages and write to a report file
-            #TODO2: benchmark speeds
-        done
-    done <  <(get_sacrebleu_names | head -n $max_tests)
-}
+        fi
+
+        out_file=$score_pref.pymarian
+        if [[ ! -s $out_file || $REGEN_MARIAN -eq 1 ]]; then
+            rm -f $out_file $out_file.log  # cleanup
+            log "Generating Marian scores for $mn :: $prefix"
+            pymarian-eval -d $DEVICES -m $(basename $mn) -s $prefix.src -r $prefix.ref -t $prefix.mt -a skip --fp16 --debug > $out_file 2> $out_file.log || {
+                log "ERROR: Failed to generate scores for $mn"
+                cat $out_file.log
+                continue
+            }
+        fi
+
+        # compute diffs
+        paste $out_file $orig_file \
+            | awk -F '\t' -v OFS='\t' -v mn=$mn -v of=$out_file.diff 'BEGIN {tot=0.0}
+                {$2 = +sprintf("%.4f", $2); diff=sqrt(($1-$2)^2); tot+=diff; print diff, $0 > of}
+                END {printf "%s\t%f\n", mn, tot/NR}' | tee -a $report_file
+    done
+done <  <(get_sacrebleu_names | head -n $MAX_TESTS)
 
-main "$@"
+cat $OUT_DIR/*.report.txt #| column -t