EveryVoiceTTS · joanise · Jan 31, 2025 · Jan 9, 2025 · Dec 20, 2024 · Jan 24, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -17,7 +17,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: recursive
-      - run: sudo apt-get install sox libsox-dev ffmpeg
+      - run: sudo apt-get update
+      - run: sudo apt-get install --fix-missing sox libsox-dev ffmpeg
       - uses: actions/setup-python@v5
         with:
           python-version: "3.10"

diff --git a/README.md b/README.md
@@ -12,15 +12,15 @@ This is the Text-to-Speech (TTS) toolkit used by the Small Teams "Speech Generat
 
 ## Quickstart from PyPI
 
- - Install Python 3.10 or 3.11 and create a venv or a conda env for EveryVoice.
+- Install Python 3.10 or 3.11 and create a venv or a conda env for EveryVoice.
 
- - Install `sox`. On Ubuntu, `sudo apt-get install sox libsox-dev` should work. If not, use Conda and run `conda install sox -c conda-forge` in your EveryVoice environment
+- Install `sox`. On Ubuntu, `sudo apt-get install sox libsox-dev` should work. If not, use Conda and run `conda install sox -c conda-forge` in your EveryVoice environment
 
- - Install `ffmpeg`. On Ubuntu, `sudo apt-get install ffmpeg` should work. If not, use Conda and run `conda install ffmpeg` in your EveryVoice environment.
+- Install `ffmpeg`. On Ubuntu, `sudo apt-get install ffmpeg` should work. If not, use Conda and run `conda install ffmpeg` in your EveryVoice environment.
 
- - Install `torch` and `torchaudio` version 2.1.0 for your platform and CUDA version: follow the instructions at https://pytorch.org/get-started/locally/ but specify `torch==2.1.0 torchaudio==2.1.0` in the install command and remove `torchvision`.
+- Install `torch` and `torchaudio` version 2.1.0 for your platform and CUDA version: follow the instructions at https://pytorch.org/get-started/locally/ but specify `torch==2.1.0 torchaudio==2.1.0` in the install command and remove `torchvision`.
 
- - Run `pip install everyvoice==0.2.0a1` (change the version to the current version if needed).
+- Run `pip install everyvoice==0.2.0a1` (change the version to the current version if needed).
 
 ## Quickstart from source
 
@@ -68,7 +68,7 @@ This repo follows the [Contributor Covenant](http://contributor-covenant.org/ver
 Please make sure our standard Git hooks are activated, by running these commands in your sandbox (if you used our `make-everyvoice-env` script then this step is already done for you):
 
 ```sh
-pip install -r requirements.dev.txt
+pip install -e .[dev]
 pre-commit install
 gitlint install-hook
 git submodule foreach 'pre-commit install'

diff --git a/everyvoice/tests/regression/.gitignore b/everyvoice/tests/regression/.gitignore
@@ -0,0 +1,3 @@
+EV-regress.*
+EV-r-main.*
+regress-*
diff --git a/everyvoice/tests/regression/README.md b/everyvoice/tests/regression/README.md
@@ -0,0 +1,43 @@
+# EveryVoice regression test suite
+
+## Preparing the regression training data:
+
+- Download LJ 1.1 from https://keithito.com/LJ-Speech-Dataset/
+- Download Sinhala TTS from https://openslr.org/30/
+- Download High quality TTS data for four South African languages (af, st, tn,
+  xh) from https://openslr.org/32
+- See [`prep-datasets.sh`](prep-datasets.sh) to see where these datasets are expected to be found.
+- Run this to create the regression testing directory structure:
+
+```sh
+export ACTIVATE_SCRIPT=$HOME/start-ev.sh
+export SGILE_DATASET_ROOT=$HOME/sgile/data
+
+mkdir regress-1  # or any suffix you want
+cd regress-1
+../prep-datasets.sh
+```
+
+## Running the regression tests
+
+On a Slurm cluster:
+
+```sh
+for dir in regress-*; do
+    pushd $dir
+    sbatch ../../regression-test.sh
+    popd
+done
+```
+
+Or just use `../../regression-test.sh` directly in the loop if you're not on a cluster.
+
+## One script to run them all
+
+All the above can be accomplished by running `go.sh`.
+
+## Cluster parameters
+
+The scripts hardcode NRC's default Slurm cluster parameters. Add `--partition=... --account=...`
+the the `sbatch` commands to override, or edit `go.sh` and `regression-test.sh` to use
+your partition and account settings to request nodes with GPUs available.
diff --git a/everyvoice/tests/regression/combine-coverage.sh b/everyvoice/tests/regression/combine-coverage.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+find . -name .coverage\* | coverage combine --keep
+coverage report --include='*/everyvoice/*' | sed 's/.*EveryVoice\/everyvoice/everyvoice/' > coverage.txt
+coverage html --include='*/everyvoice/*'
+coverage xml --include='*/everyvoice/*'
+sed -i 's/"[^"]*EveryVoice.everyvoice/"everyvoice/g' coverage.xml
diff --git a/everyvoice/tests/regression/go.sh b/everyvoice/tests/regression/go.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+#SBATCH --job-name=EV-r-main
+#SBATCH --partition=standard
+#SBATCH --account=nrc_ict
+#SBATCH --qos=low
+#SBATCH --time=10080
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=8000M
+#SBATCH --output=./%x.o%j
+#SBATCH --error=./%x.e%j
+
+# Automated application of the instructions in README.md
+
+set -o errexit
+
+TOP_LEVEL_DIR=$(mktemp --directory regress-$(date +'%Y%m%d')-XXX)
+cd "$TOP_LEVEL_DIR"
+
+if sbatch -h >& /dev/null; then
+    SUBMIT_COMMAND=sbatch
+else
+    SUBMIT_COMMAND=bash
+fi
+
+../prep-datasets.sh
+for DIR in regress-*; do
+    pushd "$DIR"
+    $SUBMIT_COMMAND ../../regression-test.sh
+    popd
+done
+
+coverage run -p -m everyvoice test
+
+JOB_COUNT=$(find . -maxdepth 1 -name regress-\* | wc -l)
+while true; do
+    DONE_COUNT=$(find . -maxdepth 2 -name DONE | wc -l)
+    if (( DONE_COUNT + 2 >= JOB_COUNT )); then
+        break
+    fi
+    echo "$DONE_COUNT/$JOB_COUNT regression job(s) done. Still waiting."
+    date
+    sleep $(( 60 * 5 ))
+done
+
+echo "$DONE_COUNT regression jobs done. Calculating coverage now, but some jobs may still be running."
+../combine-coverage.sh
+cat coverage.txt
+
+while true; do
+    DONE_COUNT=$(find . -maxdepth 2 -name DONE | wc -l)
+    if (( DONE_COUNT >= JOB_COUNT )); then
+        break
+    fi
+    echo "$DONE_COUNT/$JOB_COUNT regression job(s) done. Still waiting."
+    date
+    sleep $(( 60 * 5 ))
+done
+
+echo "All $DONE_COUNT regression jobs done. Calculating final coverage."
+rm .coverage
+../combine-coverage.sh
+cat coverage.txt
diff --git a/everyvoice/tests/regression/prep-datasets.sh b/everyvoice/tests/regression/prep-datasets.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Prepare the datasets and directories for our regression test cases
+
+set -o errexit
+
+# Usage: cat my_file | get_slice lines_to_keep > out
+# Use a number of lines or full to get all lines
+get_slice() {
+    lines=$1
+    if [[ $lines == full ]]; then
+        cat
+    else
+        head -"$lines"
+    fi
+}
+
+EVERYVOICE_REGRESS_ROOT=$(python -c 'import everyvoice; print(everyvoice.__path__[0])')/tests/regression
+
+SGILE_DATASET_ROOT=${SGILE_DATASET_ROOT:-$HOME/sgile/data}
+
+LJ_SPEECH_DATASET=$SGILE_DATASET_ROOT/LJSpeech-1.1
+LJSLICES="160 600 1600 full"
+for slice in $LJSLICES; do
+    dir=regress-lj-$slice
+    mkdir "$dir"
+    ln -s "$LJ_SPEECH_DATASET/wavs" "$dir"/
+    get_slice "$slice" < "$LJ_SPEECH_DATASET/metadata.csv" > "$dir"/metadata.csv
+    cp "$EVERYVOICE_REGRESS_ROOT"/wizard-resume-lj "$dir"/wizard-resume
+    cat <<'==EOF==' > "$dir"/test.txt
+This is a test.
+I am an anvil.
+I have no idea what to write here, but it has to be synthesizable text; so here is something!
+Boo!
+==EOF==
+    echo spec > "$dir"/test2.txt
+done
+
+SinhalaTTS=$SGILE_DATASET_ROOT/SinhalaTTS
+dir=regress-si
+mkdir $dir
+ln -s "$SinhalaTTS/wavs" $dir/
+cp "$SinhalaTTS/si_lk.lines.txt" $dir/
+cp "$EVERYVOICE_REGRESS_ROOT"/wizard-resume-si "$dir"/wizard-resume
+# Source of this sample text: https://en.wikipedia.org/wiki/Sinhala_script CC BY-SA-4.0
+#  - the first line means Sinhala script, found at the top of the page
+#  - the rest is the first verse from the Pali Dhammapada lower on the same page
+cat <<'==EOF==' > "$dir"/test.txt
+සිංහල අක්ෂර මාලාව
+මනොපුබ්‌බඞ්‌ගමා ධම්‌මා, මනොසෙට්‌ඨා මනොමයා;
+මනසා චෙ පදුට්‌ඨෙන, භාසති වා කරොති වා;
+තතො නං දුක්‌ඛමන්‌වෙති, චක්‌කංව වහතො පදං.
+==EOF==
+echo "අක-ෂර" > "$dir"/test2.txt
+
+isiXhosa=$SGILE_DATASET_ROOT/OpenSLR32-four-South-Afican-languages/xh_za/za/xho
+dir=regress-xh
+mkdir $dir
+ln -s "$isiXhosa/wavs" $dir/
+cp "$isiXhosa/line_index.tsv" $dir/
+cp "$EVERYVOICE_REGRESS_ROOT"/wizard-resume-xh "$dir"/wizard-resume
+# Source of this sample text: individual words copied from
+# https://en.wikipedia.org/wiki/Xhosa_language CC BY-SA-4.0
+cat <<'==EOF==' > "$dir"/test.txt
+ukukrwentshwa
+uqeqesho
+iimpumlo
+==EOF==
+echo isiXhosa > "$dir"/test2.txt
diff --git a/everyvoice/tests/regression/regression-test.sh b/everyvoice/tests/regression/regression-test.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+#SBATCH --job-name=EV-regress
+#SBATCH --partition=gpu_a100
+#SBATCH --account=nrc_ict__gpu_a100
+#SBATCH --qos=low
+#SBATCH --time=180
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16000M
+#SBATCH --gres=gpu:1
+#SBATCH --output=./%x.o%j
+#SBATCH --error=./%x.e%j
+
+# User env config -- set ACTIVATE_SCRIPT to point to something that will activate the
+# right Python environment, or leave it empty if you don't need it.
+ACTIVATE_SCRIPT=${ACTIVATE_SCRIPT:-$HOME/start_ev.sh}
+
+# Run a command, logging it first
+r() {
+    cmd="$*"
+    printf "\n\n======================================================================\n"
+    printf 'Running "%s"\n' "$cmd"
+    date
+    printf "======================================================================\n"
+    eval "$cmd" 2>&1
+    rc=$?
+    if [[ $rc != 0 ]]; then
+        printf "\n\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n"
+        echo "Command \"$cmd\" exited with non-zero return code $rc."
+        date
+        printf "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\n"
+    fi
+    return $rc
+}
+
+echo "Start at $(date)"
+date > START
+
+trap 'echo "Failed or killed at $(date)"; date | tee FAILED > DONE' 0
+
+# Regression config
+[[ -e "$ACTIVATE_SCRIPT" ]] && source "$ACTIVATE_SCRIPT"
+export TQDM_MININTERVAL=5
+MAX_STEPS=1000
+MAX_EPOCHS=10
+# For a production config, use MAX_STEPS=100000, MAX_EPOCHS=1000, and increase the SBATCH --time above
+
+# Run the new-project wizard
+r "coverage run -p -m everyvoice new-project --resume-from wizard-resume"
+
+# Enter the directory created by the wizard
+cd regress || { echo "ERROR: Cannot cd into regress directory, aborting."; exit 1; }
+trap 'echo "Failed or killed at $(date)"; date | tee ../FAILED > ../DONE' 0
+
+# Preprocess
+r "coverage run -p -m everyvoice preprocess config/everyvoice-text-to-spec.yaml"
+
+# Train the fs2 model
+r "coverage run -p -m everyvoice train text-to-spec config/everyvoice-text-to-spec.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
+FS2=logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt
+ls $FS2 || { echo ERROR: Training the text-to-spec model failed, aborting.; exit 1; }
+
+# Train the vocoder
+r "coverage run -p -m everyvoice train spec-to-wav config/everyvoice-spec-to-wav.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
+VOCODER=logs_and_checkpoints/VocoderExperiment/base/checkpoints/last.ckpt
+ls $VOCODER || { echo ERROR: Training the Vocoder failed, aborting.; exit 1; }
+
+# Synthesize some text
+r "coverage run -p -m everyvoice synthesize from-text \
+    --output-type wav --output-type spec --output-type textgrid --output-type readalong-xml --output-type readalong-html \
+    --filelist ../test.txt \
+    --vocoder-path '$VOCODER' \
+    '$FS2'"
+# TODO: check the synthesized files, somehow
+
+# Exercise two-step synthesis
+ONE_WORD=$(cat ../test2.txt)
+r "coverage run -p -m everyvoice synthesize from-text --output-type spec --text '$ONE_WORD' '$FS2'"
+r "coverage run -p -m everyvoice synthesize from-spec \
+    --input synthesis_output/synthesized_spec/'$ONE_WORD'-*.pt \
+    --model '$VOCODER'"
+
+# TODO: Exercise DeepForceAligner
+# Meh, this appears to be broken... train passes on lj-full, not on lj-160 or lj-600
+#r "coverage run -p -m dfaligner train config/everyvoice-aligner.yaml --config-args training.max_steps=$MAX_STEPS --config-args training.max_epochs=$MAX_EPOCHS"
+#ALIGNER=logs_and_checkpoints/AlignerExperiment/base/checkpoints/last.ckpt
+# Even on lj-full, this eventually fails with a stack trace dump with a `KeyError: 'character_tokens'` at `dfaligner/dataset.py:165`
+#r "coverage run -p -m dfaligner extract-alignments config/everyvoice-aligner.yaml --model '$ALIGNER'"
+
+
+# Spin up the demo
+# everyvoice demo $FS2 $VOCODER &
+# TODO: use playwright to synthesize something using the demo
+
+
+# TODO: use coverage analysis to flag the next priority things to add here
+
+
+echo "Done at $(date)"
+date > ../DONE
+trap - 0
diff --git a/everyvoice/tests/regression/wizard-resume-lj b/everyvoice/tests/regression/wizard-resume-lj
@@ -0,0 +1,52 @@
+- - EveryVoice Wizard
+  - 0.2.0a1
+- - Root
+  - null
+- - Name Step
+  - regress
+- - Contact Name Step
+  - EveryVoice Regressor
+- - Contact Email Step
+  - ev@nrc.ca
+- - Output Path Step
+  - .
+- - Filelist Step
+  - metadata.csv
+- - Dataset Permission Step
+  - Yes, I do have permission to use this data.
+- - Filelist Format Step
+  - psv
+- - Filelist Has Header Line Step
+  - 'no'
+- - Basename Header Step
+  - 0
+- - Text Header Step
+  - 1
+- - Filelist Text Representation Step
+  - characters
+- - Text Processing Step
+  - - 0
+    - 1
+- - Data Has Speaker Step
+  - 'no'
+- - Know Speaker Step
+  - 'no'
+- - Data Has Language Step
+  - 'no'
+- - Select Language Step
+  - '[eng]: English'
+- - Wavs Dir Step
+  - wavs
+- - Validate Wavs Step
+  - OK
+- - Symbol-Set Step
+  - true
+- - SoX Effects Step
+  - - 0
+    - 1
+- - Dataset Name Step
+  - lj2k
+- - More Datasets Step
+  - 'no'
+- - Config Format Step
+  - 'yaml'
diff --git a/everyvoice/tests/regression/wizard-resume-si b/everyvoice/tests/regression/wizard-resume-si
@@ -0,0 +1,48 @@
+- - EveryVoice Wizard
+  - 0.2.0a1
+- - Root
+  - null
+- - Name Step
+  - regress
+- - Contact Name Step
+  - Regression Tester
+- - Contact Email Step
+  - Eric.Joanis@nrc-cnrc.gc.ca
+- - Output Path Step
+  - .
+- - Filelist Step
+  - si_lk.lines.txt
+- - Dataset Permission Step
+  - Yes, I do have permission to use this data.
+- - Filelist Format Step
+  - festival
+- - Filelist Text Representation Step
+  - characters
+- - Text Processing Step
+  - []
+- - Data Has Speaker Step
+  - 'no'
+- - Know Speaker Step
+  - 'yes'
+- - Add Speaker Step
+  - sinhala_speaker
+- - Data Has Language Step
+  - 'no'
+- - Select Language Step
+  - '[und]: my language isn''t here'
+- - Wavs Dir Step
+  - wavs
+- - Validate Wavs Step
+  - OK
+- - Symbol-Set Step
+  - true
+- - SoX Effects Step
+  - - 0
+    - 1
+    - 2
+- - Dataset Name Step
+  - sinhala-regress
+- - More Datasets Step
+  - 'no'
+- - Config Format Step
+  - 'yaml'
diff --git a/everyvoice/tests/regression/wizard-resume-xh b/everyvoice/tests/regression/wizard-resume-xh
@@ -0,0 +1,53 @@
+- - EveryVoice Wizard
+  - 0.2.0a1
+- - Root
+  - null
+- - Name Step
+  - regress
+- - Contact Name Step
+  - Regression Tester
+- - Contact Email Step
+  - Eric.Joanis@nrc-cnrc.gc.ca
+- - Output Path Step
+  - .
+- - Filelist Step
+  - line_index.tsv
+- - Dataset Permission Step
+  - Yes, I do have permission to use this data.
+- - Filelist Format Step
+  - tsv
+- - Filelist Has Header Line Step
+  - 'no'
+- - Basename Header Step
+  - 0
+- - Text Header Step
+  - 1
+- - Filelist Text Representation Step
+  - characters
+- - Text Processing Step
+  - - 0
+    - 1
+- - Data Has Speaker Step
+  - 'no'
+- - Know Speaker Step
+  - 'no'
+- - Data Has Language Step
+  - 'no'
+- - Select Language Step
+  - '[und]: my language isn''t here'
+- - Wavs Dir Step
+  - wavs
+- - Validate Wavs Step
+  - OK
+- - Symbol-Set Step
+  - true
+- - SoX Effects Step
+  - - 0
+    - 1
+    - 2
+- - Dataset Name Step
+  - xho-regress
+- - More Datasets Step
+  - 'no'
+- - Config Format Step
+  - 'yaml'
diff --git a/everyvoice/tests/test_cli.py b/everyvoice/tests/test_cli.py
@@ -271,10 +271,9 @@ def test_evaluate(self):
             dir_result.stdout,
             "should report metrics in terms of averages",
         )
-        self.assertTrue(
-            (self.data_dir / "lj" / "wavs" / "evaluation.json").exists(),
-            "should print out results to a file",
-        )
+        evaluation_output = self.data_dir / "lj" / "wavs" / "evaluation.json"
+        self.assertTrue(evaluation_output.exists(), "should print results to a file")
+        evaluation_output.unlink()
 
     def test_inspect_checkpoint_help(self):
         with silence_c_stderr():

diff --git a/pyproject.toml b/pyproject.toml
@@ -74,6 +74,7 @@ dependencies = [
   "torch==2.3.1",
   "torchaudio==2.3.1",
   "torchinfo==1.8.0",
+  "tqdm>=4.66.0",
   "typer>=0.12.4",
   "yaspin>=3.1.0",
 ]
@@ -89,6 +90,7 @@ torch = [
 ]
 dev = [
   "black~=24.3",
+  "coverage",
   "flake8>=4.0.1",
   "gitlint-core>=0.19.0",
   "isort>=5.10.1",