From 84dae8e9f9abc9a5f58d4d6ce10918cd27c523c1 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 7 Apr 2025 14:51:59 -0400
Subject: [PATCH 01/17] Add optional alignment step to nmt jobs, temporary
 implementation of eflomal

---
 .devcontainer/dockerfile                     |   4 +-
 machine/jobs/build_nmt_engine.py             |   3 +
 machine/jobs/eflomal_aligner.py              | 153 +++++++++++++++++++
 machine/jobs/nmt_engine_build_job.py         |  27 +++-
 machine/jobs/smt_engine_build_job.py         |   4 +-
 machine/jobs/translation_engine_build_job.py |  64 +++++++-
 machine/jobs/translation_file_service.py     |  24 ++-
 poetry.lock                                  |  92 ++++++++++-
 pyproject.toml                               |   1 +
 tests/jobs/test_nmt_engine_build_job.py      |  25 ++-
 tests/jobs/test_smt_engine_build_job.py      |   7 +-
 11 files changed, 381 insertions(+), 23 deletions(-)
 create mode 100644 machine/jobs/eflomal_aligner.py

diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
index 6212c64c..7b1282fb 100644
--- a/.devcontainer/dockerfile
+++ b/.devcontainer/dockerfile
@@ -22,7 +22,7 @@ RUN apt-get update && \
     python$PYTHON_VERSION-distutils \
     git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
     libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \
-    libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \
+    libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev python3.9-dev && \
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION
@@ -39,4 +39,6 @@ RUN pip install -U pip setuptools \
 
 COPY ./.devcontainer/clearml.conf /root/clearml.conf
 
+ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python3.9/site-packages/eflomal/bin
+
 CMD ["bash"]
diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py
index a047a7ba..2645598c 100644
--- a/machine/jobs/build_nmt_engine.py
+++ b/machine/jobs/build_nmt_engine.py
@@ -92,6 +92,9 @@ def main() -> None:
     parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task")
     parser.add_argument("--build-options", default=None, type=str, help="Build configurations")
     parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name")
+    parser.add_argument(
+        "--align-pretranslations", default=False, action="store_true", help="Aligns source and target pretranslations"
+    )
     args = parser.parse_args()
 
     run({k: v for k, v in vars(args).items() if v is not None})
diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
new file mode 100644
index 00000000..31559847
--- /dev/null
+++ b/machine/jobs/eflomal_aligner.py
@@ -0,0 +1,153 @@
+# NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py.
+# The vast majority of this code is taken from the silnlp repository.
+
+import os
+import subprocess
+from contextlib import ExitStack
+from math import sqrt
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import IO, Iterable, List, Sequence, Tuple
+
+from eflomal import read_text, write_text
+
+from ..corpora import AlignedWordPair
+from ..corpora.token_processors import escape_spaces, lowercase, normalize
+from ..tokenization import LatinWordTokenizer
+from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix
+
+# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine?
+EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
+TOKENIZER = LatinWordTokenizer()
+
+
+# From silnlp.alignment.tools
+def execute_eflomal(
+    source_path: Path,
+    target_path: Path,
+    forward_links_path: Path,
+    reverse_links_path: Path,
+    n_iterations: Tuple[int, int, int],
+) -> None:
+    if not EFLOMAL_PATH.is_file():
+        raise RuntimeError("eflomal is not installed.")
+
+    args = [
+        str(EFLOMAL_PATH),
+        "-s",
+        str(source_path),
+        "-t",
+        str(target_path),
+        "-f",
+        str(forward_links_path),
+        "-r",
+        str(reverse_links_path),
+        # "-q",
+        "-m",
+        "3",
+        "-n",
+        "3",
+        "-N",
+        "0.2",
+        "-1",
+        str(n_iterations[0]),
+        "-2",
+        str(n_iterations[1]),
+        "-3",
+        str(n_iterations[2]),
+    ]
+    subprocess.run(args, stderr=subprocess.DEVNULL)
+
+
+# From silnlp.alignment.eflomal
+def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix:
+    word_pairs = AlignedWordPair.from_string(alignment_str)
+    row_count = 0
+    column_count = 0
+    for pair in word_pairs:
+        if pair.source_index + 1 > row_count:
+            row_count = pair.source_index + 1
+        if pair.target_index + 1 > column_count:
+            column_count = pair.target_index + 1
+    return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs)
+
+
+# From silnlp.alignment.eflomal
+def to_eflomal_text_file(input: Iterable[str], output_file: IO[bytes], prefix_len: int = 0, suffix_len: int = 0) -> int:
+    sents, index = read_text(input, True, prefix_len, suffix_len)
+    n_sents = len(sents)
+    voc_size = len(index)
+    write_text(output_file, tuple(sents), voc_size)
+    return n_sents
+
+
+# From silnlp.alignment.eflomal
+def prepare_files(
+    src_input: Iterable[str], src_output_file: IO[bytes], trg_input: Iterable[str], trg_output_file: IO[bytes]
+) -> int:
+    n_src_sents = to_eflomal_text_file(src_input, src_output_file)
+    n_trg_sents = to_eflomal_text_file(trg_input, trg_output_file)
+    if n_src_sents != n_trg_sents:
+        raise ValueError("Mismatched file sizes")
+    return n_src_sents
+
+
+def tokenize(sent: str) -> Sequence[str]:
+    return lowercase(normalize("NFC", escape_spaces(list(TOKENIZER.tokenize(sent)))))
+
+
+# From silnlp.alignment.eflomal
+class EflomalAligner:
+    def __init__(self, model_dir: Path) -> None:
+        self._model_dir = model_dir
+
+    def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[str]]) -> None:
+        self._model_dir.mkdir(exist_ok=True)
+        with TemporaryDirectory() as temp_dir:
+            src_eflomal_path = Path(temp_dir, "source")
+            trg_eflomal_path = Path(temp_dir, "target")
+            with ExitStack() as stack:
+                src_output_file = stack.enter_context(src_eflomal_path.open("wb"))
+                trg_output_file = stack.enter_context(trg_eflomal_path.open("wb"))
+                # Write input files for the eflomal binary
+                n_sentences = prepare_files(
+                    [" ".join(s) for s in src_toks], src_output_file, [" ".join(s) for s in trg_toks], trg_output_file
+                )
+
+            iters = max(2, int(round(1.0 * 5000 / sqrt(n_sentences))))
+            iters4 = max(1, iters // 4)
+            n_iterations = (max(2, iters4), iters4, iters)
+
+            # Run wrapper for the eflomal binary
+            execute_eflomal(
+                src_eflomal_path,
+                trg_eflomal_path,
+                self._model_dir / "forward-align.txt",
+                self._model_dir / "reverse-align.txt",
+                n_iterations,
+            )
+
+    def align(self, sym_heuristic: str = "grow-diag-final-and") -> List[str]:
+        forward_align_path = self._model_dir / "forward-align.txt"
+        reverse_align_path = self._model_dir / "reverse-align.txt"
+
+        alignments = []
+        heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")]
+        with ExitStack() as stack:
+            forward_file = stack.enter_context(forward_align_path.open("r", encoding="utf-8-sig"))
+            reverse_file = stack.enter_context(reverse_align_path.open("r", encoding="utf-8-sig"))
+
+            for forward_line, reverse_line in zip(forward_file, reverse_file):
+                forward_matrix = to_word_alignment_matrix(forward_line.strip())
+                reverse_matrix = to_word_alignment_matrix(reverse_line.strip())
+                src_len = max(forward_matrix.row_count, reverse_matrix.row_count)
+                trg_len = max(forward_matrix.column_count, reverse_matrix.column_count)
+
+                forward_matrix.resize(src_len, trg_len)
+                reverse_matrix.resize(src_len, trg_len)
+
+                forward_matrix.symmetrize_with(reverse_matrix, heuristic)
+
+                alignments.append(str(forward_matrix))
+
+        return alignments
diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
index 1ff719a4..d3674120 100644
--- a/machine/jobs/nmt_engine_build_job.py
+++ b/machine/jobs/nmt_engine_build_job.py
@@ -28,12 +28,25 @@ def _get_progress_reporter(
         self, progress: Optional[Callable[[ProgressStatus], None]], corpus_size: int
     ) -> PhasedProgressReporter:
         if corpus_size > 0:
-            phases = [
-                Phase(message="Training NMT model", percentage=0.9),
-                Phase(message="Pretranslating segments", percentage=0.1),
-            ]
+            if "align_pretranslations" in self._config and self._config.align_pretranslations:
+                phases = [
+                    Phase(message="Training NMT model", percentage=0.8),
+                    Phase(message="Pretranslating segments", percentage=0.1),
+                    Phase(message="Aligning segments", percentage=0.1, report_steps=False),
+                ]
+            else:
+                phases = [
+                    Phase(message="Training NMT model", percentage=0.9),
+                    Phase(message="Pretranslating segments", percentage=0.1),
+                ]
         else:
-            phases = [Phase(message="Pretranslating segments", percentage=1.0)]
+            if "align_pretranslations" in self._config and self._config.align_pretranslations:
+                phases = [
+                    Phase(message="Pretranslating segments", percentage=0.9),
+                    Phase(message="Aligning segments", percentage=0.1, report_steps=False),
+                ]
+            else:
+                phases = [Phase(message="Pretranslating segments", percentage=1.0)]
         return PhasedProgressReporter(progress, phases)
 
     def _respond_to_no_training_corpus(self) -> Tuple[int, float]:
@@ -115,7 +128,7 @@ def _translate_batch(
     batch: Sequence[PretranslationInfo],
     writer: DictToJsonWriter,
 ) -> None:
-    source_segments = [pi["translation"] for pi in batch]
+    source_segments = [pi["pretranslation"] for pi in batch]
     for i, result in enumerate(engine.translate_batch(source_segments)):
-        batch[i]["translation"] = result.translation
+        batch[i]["pretranslation"] = result.translation
         writer.write(batch[i])
diff --git a/machine/jobs/smt_engine_build_job.py b/machine/jobs/smt_engine_build_job.py
index 452810f8..35977c60 100644
--- a/machine/jobs/smt_engine_build_job.py
+++ b/machine/jobs/smt_engine_build_job.py
@@ -107,7 +107,7 @@ def _translate_batch(
     batch: Sequence[PretranslationInfo],
     writer: DictToJsonWriter,
 ) -> None:
-    source_segments = [pi["translation"] for pi in batch]
+    source_segments = [pi["pretranslation"] for pi in batch]
     for i, result in enumerate(engine.translate_batch(source_segments)):
-        batch[i]["translation"] = result.translation
+        batch[i]["pretranslation"] = result.translation
         writer.write(batch[i])
diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py
index 7effa62f..ec94eff3 100644
--- a/machine/jobs/translation_engine_build_job.py
+++ b/machine/jobs/translation_engine_build_job.py
@@ -1,12 +1,16 @@
 import logging
 from abc import ABC, abstractmethod
+from contextlib import ExitStack
+from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Any, Callable, Optional, Tuple
 
 from ..corpora.parallel_text_corpus import ParallelTextCorpus
 from ..corpora.text_corpus import TextCorpus
 from ..utils.phased_progress_reporter import PhasedProgressReporter
 from ..utils.progress_status import ProgressStatus
-from .translation_file_service import TranslationFileService
+from .eflomal_aligner import EflomalAligner, tokenize
+from .translation_file_service import PretranslationInfo, TranslationFileService
 
 logger = logging.getLogger(__name__)
 
@@ -44,6 +48,10 @@ def run(
         logger.info("Pretranslating segments")
         self._batch_inference(progress_reporter, check_canceled)
 
+        if "align_pretranslations" in self._config and self._config.align_pretranslations:
+            logger.info("Aligning source to pretranslations")
+            self._align(progress_reporter, check_canceled)
+
         self._save_model()
         return train_corpus_size, confidence
 
@@ -74,5 +82,59 @@ def _batch_inference(
         check_canceled: Optional[Callable[[], None]],
     ) -> None: ...
 
+    def _align(
+        self,
+        progress_reporter: PhasedProgressReporter,
+        check_canceled: Optional[Callable[[], None]],
+    ) -> None:
+        if check_canceled is not None:
+            check_canceled()
+
+        logger.info("Aligning source to pretranslations")
+        with ExitStack() as stack:
+            phase_progress = stack.enter_context(progress_reporter.start_next_phase())
+
+            src_tokenized = [
+                tokenize(s["pretranslation"])
+                for s in stack.enter_context(self._translation_file_service.get_source_pretranslations())
+            ]
+            trg_tokenized = [
+                tokenize(s["pretranslation"])
+                for s in stack.enter_context(self._translation_file_service.get_target_pretranslations())
+            ]
+
+            with TemporaryDirectory() as td:
+                aligner = EflomalAligner(Path(td))
+                logger.info("Training aligner")
+                aligner.train(src_tokenized, trg_tokenized)
+
+                if check_canceled is not None:
+                    check_canceled()
+
+                logger.info("Aligning pretranslations")
+                alignments = aligner.align()
+
+            if check_canceled is not None:
+                check_canceled()
+
+            writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer())
+            for trg_pi, src_toks, trg_toks, alignment in zip(
+                stack.enter_context(self._translation_file_service.get_target_pretranslations()),
+                src_tokenized,
+                trg_tokenized,
+                alignments,
+            ):
+                writer.write(
+                    PretranslationInfo(
+                        corpusId=trg_pi["corpusId"],
+                        textId=trg_pi["textId"],
+                        refs=trg_pi["refs"],
+                        pretranslation=trg_pi["pretranslation"],
+                        source_toks=list(src_toks),
+                        pretranslation_toks=list(trg_toks),
+                        alignment=alignment,
+                    )
+                )
+
     @abstractmethod
     def _save_model(self) -> None: ...
diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py
index 16e9f2e7..a6942ab4 100644
--- a/machine/jobs/translation_file_service.py
+++ b/machine/jobs/translation_file_service.py
@@ -15,7 +15,10 @@ class PretranslationInfo(TypedDict):
     corpusId: str  # noqa: N815
     textId: str  # noqa: N815
     refs: List[str]
-    translation: str
+    pretranslation: str
+    source_toks: List[str]
+    pretranslation_toks: List[str]
+    alignment: str
 
 
 SOURCE_FILENAME = "train.src.txt"
@@ -49,23 +52,30 @@ def exists_source_corpus(self) -> bool:
     def exists_target_corpus(self) -> bool:
         return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{TARGET_FILENAME}")
 
-    def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
-        src_pretranslate_path = self.shared_file_service.download_file(
-            f"{self.shared_file_service.build_path}/{SOURCE_PRETRANSLATION_FILENAME}"
-        )
+    def _get_pretranslations(self, filename: str) -> ContextManagedGenerator[PretranslationInfo, None, None]:
+        pretranslate_path = self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{filename}")
 
         def generator() -> Generator[PretranslationInfo, None, None]:
-            with src_pretranslate_path.open("r", encoding="utf-8-sig") as file:
+            with pretranslate_path.open("r", encoding="utf-8-sig") as file:
                 for pi in json_stream.load(file):
                     yield PretranslationInfo(
                         corpusId=pi["corpusId"],
                         textId=pi["textId"],
                         refs=list(pi["refs"]),
-                        translation=pi["translation"],
+                        pretranslation=pi["pretranslation"],
+                        source_toks=list(pi["source_toks"]),
+                        pretranslation_toks=list(pi["pretranslation_toks"]),
+                        alignment=pi["alignment"],
                     )
 
         return ContextManagedGenerator(generator())
 
+    def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
+        return self._get_pretranslations(SOURCE_PRETRANSLATION_FILENAME)
+
+    def get_target_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
+        return self._get_pretranslations(TARGET_PRETRANSLATION_FILENAME)
+
     def save_model(self, model_path: Path, destination: str) -> None:
         self.shared_file_service.upload_path(model_path, destination)
 
diff --git a/poetry.lock b/poetry.lock
index bd6b9039..5850ec36 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -733,6 +733,79 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1
 [package.extras]
 toml = ["tomli"]
 
+[[package]]
+name = "cython"
+version = "3.0.12"
+description = "The Cython compiler for writing C extensions in the Python language."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+files = [
+    {file = "Cython-3.0.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba67eee9413b66dd9fbacd33f0bc2e028a2a120991d77b5fd4b19d0b1e4039b9"},
+    {file = "Cython-3.0.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee2717e5b5f7d966d0c6e27d2efe3698c357aa4d61bb3201997c7a4f9fe485a"},
+    {file = "Cython-3.0.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cffc3464f641c8d0dda942c7c53015291beea11ec4d32421bed2f13b386b819"},
+    {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d3a8f81980ffbd74e52f9186d8f1654e347d0c44bfea6b5997028977f481a179"},
+    {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d32856716c369d01f2385ad9177cdd1a11079ac89ea0932dc4882de1aa19174"},
+    {file = "Cython-3.0.12-cp310-cp310-win32.whl", hash = "sha256:712c3f31adec140dc60d064a7f84741f50e2c25a8edd7ae746d5eb4d3ef7072a"},
+    {file = "Cython-3.0.12-cp310-cp310-win_amd64.whl", hash = "sha256:d6945694c5b9170cfbd5f2c0d00ef7487a2de7aba83713a64ee4ebce7fad9e05"},
+    {file = "Cython-3.0.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:feb86122a823937cc06e4c029d80ff69f082ebb0b959ab52a5af6cdd271c5dc3"},
+    {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfdbea486e702c328338314adb8e80f5f9741f06a0ae83aaec7463bc166d12e8"},
+    {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563de1728c8e48869d2380a1b76bbc1b1b1d01aba948480d68c1d05e52d20c92"},
+    {file = "Cython-3.0.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:398d4576c1e1f6316282aa0b4a55139254fbed965cba7813e6d9900d3092b128"},
+    {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1e5eadef80143026944ea8f9904715a008f5108d1d644a89f63094cc37351e73"},
+    {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5a93cbda00a5451175b97dea5a9440a3fcee9e54b4cba7a7dbcba9a764b22aec"},
+    {file = "Cython-3.0.12-cp311-cp311-win32.whl", hash = "sha256:3109e1d44425a2639e9a677b66cd7711721a5b606b65867cb2d8ef7a97e2237b"},
+    {file = "Cython-3.0.12-cp311-cp311-win_amd64.whl", hash = "sha256:d4b70fc339adba1e2111b074ee6119fe9fd6072c957d8597bce9a0dd1c3c6784"},
+    {file = "Cython-3.0.12-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fe030d4a00afb2844f5f70896b7f2a1a0d7da09bf3aa3d884cbe5f73fff5d310"},
+    {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7fec4f052b8fe173fe70eae75091389955b9a23d5cec3d576d21c5913b49d47"},
+    {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0faa5e39e5c8cdf6f9c3b1c3f24972826e45911e7f5b99cf99453fca5432f45e"},
+    {file = "Cython-3.0.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d53de996ed340e9ab0fc85a88aaa8932f2591a2746e1ab1c06e262bd4ec4be7"},
+    {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ea3a0e19ab77266c738aa110684a753a04da4e709472cadeff487133354d6ab8"},
+    {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c151082884be468f2f405645858a857298ac7f7592729e5b54788b5c572717ba"},
+    {file = "Cython-3.0.12-cp312-cp312-win32.whl", hash = "sha256:3083465749911ac3b2ce001b6bf17f404ac9dd35d8b08469d19dc7e717f5877a"},
+    {file = "Cython-3.0.12-cp312-cp312-win_amd64.whl", hash = "sha256:c0b91c7ebace030dd558ea28730de8c580680b50768e5af66db2904a3716c3e3"},
+    {file = "Cython-3.0.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4ee6f1ea1bead8e6cbc4e64571505b5d8dbdb3b58e679d31f3a84160cebf1a1a"},
+    {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57aefa6d3341109e46ec1a13e3a763aaa2cbeb14e82af2485b318194be1d9170"},
+    {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:879ae9023958d63c0675015369384642d0afb9c9d1f3473df9186c42f7a9d265"},
+    {file = "Cython-3.0.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36fcd584dae547de6f095500a380f4a0cce72b7a7e409e9ff03cb9beed6ac7a1"},
+    {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:62b79dcc0de49efe9e84b9d0e2ae0a6fc9b14691a65565da727aa2e2e63c6a28"},
+    {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4aa255781b093a8401109d8f2104bbb2e52de7639d5896aefafddc85c30e0894"},
+    {file = "Cython-3.0.12-cp313-cp313-win32.whl", hash = "sha256:77d48f2d4bab9fe1236eb753d18f03e8b2619af5b6f05d51df0532a92dfb38ab"},
+    {file = "Cython-3.0.12-cp313-cp313-win_amd64.whl", hash = "sha256:86c304b20bd57c727c7357e90d5ba1a2b6f1c45492de2373814d7745ef2e63b4"},
+    {file = "Cython-3.0.12-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ff5c0b6a65b08117d0534941d404833d516dac422eee88c6b4fd55feb409a5ed"},
+    {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:680f1d6ed4436ae94805db264d6155ed076d2835d84f20dcb31a7a3ad7f8668c"},
+    {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc24609613fa06d0d896309f7164ba168f7e8d71c1e490ed2a08d23351c3f41"},
+    {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1879c073e2b34924ce9b7ca64c212705dcc416af4337c45f371242b2e5f6d32"},
+    {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:bfb75123dd4ff767baa37d7036da0de2dfb6781ff256eef69b11b88b9a0691d1"},
+    {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:f39640f8df0400cde6882e23c734f15bb8196de0a008ae5dc6c8d1ec5957d7c8"},
+    {file = "Cython-3.0.12-cp36-cp36m-win32.whl", hash = "sha256:8c9efe9a0895abee3cadfdad4130b30f7b5e57f6e6a51ef2a44f9fc66a913880"},
+    {file = "Cython-3.0.12-cp36-cp36m-win_amd64.whl", hash = "sha256:63d840f2975e44d74512f8f34f1f7cb8121c9428e26a3f6116ff273deb5e60a2"},
+    {file = "Cython-3.0.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:75c5acd40b97cff16fadcf6901a91586cbca5dcdba81f738efaf1f4c6bc8dccb"},
+    {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e62564457851db1c40399bd95a5346b9bb99e17a819bf583b362f418d8f3457a"},
+    {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ccd1228cc203b1f1b8a3d403f5a20ad1c40e5879b3fbf5851ce09d948982f2c"},
+    {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25529ee948f44d9a165ff960c49d4903267c20b5edf2df79b45924802e4cca6e"},
+    {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:90cf599372c5a22120609f7d3a963f17814799335d56dd0dcf8fe615980a8ae1"},
+    {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9f8c48748a9c94ea5d59c26ab49ad0fad514d36f894985879cf3c3ca0e600bf4"},
+    {file = "Cython-3.0.12-cp37-cp37m-win32.whl", hash = "sha256:3e4fa855d98bc7bd6a2049e0c7dc0dcf595e2e7f571a26e808f3efd84d2db374"},
+    {file = "Cython-3.0.12-cp37-cp37m-win_amd64.whl", hash = "sha256:120681093772bf3600caddb296a65b352a0d3556e962b9b147efcfb8e8c9801b"},
+    {file = "Cython-3.0.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:731d719423e041242c9303c80cae4327467299b90ffe62d4cc407e11e9ea3160"},
+    {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3238a29f37999e27494d120983eca90d14896b2887a0bd858a381204549137a"},
+    {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b588c0a089a9f4dd316d2f9275230bad4a7271e5af04e1dc41d2707c816be44b"},
+    {file = "Cython-3.0.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab9f5198af74eb16502cc143cdde9ca1cbbf66ea2912e67440dd18a36e3b5fa"},
+    {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8ee841c0e114efa1e849c281ac9b8df8aa189af10b4a103b1c5fd71cbb799679"},
+    {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:43c48b5789398b228ea97499f5b864843ba9b1ab837562a9227c6f58d16ede8b"},
+    {file = "Cython-3.0.12-cp38-cp38-win32.whl", hash = "sha256:5e5f17c48a4f41557fbcc7ee660ccfebe4536a34c557f553b6893c1b3c83df2d"},
+    {file = "Cython-3.0.12-cp38-cp38-win_amd64.whl", hash = "sha256:309c081057930bb79dc9ea3061a1af5086c679c968206e9c9c2ec90ab7cb471a"},
+    {file = "Cython-3.0.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54115fcc126840926ff3b53cfd2152eae17b3522ae7f74888f8a41413bd32f25"},
+    {file = "Cython-3.0.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:629db614b9c364596d7c975fa3fb3978e8c5349524353dbe11429896a783fc1e"},
+    {file = "Cython-3.0.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af081838b0f9e12a83ec4c3809a00a64c817f489f7c512b0e3ecaf5f90a2a816"},
+    {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:34ce459808f7d8d5d4007bc5486fe50532529096b43957af6cbffcb4d9cc5c8d"},
+    {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d6c6cd6a75c8393e6805d17f7126b96a894f310a1a9ea91c47d141fb9341bfa8"},
+    {file = "Cython-3.0.12-cp39-cp39-win32.whl", hash = "sha256:a4032e48d4734d2df68235d21920c715c451ac9de15fa14c71b378e8986b83be"},
+    {file = "Cython-3.0.12-cp39-cp39-win_amd64.whl", hash = "sha256:dcdc3e5d4ce0e7a4af6903ed580833015641e968d18d528d8371e2435a34132c"},
+    {file = "Cython-3.0.12-py2.py3-none-any.whl", hash = "sha256:0038c9bae46c459669390e53a1ec115f8096b2e4647ae007ff1bf4e6dee92806"},
+    {file = "cython-3.0.12.tar.gz", hash = "sha256:b988bb297ce76c671e28c97d017b95411010f7c77fa6623dd0bb47eed1aee1bc"},
+]
+
 [[package]]
 name = "datasets"
 version = "2.21.0"
@@ -881,6 +954,23 @@ toml = ["toml"]
 vault = ["hvac"]
 yaml = ["ruamel.yaml"]
 
+[[package]]
+name = "eflomal"
+version = "2.0.0"
+description = "pip installable eflomal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "eflomal-2.0.0.tar.gz", hash = "sha256:b71183dcf85bf4f59f44ef7a59f5268df1c17c0c8d8093f77b220025ffdba100"},
+]
+
+[package.dependencies]
+Cython = "*"
+numpy = "*"
+
+[package.extras]
+test = ["pytest"]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -4787,4 +4877,4 @@ thot = ["sil-thot"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "ff353baa0a9c4519a6bef585b095c141da9c20b6dad4ef47c0af3ea57c92e6ee"
+content-hash = "d939e11c2c341294ac8cbc3af9b1ae710b188e5ff4af4d39b614fb0a5726eddb"
diff --git a/pyproject.toml b/pyproject.toml
index 8f6527d9..be9ac8fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ urllib3 = "<2"
 
 sentencepiece = "^0.2.0"
 sil-thot = "^3.4.6"
+eflomal = "^2.0.0"
 
 transformers = ">=4.38.0,<4.46"
 datasets = "^2.4.0"
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index a5e416d6..15a38681 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -35,7 +35,7 @@ def test_run(decoy: Decoy) -> None:
 
     pretranslations = json.loads(env.target_pretranslations)
     assert len(pretranslations) == 1
-    assert pretranslations[0]["translation"] == "Please, I have booked a room."
+    assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
 
@@ -112,7 +112,28 @@ def __init__(self, decoy: Decoy) -> None:
                             corpusId="corpus1",
                             textId="text1",
                             refs=["ref1"],
-                            translation="Por favor, tengo reservada una habitación.",
+                            pretranslation="Por favor, tengo reservada una habitación.",
+                            source_toks=[],
+                            pretranslation_toks=[],
+                            alignment="",
+                        )
+                    ]
+                )
+            )
+        )
+        decoy.when(self.translation_file_service.get_target_pretranslations()).then_do(
+            lambda: ContextManagedGenerator(
+                (
+                    pi
+                    for pi in [
+                        PretranslationInfo(
+                            corpusId="corpus1",
+                            textId="text1",
+                            refs=["ref1"],
+                            pretranslation="Please, I have booked a room.",
+                            source_toks=[],
+                            pretranslation_toks=[],
+                            alignment="",
                         )
                     ]
                 )
diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py
index 0cf2d948..51def81e 100644
--- a/tests/jobs/test_smt_engine_build_job.py
+++ b/tests/jobs/test_smt_engine_build_job.py
@@ -31,7 +31,7 @@ def test_run(decoy: Decoy) -> None:
 
     pretranslations = json.loads(env.target_pretranslations)
     assert len(pretranslations) == 1
-    assert pretranslations[0]["translation"] == "Please, I have booked a room."
+    assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
     decoy.verify(
         env.translation_file_service.save_model(matchers.Anything(), f"builds/{env.job._config.build_id}/model.zip"),
         times=1,
@@ -136,7 +136,10 @@ def __init__(self, decoy: Decoy) -> None:
                             corpusId="corpus1",
                             textId="text1",
                             refs=["ref1"],
-                            translation="Por favor, tengo reservada una habitación.",
+                            pretranslation="Por favor, tengo reservada una habitación.",
+                            source_toks=[],
+                            pretranslation_toks=[],
+                            alignment="",
                         )
                     ]
                 )

From c2615978c18c307d456eddc9acb756525694d663 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Wed, 9 Apr 2025 21:23:53 -0400
Subject: [PATCH 02/17] make dockerfiles compatible with eflomal

---
 .devcontainer/dockerfile | 3 ++-
 dockerfile               | 4 +++-
 dockerfile.cpu_only      | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
index 7b1282fb..3a7837a0 100644
--- a/.devcontainer/dockerfile
+++ b/.devcontainer/dockerfile
@@ -20,9 +20,10 @@ RUN apt-get update && \
     apt-get install --no-install-recommends -y \
     python$PYTHON_VERSION \
     python$PYTHON_VERSION-distutils \
+    python$PYTHON_VERSION-dev \
     git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
     libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \
-    libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev python3.9-dev && \
+    libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION
diff --git a/dockerfile b/dockerfile
index 08f20a61..09f5937b 100755
--- a/dockerfile
+++ b/dockerfile
@@ -25,7 +25,7 @@ COPY poetry.lock pyproject.toml /src
 RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt
 
 
-FROM nvidia/cuda:$CUDA_VERSION
+FROM python:$PYTHON_VERSION
 ARG PYTHON_VERSION
 
 ENV PIP_DISABLE_PIP_VERSION_CHECK=on
@@ -64,4 +64,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
+ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+
 CMD ["bash"]
diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
index cb41bb7f..aab45898 100755
--- a/dockerfile.cpu_only
+++ b/dockerfile.cpu_only
@@ -43,4 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
+ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+
 CMD ["bash"]

From 21ebf239f1113f50a0591974b5735170e85ee8b3 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Wed, 9 Apr 2025 21:56:40 -0400
Subject: [PATCH 03/17] fix flake8 error

---
 machine/jobs/translation_engine_build_job.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py
index ec94eff3..21e4b45b 100644
--- a/machine/jobs/translation_engine_build_job.py
+++ b/machine/jobs/translation_engine_build_job.py
@@ -92,7 +92,8 @@ def _align(
 
         logger.info("Aligning source to pretranslations")
         with ExitStack() as stack:
-            phase_progress = stack.enter_context(progress_reporter.start_next_phase())
+            # phase_progress = stack.enter_context(progress_reporter.start_next_phase())
+            progress_reporter.start_next_phase()
 
             src_tokenized = [
                 tokenize(s["pretranslation"])

From 0e3a0d5cc0eecb63685ac9043b2f2f7885fa09cd Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Fri, 11 Apr 2025 16:58:09 -0400
Subject: [PATCH 04/17] Only use eflomal on linux and other small tweaks

---
 .devcontainer/dockerfile                     |  2 +-
 machine/jobs/build_nmt_engine.py             |  6 ++++-
 machine/jobs/eflomal_aligner.py              | 14 +++++++++---
 machine/jobs/translation_engine_build_job.py | 17 +++++---------
 poetry.lock                                  |  4 ++--
 pyproject.toml                               |  4 ++--
 tests/jobs/test_nmt_engine_build_job.py      | 24 +++++++++++++++++++-
 7 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile
index 3a7837a0..06547678 100644
--- a/.devcontainer/dockerfile
+++ b/.devcontainer/dockerfile
@@ -40,6 +40,6 @@ RUN pip install -U pip setuptools \
 
 COPY ./.devcontainer/clearml.conf /root/clearml.conf
 
-ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python3.9/site-packages/eflomal/bin
+ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
 
 CMD ["bash"]
diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py
index 2645598c..cd0e12ff 100644
--- a/machine/jobs/build_nmt_engine.py
+++ b/machine/jobs/build_nmt_engine.py
@@ -93,7 +93,11 @@ def main() -> None:
     parser.add_argument("--build-options", default=None, type=str, help="Build configurations")
     parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name")
     parser.add_argument(
-        "--align-pretranslations", default=False, action="store_true", help="Aligns source and target pretranslations"
+        "--align-pretranslations",
+        default=False,
+        action="store_true",
+        help="Aligns source and target pretranslations using Eflomal (linux only) "
+        "and returns the alignments as well as the tokenized source and target with the pretranslations.",
     )
     args = parser.parse_args()
 
diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 31559847..cccd6ca9 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -4,18 +4,26 @@
 import os
 import subprocess
 from contextlib import ExitStack
+from importlib.util import find_spec
 from math import sqrt
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import IO, Iterable, List, Sequence, Tuple
 
-from eflomal import read_text, write_text
-
 from ..corpora import AlignedWordPair
 from ..corpora.token_processors import escape_spaces, lowercase, normalize
 from ..tokenization import LatinWordTokenizer
 from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix
 
+
+# From silnlp.common.package_utils
+def is_eflomal_available() -> bool:
+    return find_spec("eflomal") is not None
+
+
+if is_eflomal_available():
+    from eflomal import read_text, write_text
+
 # may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine?
 EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
 TOKENIZER = LatinWordTokenizer()
@@ -29,7 +37,7 @@ def execute_eflomal(
     reverse_links_path: Path,
     n_iterations: Tuple[int, int, int],
 ) -> None:
-    if not EFLOMAL_PATH.is_file():
+    if not is_eflomal_available():
         raise RuntimeError("eflomal is not installed.")
 
     args = [
diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py
index 21e4b45b..1c961cec 100644
--- a/machine/jobs/translation_engine_build_job.py
+++ b/machine/jobs/translation_engine_build_job.py
@@ -9,7 +9,7 @@
 from ..corpora.text_corpus import TextCorpus
 from ..utils.phased_progress_reporter import PhasedProgressReporter
 from ..utils.progress_status import ProgressStatus
-from .eflomal_aligner import EflomalAligner, tokenize
+from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize
 from .translation_file_service import PretranslationInfo, TranslationFileService
 
 logger = logging.getLogger(__name__)
@@ -48,7 +48,7 @@ def run(
         logger.info("Pretranslating segments")
         self._batch_inference(progress_reporter, check_canceled)
 
-        if "align_pretranslations" in self._config and self._config.align_pretranslations:
+        if "align_pretranslations" in self._config and self._config.align_pretranslations and is_eflomal_available():
             logger.info("Aligning source to pretranslations")
             self._align(progress_reporter, check_canceled)
 
@@ -99,10 +99,10 @@ def _align(
                 tokenize(s["pretranslation"])
                 for s in stack.enter_context(self._translation_file_service.get_source_pretranslations())
             ]
-            trg_tokenized = [
-                tokenize(s["pretranslation"])
-                for s in stack.enter_context(self._translation_file_service.get_target_pretranslations())
+            trg_info = [
+                pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations())
             ]
+            trg_tokenized = [tokenize(pt_info["pretranslation"]) for pt_info in trg_info]
 
             with TemporaryDirectory() as td:
                 aligner = EflomalAligner(Path(td))
@@ -119,12 +119,7 @@ def _align(
                 check_canceled()
 
             writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer())
-            for trg_pi, src_toks, trg_toks, alignment in zip(
-                stack.enter_context(self._translation_file_service.get_target_pretranslations()),
-                src_tokenized,
-                trg_tokenized,
-                alignments,
-            ):
+            for trg_pi, src_toks, trg_toks, alignment in zip(trg_info, src_tokenized, trg_tokenized, alignments):
                 writer.write(
                     PretranslationInfo(
                         corpusId=trg_pi["corpusId"],
diff --git a/poetry.lock b/poetry.lock
index 5850ec36..4d8ded6e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4870,11 +4870,11 @@ type = ["pytest-mypy"]
 
 [extras]
 huggingface = ["datasets", "sacremoses", "transformers"]
-jobs = ["clearml", "dynaconf", "json-stream"]
+jobs = ["clearml", "dynaconf", "eflomal", "json-stream"]
 sentencepiece = ["sentencepiece"]
 thot = ["sil-thot"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "d939e11c2c341294ac8cbc3af9b1ae710b188e5ff4af4d39b614fb0a5726eddb"
+content-hash = "b650f3e8499b348a527c5e5f0e89ba90e55fb7df93bb907cc8d8e5fdd6b63cb0"
diff --git a/pyproject.toml b/pyproject.toml
index be9ac8fb..822c5ee5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,6 @@ urllib3 = "<2"
 
 sentencepiece = "^0.2.0"
 sil-thot = "^3.4.6"
-eflomal = "^2.0.0"
 
 transformers = ">=4.38.0,<4.46"
 datasets = "^2.4.0"
@@ -74,6 +73,7 @@ botocore = "^1.35.41"
 boto3 = "^1.19.41"
 dynaconf = "^3.2.5"
 json-stream = "^1.3.0"
+eflomal = { markers = "sys_platform == 'linux'", version = "^2.0.0" }
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.2"
@@ -96,7 +96,7 @@ accelerate = { version = "^0.26.1", markers = "sys_platform == 'win32' or sys_pl
 sentencepiece = ["sentencepiece"]
 thot = ["sil-thot"]
 huggingface = ["transformers", "datasets", "sacremoses"]
-jobs = ["clearml", "json-stream", "dynaconf"]
+jobs = ["clearml", "json-stream", "dynaconf", "eflomal"]
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 15a38681..9dee57d0 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -17,6 +17,7 @@
     PretranslationInfo,
     TranslationFileService,
 )
+from machine.jobs.eflomal_aligner import is_eflomal_available
 from machine.translation import (
     Phrase,
     Trainer,
@@ -36,6 +37,19 @@ def test_run(decoy: Decoy) -> None:
     pretranslations = json.loads(env.target_pretranslations)
     assert len(pretranslations) == 1
     assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
+    if is_eflomal_available():
+        assert pretranslations[0]["source_toks"] == [
+            "por",
+            "favor",
+            ",",
+            "tengo",
+            "reservada",
+            "una",
+            "habitación",
+            ".",
+        ]
+        assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."]
+        assert len(pretranslations[0]["alignment"]) > 0
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
 
@@ -155,7 +169,15 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ
         )
 
         self.job = NmtEngineBuildJob(
-            MockSettings({"src_lang": "es", "trg_lang": "en", "save_model": "save-model", "inference_batch_size": 100}),
+            MockSettings(
+                {
+                    "src_lang": "es",
+                    "trg_lang": "en",
+                    "save_model": "save-model",
+                    "inference_batch_size": 100,
+                    "align_pretranslations": is_eflomal_available(),
+                }
+            ),
             self.nmt_model_factory,
             self.translation_file_service,
         )

From 05fe8ec01ce8000949ec769b8f4afcb62bfa2a3b Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Fri, 11 Apr 2025 17:06:03 -0400
Subject: [PATCH 05/17] Adjust NMT engine test

---
 tests/jobs/test_nmt_engine_build_job.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 9dee57d0..112ca1ca 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -50,6 +50,10 @@ def test_run(decoy: Decoy) -> None:
         ]
         assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."]
         assert len(pretranslations[0]["alignment"]) > 0
+    else:
+        assert pretranslations[0]["source_toks"] == []
+        assert pretranslations[0]["pretranslation_toks"] == []
+        assert len(pretranslations[0]["alignment"]) == 0
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
 
@@ -175,7 +179,7 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ
                     "trg_lang": "en",
                     "save_model": "save-model",
                     "inference_batch_size": 100,
-                    "align_pretranslations": is_eflomal_available(),
+                    "align_pretranslations": True,
                 }
             ),
             self.nmt_model_factory,

From 4fd72fa781ba02d2665e8f8417aa08d3d9bbb6c9 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Fri, 11 Apr 2025 17:50:00 -0400
Subject: [PATCH 06/17] alternate eflomal check

---
 machine/jobs/eflomal_aligner.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index cccd6ca9..6333a3d2 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -4,7 +4,6 @@
 import os
 import subprocess
 from contextlib import ExitStack
-from importlib.util import find_spec
 from math import sqrt
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -15,19 +14,19 @@
 from ..tokenization import LatinWordTokenizer
 from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix
 
+# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine?
+EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
+TOKENIZER = LatinWordTokenizer()
+
 
 # From silnlp.common.package_utils
 def is_eflomal_available() -> bool:
-    return find_spec("eflomal") is not None
+    return EFLOMAL_PATH.is_file()
 
 
 if is_eflomal_available():
     from eflomal import read_text, write_text
 
-# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine?
-EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
-TOKENIZER = LatinWordTokenizer()
-
 
 # From silnlp.alignment.tools
 def execute_eflomal(

From 73794c47e66a6d75a69c8e11c77456fa1c94c761 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 16:13:12 -0400
Subject: [PATCH 07/17] Alternate EFLOMAL_PATH, revert eflomal check change

---
 dockerfile                      |  4 +---
 dockerfile.cpu_only             |  2 +-
 machine/jobs/eflomal_aligner.py | 10 +++++-----
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/dockerfile b/dockerfile
index 09f5937b..7a1c8c4b 100755
--- a/dockerfile
+++ b/dockerfile
@@ -1,9 +1,7 @@
 # syntax=docker/dockerfile:1.7-labs
-
 ARG PYTHON_VERSION=3.12
 ARG UBUNTU_VERSION=noble
 ARG POETRY_VERSION=1.6.1
-ARG CUDA_VERSION=12.6.1-base-ubuntu24.04
 
 FROM python:$PYTHON_VERSION-slim AS builder
 ARG POETRY_VERSION
@@ -64,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
 
 CMD ["bash"]
diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
index aab45898..a5c502e9 100755
--- a/dockerfile.cpu_only
+++ b/dockerfile.cpu_only
@@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
 
 CMD ["bash"]
diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 6333a3d2..6eed6952 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -4,6 +4,7 @@
 import os
 import subprocess
 from contextlib import ExitStack
+from importlib.util import find_spec
 from math import sqrt
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -14,19 +15,18 @@
 from ..tokenization import LatinWordTokenizer
 from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix
 
-# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine?
-EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
-TOKENIZER = LatinWordTokenizer()
-
 
 # From silnlp.common.package_utils
 def is_eflomal_available() -> bool:
-    return EFLOMAL_PATH.is_file()
+    return find_spec("eflomal") is not None
 
 
 if is_eflomal_available():
     from eflomal import read_text, write_text
 
+EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
+TOKENIZER = LatinWordTokenizer()
+
 
 # From silnlp.alignment.tools
 def execute_eflomal(

From 2c9db286e1d8a3b6b5fa510277ddd54ecf097bd0 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 16:23:28 -0400
Subject: [PATCH 08/17] Make linter ignore conditional eflomal import

---
 machine/jobs/eflomal_aligner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 6eed6952..551e55c8 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -22,7 +22,7 @@ def is_eflomal_available() -> bool:
 
 
 if is_eflomal_available():
-    from eflomal import read_text, write_text
+    from eflomal import read_text, write_text  # type: ignore
 
 EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
 TOKENIZER = LatinWordTokenizer()

From 52540172d9f089b91cba9ca992d9098d3e2dce85 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 16:58:00 -0400
Subject: [PATCH 09/17] Attempt to fix EFLOMAL_PATH

---
 dockerfile          | 2 +-
 dockerfile.cpu_only | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dockerfile b/dockerfile
index 7a1c8c4b..b5f512ef 100755
--- a/dockerfile
+++ b/dockerfile
@@ -62,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin"
 
 CMD ["bash"]
diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
index a5c502e9..764ab6f1 100755
--- a/dockerfile.cpu_only
+++ b/dockerfile.cpu_only
@@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
+ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin"
 
 CMD ["bash"]

From f45b59f4ae631a03b15eca20dcf78377a6e6cede Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 17:04:41 -0400
Subject: [PATCH 10/17] Eflomal sanity check

---
 machine/jobs/eflomal_aligner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 551e55c8..c088cf22 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -24,7 +24,7 @@ def is_eflomal_available() -> bool:
 if is_eflomal_available():
     from eflomal import read_text, write_text  # type: ignore
 
-EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
+EFLOMAL_PATH = Path("/home/runner/work/machine.py/machine.py/.venv/lib/python3.12/site-packages/eflomal/bin", "eflomal")
 TOKENIZER = LatinWordTokenizer()
 
 

From dc6bb667b56447f4d446cf139e76b0f3f3a82542 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 17:08:06 -0400
Subject: [PATCH 11/17] Eflomal sanity check take 2

---
 machine/jobs/eflomal_aligner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index c088cf22..054b0e4a 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -1,7 +1,7 @@
 # NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py.
 # The vast majority of this code is taken from the silnlp repository.
 
-import os
+# import os
 import subprocess
 from contextlib import ExitStack
 from importlib.util import find_spec

From c5b7916a68b42a8af8facd941e9c3c36451fa35e Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 17:31:33 -0400
Subject: [PATCH 12/17] Add EFLOMAL_PATH value to pytest step of ci workflow

---
 .github/workflows/ci.yml        | 2 ++
 machine/jobs/eflomal_aligner.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d38f9aac..380c8e10 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,6 +59,8 @@ jobs:
           poetry run pyright
       - name: Test with pytest
         run: poetry run pytest --cov --cov-report=xml
+        env:
+          EFLOMAL_PATH: /home/runner/work/machine.py/machine.py/.venv/lib/python${{ matrix.python-version }}/site-packages/eflomal/bin
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4
         env:
diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 054b0e4a..551e55c8 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -1,7 +1,7 @@
 # NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py.
 # The vast majority of this code is taken from the silnlp repository.
 
-# import os
+import os
 import subprocess
 from contextlib import ExitStack
 from importlib.util import find_spec
@@ -24,7 +24,7 @@ def is_eflomal_available() -> bool:
 if is_eflomal_available():
     from eflomal import read_text, write_text  # type: ignore
 
-EFLOMAL_PATH = Path("/home/runner/work/machine.py/machine.py/.venv/lib/python3.12/site-packages/eflomal/bin", "eflomal")
+EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal")
 TOKENIZER = LatinWordTokenizer()
 
 

From b4b413847fd0c0e4733e7bfae79d67ce3ccc7a77 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Mon, 14 Apr 2025 17:50:59 -0400
Subject: [PATCH 13/17] Revert EFLOMAL_PATH values to regular docker container
 paths

---
 dockerfile          | 2 +-
 dockerfile.cpu_only | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dockerfile b/dockerfile
index b5f512ef..77e49bc0 100755
--- a/dockerfile
+++ b/dockerfile
@@ -62,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin"
+ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
 
 CMD ["bash"]
diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only
index 764ab6f1..aab45898 100755
--- a/dockerfile.cpu_only
+++ b/dockerfile.cpu_only
@@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \
 RUN python -m pip install --no-deps . && rm -r /root/*
 ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1
 
-ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin"
+ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin
 
 CMD ["bash"]

From 881a7954571a23f7bbd19da0821fb0dd559e4b09 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Tue, 15 Apr 2025 00:47:33 -0400
Subject: [PATCH 14/17] Only use normalized tokens inside of aligner

---
 machine/jobs/eflomal_aligner.py         | 11 +++++++++--
 tests/jobs/test_nmt_engine_build_job.py |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py
index 551e55c8..0f5526b5 100644
--- a/machine/jobs/eflomal_aligner.py
+++ b/machine/jobs/eflomal_aligner.py
@@ -100,7 +100,11 @@ def prepare_files(
 
 
 def tokenize(sent: str) -> Sequence[str]:
-    return lowercase(normalize("NFC", escape_spaces(list(TOKENIZER.tokenize(sent)))))
+    return list(TOKENIZER.tokenize(sent))
+
+
+def normalize_for_alignment(sent: Sequence[str]) -> str:
+    return " ".join(lowercase(normalize("NFC", escape_spaces(sent))))
 
 
 # From silnlp.alignment.eflomal
@@ -118,7 +122,10 @@ def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[s
                 trg_output_file = stack.enter_context(trg_eflomal_path.open("wb"))
                 # Write input files for the eflomal binary
                 n_sentences = prepare_files(
-                    [" ".join(s) for s in src_toks], src_output_file, [" ".join(s) for s in trg_toks], trg_output_file
+                    [normalize_for_alignment(s) for s in src_toks],
+                    src_output_file,
+                    [normalize_for_alignment(s) for s in trg_toks],
+                    trg_output_file,
                 )
 
             iters = max(2, int(round(1.0 * 5000 / sqrt(n_sentences))))
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 112ca1ca..29c50cc8 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -39,7 +39,7 @@ def test_run(decoy: Decoy) -> None:
     assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
     if is_eflomal_available():
         assert pretranslations[0]["source_toks"] == [
-            "por",
+            "Por",
             "favor",
             ",",
             "tengo",
@@ -48,7 +48,7 @@ def test_run(decoy: Decoy) -> None:
             "habitación",
             ".",
         ]
-        assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."]
+        assert pretranslations[0]["pretranslation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
         assert len(pretranslations[0]["alignment"]) > 0
     else:
         assert pretranslations[0]["source_toks"] == []

From 123263f0e8f7fdaa62961c17e421320327e7f210 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Wed, 23 Apr 2025 14:00:59 -0400
Subject: [PATCH 15/17] Move alignment config option to build options, revert
 to 'translation' in PretranslationInfo

---
 machine/jobs/build_nmt_engine.py             |  7 -------
 machine/jobs/nmt_engine_build_job.py         |  8 ++++----
 machine/jobs/settings.yaml                   |  1 +
 machine/jobs/smt_engine_build_job.py         |  4 ++--
 machine/jobs/translation_engine_build_job.py | 10 +++++-----
 machine/jobs/translation_file_service.py     |  8 ++++----
 tests/jobs/test_nmt_engine_build_job.py      | 14 +++++++-------
 tests/jobs/test_smt_engine_build_job.py      | 15 +++++++++++----
 8 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py
index cd0e12ff..a047a7ba 100644
--- a/machine/jobs/build_nmt_engine.py
+++ b/machine/jobs/build_nmt_engine.py
@@ -92,13 +92,6 @@ def main() -> None:
     parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task")
     parser.add_argument("--build-options", default=None, type=str, help="Build configurations")
     parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name")
-    parser.add_argument(
-        "--align-pretranslations",
-        default=False,
-        action="store_true",
-        help="Aligns source and target pretranslations using Eflomal (linux only) "
-        "and returns the alignments as well as the tokenized source and target with the pretranslations.",
-    )
     args = parser.parse_args()
 
     run({k: v for k, v in vars(args).items() if v is not None})
diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
index d3674120..5d92aa06 100644
--- a/machine/jobs/nmt_engine_build_job.py
+++ b/machine/jobs/nmt_engine_build_job.py
@@ -28,7 +28,7 @@ def _get_progress_reporter(
         self, progress: Optional[Callable[[ProgressStatus], None]], corpus_size: int
     ) -> PhasedProgressReporter:
         if corpus_size > 0:
-            if "align_pretranslations" in self._config and self._config.align_pretranslations:
+            if self._config.align_pretranslations:
                 phases = [
                     Phase(message="Training NMT model", percentage=0.8),
                     Phase(message="Pretranslating segments", percentage=0.1),
@@ -40,7 +40,7 @@ def _get_progress_reporter(
                     Phase(message="Pretranslating segments", percentage=0.1),
                 ]
         else:
-            if "align_pretranslations" in self._config and self._config.align_pretranslations:
+            if self._config.align_pretranslations:
                 phases = [
                     Phase(message="Pretranslating segments", percentage=0.9),
                     Phase(message="Aligning segments", percentage=0.1, report_steps=False),
@@ -128,7 +128,7 @@ def _translate_batch(
     batch: Sequence[PretranslationInfo],
     writer: DictToJsonWriter,
 ) -> None:
-    source_segments = [pi["pretranslation"] for pi in batch]
+    source_segments = [pi["translation"] for pi in batch]
     for i, result in enumerate(engine.translate_batch(source_segments)):
-        batch[i]["pretranslation"] = result.translation
+        batch[i]["translation"] = result.translation
         writer.write(batch[i])
diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml
index f937826d..cfc727c8 100644
--- a/machine/jobs/settings.yaml
+++ b/machine/jobs/settings.yaml
@@ -3,6 +3,7 @@ default:
   shared_file_uri: s3:/silnlp/
   shared_file_folder: production
   inference_batch_size: 1024
+  align_pretranslations: false
   huggingface:
     parent_model_name: facebook/nllb-200-distilled-1.3B
     train_params:
diff --git a/machine/jobs/smt_engine_build_job.py b/machine/jobs/smt_engine_build_job.py
index 35977c60..452810f8 100644
--- a/machine/jobs/smt_engine_build_job.py
+++ b/machine/jobs/smt_engine_build_job.py
@@ -107,7 +107,7 @@ def _translate_batch(
     batch: Sequence[PretranslationInfo],
     writer: DictToJsonWriter,
 ) -> None:
-    source_segments = [pi["pretranslation"] for pi in batch]
+    source_segments = [pi["translation"] for pi in batch]
     for i, result in enumerate(engine.translate_batch(source_segments)):
-        batch[i]["pretranslation"] = result.translation
+        batch[i]["translation"] = result.translation
         writer.write(batch[i])
diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py
index 1c961cec..703490bb 100644
--- a/machine/jobs/translation_engine_build_job.py
+++ b/machine/jobs/translation_engine_build_job.py
@@ -48,7 +48,7 @@ def run(
         logger.info("Pretranslating segments")
         self._batch_inference(progress_reporter, check_canceled)
 
-        if "align_pretranslations" in self._config and self._config.align_pretranslations and is_eflomal_available():
+        if self._config.align_pretranslations and is_eflomal_available():
             logger.info("Aligning source to pretranslations")
             self._align(progress_reporter, check_canceled)
 
@@ -96,13 +96,13 @@ def _align(
             progress_reporter.start_next_phase()
 
             src_tokenized = [
-                tokenize(s["pretranslation"])
+                tokenize(s["translation"])
                 for s in stack.enter_context(self._translation_file_service.get_source_pretranslations())
             ]
             trg_info = [
                 pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations())
             ]
-            trg_tokenized = [tokenize(pt_info["pretranslation"]) for pt_info in trg_info]
+            trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in trg_info]
 
             with TemporaryDirectory() as td:
                 aligner = EflomalAligner(Path(td))
@@ -125,9 +125,9 @@ def _align(
                         corpusId=trg_pi["corpusId"],
                         textId=trg_pi["textId"],
                         refs=trg_pi["refs"],
-                        pretranslation=trg_pi["pretranslation"],
+                        translation=trg_pi["translation"],
                         source_toks=list(src_toks),
-                        pretranslation_toks=list(trg_toks),
+                        translation_toks=list(trg_toks),
                         alignment=alignment,
                     )
                 )
diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py
index a6942ab4..a8e8d513 100644
--- a/machine/jobs/translation_file_service.py
+++ b/machine/jobs/translation_file_service.py
@@ -15,9 +15,9 @@ class PretranslationInfo(TypedDict):
     corpusId: str  # noqa: N815
     textId: str  # noqa: N815
     refs: List[str]
-    pretranslation: str
+    translation: str
     source_toks: List[str]
-    pretranslation_toks: List[str]
+    translation_toks: List[str]
     alignment: str
 
 
@@ -62,9 +62,9 @@ def generator() -> Generator[PretranslationInfo, None, None]:
                         corpusId=pi["corpusId"],
                         textId=pi["textId"],
                         refs=list(pi["refs"]),
-                        pretranslation=pi["pretranslation"],
+                        translation=pi["translation"],
                         source_toks=list(pi["source_toks"]),
-                        pretranslation_toks=list(pi["pretranslation_toks"]),
+                        translation_toks=list(pi["translation_toks"]),
                         alignment=pi["alignment"],
                     )
 
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 29c50cc8..323d0901 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -36,7 +36,7 @@ def test_run(decoy: Decoy) -> None:
 
     pretranslations = json.loads(env.target_pretranslations)
     assert len(pretranslations) == 1
-    assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
+    assert pretranslations[0]["translation"] == "Please, I have booked a room."
     if is_eflomal_available():
         assert pretranslations[0]["source_toks"] == [
             "Por",
@@ -48,11 +48,11 @@ def test_run(decoy: Decoy) -> None:
             "habitación",
             ".",
         ]
-        assert pretranslations[0]["pretranslation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
+        assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
         assert len(pretranslations[0]["alignment"]) > 0
     else:
         assert pretranslations[0]["source_toks"] == []
-        assert pretranslations[0]["pretranslation_toks"] == []
+        assert pretranslations[0]["translation_toks"] == []
         assert len(pretranslations[0]["alignment"]) == 0
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
@@ -130,9 +130,9 @@ def __init__(self, decoy: Decoy) -> None:
                             corpusId="corpus1",
                             textId="text1",
                             refs=["ref1"],
-                            pretranslation="Por favor, tengo reservada una habitación.",
+                            translation="Por favor, tengo reservada una habitación.",
                             source_toks=[],
-                            pretranslation_toks=[],
+                            translation_toks=[],
                             alignment="",
                         )
                     ]
@@ -148,9 +148,9 @@ def __init__(self, decoy: Decoy) -> None:
                             corpusId="corpus1",
                             textId="text1",
                             refs=["ref1"],
-                            pretranslation="Please, I have booked a room.",
+                            translation="Please, I have booked a room.",
                             source_toks=[],
-                            pretranslation_toks=[],
+                            translation_toks=[],
                             alignment="",
                         )
                     ]
diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py
index 51def81e..16afcacf 100644
--- a/tests/jobs/test_smt_engine_build_job.py
+++ b/tests/jobs/test_smt_engine_build_job.py
@@ -31,7 +31,7 @@ def test_run(decoy: Decoy) -> None:
 
     pretranslations = json.loads(env.target_pretranslations)
     assert len(pretranslations) == 1
-    assert pretranslations[0]["pretranslation"] == "Please, I have booked a room."
+    assert pretranslations[0]["translation"] == "Please, I have booked a room."
     decoy.verify(
         env.translation_file_service.save_model(matchers.Anything(), f"builds/{env.job._config.build_id}/model.zip"),
         times=1,
@@ -136,9 +136,9 @@ def __init__(self, decoy: Decoy) -> None:
                             corpusId="corpus1",
                             textId="text1",
                             refs=["ref1"],
-                            pretranslation="Por favor, tengo reservada una habitación.",
+                            translation="Por favor, tengo reservada una habitación.",
                             source_toks=[],
-                            pretranslation_toks=[],
+                            translation_toks=[],
                             alignment="",
                         )
                     ]
@@ -161,7 +161,14 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ
         )
 
         self.job = SmtEngineBuildJob(
-            MockSettings({"build_id": "mybuild", "inference_batch_size": 100, "thot_mt": {"tokenizer": "latin"}}),
+            MockSettings(
+                {
+                    "build_id": "mybuild",
+                    "inference_batch_size": 100,
+                    "thot_mt": {"tokenizer": "latin"},
+                    "align_pretranslations": False,
+                }
+            ),
             self.smt_model_factory,
             self.translation_file_service,
         )

From 3774258723ac54e8d07a0a2a65c76ba137792eb2 Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Wed, 23 Apr 2025 18:03:27 -0400
Subject: [PATCH 16/17] Refactor to do alignment during the inference step

---
 machine/jobs/nmt_engine_build_job.py         | 61 ++++++++++++++++++--
 machine/jobs/translation_engine_build_job.py | 60 +------------------
 machine/jobs/translation_file_service.py     | 14 ++---
 tests/jobs/test_nmt_engine_build_job.py      | 18 ------
 4 files changed, 62 insertions(+), 91 deletions(-)

diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
index 5d92aa06..1244fea8 100644
--- a/machine/jobs/nmt_engine_build_job.py
+++ b/machine/jobs/nmt_engine_build_job.py
@@ -1,5 +1,7 @@
 import logging
 from contextlib import ExitStack
+from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Any, Callable, Optional, Sequence, Tuple
 
 from ..corpora.corpora_utils import batch
@@ -8,6 +10,7 @@
 from ..translation.translation_engine import TranslationEngine
 from ..utils.phased_progress_reporter import Phase, PhasedProgressReporter
 from ..utils.progress_status import ProgressStatus
+from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize
 from .nmt_model_factory import NmtModelFactory
 from .shared_file_service_base import DictToJsonWriter
 from .translation_engine_build_job import TranslationEngineBuildJob
@@ -102,18 +105,66 @@ def _batch_inference(
         with ExitStack() as stack:
             phase_progress = stack.enter_context(progress_reporter.start_next_phase())
             engine = stack.enter_context(self._nmt_model_factory.create_engine())
-            src_pretranslations = stack.enter_context(self._translation_file_service.get_source_pretranslations())
-            writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer())
+            pretranslations = [
+                pt_info for pt_info in stack.enter_context(self._translation_file_service.get_source_pretranslations())
+            ]
+            src_segments = [pt_info["translation"] for pt_info in pretranslations]
             current_inference_step = 0
             phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count))
             batch_size = self._config["inference_batch_size"]
-            for pi_batch in batch(src_pretranslations, batch_size):
+            for seg_batch in batch(iter(src_segments), batch_size):
                 if check_canceled is not None:
                     check_canceled()
-                _translate_batch(engine, pi_batch, writer)
-                current_inference_step += len(pi_batch)
+                for i, result in enumerate(engine.translate_batch(seg_batch)):
+                    pretranslations[current_inference_step + i]["translation"] = result.translation
+                current_inference_step += len(seg_batch)
                 phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count))
 
+            if self._config.align_pretranslations and is_eflomal_available():
+                logger.info("Aligning source to pretranslations")
+                pretranslations = self._align(src_segments, pretranslations, progress_reporter, check_canceled)
+
+            writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer())
+            for pretranslation in pretranslations:
+                writer.write(pretranslation)
+
+    def _align(
+        self,
+        src_segments: Sequence[str],
+        pretranslations: Sequence[PretranslationInfo],
+        progress_reporter: PhasedProgressReporter,
+        check_canceled: Optional[Callable[[], None]],
+    ) -> Sequence[PretranslationInfo]:
+        if check_canceled is not None:
+            check_canceled()
+
+        logger.info("Aligning source to pretranslations")
+        progress_reporter.start_next_phase()
+
+        src_tokenized = [tokenize(s) for s in src_segments]
+        trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in pretranslations]
+
+        with TemporaryDirectory() as td:
+            aligner = EflomalAligner(Path(td))
+            logger.info("Training aligner")
+            aligner.train(src_tokenized, trg_tokenized)
+
+            if check_canceled is not None:
+                check_canceled()
+
+            logger.info("Aligning pretranslations")
+            alignments = aligner.align()
+
+        if check_canceled is not None:
+            check_canceled()
+
+        for i in range(len(pretranslations)):
+            pretranslations[i]["source_toks"] = list(src_tokenized[i])
+            pretranslations[i]["translation_toks"] = list(trg_tokenized[i])
+            pretranslations[i]["alignment"] = alignments[i]
+
+        return pretranslations
+
     def _save_model(self) -> None:
         if "save_model" in self._config and self._config.save_model is not None:
             logger.info("Saving model")
diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py
index 703490bb..7effa62f 100644
--- a/machine/jobs/translation_engine_build_job.py
+++ b/machine/jobs/translation_engine_build_job.py
@@ -1,16 +1,12 @@
 import logging
 from abc import ABC, abstractmethod
-from contextlib import ExitStack
-from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import Any, Callable, Optional, Tuple
 
 from ..corpora.parallel_text_corpus import ParallelTextCorpus
 from ..corpora.text_corpus import TextCorpus
 from ..utils.phased_progress_reporter import PhasedProgressReporter
 from ..utils.progress_status import ProgressStatus
-from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize
-from .translation_file_service import PretranslationInfo, TranslationFileService
+from .translation_file_service import TranslationFileService
 
 logger = logging.getLogger(__name__)
 
@@ -48,10 +44,6 @@ def run(
         logger.info("Pretranslating segments")
         self._batch_inference(progress_reporter, check_canceled)
 
-        if self._config.align_pretranslations and is_eflomal_available():
-            logger.info("Aligning source to pretranslations")
-            self._align(progress_reporter, check_canceled)
-
         self._save_model()
         return train_corpus_size, confidence
 
@@ -82,55 +74,5 @@ def _batch_inference(
         check_canceled: Optional[Callable[[], None]],
     ) -> None: ...
 
-    def _align(
-        self,
-        progress_reporter: PhasedProgressReporter,
-        check_canceled: Optional[Callable[[], None]],
-    ) -> None:
-        if check_canceled is not None:
-            check_canceled()
-
-        logger.info("Aligning source to pretranslations")
-        with ExitStack() as stack:
-            # phase_progress = stack.enter_context(progress_reporter.start_next_phase())
-            progress_reporter.start_next_phase()
-
-            src_tokenized = [
-                tokenize(s["translation"])
-                for s in stack.enter_context(self._translation_file_service.get_source_pretranslations())
-            ]
-            trg_info = [
-                pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations())
-            ]
-            trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in trg_info]
-
-            with TemporaryDirectory() as td:
-                aligner = EflomalAligner(Path(td))
-                logger.info("Training aligner")
-                aligner.train(src_tokenized, trg_tokenized)
-
-                if check_canceled is not None:
-                    check_canceled()
-
-                logger.info("Aligning pretranslations")
-                alignments = aligner.align()
-
-            if check_canceled is not None:
-                check_canceled()
-
-            writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer())
-            for trg_pi, src_toks, trg_toks, alignment in zip(trg_info, src_tokenized, trg_tokenized, alignments):
-                writer.write(
-                    PretranslationInfo(
-                        corpusId=trg_pi["corpusId"],
-                        textId=trg_pi["textId"],
-                        refs=trg_pi["refs"],
-                        translation=trg_pi["translation"],
-                        source_toks=list(src_toks),
-                        translation_toks=list(trg_toks),
-                        alignment=alignment,
-                    )
-                )
-
     @abstractmethod
     def _save_model(self) -> None: ...
diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py
index a8e8d513..54c4ae90 100644
--- a/machine/jobs/translation_file_service.py
+++ b/machine/jobs/translation_file_service.py
@@ -52,11 +52,13 @@ def exists_source_corpus(self) -> bool:
     def exists_target_corpus(self) -> bool:
         return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{TARGET_FILENAME}")
 
-    def _get_pretranslations(self, filename: str) -> ContextManagedGenerator[PretranslationInfo, None, None]:
-        pretranslate_path = self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{filename}")
+    def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
+        src_pretranslate_path = self.shared_file_service.download_file(
+            f"{self.shared_file_service.build_path}/{SOURCE_PRETRANSLATION_FILENAME}"
+        )
 
         def generator() -> Generator[PretranslationInfo, None, None]:
-            with pretranslate_path.open("r", encoding="utf-8-sig") as file:
+            with src_pretranslate_path.open("r", encoding="utf-8-sig") as file:
                 for pi in json_stream.load(file):
                     yield PretranslationInfo(
                         corpusId=pi["corpusId"],
@@ -70,12 +72,6 @@ def generator() -> Generator[PretranslationInfo, None, None]:
 
         return ContextManagedGenerator(generator())
 
-    def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
-        return self._get_pretranslations(SOURCE_PRETRANSLATION_FILENAME)
-
-    def get_target_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]:
-        return self._get_pretranslations(TARGET_PRETRANSLATION_FILENAME)
-
     def save_model(self, model_path: Path, destination: str) -> None:
         self.shared_file_service.upload_path(model_path, destination)
 
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
index 323d0901..227b909b 100644
--- a/tests/jobs/test_nmt_engine_build_job.py
+++ b/tests/jobs/test_nmt_engine_build_job.py
@@ -139,24 +139,6 @@ def __init__(self, decoy: Decoy) -> None:
                 )
             )
         )
-        decoy.when(self.translation_file_service.get_target_pretranslations()).then_do(
-            lambda: ContextManagedGenerator(
-                (
-                    pi
-                    for pi in [
-                        PretranslationInfo(
-                            corpusId="corpus1",
-                            textId="text1",
-                            refs=["ref1"],
-                            translation="Please, I have booked a room.",
-                            source_toks=[],
-                            translation_toks=[],
-                            alignment="",
-                        )
-                    ]
-                )
-            )
-        )
 
         self.target_pretranslations = ""
 

From 2556455610fef3ebae9fa03158c6c246542167da Mon Sep 17 00:00:00 2001
From: Isaac Schifferer <isaac@schifferer.com>
Date: Thu, 24 Apr 2025 17:53:47 -0400
Subject: [PATCH 17/17] Remove unused function

---
 machine/jobs/nmt_engine_build_job.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
index 1244fea8..b7b2afbc 100644
--- a/machine/jobs/nmt_engine_build_job.py
+++ b/machine/jobs/nmt_engine_build_job.py
@@ -7,12 +7,10 @@
 from ..corpora.corpora_utils import batch
 from ..corpora.parallel_text_corpus import ParallelTextCorpus
 from ..corpora.text_corpus import TextCorpus
-from ..translation.translation_engine import TranslationEngine
 from ..utils.phased_progress_reporter import Phase, PhasedProgressReporter
 from ..utils.progress_status import ProgressStatus
 from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize
 from .nmt_model_factory import NmtModelFactory
-from .shared_file_service_base import DictToJsonWriter
 from .translation_engine_build_job import TranslationEngineBuildJob
 from .translation_file_service import PretranslationInfo, TranslationFileService
 
@@ -172,14 +170,3 @@ def _save_model(self) -> None:
             self._translation_file_service.save_model(
                 model_path, f"models/{self._config.save_model + ''.join(model_path.suffixes)}"
             )
-
-
-def _translate_batch(
-    engine: TranslationEngine,
-    batch: Sequence[PretranslationInfo],
-    writer: DictToJsonWriter,
-) -> None:
-    source_segments = [pi["translation"] for pi in batch]
-    for i, result in enumerate(engine.translate_batch(source_segments)):
-        batch[i]["translation"] = result.translation
-        writer.write(batch[i])