From 84dae8e9f9abc9a5f58d4d6ce10918cd27c523c1 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 7 Apr 2025 14:51:59 -0400 Subject: [PATCH 01/17] Add optional alignment step to nmt jobs, temporary implementation of eflomal --- .devcontainer/dockerfile | 4 +- machine/jobs/build_nmt_engine.py | 3 + machine/jobs/eflomal_aligner.py | 153 +++++++++++++++++++ machine/jobs/nmt_engine_build_job.py | 27 +++- machine/jobs/smt_engine_build_job.py | 4 +- machine/jobs/translation_engine_build_job.py | 64 +++++++- machine/jobs/translation_file_service.py | 24 ++- poetry.lock | 92 ++++++++++- pyproject.toml | 1 + tests/jobs/test_nmt_engine_build_job.py | 25 ++- tests/jobs/test_smt_engine_build_job.py | 7 +- 11 files changed, 381 insertions(+), 23 deletions(-) create mode 100644 machine/jobs/eflomal_aligner.py diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile index 6212c64c..7b1282fb 100644 --- a/.devcontainer/dockerfile +++ b/.devcontainer/dockerfile @@ -22,7 +22,7 @@ RUN apt-get update && \ python$PYTHON_VERSION-distutils \ git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \ libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \ - libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \ + libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev python3.9-dev && \ rm -rf /var/lib/apt/lists/* RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION @@ -39,4 +39,6 @@ RUN pip install -U pip setuptools \ COPY ./.devcontainer/clearml.conf /root/clearml.conf +ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python3.9/site-packages/eflomal/bin + CMD ["bash"] diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py index a047a7ba..2645598c 100644 --- a/machine/jobs/build_nmt_engine.py +++ b/machine/jobs/build_nmt_engine.py @@ -92,6 +92,9 @@ def main() -> None: parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task") parser.add_argument("--build-options", default=None, type=str, help="Build configurations") parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name") + parser.add_argument( + "--align-pretranslations", default=False, action="store_true", help="Aligns source and target pretranslations" + ) args = parser.parse_args() run({k: v for k, v in vars(args).items() if v is not None}) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py new file mode 100644 index 00000000..31559847 --- /dev/null +++ b/machine/jobs/eflomal_aligner.py @@ -0,0 +1,153 @@ +# NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py. +# The vast majority of this code is taken from the silnlp repository. + +import os +import subprocess +from contextlib import ExitStack +from math import sqrt +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import IO, Iterable, List, Sequence, Tuple + +from eflomal import read_text, write_text + +from ..corpora import AlignedWordPair +from ..corpora.token_processors import escape_spaces, lowercase, normalize +from ..tokenization import LatinWordTokenizer +from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix + +# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine? +EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") +TOKENIZER = LatinWordTokenizer() + + +# From silnlp.alignment.tools +def execute_eflomal( + source_path: Path, + target_path: Path, + forward_links_path: Path, + reverse_links_path: Path, + n_iterations: Tuple[int, int, int], +) -> None: + if not EFLOMAL_PATH.is_file(): + raise RuntimeError("eflomal is not installed.") + + args = [ + str(EFLOMAL_PATH), + "-s", + str(source_path), + "-t", + str(target_path), + "-f", + str(forward_links_path), + "-r", + str(reverse_links_path), + # "-q", + "-m", + "3", + "-n", + "3", + "-N", + "0.2", + "-1", + str(n_iterations[0]), + "-2", + str(n_iterations[1]), + "-3", + str(n_iterations[2]), + ] + subprocess.run(args, stderr=subprocess.DEVNULL) + + +# From silnlp.alignment.eflomal +def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: + word_pairs = AlignedWordPair.from_string(alignment_str) + row_count = 0 + column_count = 0 + for pair in word_pairs: + if pair.source_index + 1 > row_count: + row_count = pair.source_index + 1 + if pair.target_index + 1 > column_count: + column_count = pair.target_index + 1 + return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) + + +# From silnlp.alignment.eflomal +def to_eflomal_text_file(input: Iterable[str], output_file: IO[bytes], prefix_len: int = 0, suffix_len: int = 0) -> int: + sents, index = read_text(input, True, prefix_len, suffix_len) + n_sents = len(sents) + voc_size = len(index) + write_text(output_file, tuple(sents), voc_size) + return n_sents + + +# From silnlp.alignment.eflomal +def prepare_files( + src_input: Iterable[str], src_output_file: IO[bytes], trg_input: Iterable[str], trg_output_file: IO[bytes] +) -> int: + n_src_sents = to_eflomal_text_file(src_input, src_output_file) + n_trg_sents = to_eflomal_text_file(trg_input, trg_output_file) + if n_src_sents != n_trg_sents: + raise ValueError("Mismatched file sizes") + return n_src_sents + + +def tokenize(sent: str) -> Sequence[str]: + return lowercase(normalize("NFC", escape_spaces(list(TOKENIZER.tokenize(sent))))) + + +# From silnlp.alignment.eflomal +class EflomalAligner: + def __init__(self, model_dir: Path) -> None: + self._model_dir = model_dir + + def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[str]]) -> None: + self._model_dir.mkdir(exist_ok=True) + with TemporaryDirectory() as temp_dir: + src_eflomal_path = Path(temp_dir, "source") + trg_eflomal_path = Path(temp_dir, "target") + with ExitStack() as stack: + src_output_file = stack.enter_context(src_eflomal_path.open("wb")) + trg_output_file = stack.enter_context(trg_eflomal_path.open("wb")) + # Write input files for the eflomal binary + n_sentences = prepare_files( + [" ".join(s) for s in src_toks], src_output_file, [" ".join(s) for s in trg_toks], trg_output_file + ) + + iters = max(2, int(round(1.0 * 5000 / sqrt(n_sentences)))) + iters4 = max(1, iters // 4) + n_iterations = (max(2, iters4), iters4, iters) + + # Run wrapper for the eflomal binary + execute_eflomal( + src_eflomal_path, + trg_eflomal_path, + self._model_dir / "forward-align.txt", + self._model_dir / "reverse-align.txt", + n_iterations, + ) + + def align(self, sym_heuristic: str = "grow-diag-final-and") -> List[str]: + forward_align_path = self._model_dir / "forward-align.txt" + reverse_align_path = self._model_dir / "reverse-align.txt" + + alignments = [] + heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")] + with ExitStack() as stack: + forward_file = stack.enter_context(forward_align_path.open("r", encoding="utf-8-sig")) + reverse_file = stack.enter_context(reverse_align_path.open("r", encoding="utf-8-sig")) + + for forward_line, reverse_line in zip(forward_file, reverse_file): + forward_matrix = to_word_alignment_matrix(forward_line.strip()) + reverse_matrix = to_word_alignment_matrix(reverse_line.strip()) + src_len = max(forward_matrix.row_count, reverse_matrix.row_count) + trg_len = max(forward_matrix.column_count, reverse_matrix.column_count) + + forward_matrix.resize(src_len, trg_len) + reverse_matrix.resize(src_len, trg_len) + + forward_matrix.symmetrize_with(reverse_matrix, heuristic) + + alignments.append(str(forward_matrix)) + + return alignments diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 1ff719a4..d3674120 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -28,12 +28,25 @@ def _get_progress_reporter( self, progress: Optional[Callable[[ProgressStatus], None]], corpus_size: int ) -> PhasedProgressReporter: if corpus_size > 0: - phases = [ - Phase(message="Training NMT model", percentage=0.9), - Phase(message="Pretranslating segments", percentage=0.1), - ] + if "align_pretranslations" in self._config and self._config.align_pretranslations: + phases = [ + Phase(message="Training NMT model", percentage=0.8), + Phase(message="Pretranslating segments", percentage=0.1), + Phase(message="Aligning segments", percentage=0.1, report_steps=False), + ] + else: + phases = [ + Phase(message="Training NMT model", percentage=0.9), + Phase(message="Pretranslating segments", percentage=0.1), + ] else: - phases = [Phase(message="Pretranslating segments", percentage=1.0)] + if "align_pretranslations" in self._config and self._config.align_pretranslations: + phases = [ + Phase(message="Pretranslating segments", percentage=0.9), + Phase(message="Aligning segments", percentage=0.1, report_steps=False), + ] + else: + phases = [Phase(message="Pretranslating segments", percentage=1.0)] return PhasedProgressReporter(progress, phases) def _respond_to_no_training_corpus(self) -> Tuple[int, float]: @@ -115,7 +128,7 @@ def _translate_batch( batch: Sequence[PretranslationInfo], writer: DictToJsonWriter, ) -> None: - source_segments = [pi["translation"] for pi in batch] + source_segments = [pi["pretranslation"] for pi in batch] for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["translation"] = result.translation + batch[i]["pretranslation"] = result.translation writer.write(batch[i]) diff --git a/machine/jobs/smt_engine_build_job.py b/machine/jobs/smt_engine_build_job.py index 452810f8..35977c60 100644 --- a/machine/jobs/smt_engine_build_job.py +++ b/machine/jobs/smt_engine_build_job.py @@ -107,7 +107,7 @@ def _translate_batch( batch: Sequence[PretranslationInfo], writer: DictToJsonWriter, ) -> None: - source_segments = [pi["translation"] for pi in batch] + source_segments = [pi["pretranslation"] for pi in batch] for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["translation"] = result.translation + batch[i]["pretranslation"] = result.translation writer.write(batch[i]) diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py index 7effa62f..ec94eff3 100644 --- a/machine/jobs/translation_engine_build_job.py +++ b/machine/jobs/translation_engine_build_job.py @@ -1,12 +1,16 @@ import logging from abc import ABC, abstractmethod +from contextlib import ExitStack +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any, Callable, Optional, Tuple from ..corpora.parallel_text_corpus import ParallelTextCorpus from ..corpora.text_corpus import TextCorpus from ..utils.phased_progress_reporter import PhasedProgressReporter from ..utils.progress_status import ProgressStatus -from .translation_file_service import TranslationFileService +from .eflomal_aligner import EflomalAligner, tokenize +from .translation_file_service import PretranslationInfo, TranslationFileService logger = logging.getLogger(__name__) @@ -44,6 +48,10 @@ def run( logger.info("Pretranslating segments") self._batch_inference(progress_reporter, check_canceled) + if "align_pretranslations" in self._config and self._config.align_pretranslations: + logger.info("Aligning source to pretranslations") + self._align(progress_reporter, check_canceled) + self._save_model() return train_corpus_size, confidence @@ -74,5 +82,59 @@ def _batch_inference( check_canceled: Optional[Callable[[], None]], ) -> None: ... + def _align( + self, + progress_reporter: PhasedProgressReporter, + check_canceled: Optional[Callable[[], None]], + ) -> None: + if check_canceled is not None: + check_canceled() + + logger.info("Aligning source to pretranslations") + with ExitStack() as stack: + phase_progress = stack.enter_context(progress_reporter.start_next_phase()) + + src_tokenized = [ + tokenize(s["pretranslation"]) + for s in stack.enter_context(self._translation_file_service.get_source_pretranslations()) + ] + trg_tokenized = [ + tokenize(s["pretranslation"]) + for s in stack.enter_context(self._translation_file_service.get_target_pretranslations()) + ] + + with TemporaryDirectory() as td: + aligner = EflomalAligner(Path(td)) + logger.info("Training aligner") + aligner.train(src_tokenized, trg_tokenized) + + if check_canceled is not None: + check_canceled() + + logger.info("Aligning pretranslations") + alignments = aligner.align() + + if check_canceled is not None: + check_canceled() + + writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) + for trg_pi, src_toks, trg_toks, alignment in zip( + stack.enter_context(self._translation_file_service.get_target_pretranslations()), + src_tokenized, + trg_tokenized, + alignments, + ): + writer.write( + PretranslationInfo( + corpusId=trg_pi["corpusId"], + textId=trg_pi["textId"], + refs=trg_pi["refs"], + pretranslation=trg_pi["pretranslation"], + source_toks=list(src_toks), + pretranslation_toks=list(trg_toks), + alignment=alignment, + ) + ) + @abstractmethod def _save_model(self) -> None: ... diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index 16e9f2e7..a6942ab4 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -15,7 +15,10 @@ class PretranslationInfo(TypedDict): corpusId: str # noqa: N815 textId: str # noqa: N815 refs: List[str] - translation: str + pretranslation: str + source_toks: List[str] + pretranslation_toks: List[str] + alignment: str SOURCE_FILENAME = "train.src.txt" @@ -49,23 +52,30 @@ def exists_source_corpus(self) -> bool: def exists_target_corpus(self) -> bool: return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{TARGET_FILENAME}") - def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: - src_pretranslate_path = self.shared_file_service.download_file( - f"{self.shared_file_service.build_path}/{SOURCE_PRETRANSLATION_FILENAME}" - ) + def _get_pretranslations(self, filename: str) -> ContextManagedGenerator[PretranslationInfo, None, None]: + pretranslate_path = self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{filename}") def generator() -> Generator[PretranslationInfo, None, None]: - with src_pretranslate_path.open("r", encoding="utf-8-sig") as file: + with pretranslate_path.open("r", encoding="utf-8-sig") as file: for pi in json_stream.load(file): yield PretranslationInfo( corpusId=pi["corpusId"], textId=pi["textId"], refs=list(pi["refs"]), - translation=pi["translation"], + pretranslation=pi["pretranslation"], + source_toks=list(pi["source_toks"]), + pretranslation_toks=list(pi["pretranslation_toks"]), + alignment=pi["alignment"], ) return ContextManagedGenerator(generator()) + def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: + return self._get_pretranslations(SOURCE_PRETRANSLATION_FILENAME) + + def get_target_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: + return self._get_pretranslations(TARGET_PRETRANSLATION_FILENAME) + def save_model(self, model_path: Path, destination: str) -> None: self.shared_file_service.upload_path(model_path, destination) diff --git a/poetry.lock b/poetry.lock index bd6b9039..5850ec36 100644 --- a/poetry.lock +++ b/poetry.lock @@ -733,6 +733,79 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "cython" +version = "3.0.12" +description = "The Cython compiler for writing C extensions in the Python language." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "Cython-3.0.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba67eee9413b66dd9fbacd33f0bc2e028a2a120991d77b5fd4b19d0b1e4039b9"}, + {file = "Cython-3.0.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee2717e5b5f7d966d0c6e27d2efe3698c357aa4d61bb3201997c7a4f9fe485a"}, + {file = "Cython-3.0.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cffc3464f641c8d0dda942c7c53015291beea11ec4d32421bed2f13b386b819"}, + {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d3a8f81980ffbd74e52f9186d8f1654e347d0c44bfea6b5997028977f481a179"}, + {file = "Cython-3.0.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d32856716c369d01f2385ad9177cdd1a11079ac89ea0932dc4882de1aa19174"}, + {file = "Cython-3.0.12-cp310-cp310-win32.whl", hash = "sha256:712c3f31adec140dc60d064a7f84741f50e2c25a8edd7ae746d5eb4d3ef7072a"}, + {file = "Cython-3.0.12-cp310-cp310-win_amd64.whl", hash = "sha256:d6945694c5b9170cfbd5f2c0d00ef7487a2de7aba83713a64ee4ebce7fad9e05"}, + {file = "Cython-3.0.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:feb86122a823937cc06e4c029d80ff69f082ebb0b959ab52a5af6cdd271c5dc3"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfdbea486e702c328338314adb8e80f5f9741f06a0ae83aaec7463bc166d12e8"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563de1728c8e48869d2380a1b76bbc1b1b1d01aba948480d68c1d05e52d20c92"}, + {file = "Cython-3.0.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:398d4576c1e1f6316282aa0b4a55139254fbed965cba7813e6d9900d3092b128"}, + {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1e5eadef80143026944ea8f9904715a008f5108d1d644a89f63094cc37351e73"}, + {file = "Cython-3.0.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5a93cbda00a5451175b97dea5a9440a3fcee9e54b4cba7a7dbcba9a764b22aec"}, + {file = "Cython-3.0.12-cp311-cp311-win32.whl", hash = "sha256:3109e1d44425a2639e9a677b66cd7711721a5b606b65867cb2d8ef7a97e2237b"}, + {file = "Cython-3.0.12-cp311-cp311-win_amd64.whl", hash = "sha256:d4b70fc339adba1e2111b074ee6119fe9fd6072c957d8597bce9a0dd1c3c6784"}, + {file = "Cython-3.0.12-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fe030d4a00afb2844f5f70896b7f2a1a0d7da09bf3aa3d884cbe5f73fff5d310"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7fec4f052b8fe173fe70eae75091389955b9a23d5cec3d576d21c5913b49d47"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0faa5e39e5c8cdf6f9c3b1c3f24972826e45911e7f5b99cf99453fca5432f45e"}, + {file = "Cython-3.0.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d53de996ed340e9ab0fc85a88aaa8932f2591a2746e1ab1c06e262bd4ec4be7"}, + {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ea3a0e19ab77266c738aa110684a753a04da4e709472cadeff487133354d6ab8"}, + {file = "Cython-3.0.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c151082884be468f2f405645858a857298ac7f7592729e5b54788b5c572717ba"}, + {file = "Cython-3.0.12-cp312-cp312-win32.whl", hash = "sha256:3083465749911ac3b2ce001b6bf17f404ac9dd35d8b08469d19dc7e717f5877a"}, + {file = "Cython-3.0.12-cp312-cp312-win_amd64.whl", hash = "sha256:c0b91c7ebace030dd558ea28730de8c580680b50768e5af66db2904a3716c3e3"}, + {file = "Cython-3.0.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4ee6f1ea1bead8e6cbc4e64571505b5d8dbdb3b58e679d31f3a84160cebf1a1a"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57aefa6d3341109e46ec1a13e3a763aaa2cbeb14e82af2485b318194be1d9170"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:879ae9023958d63c0675015369384642d0afb9c9d1f3473df9186c42f7a9d265"}, + {file = "Cython-3.0.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36fcd584dae547de6f095500a380f4a0cce72b7a7e409e9ff03cb9beed6ac7a1"}, + {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:62b79dcc0de49efe9e84b9d0e2ae0a6fc9b14691a65565da727aa2e2e63c6a28"}, + {file = "Cython-3.0.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4aa255781b093a8401109d8f2104bbb2e52de7639d5896aefafddc85c30e0894"}, + {file = "Cython-3.0.12-cp313-cp313-win32.whl", hash = "sha256:77d48f2d4bab9fe1236eb753d18f03e8b2619af5b6f05d51df0532a92dfb38ab"}, + {file = "Cython-3.0.12-cp313-cp313-win_amd64.whl", hash = "sha256:86c304b20bd57c727c7357e90d5ba1a2b6f1c45492de2373814d7745ef2e63b4"}, + {file = "Cython-3.0.12-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ff5c0b6a65b08117d0534941d404833d516dac422eee88c6b4fd55feb409a5ed"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:680f1d6ed4436ae94805db264d6155ed076d2835d84f20dcb31a7a3ad7f8668c"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc24609613fa06d0d896309f7164ba168f7e8d71c1e490ed2a08d23351c3f41"}, + {file = "Cython-3.0.12-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1879c073e2b34924ce9b7ca64c212705dcc416af4337c45f371242b2e5f6d32"}, + {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:bfb75123dd4ff767baa37d7036da0de2dfb6781ff256eef69b11b88b9a0691d1"}, + {file = "Cython-3.0.12-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:f39640f8df0400cde6882e23c734f15bb8196de0a008ae5dc6c8d1ec5957d7c8"}, + {file = "Cython-3.0.12-cp36-cp36m-win32.whl", hash = "sha256:8c9efe9a0895abee3cadfdad4130b30f7b5e57f6e6a51ef2a44f9fc66a913880"}, + {file = "Cython-3.0.12-cp36-cp36m-win_amd64.whl", hash = "sha256:63d840f2975e44d74512f8f34f1f7cb8121c9428e26a3f6116ff273deb5e60a2"}, + {file = "Cython-3.0.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:75c5acd40b97cff16fadcf6901a91586cbca5dcdba81f738efaf1f4c6bc8dccb"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e62564457851db1c40399bd95a5346b9bb99e17a819bf583b362f418d8f3457a"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ccd1228cc203b1f1b8a3d403f5a20ad1c40e5879b3fbf5851ce09d948982f2c"}, + {file = "Cython-3.0.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25529ee948f44d9a165ff960c49d4903267c20b5edf2df79b45924802e4cca6e"}, + {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:90cf599372c5a22120609f7d3a963f17814799335d56dd0dcf8fe615980a8ae1"}, + {file = "Cython-3.0.12-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9f8c48748a9c94ea5d59c26ab49ad0fad514d36f894985879cf3c3ca0e600bf4"}, + {file = "Cython-3.0.12-cp37-cp37m-win32.whl", hash = "sha256:3e4fa855d98bc7bd6a2049e0c7dc0dcf595e2e7f571a26e808f3efd84d2db374"}, + {file = "Cython-3.0.12-cp37-cp37m-win_amd64.whl", hash = "sha256:120681093772bf3600caddb296a65b352a0d3556e962b9b147efcfb8e8c9801b"}, + {file = "Cython-3.0.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:731d719423e041242c9303c80cae4327467299b90ffe62d4cc407e11e9ea3160"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3238a29f37999e27494d120983eca90d14896b2887a0bd858a381204549137a"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b588c0a089a9f4dd316d2f9275230bad4a7271e5af04e1dc41d2707c816be44b"}, + {file = "Cython-3.0.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab9f5198af74eb16502cc143cdde9ca1cbbf66ea2912e67440dd18a36e3b5fa"}, + {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8ee841c0e114efa1e849c281ac9b8df8aa189af10b4a103b1c5fd71cbb799679"}, + {file = "Cython-3.0.12-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:43c48b5789398b228ea97499f5b864843ba9b1ab837562a9227c6f58d16ede8b"}, + {file = "Cython-3.0.12-cp38-cp38-win32.whl", hash = "sha256:5e5f17c48a4f41557fbcc7ee660ccfebe4536a34c557f553b6893c1b3c83df2d"}, + {file = "Cython-3.0.12-cp38-cp38-win_amd64.whl", hash = "sha256:309c081057930bb79dc9ea3061a1af5086c679c968206e9c9c2ec90ab7cb471a"}, + {file = "Cython-3.0.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54115fcc126840926ff3b53cfd2152eae17b3522ae7f74888f8a41413bd32f25"}, + {file = "Cython-3.0.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:629db614b9c364596d7c975fa3fb3978e8c5349524353dbe11429896a783fc1e"}, + {file = "Cython-3.0.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af081838b0f9e12a83ec4c3809a00a64c817f489f7c512b0e3ecaf5f90a2a816"}, + {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:34ce459808f7d8d5d4007bc5486fe50532529096b43957af6cbffcb4d9cc5c8d"}, + {file = "Cython-3.0.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d6c6cd6a75c8393e6805d17f7126b96a894f310a1a9ea91c47d141fb9341bfa8"}, + {file = "Cython-3.0.12-cp39-cp39-win32.whl", hash = "sha256:a4032e48d4734d2df68235d21920c715c451ac9de15fa14c71b378e8986b83be"}, + {file = "Cython-3.0.12-cp39-cp39-win_amd64.whl", hash = "sha256:dcdc3e5d4ce0e7a4af6903ed580833015641e968d18d528d8371e2435a34132c"}, + {file = "Cython-3.0.12-py2.py3-none-any.whl", hash = "sha256:0038c9bae46c459669390e53a1ec115f8096b2e4647ae007ff1bf4e6dee92806"}, + {file = "cython-3.0.12.tar.gz", hash = "sha256:b988bb297ce76c671e28c97d017b95411010f7c77fa6623dd0bb47eed1aee1bc"}, +] + [[package]] name = "datasets" version = "2.21.0" @@ -881,6 +954,23 @@ toml = ["toml"] vault = ["hvac"] yaml = ["ruamel.yaml"] +[[package]] +name = "eflomal" +version = "2.0.0" +description = "pip installable eflomal" +optional = false +python-versions = "*" +files = [ + {file = "eflomal-2.0.0.tar.gz", hash = "sha256:b71183dcf85bf4f59f44ef7a59f5268df1c17c0c8d8093f77b220025ffdba100"}, +] + +[package.dependencies] +Cython = "*" +numpy = "*" + +[package.extras] +test = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -4787,4 +4877,4 @@ thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "ff353baa0a9c4519a6bef585b095c141da9c20b6dad4ef47c0af3ea57c92e6ee" +content-hash = "d939e11c2c341294ac8cbc3af9b1ae710b188e5ff4af4d39b614fb0a5726eddb" diff --git a/pyproject.toml b/pyproject.toml index 8f6527d9..be9ac8fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ urllib3 = "<2" sentencepiece = "^0.2.0" sil-thot = "^3.4.6" +eflomal = "^2.0.0" transformers = ">=4.38.0,<4.46" datasets = "^2.4.0" diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index a5e416d6..15a38681 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -35,7 +35,7 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 - assert pretranslations[0]["translation"] == "Please, I have booked a room." + assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -112,7 +112,28 @@ def __init__(self, decoy: Decoy) -> None: corpusId="corpus1", textId="text1", refs=["ref1"], - translation="Por favor, tengo reservada una habitación.", + pretranslation="Por favor, tengo reservada una habitación.", + source_toks=[], + pretranslation_toks=[], + alignment="", + ) + ] + ) + ) + ) + decoy.when(self.translation_file_service.get_target_pretranslations()).then_do( + lambda: ContextManagedGenerator( + ( + pi + for pi in [ + PretranslationInfo( + corpusId="corpus1", + textId="text1", + refs=["ref1"], + pretranslation="Please, I have booked a room.", + source_toks=[], + pretranslation_toks=[], + alignment="", ) ] ) diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py index 0cf2d948..51def81e 100644 --- a/tests/jobs/test_smt_engine_build_job.py +++ b/tests/jobs/test_smt_engine_build_job.py @@ -31,7 +31,7 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 - assert pretranslations[0]["translation"] == "Please, I have booked a room." + assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." decoy.verify( env.translation_file_service.save_model(matchers.Anything(), f"builds/{env.job._config.build_id}/model.zip"), times=1, @@ -136,7 +136,10 @@ def __init__(self, decoy: Decoy) -> None: corpusId="corpus1", textId="text1", refs=["ref1"], - translation="Por favor, tengo reservada una habitación.", + pretranslation="Por favor, tengo reservada una habitación.", + source_toks=[], + pretranslation_toks=[], + alignment="", ) ] ) From c2615978c18c307d456eddc9acb756525694d663 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Wed, 9 Apr 2025 21:23:53 -0400 Subject: [PATCH 02/17] make dockerfiles compatible with eflomal --- .devcontainer/dockerfile | 3 ++- dockerfile | 4 +++- dockerfile.cpu_only | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile index 7b1282fb..3a7837a0 100644 --- a/.devcontainer/dockerfile +++ b/.devcontainer/dockerfile @@ -20,9 +20,10 @@ RUN apt-get update && \ apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python$PYTHON_VERSION-distutils \ + python$PYTHON_VERSION-dev \ git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \ libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \ - libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev python3.9-dev && \ + libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \ rm -rf /var/lib/apt/lists/* RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION diff --git a/dockerfile b/dockerfile index 08f20a61..09f5937b 100755 --- a/dockerfile +++ b/dockerfile @@ -25,7 +25,7 @@ COPY poetry.lock pyproject.toml /src RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt -FROM nvidia/cuda:$CUDA_VERSION +FROM python:$PYTHON_VERSION ARG PYTHON_VERSION ENV PIP_DISABLE_PIP_VERSION_CHECK=on @@ -64,4 +64,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin + CMD ["bash"] diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only index cb41bb7f..aab45898 100755 --- a/dockerfile.cpu_only +++ b/dockerfile.cpu_only @@ -43,4 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin + CMD ["bash"] From 21ebf239f1113f50a0591974b5735170e85ee8b3 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Wed, 9 Apr 2025 21:56:40 -0400 Subject: [PATCH 03/17] fix flake8 error --- machine/jobs/translation_engine_build_job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py index ec94eff3..21e4b45b 100644 --- a/machine/jobs/translation_engine_build_job.py +++ b/machine/jobs/translation_engine_build_job.py @@ -92,7 +92,8 @@ def _align( logger.info("Aligning source to pretranslations") with ExitStack() as stack: - phase_progress = stack.enter_context(progress_reporter.start_next_phase()) + # phase_progress = stack.enter_context(progress_reporter.start_next_phase()) + progress_reporter.start_next_phase() src_tokenized = [ tokenize(s["pretranslation"]) From 0e3a0d5cc0eecb63685ac9043b2f2f7885fa09cd Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 11 Apr 2025 16:58:09 -0400 Subject: [PATCH 04/17] Only use eflomal on linux and other small tweaks --- .devcontainer/dockerfile | 2 +- machine/jobs/build_nmt_engine.py | 6 ++++- machine/jobs/eflomal_aligner.py | 14 +++++++++--- machine/jobs/translation_engine_build_job.py | 17 +++++--------- poetry.lock | 4 ++-- pyproject.toml | 4 ++-- tests/jobs/test_nmt_engine_build_job.py | 24 +++++++++++++++++++- 7 files changed, 50 insertions(+), 21 deletions(-) diff --git a/.devcontainer/dockerfile b/.devcontainer/dockerfile index 3a7837a0..06547678 100644 --- a/.devcontainer/dockerfile +++ b/.devcontainer/dockerfile @@ -40,6 +40,6 @@ RUN pip install -U pip setuptools \ COPY ./.devcontainer/clearml.conf /root/clearml.conf -ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python3.9/site-packages/eflomal/bin +ENV EFLOMAL_PATH=/workspaces/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin CMD ["bash"] diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py index 2645598c..cd0e12ff 100644 --- a/machine/jobs/build_nmt_engine.py +++ b/machine/jobs/build_nmt_engine.py @@ -93,7 +93,11 @@ def main() -> None: parser.add_argument("--build-options", default=None, type=str, help="Build configurations") parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name") parser.add_argument( - "--align-pretranslations", default=False, action="store_true", help="Aligns source and target pretranslations" + "--align-pretranslations", + default=False, + action="store_true", + help="Aligns source and target pretranslations using Eflomal (linux only) " + "and returns the alignments as well as the tokenized source and target with the pretranslations.", ) args = parser.parse_args() diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 31559847..cccd6ca9 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -4,18 +4,26 @@ import os import subprocess from contextlib import ExitStack +from importlib.util import find_spec from math import sqrt from pathlib import Path from tempfile import TemporaryDirectory from typing import IO, Iterable, List, Sequence, Tuple -from eflomal import read_text, write_text - from ..corpora import AlignedWordPair from ..corpora.token_processors import escape_spaces, lowercase, normalize from ..tokenization import LatinWordTokenizer from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix + +# From silnlp.common.package_utils +def is_eflomal_available() -> bool: + return find_spec("eflomal") is not None + + +if is_eflomal_available(): + from eflomal import read_text, write_text + # may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine? EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") TOKENIZER = LatinWordTokenizer() @@ -29,7 +37,7 @@ def execute_eflomal( reverse_links_path: Path, n_iterations: Tuple[int, int, int], ) -> None: - if not EFLOMAL_PATH.is_file(): + if not is_eflomal_available(): raise RuntimeError("eflomal is not installed.") args = [ diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py index 21e4b45b..1c961cec 100644 --- a/machine/jobs/translation_engine_build_job.py +++ b/machine/jobs/translation_engine_build_job.py @@ -9,7 +9,7 @@ from ..corpora.text_corpus import TextCorpus from ..utils.phased_progress_reporter import PhasedProgressReporter from ..utils.progress_status import ProgressStatus -from .eflomal_aligner import EflomalAligner, tokenize +from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize from .translation_file_service import PretranslationInfo, TranslationFileService logger = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def run( logger.info("Pretranslating segments") self._batch_inference(progress_reporter, check_canceled) - if "align_pretranslations" in self._config and self._config.align_pretranslations: + if "align_pretranslations" in self._config and self._config.align_pretranslations and is_eflomal_available(): logger.info("Aligning source to pretranslations") self._align(progress_reporter, check_canceled) @@ -99,10 +99,10 @@ def _align( tokenize(s["pretranslation"]) for s in stack.enter_context(self._translation_file_service.get_source_pretranslations()) ] - trg_tokenized = [ - tokenize(s["pretranslation"]) - for s in stack.enter_context(self._translation_file_service.get_target_pretranslations()) + trg_info = [ + pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations()) ] + trg_tokenized = [tokenize(pt_info["pretranslation"]) for pt_info in trg_info] with TemporaryDirectory() as td: aligner = EflomalAligner(Path(td)) @@ -119,12 +119,7 @@ def _align( check_canceled() writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) - for trg_pi, src_toks, trg_toks, alignment in zip( - stack.enter_context(self._translation_file_service.get_target_pretranslations()), - src_tokenized, - trg_tokenized, - alignments, - ): + for trg_pi, src_toks, trg_toks, alignment in zip(trg_info, src_tokenized, trg_tokenized, alignments): writer.write( PretranslationInfo( corpusId=trg_pi["corpusId"], diff --git a/poetry.lock b/poetry.lock index 5850ec36..4d8ded6e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4870,11 +4870,11 @@ type = ["pytest-mypy"] [extras] huggingface = ["datasets", "sacremoses", "transformers"] -jobs = ["clearml", "dynaconf", "json-stream"] +jobs = ["clearml", "dynaconf", "eflomal", "json-stream"] sentencepiece = ["sentencepiece"] thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "d939e11c2c341294ac8cbc3af9b1ae710b188e5ff4af4d39b614fb0a5726eddb" +content-hash = "b650f3e8499b348a527c5e5f0e89ba90e55fb7df93bb907cc8d8e5fdd6b63cb0" diff --git a/pyproject.toml b/pyproject.toml index be9ac8fb..822c5ee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ urllib3 = "<2" sentencepiece = "^0.2.0" sil-thot = "^3.4.6" -eflomal = "^2.0.0" transformers = ">=4.38.0,<4.46" datasets = "^2.4.0" @@ -74,6 +73,7 @@ botocore = "^1.35.41" boto3 = "^1.19.41" dynaconf = "^3.2.5" json-stream = "^1.3.0" +eflomal = { markers = "sys_platform == 'linux'", version = "^2.0.0" } [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" @@ -96,7 +96,7 @@ accelerate = { version = "^0.26.1", markers = "sys_platform == 'win32' or sys_pl sentencepiece = ["sentencepiece"] thot = ["sil-thot"] huggingface = ["transformers", "datasets", "sacremoses"] -jobs = ["clearml", "json-stream", "dynaconf"] +jobs = ["clearml", "json-stream", "dynaconf", "eflomal"] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 15a38681..9dee57d0 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -17,6 +17,7 @@ PretranslationInfo, TranslationFileService, ) +from machine.jobs.eflomal_aligner import is_eflomal_available from machine.translation import ( Phrase, Trainer, @@ -36,6 +37,19 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." + if is_eflomal_available(): + assert pretranslations[0]["source_toks"] == [ + "por", + "favor", + ",", + "tengo", + "reservada", + "una", + "habitación", + ".", + ] + assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."] + assert len(pretranslations[0]["alignment"]) > 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -155,7 +169,15 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ ) self.job = NmtEngineBuildJob( - MockSettings({"src_lang": "es", "trg_lang": "en", "save_model": "save-model", "inference_batch_size": 100}), + MockSettings( + { + "src_lang": "es", + "trg_lang": "en", + "save_model": "save-model", + "inference_batch_size": 100, + "align_pretranslations": is_eflomal_available(), + } + ), self.nmt_model_factory, self.translation_file_service, ) From 05fe8ec01ce8000949ec769b8f4afcb62bfa2a3b Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 11 Apr 2025 17:06:03 -0400 Subject: [PATCH 05/17] Adjust NMT engine test --- tests/jobs/test_nmt_engine_build_job.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 9dee57d0..112ca1ca 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -50,6 +50,10 @@ def test_run(decoy: Decoy) -> None: ] assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."] assert len(pretranslations[0]["alignment"]) > 0 + else: + assert pretranslations[0]["source_toks"] == [] + assert pretranslations[0]["pretranslation_toks"] == [] + assert len(pretranslations[0]["alignment"]) == 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -175,7 +179,7 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ "trg_lang": "en", "save_model": "save-model", "inference_batch_size": 100, - "align_pretranslations": is_eflomal_available(), + "align_pretranslations": True, } ), self.nmt_model_factory, From 4fd72fa781ba02d2665e8f8417aa08d3d9bbb6c9 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Fri, 11 Apr 2025 17:50:00 -0400 Subject: [PATCH 06/17] alternate eflomal check --- machine/jobs/eflomal_aligner.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index cccd6ca9..6333a3d2 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -4,7 +4,6 @@ import os import subprocess from contextlib import ExitStack -from importlib.util import find_spec from math import sqrt from pathlib import Path from tempfile import TemporaryDirectory @@ -15,19 +14,19 @@ from ..tokenization import LatinWordTokenizer from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix +# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine? +EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") +TOKENIZER = LatinWordTokenizer() + # From silnlp.common.package_utils def is_eflomal_available() -> bool: - return find_spec("eflomal") is not None + return EFLOMAL_PATH.is_file() if is_eflomal_available(): from eflomal import read_text, write_text -# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine? -EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") -TOKENIZER = LatinWordTokenizer() - # From silnlp.alignment.tools def execute_eflomal( From 73794c47e66a6d75a69c8e11c77456fa1c94c761 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 16:13:12 -0400 Subject: [PATCH 07/17] Alternate EFLOMAL_PATH, revert eflomal check change --- dockerfile | 4 +--- dockerfile.cpu_only | 2 +- machine/jobs/eflomal_aligner.py | 10 +++++----- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/dockerfile b/dockerfile index 09f5937b..7a1c8c4b 100755 --- a/dockerfile +++ b/dockerfile @@ -1,9 +1,7 @@ # syntax=docker/dockerfile:1.7-labs - ARG PYTHON_VERSION=3.12 ARG UBUNTU_VERSION=noble ARG POETRY_VERSION=1.6.1 -ARG CUDA_VERSION=12.6.1-base-ubuntu24.04 FROM python:$PYTHON_VERSION-slim AS builder ARG POETRY_VERSION @@ -64,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin +ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin CMD ["bash"] diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only index aab45898..a5c502e9 100755 --- a/dockerfile.cpu_only +++ b/dockerfile.cpu_only @@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin +ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin CMD ["bash"] diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 6333a3d2..6eed6952 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -4,6 +4,7 @@ import os import subprocess from contextlib import ExitStack +from importlib.util import find_spec from math import sqrt from pathlib import Path from tempfile import TemporaryDirectory @@ -14,19 +15,18 @@ from ..tokenization import LatinWordTokenizer from ..translation import SymmetrizationHeuristic, WordAlignmentMatrix -# may have to make more dynamic, look at silnlp get_wsl_path, is there something equivalent in machine? -EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") -TOKENIZER = LatinWordTokenizer() - # From silnlp.common.package_utils def is_eflomal_available() -> bool: - return EFLOMAL_PATH.is_file() + return find_spec("eflomal") is not None if is_eflomal_available(): from eflomal import read_text, write_text +EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") +TOKENIZER = LatinWordTokenizer() + # From silnlp.alignment.tools def execute_eflomal( From 2c9db286e1d8a3b6b5fa510277ddd54ecf097bd0 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 16:23:28 -0400 Subject: [PATCH 08/17] Make linter ignore conditional eflomal import --- machine/jobs/eflomal_aligner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 6eed6952..551e55c8 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -22,7 +22,7 @@ def is_eflomal_available() -> bool: if is_eflomal_available(): - from eflomal import read_text, write_text + from eflomal import read_text, write_text # type: ignore EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") TOKENIZER = LatinWordTokenizer() From 52540172d9f089b91cba9ca992d9098d3e2dce85 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 16:58:00 -0400 Subject: [PATCH 09/17] Attempt to fix EFLOMAL_PATH --- dockerfile | 2 +- dockerfile.cpu_only | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dockerfile b/dockerfile index 7a1c8c4b..b5f512ef 100755 --- a/dockerfile +++ b/dockerfile @@ -62,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin +ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin" CMD ["bash"] diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only index a5c502e9..764ab6f1 100755 --- a/dockerfile.cpu_only +++ b/dockerfile.cpu_only @@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH=/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin +ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin" CMD ["bash"] From f45b59f4ae631a03b15eca20dcf78377a6e6cede Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 17:04:41 -0400 Subject: [PATCH 10/17] Eflomal sanity check --- machine/jobs/eflomal_aligner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 551e55c8..c088cf22 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -24,7 +24,7 @@ def is_eflomal_available() -> bool: if is_eflomal_available(): from eflomal import read_text, write_text # type: ignore -EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") +EFLOMAL_PATH = Path("/home/runner/work/machine.py/machine.py/.venv/lib/python3.12/site-packages/eflomal/bin", "eflomal") TOKENIZER = LatinWordTokenizer() From dc6bb667b56447f4d446cf139e76b0f3f3a82542 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 17:08:06 -0400 Subject: [PATCH 11/17] Eflomal sanity check take 2 --- machine/jobs/eflomal_aligner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index c088cf22..054b0e4a 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -1,7 +1,7 @@ # NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py. # The vast majority of this code is taken from the silnlp repository. -import os +# import os import subprocess from contextlib import ExitStack from importlib.util import find_spec From c5b7916a68b42a8af8facd941e9c3c36451fa35e Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 17:31:33 -0400 Subject: [PATCH 12/17] Add EFLOMAL_PATH value to pytest step of ci workflow --- .github/workflows/ci.yml | 2 ++ machine/jobs/eflomal_aligner.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d38f9aac..380c8e10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,8 @@ jobs: poetry run pyright - name: Test with pytest run: poetry run pytest --cov --cov-report=xml + env: + EFLOMAL_PATH: /home/runner/work/machine.py/machine.py/.venv/lib/python${{ matrix.python-version }}/site-packages/eflomal/bin - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 env: diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 054b0e4a..551e55c8 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -1,7 +1,7 @@ # NOTE: this is a temporary solution to be able to use the eflomal aligner inside of machine.py. # The vast majority of this code is taken from the silnlp repository. -# import os +import os import subprocess from contextlib import ExitStack from importlib.util import find_spec @@ -24,7 +24,7 @@ def is_eflomal_available() -> bool: if is_eflomal_available(): from eflomal import read_text, write_text # type: ignore -EFLOMAL_PATH = Path("/home/runner/work/machine.py/machine.py/.venv/lib/python3.12/site-packages/eflomal/bin", "eflomal") +EFLOMAL_PATH = Path(os.getenv("EFLOMAL_PATH", "."), "eflomal") TOKENIZER = LatinWordTokenizer() From b4b413847fd0c0e4733e7bfae79d67ce3ccc7a77 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Mon, 14 Apr 2025 17:50:59 -0400 Subject: [PATCH 13/17] Revert EFLOMAL_PATH values to regular docker container paths --- dockerfile | 2 +- dockerfile.cpu_only | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dockerfile b/dockerfile index b5f512ef..77e49bc0 100755 --- a/dockerfile +++ b/dockerfile @@ -62,6 +62,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin" +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin CMD ["bash"] diff --git a/dockerfile.cpu_only b/dockerfile.cpu_only index 764ab6f1..aab45898 100755 --- a/dockerfile.cpu_only +++ b/dockerfile.cpu_only @@ -43,6 +43,6 @@ RUN --mount=type=cache,target=/root/.cache \ RUN python -m pip install --no-deps . && rm -r /root/* ENV CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL=1 -ENV EFLOMAL_PATH="/home/runner/work/machine.py/machine.py/.venv/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin" +ENV EFLOMAL_PATH=/usr/local/lib/python${PYTHON_VERSION}/site-packages/eflomal/bin CMD ["bash"] From 881a7954571a23f7bbd19da0821fb0dd559e4b09 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Tue, 15 Apr 2025 00:47:33 -0400 Subject: [PATCH 14/17] Only use normalized tokens inside of aligner --- machine/jobs/eflomal_aligner.py | 11 +++++++++-- tests/jobs/test_nmt_engine_build_job.py | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/machine/jobs/eflomal_aligner.py b/machine/jobs/eflomal_aligner.py index 551e55c8..0f5526b5 100644 --- a/machine/jobs/eflomal_aligner.py +++ b/machine/jobs/eflomal_aligner.py @@ -100,7 +100,11 @@ def prepare_files( def tokenize(sent: str) -> Sequence[str]: - return lowercase(normalize("NFC", escape_spaces(list(TOKENIZER.tokenize(sent))))) + return list(TOKENIZER.tokenize(sent)) + + +def normalize_for_alignment(sent: Sequence[str]) -> str: + return " ".join(lowercase(normalize("NFC", escape_spaces(sent)))) # From silnlp.alignment.eflomal @@ -118,7 +122,10 @@ def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[s trg_output_file = stack.enter_context(trg_eflomal_path.open("wb")) # Write input files for the eflomal binary n_sentences = prepare_files( - [" ".join(s) for s in src_toks], src_output_file, [" ".join(s) for s in trg_toks], trg_output_file + [normalize_for_alignment(s) for s in src_toks], + src_output_file, + [normalize_for_alignment(s) for s in trg_toks], + trg_output_file, ) iters = max(2, int(round(1.0 * 5000 / sqrt(n_sentences)))) diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 112ca1ca..29c50cc8 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -39,7 +39,7 @@ def test_run(decoy: Decoy) -> None: assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." if is_eflomal_available(): assert pretranslations[0]["source_toks"] == [ - "por", + "Por", "favor", ",", "tengo", @@ -48,7 +48,7 @@ def test_run(decoy: Decoy) -> None: "habitación", ".", ] - assert pretranslations[0]["pretranslation_toks"] == ["please", ",", "i", "have", "booked", "a", "room", "."] + assert pretranslations[0]["pretranslation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] assert len(pretranslations[0]["alignment"]) > 0 else: assert pretranslations[0]["source_toks"] == [] From 123263f0e8f7fdaa62961c17e421320327e7f210 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Wed, 23 Apr 2025 14:00:59 -0400 Subject: [PATCH 15/17] Move alignment config option to build options, revert to 'translation' in PretranslationInfo --- machine/jobs/build_nmt_engine.py | 7 ------- machine/jobs/nmt_engine_build_job.py | 8 ++++---- machine/jobs/settings.yaml | 1 + machine/jobs/smt_engine_build_job.py | 4 ++-- machine/jobs/translation_engine_build_job.py | 10 +++++----- machine/jobs/translation_file_service.py | 8 ++++---- tests/jobs/test_nmt_engine_build_job.py | 14 +++++++------- tests/jobs/test_smt_engine_build_job.py | 15 +++++++++++---- 8 files changed, 34 insertions(+), 33 deletions(-) diff --git a/machine/jobs/build_nmt_engine.py b/machine/jobs/build_nmt_engine.py index cd0e12ff..a047a7ba 100644 --- a/machine/jobs/build_nmt_engine.py +++ b/machine/jobs/build_nmt_engine.py @@ -92,13 +92,6 @@ def main() -> None: parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task") parser.add_argument("--build-options", default=None, type=str, help="Build configurations") parser.add_argument("--save-model", default=None, type=str, help="Save the model using the specified base name") - parser.add_argument( - "--align-pretranslations", - default=False, - action="store_true", - help="Aligns source and target pretranslations using Eflomal (linux only) " - "and returns the alignments as well as the tokenized source and target with the pretranslations.", - ) args = parser.parse_args() run({k: v for k, v in vars(args).items() if v is not None}) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index d3674120..5d92aa06 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -28,7 +28,7 @@ def _get_progress_reporter( self, progress: Optional[Callable[[ProgressStatus], None]], corpus_size: int ) -> PhasedProgressReporter: if corpus_size > 0: - if "align_pretranslations" in self._config and self._config.align_pretranslations: + if self._config.align_pretranslations: phases = [ Phase(message="Training NMT model", percentage=0.8), Phase(message="Pretranslating segments", percentage=0.1), @@ -40,7 +40,7 @@ def _get_progress_reporter( Phase(message="Pretranslating segments", percentage=0.1), ] else: - if "align_pretranslations" in self._config and self._config.align_pretranslations: + if self._config.align_pretranslations: phases = [ Phase(message="Pretranslating segments", percentage=0.9), Phase(message="Aligning segments", percentage=0.1, report_steps=False), @@ -128,7 +128,7 @@ def _translate_batch( batch: Sequence[PretranslationInfo], writer: DictToJsonWriter, ) -> None: - source_segments = [pi["pretranslation"] for pi in batch] + source_segments = [pi["translation"] for pi in batch] for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["pretranslation"] = result.translation + batch[i]["translation"] = result.translation writer.write(batch[i]) diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml index f937826d..cfc727c8 100644 --- a/machine/jobs/settings.yaml +++ b/machine/jobs/settings.yaml @@ -3,6 +3,7 @@ default: shared_file_uri: s3:/silnlp/ shared_file_folder: production inference_batch_size: 1024 + align_pretranslations: false huggingface: parent_model_name: facebook/nllb-200-distilled-1.3B train_params: diff --git a/machine/jobs/smt_engine_build_job.py b/machine/jobs/smt_engine_build_job.py index 35977c60..452810f8 100644 --- a/machine/jobs/smt_engine_build_job.py +++ b/machine/jobs/smt_engine_build_job.py @@ -107,7 +107,7 @@ def _translate_batch( batch: Sequence[PretranslationInfo], writer: DictToJsonWriter, ) -> None: - source_segments = [pi["pretranslation"] for pi in batch] + source_segments = [pi["translation"] for pi in batch] for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["pretranslation"] = result.translation + batch[i]["translation"] = result.translation writer.write(batch[i]) diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py index 1c961cec..703490bb 100644 --- a/machine/jobs/translation_engine_build_job.py +++ b/machine/jobs/translation_engine_build_job.py @@ -48,7 +48,7 @@ def run( logger.info("Pretranslating segments") self._batch_inference(progress_reporter, check_canceled) - if "align_pretranslations" in self._config and self._config.align_pretranslations and is_eflomal_available(): + if self._config.align_pretranslations and is_eflomal_available(): logger.info("Aligning source to pretranslations") self._align(progress_reporter, check_canceled) @@ -96,13 +96,13 @@ def _align( progress_reporter.start_next_phase() src_tokenized = [ - tokenize(s["pretranslation"]) + tokenize(s["translation"]) for s in stack.enter_context(self._translation_file_service.get_source_pretranslations()) ] trg_info = [ pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations()) ] - trg_tokenized = [tokenize(pt_info["pretranslation"]) for pt_info in trg_info] + trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in trg_info] with TemporaryDirectory() as td: aligner = EflomalAligner(Path(td)) @@ -125,9 +125,9 @@ def _align( corpusId=trg_pi["corpusId"], textId=trg_pi["textId"], refs=trg_pi["refs"], - pretranslation=trg_pi["pretranslation"], + translation=trg_pi["translation"], source_toks=list(src_toks), - pretranslation_toks=list(trg_toks), + translation_toks=list(trg_toks), alignment=alignment, ) ) diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index a6942ab4..a8e8d513 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -15,9 +15,9 @@ class PretranslationInfo(TypedDict): corpusId: str # noqa: N815 textId: str # noqa: N815 refs: List[str] - pretranslation: str + translation: str source_toks: List[str] - pretranslation_toks: List[str] + translation_toks: List[str] alignment: str @@ -62,9 +62,9 @@ def generator() -> Generator[PretranslationInfo, None, None]: corpusId=pi["corpusId"], textId=pi["textId"], refs=list(pi["refs"]), - pretranslation=pi["pretranslation"], + translation=pi["translation"], source_toks=list(pi["source_toks"]), - pretranslation_toks=list(pi["pretranslation_toks"]), + translation_toks=list(pi["translation_toks"]), alignment=pi["alignment"], ) diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 29c50cc8..323d0901 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -36,7 +36,7 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 - assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." + assert pretranslations[0]["translation"] == "Please, I have booked a room." if is_eflomal_available(): assert pretranslations[0]["source_toks"] == [ "Por", @@ -48,11 +48,11 @@ def test_run(decoy: Decoy) -> None: "habitación", ".", ] - assert pretranslations[0]["pretranslation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] + assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] assert len(pretranslations[0]["alignment"]) > 0 else: assert pretranslations[0]["source_toks"] == [] - assert pretranslations[0]["pretranslation_toks"] == [] + assert pretranslations[0]["translation_toks"] == [] assert len(pretranslations[0]["alignment"]) == 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -130,9 +130,9 @@ def __init__(self, decoy: Decoy) -> None: corpusId="corpus1", textId="text1", refs=["ref1"], - pretranslation="Por favor, tengo reservada una habitación.", + translation="Por favor, tengo reservada una habitación.", source_toks=[], - pretranslation_toks=[], + translation_toks=[], alignment="", ) ] @@ -148,9 +148,9 @@ def __init__(self, decoy: Decoy) -> None: corpusId="corpus1", textId="text1", refs=["ref1"], - pretranslation="Please, I have booked a room.", + translation="Please, I have booked a room.", source_toks=[], - pretranslation_toks=[], + translation_toks=[], alignment="", ) ] diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py index 51def81e..16afcacf 100644 --- a/tests/jobs/test_smt_engine_build_job.py +++ b/tests/jobs/test_smt_engine_build_job.py @@ -31,7 +31,7 @@ def test_run(decoy: Decoy) -> None: pretranslations = json.loads(env.target_pretranslations) assert len(pretranslations) == 1 - assert pretranslations[0]["pretranslation"] == "Please, I have booked a room." + assert pretranslations[0]["translation"] == "Please, I have booked a room." decoy.verify( env.translation_file_service.save_model(matchers.Anything(), f"builds/{env.job._config.build_id}/model.zip"), times=1, @@ -136,9 +136,9 @@ def __init__(self, decoy: Decoy) -> None: corpusId="corpus1", textId="text1", refs=["ref1"], - pretranslation="Por favor, tengo reservada una habitación.", + translation="Por favor, tengo reservada una habitación.", source_toks=[], - pretranslation_toks=[], + translation_toks=[], alignment="", ) ] @@ -161,7 +161,14 @@ def open_target_pretranslation_writer(env: _TestEnvironment) -> Iterator[DictToJ ) self.job = SmtEngineBuildJob( - MockSettings({"build_id": "mybuild", "inference_batch_size": 100, "thot_mt": {"tokenizer": "latin"}}), + MockSettings( + { + "build_id": "mybuild", + "inference_batch_size": 100, + "thot_mt": {"tokenizer": "latin"}, + "align_pretranslations": False, + } + ), self.smt_model_factory, self.translation_file_service, ) From 3774258723ac54e8d07a0a2a65c76ba137792eb2 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Wed, 23 Apr 2025 18:03:27 -0400 Subject: [PATCH 16/17] Refactor to do alignment during the inference step --- machine/jobs/nmt_engine_build_job.py | 61 ++++++++++++++++++-- machine/jobs/translation_engine_build_job.py | 60 +------------------ machine/jobs/translation_file_service.py | 14 ++--- tests/jobs/test_nmt_engine_build_job.py | 18 ------ 4 files changed, 62 insertions(+), 91 deletions(-) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 5d92aa06..1244fea8 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -1,5 +1,7 @@ import logging from contextlib import ExitStack +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any, Callable, Optional, Sequence, Tuple from ..corpora.corpora_utils import batch @@ -8,6 +10,7 @@ from ..translation.translation_engine import TranslationEngine from ..utils.phased_progress_reporter import Phase, PhasedProgressReporter from ..utils.progress_status import ProgressStatus +from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize from .nmt_model_factory import NmtModelFactory from .shared_file_service_base import DictToJsonWriter from .translation_engine_build_job import TranslationEngineBuildJob @@ -102,18 +105,66 @@ def _batch_inference( with ExitStack() as stack: phase_progress = stack.enter_context(progress_reporter.start_next_phase()) engine = stack.enter_context(self._nmt_model_factory.create_engine()) - src_pretranslations = stack.enter_context(self._translation_file_service.get_source_pretranslations()) - writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) + pretranslations = [ + pt_info for pt_info in stack.enter_context(self._translation_file_service.get_source_pretranslations()) + ] + src_segments = [pt_info["translation"] for pt_info in pretranslations] current_inference_step = 0 phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count)) batch_size = self._config["inference_batch_size"] - for pi_batch in batch(src_pretranslations, batch_size): + for seg_batch in batch(iter(src_segments), batch_size): if check_canceled is not None: check_canceled() - _translate_batch(engine, pi_batch, writer) - current_inference_step += len(pi_batch) + for i, result in enumerate(engine.translate_batch(seg_batch)): + pretranslations[current_inference_step + i]["translation"] = result.translation + current_inference_step += len(seg_batch) phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count)) + if self._config.align_pretranslations and is_eflomal_available(): + logger.info("Aligning source to pretranslations") + pretranslations = self._align(src_segments, pretranslations, progress_reporter, check_canceled) + + writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) + for pretranslation in pretranslations: + writer.write(pretranslation) + + def _align( + self, + src_segments: Sequence[str], + pretranslations: Sequence[PretranslationInfo], + progress_reporter: PhasedProgressReporter, + check_canceled: Optional[Callable[[], None]], + ) -> Sequence[PretranslationInfo]: + if check_canceled is not None: + check_canceled() + + logger.info("Aligning source to pretranslations") + progress_reporter.start_next_phase() + + src_tokenized = [tokenize(s) for s in src_segments] + trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in pretranslations] + + with TemporaryDirectory() as td: + aligner = EflomalAligner(Path(td)) + logger.info("Training aligner") + aligner.train(src_tokenized, trg_tokenized) + + if check_canceled is not None: + check_canceled() + + logger.info("Aligning pretranslations") + alignments = aligner.align() + + if check_canceled is not None: + check_canceled() + + for i in range(len(pretranslations)): + pretranslations[i]["source_toks"] = list(src_tokenized[i]) + pretranslations[i]["translation_toks"] = list(trg_tokenized[i]) + pretranslations[i]["alignment"] = alignments[i] + + return pretranslations + def _save_model(self) -> None: if "save_model" in self._config and self._config.save_model is not None: logger.info("Saving model") diff --git a/machine/jobs/translation_engine_build_job.py b/machine/jobs/translation_engine_build_job.py index 703490bb..7effa62f 100644 --- a/machine/jobs/translation_engine_build_job.py +++ b/machine/jobs/translation_engine_build_job.py @@ -1,16 +1,12 @@ import logging from abc import ABC, abstractmethod -from contextlib import ExitStack -from pathlib import Path -from tempfile import TemporaryDirectory from typing import Any, Callable, Optional, Tuple from ..corpora.parallel_text_corpus import ParallelTextCorpus from ..corpora.text_corpus import TextCorpus from ..utils.phased_progress_reporter import PhasedProgressReporter from ..utils.progress_status import ProgressStatus -from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize -from .translation_file_service import PretranslationInfo, TranslationFileService +from .translation_file_service import TranslationFileService logger = logging.getLogger(__name__) @@ -48,10 +44,6 @@ def run( logger.info("Pretranslating segments") self._batch_inference(progress_reporter, check_canceled) - if self._config.align_pretranslations and is_eflomal_available(): - logger.info("Aligning source to pretranslations") - self._align(progress_reporter, check_canceled) - self._save_model() return train_corpus_size, confidence @@ -82,55 +74,5 @@ def _batch_inference( check_canceled: Optional[Callable[[], None]], ) -> None: ... - def _align( - self, - progress_reporter: PhasedProgressReporter, - check_canceled: Optional[Callable[[], None]], - ) -> None: - if check_canceled is not None: - check_canceled() - - logger.info("Aligning source to pretranslations") - with ExitStack() as stack: - # phase_progress = stack.enter_context(progress_reporter.start_next_phase()) - progress_reporter.start_next_phase() - - src_tokenized = [ - tokenize(s["translation"]) - for s in stack.enter_context(self._translation_file_service.get_source_pretranslations()) - ] - trg_info = [ - pt_info for pt_info in stack.enter_context(self._translation_file_service.get_target_pretranslations()) - ] - trg_tokenized = [tokenize(pt_info["translation"]) for pt_info in trg_info] - - with TemporaryDirectory() as td: - aligner = EflomalAligner(Path(td)) - logger.info("Training aligner") - aligner.train(src_tokenized, trg_tokenized) - - if check_canceled is not None: - check_canceled() - - logger.info("Aligning pretranslations") - alignments = aligner.align() - - if check_canceled is not None: - check_canceled() - - writer = stack.enter_context(self._translation_file_service.open_target_pretranslation_writer()) - for trg_pi, src_toks, trg_toks, alignment in zip(trg_info, src_tokenized, trg_tokenized, alignments): - writer.write( - PretranslationInfo( - corpusId=trg_pi["corpusId"], - textId=trg_pi["textId"], - refs=trg_pi["refs"], - translation=trg_pi["translation"], - source_toks=list(src_toks), - translation_toks=list(trg_toks), - alignment=alignment, - ) - ) - @abstractmethod def _save_model(self) -> None: ... diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index a8e8d513..54c4ae90 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -52,11 +52,13 @@ def exists_source_corpus(self) -> bool: def exists_target_corpus(self) -> bool: return self.shared_file_service._exists_file(f"{self.shared_file_service.build_path}/{TARGET_FILENAME}") - def _get_pretranslations(self, filename: str) -> ContextManagedGenerator[PretranslationInfo, None, None]: - pretranslate_path = self.shared_file_service.download_file(f"{self.shared_file_service.build_path}/{filename}") + def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: + src_pretranslate_path = self.shared_file_service.download_file( + f"{self.shared_file_service.build_path}/{SOURCE_PRETRANSLATION_FILENAME}" + ) def generator() -> Generator[PretranslationInfo, None, None]: - with pretranslate_path.open("r", encoding="utf-8-sig") as file: + with src_pretranslate_path.open("r", encoding="utf-8-sig") as file: for pi in json_stream.load(file): yield PretranslationInfo( corpusId=pi["corpusId"], @@ -70,12 +72,6 @@ def generator() -> Generator[PretranslationInfo, None, None]: return ContextManagedGenerator(generator()) - def get_source_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: - return self._get_pretranslations(SOURCE_PRETRANSLATION_FILENAME) - - def get_target_pretranslations(self) -> ContextManagedGenerator[PretranslationInfo, None, None]: - return self._get_pretranslations(TARGET_PRETRANSLATION_FILENAME) - def save_model(self, model_path: Path, destination: str) -> None: self.shared_file_service.upload_path(model_path, destination) diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 323d0901..227b909b 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -139,24 +139,6 @@ def __init__(self, decoy: Decoy) -> None: ) ) ) - decoy.when(self.translation_file_service.get_target_pretranslations()).then_do( - lambda: ContextManagedGenerator( - ( - pi - for pi in [ - PretranslationInfo( - corpusId="corpus1", - textId="text1", - refs=["ref1"], - translation="Please, I have booked a room.", - source_toks=[], - translation_toks=[], - alignment="", - ) - ] - ) - ) - ) self.target_pretranslations = "" From 2556455610fef3ebae9fa03158c6c246542167da Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Thu, 24 Apr 2025 17:53:47 -0400 Subject: [PATCH 17/17] Remove unused function --- machine/jobs/nmt_engine_build_job.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 1244fea8..b7b2afbc 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -7,12 +7,10 @@ from ..corpora.corpora_utils import batch from ..corpora.parallel_text_corpus import ParallelTextCorpus from ..corpora.text_corpus import TextCorpus -from ..translation.translation_engine import TranslationEngine from ..utils.phased_progress_reporter import Phase, PhasedProgressReporter from ..utils.progress_status import ProgressStatus from .eflomal_aligner import EflomalAligner, is_eflomal_available, tokenize from .nmt_model_factory import NmtModelFactory -from .shared_file_service_base import DictToJsonWriter from .translation_engine_build_job import TranslationEngineBuildJob from .translation_file_service import PretranslationInfo, TranslationFileService @@ -172,14 +170,3 @@ def _save_model(self) -> None: self._translation_file_service.save_model( model_path, f"models/{self._config.save_model + ''.join(model_path.suffixes)}" ) - - -def _translate_batch( - engine: TranslationEngine, - batch: Sequence[PretranslationInfo], - writer: DictToJsonWriter, -) -> None: - source_segments = [pi["translation"] for pi in batch] - for i, result in enumerate(engine.translate_batch(source_segments)): - batch[i]["translation"] = result.translation - writer.write(batch[i])