From f3de9cd34964521d8d9928fe5efb47089233333f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Fri, 31 Jan 2025 14:10:14 +0000 Subject: [PATCH] update nllb recipe and readme --- recipes/nllb/README.md | 24 ++++++++++++++++++- .../{inference.yaml => inference-hf.yaml} | 8 +++---- recipes/nllb/inference-pyonmttok.yaml | 21 ++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) rename recipes/nllb/{inference.yaml => inference-hf.yaml} (52%) create mode 100644 recipes/nllb/inference-pyonmttok.yaml diff --git a/recipes/nllb/README.md b/recipes/nllb/README.md index 962c42c8..fb5bc09b 100644 --- a/recipes/nllb/README.md +++ b/recipes/nllb/README.md @@ -2,12 +2,34 @@ ## Conversion +### 1. Sentencepiece with OpenNMT Tokenizer + ```bash eole convert HF --model_dir facebook/nllb-200-1.3B --output ./nllb-1.3b --token $HF_TOKEN --tokenizer onmt ``` +### 2. HuggingFace Tokenizer + +```bash +eole convert HF --model_dir facebook/nllb-200-1.3B --output ./nllb-1.3b --token $HF_TOKEN +``` + + ## Inference ```bash -eole predict -c inference.yaml +echo "What is the weather like in Tahiti?" > test.en +``` + + +### 1. Sentencepiece with OpenNMT Tokenizer + +```bash +eole predict -c inference-pyonmttok.yaml +``` + +### 2. HuggingFace Tokenizer + +```bash +eole predict -c inference-hf.yaml ``` \ No newline at end of file diff --git a/recipes/nllb/inference.yaml b/recipes/nllb/inference-hf.yaml similarity index 52% rename from recipes/nllb/inference.yaml rename to recipes/nllb/inference-hf.yaml index 3158ca35..f73b3ad9 100644 --- a/recipes/nllb/inference.yaml +++ b/recipes/nllb/inference-hf.yaml @@ -1,13 +1,11 @@ model_path: "nllb-1.3b" -transforms: ["sentencepiece", "prefix"] +transforms: ["prefix", "huggingface_tokenize"] transforms_configs: prefix: src_prefix: " eng_Latn" tgt_prefix: "deu_Latn" - sentencepiece: - src_subword_model: ./flores200_sacrebleu_tokenizer_spm.model - tgt_subword_model: ./flores200_sacrebleu_tokenizer_spm.model - + huggingface_tokenize: + huggingface_model: facebook/nllb-200-1.3B tgt_file_prefix: true diff --git a/recipes/nllb/inference-pyonmttok.yaml b/recipes/nllb/inference-pyonmttok.yaml new file mode 100644 index 00000000..52423fc8 --- /dev/null +++ b/recipes/nllb/inference-pyonmttok.yaml @@ -0,0 +1,21 @@ +model_path: "nllb-1.3b" +# transforms: ["sentencepiece", "prefix"] +transforms: ["prefix", "huggingface_tokenize"] +transforms_configs: + prefix: + src_prefix: " eng_Latn" + tgt_prefix: "deu_Latn" + # sentencepiece: + # src_subword_model: ./flores200_sacrebleu_tokenizer_spm.model + # tgt_subword_model: ./flores200_sacrebleu_tokenizer_spm.model + huggingface_tokenize: + huggingface_model: facebook/nllb-200-1.3B + +tgt_file_prefix: true + +gpu_ranks: [0] +world_size: 1 +beam_size: 5 + +src: test.en +output: test.de \ No newline at end of file