From 0b149b0010445a90a4b3709cd6aa26c45788c052 Mon Sep 17 00:00:00 2001 From: Ahmed Ahmed Date: Fri, 17 Jan 2025 14:18:42 -0800 Subject: [PATCH] another fix --- src/levanter/data/text.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py index 67cd514b7..c1ea88576 100644 --- a/src/levanter/data/text.py +++ b/src/levanter/data/text.py @@ -955,6 +955,9 @@ def preprocess_chat_example(batch, tokenizer: PreTrainedTokenizerBase, should_ap # Tokenize sources to get lengths sources_tokenized = tokenizer(sources, padding=False, truncation=True) + + if should_append_eos: + targets = [t + tokenizer.eos_token for t in targets] # Combine for full examples full_examples = [f"{s}{t}" for s, t in zip(sources, targets)]