From 0b149b0010445a90a4b3709cd6aa26c45788c052 Mon Sep 17 00:00:00 2001
From: Ahmed Ahmed <ahmedah@stanford.edu>
Date: Fri, 17 Jan 2025 14:18:42 -0800
Subject: [PATCH] another fix

---
 src/levanter/data/text.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
index 67cd514b7..c1ea88576 100644
--- a/src/levanter/data/text.py
+++ b/src/levanter/data/text.py
@@ -955,6 +955,9 @@ def preprocess_chat_example(batch, tokenizer: PreTrainedTokenizerBase, should_ap
     
     # Tokenize sources to get lengths
     sources_tokenized = tokenizer(sources, padding=False, truncation=True)
+
+    if should_append_eos:
+        targets = [t + tokenizer.eos_token for t in targets]
     
     # Combine for full examples
     full_examples = [f"{s}{t}" for s, t in zip(sources, targets)]