Skip to content

Commit

Permalink
patch decoder_start_token seq2seq
Browse files Browse the repository at this point in the history
  • Loading branch information
francoishernandez committed Jan 31, 2025
1 parent 2ac0cdf commit 3fd59e6
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion eole/inputters/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def numericalize(vocabs, example, model_type=ModelType.ENCODER_DECODER):
numeric["src"]["src_ids"] = vocabs["src"](src_text)
if example.get("tgt", None) is not None:
if maybe_tgt_ids != []:
# TODO: handle this better in HF tokenizer templates
if decoder_start_token != "":
decoder_start_token_id = vocabs["tgt"].tokens_to_ids[decoder_start_token]
if maybe_tgt_ids[0] != decoder_start_token_id:
maybe_tgt_ids = [decoder_start_token_id] + maybe_tgt_ids
numeric["tgt"]["tgt_ids"] = maybe_tgt_ids
else:
tgt_text = example["tgt"]["tgt"].split(" ")
Expand All @@ -89,7 +94,6 @@ def numericalize(vocabs, example, model_type=ModelType.ENCODER_DECODER):
numeric["src"]["src_ids"] = vocabs["src"](src_text)
if example["tgt"] is not None:
if maybe_tgt_ids != []:
# decoder_start_token logic is supposedly handled in the tokenizer
numeric["tgt"]["tgt_ids"] = maybe_tgt_ids
else:
tgt_text = example["tgt"]["tgt"].split(" ")
Expand Down

0 comments on commit 3fd59e6

Please sign in to comment.