Skip to content

Enhance Test Output Clarity in Tokenizer Tests #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions llama/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def test_special_tokens(self):
self.assertEqual(
self.tokenizer.special_tokens["<|begin_of_text|>"],
128000,
"Special token <|begin_of_text|> does not match expected value."
)

def test_encode(self):
Expand All @@ -26,6 +27,7 @@ def test_encode(self):
eos=True
),
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
"Encoded output does not match expected token sequence."
)

def test_decode(self):
Expand All @@ -34,6 +36,7 @@ def test_decode(self):
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
),
"<|begin_of_text|>This is a test sentence.<|end_of_text|>",
"Decoded output does not match expected string."
)

def test_encode_message(self):
Expand All @@ -50,7 +53,8 @@ def test_encode_message(self):
271, # "\n\n"
2028, 374, 264, 1296, 11914, 13, # This is a test sentence.
128009, # <|eot_id|>
]
],
"Encoded message does not match expected token sequence."
)

def test_encode_dialog(self):
Expand Down Expand Up @@ -78,11 +82,13 @@ def test_encode_dialog(self):
882, # "user"
128007, # <|end_header_id|>
271, # "\n\n"
2028, 374, 264, 2077, 13, # "This is a response.",
2028, 374, 264, 2077, 13, # "This is a response."
128009, # <|eot_id|>
128006, # <|start_header_id|>
78191, # "assistant"
128007, # <|end_header_id|>
271, # "\n\n"
]
],
"Encoded dialog prompt does not match expected token sequence."
)