Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: llama3 tokenizer #64

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128,000 changes: 128,000 additions & 0 deletions tiktoken-rs/assets/llama3_base.tiktoken

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion tiktoken-rs/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use anyhow::{anyhow, Result};

use crate::{
cl100k_base,
cl100k_base, llama3_base,
model::get_context_size,
o200k_base, p50k_base, p50k_edit, r50k_base,
tokenizer::{get_tokenizer, Tokenizer},
Expand Down Expand Up @@ -261,6 +261,7 @@ pub fn get_bpe_from_tokenizer(tokenizer: Tokenizer) -> Result<CoreBPE> {
Tokenizer::P50kBase => p50k_base(),
Tokenizer::P50kEdit => p50k_edit(),
Tokenizer::Gpt2 => r50k_base(),
Tokenizer::Llama3Base => llama3_base(),
}
}

Expand Down
1 change: 1 addition & 0 deletions tiktoken-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ pub use api::*;
pub mod model;
pub mod tokenizer;
pub use singleton::*;
pub use tiktoken_ext::llama3::*;
pub use tiktoken_ext::openai_public::*;
pub use vendor_tiktoken::*;
9 changes: 8 additions & 1 deletion tiktoken-rs/src/singleton.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use parking_lot::Mutex;

use crate::vendor_tiktoken::CoreBPE;

use crate::{cl100k_base, o200k_base, p50k_base, p50k_edit, r50k_base};
use crate::{cl100k_base, llama3_base, o200k_base, p50k_base, p50k_edit, r50k_base};

/// Returns a singleton instance of the r50k_base tokenizer. (also known as `gpt2`)
/// Use for GPT-3 models like `davinci`
Expand Down Expand Up @@ -61,3 +61,10 @@ pub fn o200k_base_singleton() -> Arc<Mutex<CoreBPE>> {
}
O200K_BASE.clone()
}

pub fn llama3_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref LLAMA3_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(llama3_base().unwrap()));
}
LLAMA3_BASE.clone()
}
68 changes: 68 additions & 0 deletions tiktoken-rs/src/tiktoken_ext/llama3.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
pub const BEGIN_OF_TEXT: &str = "<|begin_of_text|>";
pub const END_OF_TEXT: &str = "<|end_of_text|>";
pub const RESERVED_SPECIAL_TOKEN_0: &str = "<|reserved_special_token_0|>";
pub const RESERVED_SPECIAL_TOKEN_1: &str = "<|reserved_special_token_1|>";
pub const RESERVED_SPECIAL_TOKEN_2: &str = "<|reserved_special_token_2|>";
pub const RESERVED_SPECIAL_TOKEN_3: &str = "<|reserved_special_token_3|>";
pub const START_HEADER_ID: &str = "<|start_header_id|>";
pub const END_HEADER_ID: &str = "<|end_header_id|>";
pub const RESERVED_SPECIAL_TOKEN_4: &str = "<|reserved_special_token_4|>";
pub const EOT_ID: &str = "<|eot_id|>";

/// Adaptation of the tiktoken crate for use in Rust projects
use anyhow::Result;
use base64::{engine::general_purpose, Engine as _};

use rustc_hash::FxHashMap as HashMap;

use crate::vendor_tiktoken::CoreBPE;

const NUM_RESERVED_SPECIAL_TOKENS: usize = 256;

pub fn llama3_base() -> Result<CoreBPE> {
let llama3_base = include_str!("../../assets/llama3_base.tiktoken");

let mut encoder: std::collections::HashMap<
Vec<u8>,
usize,
std::hash::BuildHasherDefault<rustc_hash::FxHasher>,
> = HashMap::default();
for line in llama3_base.lines() {
let mut parts = line.split(' ');
let raw = parts.next().unwrap();
let token = &general_purpose::STANDARD.decode(raw)?;
let rank: usize = parts.next().unwrap().parse().unwrap();
encoder.insert(token.clone(), rank);
}

let mut special_tokens_list = vec![
BEGIN_OF_TEXT.to_string(),
END_OF_TEXT.to_string(),
RESERVED_SPECIAL_TOKEN_0.to_string(),
RESERVED_SPECIAL_TOKEN_1.to_string(),
RESERVED_SPECIAL_TOKEN_2.to_string(),
RESERVED_SPECIAL_TOKEN_3.to_string(),
START_HEADER_ID.to_string(),
END_HEADER_ID.to_string(),
RESERVED_SPECIAL_TOKEN_4.to_string(),
EOT_ID.to_string(),
];

for i in 5..NUM_RESERVED_SPECIAL_TOKENS - 5 {
let token = format!("<|reserved_special_token_{}|>", i);
special_tokens_list.push(token);
}

let num_base_tokens = encoder.len();
let mut special_tokens = HashMap::default();
for (i, token) in special_tokens_list.iter().enumerate() {
special_tokens.insert((*token).to_string(), num_base_tokens + i);
}

let bpe = CoreBPE::new(
encoder,
special_tokens,
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
)?;
Ok(bpe)
}
1 change: 1 addition & 0 deletions tiktoken-rs/src/tiktoken_ext/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod llama3;
pub mod openai_public;
1 change: 1 addition & 0 deletions tiktoken-rs/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub enum Tokenizer {
R50kBase,
P50kEdit,
Gpt2,
Llama3Base,
}

const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
Expand Down
33 changes: 32 additions & 1 deletion tiktoken-rs/tests/tiktoken.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use rustc_hash::FxHashMap as HashMap;

use tiktoken_rs::{
byte_pair_split, cl100k_base, o200k_base, p50k_base, p50k_base_singleton, r50k_base, CoreBPE,
byte_pair_split, cl100k_base, llama3_base, o200k_base, p50k_base, p50k_base_singleton,
r50k_base, CoreBPE,
};

#[test]
Expand Down Expand Up @@ -126,6 +127,35 @@ fn o200k_split_test() {
);
}

#[test]
fn llama3_base_test() {
let bpe = llama3_base().unwrap();
test_roundtrip(&bpe, "This is a test with a lot of spaces");
test_decode(
&bpe,
"This is a test with a lot of spaces",
vec![2028, 374, 264, 1296, 260, 449, 264, 2763, 315, 12908],
);
test_decode(
&bpe,
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>This is a test with a lot of spaces<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<|reserved_special_token_250|>",
vec![128000, 128006, 9125, 128007, 2675, 527, 459, 18328, 13, 128009, 128006, 882, 128007, 2028, 374, 264, 1296, 260, 449, 264, 2763, 315, 12908, 128009, 128006, 78191, 128007, 271, 128255],
);
}

#[test]
fn llama3_split_test() {
let bpe = llama3_base().unwrap();
let tokenized: Result<Vec<_>, _> = bpe
.split_by_token_iter("This is a test with a lot of spaces", true)
.collect();
let tokenized = tokenized.unwrap();
assert_eq!(
tokenized,
vec!["This", " is", " a", " test", " ", " with", " a", " lot", " of", " spaces"]
);
}

#[test]
fn p50k_base_singleton_test() {
// let now = std::time::Instant::now();
Expand Down Expand Up @@ -173,4 +203,5 @@ fn test_unicode_roundtrip() {
test_roundtrip(&r50k_base().unwrap(), "我想借几本汉语书");
test_roundtrip(&cl100k_base().unwrap(), "你会说中文吗?");
test_roundtrip(&o200k_base().unwrap(), "ひらがなカタカナ漢字");
test_roundtrip(&llama3_base().unwrap(), "ひらがなカタカナ漢字");
}