From 03d52abaf6f27c21ce8368f3b0844b2cc392bb20 Mon Sep 17 00:00:00 2001 From: RobinPicard Date: Mon, 4 Aug 2025 22:10:40 +0200 Subject: [PATCH 1/2] Raise error in the index init if regex and vocab are incompatible --- src/error.rs | 6 ++++ src/index.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/src/error.rs b/src/error.rs index 3967a789..e0d0eb7b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -69,6 +69,12 @@ pub enum Error { InvalidRefecencePath(Box), #[error("Ref recusion limit reached: {0}")] RefRecursionLimitReached(usize), + #[error("The vocabulary provided is incompatible with the regex '{regex}'. Found no transitions from state {error_state}, missing tokens corresponding to at least one of the following characters: {missing_tokens:?}. This may be due to an encoding issue in your vocabulary.")] + IncompatibleVocabulary { + regex: String, + error_state: u32, + missing_tokens: Vec, + }, } impl Error { diff --git a/src/index.rs b/src/index.rs index a57fabb9..04ca41ca 100644 --- a/src/index.rs +++ b/src/index.rs @@ -116,8 +116,11 @@ impl Index { let mut next_states: Vec = vec![start_state]; while let Some(current_state) = next_states.pop() { + let mut has_valid_transitions = false; + if dfa.is_match_state(dfa.next_eoi_state(current_state)) { final_states.insert(current_state.as_u32()); + has_valid_transitions = true; } 'token_loop: for (token, ids) in vocabulary.tokens().iter() { @@ -136,6 +139,7 @@ impl Index { let is_intermediate_state = !dfa.is_match_state(next_state); let is_full_match_state = dfa.is_match_state(dfa.next_eoi_state(next_state)); if is_intermediate_state || is_full_match_state { + has_valid_transitions = true; for token_id in ids { transitions .entry(current_state.as_u32()) @@ -148,6 +152,28 @@ impl Index { next_states.push(next_state); } } + + // If the current state has no valid transitions and is not a match state, + // it means the vocabulary is incompatible with the regex. + if !has_valid_transitions && !dfa.is_match_state(current_state) { + let mut valid_characters = Vec::new(); + for byte in 0..=255u8 { + let test_state = dfa.next_state(current_state, byte); + if !dfa.is_dead_state(test_state) && !dfa.is_quit_state(test_state) { + if byte.is_ascii() { + valid_characters.push(char::from(byte).to_string()); + } else { + valid_characters.push(format!("\\x{:02x}", byte)); + } + } + } + + return Err(Error::IncompatibleVocabulary { + regex: regex.to_string(), + error_state: current_state.as_u32(), + missing_tokens: valid_characters, + }); + } } // Populate `transitions` with mappings from `final_states` to `eos_token_id` @@ -290,7 +316,7 @@ mod tests { .expect("Insert failed"); } for (token, token_id) in [ - (vec![32, 240, 159, 152], 7), + (vec![32, 240, 159, 152, 136], 7), (vec![32, 240, 159, 152, 141], 6), (vec![240, 159, 152, 141], 4), ] { @@ -309,10 +335,60 @@ mod tests { ), ( 80, - HashMap::from_iter([(2, 128), (7, 192), (5, 208), (6, 208)]), + HashMap::from_iter([(2, 128), (7, 208), (5, 208), (6, 208)]), ), (128, HashMap::from_iter([(8, 128)])), ]); assert_eq!(index.transitions(), &expected); } + + #[test] + fn index_incompatible_vocabulary_error() { + let regex = "0 1"; + let mut vocabulary = Vocabulary::new(3); + for (token, token_id) in [("0", 0), ("0 ", 1), ("1", 2)] { + vocabulary + .try_insert(token, token_id as u32) + .expect("Insert failed"); + } + + let result = Index::new(regex, &vocabulary); + assert!(result.is_err()); + + if let Err(Error::IncompatibleVocabulary { + regex: _, + missing_tokens, + .. + }) = result + { + assert!(missing_tokens.contains(&" ".to_string())); + } else { + panic!("Expected IncompatibleVocabulary error"); + } + } + + #[test] + fn index_incompatible_vocabulary_error_non_ascii() { + let regex = "😈😍"; + let mut vocabulary = Vocabulary::new(3); + for (token, token_id) in [("😈", 0), (" ", 1), ("b", 2)] { + vocabulary + .try_insert(token, token_id as u32) + .expect("Insert failed"); + } + + let result = Index::new(regex, &vocabulary); + assert!(result.is_err()); + + if let Err(Error::IncompatibleVocabulary { + regex: _, + missing_tokens, + .. + }) = result + { + assert!(missing_tokens.contains(&"\\xf0".to_string())); + } else { + panic!("Expected IncompatibleVocabulary error"); + } + } } From b6a51330a5c04d6cea1b477217403db574cbcdec Mon Sep 17 00:00:00 2001 From: RobinPicard Date: Mon, 4 Aug 2025 22:11:10 +0200 Subject: [PATCH 2/2] Add documentation in the README on Vocabulary creation --- README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/README.md b/README.md index 757f40af..2ea75bc8 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,52 @@ let next_state = index.next_state(&initial_state, token_id); let final_states = index.final_states(); ``` +### Vocabulary + +You can create a `Vocabulary` in three ways: + +1. **`Vocabulary::from_pretrained(model, parameters)`** - Loads from a pretrained model (as in the example above) + +2. **Manual creation** - You can create a vocabulary from token mappings: + + 1. **`Vocabulary::new(eos_token_id)`** - Creates an empty vocabulary, then add tokens with `try_insert()`: + + ```rust + let mut vocabulary = Vocabulary::new(50256); + vocabulary.try_insert("hello", 0)?; + vocabulary.try_insert(vec![32], 1)?; + ``` + + 2. **`Vocabulary::try_from((eos_token_id, tokens))`** - Creates a vocabulary by directly providing the token mappings. + + - It can be done either with the tokens as strings: + + ```rust + use rustc_hash::FxHashMap as HashMap; + + let eos_token_id: u32 = 50256; + let mut tokens: HashMap> = HashMap::default(); + tokens.insert("hello".to_string(), vec![0]); + tokens.insert("world".to_string(), vec![1]); + + let vocabulary = Vocabulary::try_from((eos_token_id, tokens))?; + ``` + + - Or with the tokens as byte vector keys: + + ```rust + use rustc_hash::FxHashMap as HashMap; + + let eos_token_id: u32 = 50256; + let mut tokens: HashMap, Vec> = HashMap::default(); + tokens.insert(b"hello".to_vec(), vec![0]); + tokens.insert(b"world".to_vec(), vec![1]); + + let vocabulary = Vocabulary::try_from((eos_token_id, tokens))?; + ``` + +**Important**: When creating a `Vocabulary` manually from tokenizer data, ensure tokens are converted to their string representations to replace special tokens that wouldn't be recognized by the DFA. + ## Python Bindings Additionally, project provides interfaces to integrate the crate's functionality with Python.