From 03d52abaf6f27c21ce8368f3b0844b2cc392bb20 Mon Sep 17 00:00:00 2001
From: RobinPicard <robin.picard@sciencespo.fr>
Date: Mon, 4 Aug 2025 22:10:40 +0200
Subject: [PATCH 1/2] Raise error in the index init if regex and vocab are
 incompatible

---
 src/error.rs |  6 ++++
 src/index.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 2 deletions(-)
diff --git a/src/error.rs b/src/error.rs
index 3967a789..e0d0eb7b 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -69,6 +69,12 @@ pub enum Error {
     InvalidRefecencePath(Box<str>),
     #[error("Ref recusion limit reached: {0}")]
     RefRecursionLimitReached(usize),
+    #[error("The vocabulary provided is incompatible with the regex '{regex}'. Found no transitions from state {error_state}, missing tokens corresponding to at least one of the following characters: {missing_tokens:?}. This may be due to an encoding issue in your vocabulary.")]
+    IncompatibleVocabulary {
+        regex: String,
+        error_state: u32,
+        missing_tokens: Vec<String>,
+    },
 }
 
 impl Error {
diff --git a/src/index.rs b/src/index.rs
index a57fabb9..04ca41ca 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -116,8 +116,11 @@ impl Index {
         let mut next_states: Vec<AutomataStateId> = vec![start_state];
 
         while let Some(current_state) = next_states.pop() {
+            let mut has_valid_transitions = false;
+
             if dfa.is_match_state(dfa.next_eoi_state(current_state)) {
                 final_states.insert(current_state.as_u32());
+                has_valid_transitions = true;
             }
 
             'token_loop: for (token, ids) in vocabulary.tokens().iter() {
@@ -136,6 +139,7 @@ impl Index {
                 let is_intermediate_state = !dfa.is_match_state(next_state);
                 let is_full_match_state = dfa.is_match_state(dfa.next_eoi_state(next_state));
                 if is_intermediate_state || is_full_match_state {
+                    has_valid_transitions = true;
                     for token_id in ids {
                         transitions
                             .entry(current_state.as_u32())
@@ -148,6 +152,28 @@ impl Index {
                     next_states.push(next_state);
                 }
             }
+
+            // If the current state has no valid transitions and is not a match state,
+            // it means the vocabulary is incompatible with the regex.
+            if !has_valid_transitions && !dfa.is_match_state(current_state) {
+                let mut valid_characters = Vec::new();
+                for byte in 0..=255u8 {
+                    let test_state = dfa.next_state(current_state, byte);
+                    if !dfa.is_dead_state(test_state) && !dfa.is_quit_state(test_state) {
+                        if byte.is_ascii() {
+                            valid_characters.push(char::from(byte).to_string());
+                        } else {
+                            valid_characters.push(format!("\\x{:02x}", byte));
+                        }
+                    }
+                }
+
+                return Err(Error::IncompatibleVocabulary {
+                    regex: regex.to_string(),
+                    error_state: current_state.as_u32(),
+                    missing_tokens: valid_characters,
+                });
+            }
         }
 
         // Populate `transitions` with mappings from `final_states` to `eos_token_id`
@@ -290,7 +316,7 @@ mod tests {
                 .expect("Insert failed");
         }
         for (token, token_id) in [
-            (vec![32, 240, 159, 152], 7),
+            (vec![32, 240, 159, 152, 136], 7),
             (vec![32, 240, 159, 152, 141], 6),
             (vec![240, 159, 152, 141], 4),
         ] {
@@ -309,10 +335,60 @@ mod tests {
             ),
             (
                 80,
-                HashMap::from_iter([(2, 128), (7, 192), (5, 208), (6, 208)]),
+                HashMap::from_iter([(2, 128), (7, 208), (5, 208), (6, 208)]),
             ),
             (128, HashMap::from_iter([(8, 128)])),
         ]);
         assert_eq!(index.transitions(), &expected);
     }
+
+    #[test]
+    fn index_incompatible_vocabulary_error() {
+        let regex = "0 1";
+        let mut vocabulary = Vocabulary::new(3);
+        for (token, token_id) in [("0", 0), ("0 ", 1), ("1", 2)] {
+            vocabulary
+                .try_insert(token, token_id as u32)
+                .expect("Insert failed");
+        }
+
+        let result = Index::new(regex, &vocabulary);
+        assert!(result.is_err());
+
+        if let Err(Error::IncompatibleVocabulary {
+            regex: _,
+            missing_tokens,
+            ..
+        }) = result
+        {
+            assert!(missing_tokens.contains(&" ".to_string()));
+        } else {
+            panic!("Expected IncompatibleVocabulary error");
+        }
+    }
+
+    #[test]
+    fn index_incompatible_vocabulary_error_non_ascii() {
+        let regex = "😈😍";
+        let mut vocabulary = Vocabulary::new(3);
+        for (token, token_id) in [("😈", 0), (" ", 1), ("b", 2)] {
+            vocabulary
+                .try_insert(token, token_id as u32)
+                .expect("Insert failed");
+        }
+
+        let result = Index::new(regex, &vocabulary);
+        assert!(result.is_err());
+
+        if let Err(Error::IncompatibleVocabulary {
+            regex: _,
+            missing_tokens,
+            ..
+        }) = result
+        {
+            assert!(missing_tokens.contains(&"\\xf0".to_string()));
+        } else {
+            panic!("Expected IncompatibleVocabulary error");
+        }
+    }
 }

From b6a51330a5c04d6cea1b477217403db574cbcdec Mon Sep 17 00:00:00 2001
From: RobinPicard <robin.picard@sciencespo.fr>
Date: Mon, 4 Aug 2025 22:11:10 +0200
Subject: [PATCH 2/2] Add documentation in the README on Vocabulary creation

---
 README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/README.md b/README.md
index 757f40af..2ea75bc8 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,52 @@ let next_state = index.next_state(&initial_state, token_id);
 let final_states = index.final_states();
 ```
 
+### Vocabulary
+
+You can create a `Vocabulary` in three ways:
+
+1. **`Vocabulary::from_pretrained(model, parameters)`** - Loads from a pretrained model (as in the example above)
+
+2. **Manual creation** - You can create a vocabulary from token mappings:
+
+    1. **`Vocabulary::new(eos_token_id)`** - Creates an empty vocabulary, then add tokens with `try_insert()`:
+
+        ```rust
+        let mut vocabulary = Vocabulary::new(50256);
+        vocabulary.try_insert("hello", 0)?;
+        vocabulary.try_insert(vec![32], 1)?;
+        ```
+
+    2. **`Vocabulary::try_from((eos_token_id, tokens))`** - Creates a vocabulary by directly providing the token mappings.
+
+        - It can be done either with the tokens as strings:
+
+            ```rust
+            use rustc_hash::FxHashMap as HashMap;
+
+            let eos_token_id: u32 = 50256;
+            let mut tokens: HashMap<String, Vec<u32>> = HashMap::default();
+            tokens.insert("hello".to_string(), vec![0]);
+            tokens.insert("world".to_string(), vec![1]);
+
+            let vocabulary = Vocabulary::try_from((eos_token_id, tokens))?;
+            ```
+
+        - Or with the tokens as byte vector keys:
+
+            ```rust
+            use rustc_hash::FxHashMap as HashMap;
+
+            let eos_token_id: u32 = 50256;
+            let mut tokens: HashMap<Vec<u8>, Vec<u32>> = HashMap::default();
+            tokens.insert(b"hello".to_vec(), vec![0]);
+            tokens.insert(b"world".to_vec(), vec![1]);
+
+            let vocabulary = Vocabulary::try_from((eos_token_id, tokens))?;
+            ```
+
+**Important**: When creating a `Vocabulary` manually from tokenizer data, ensure tokens are converted to their string representations to replace special tokens that wouldn't be recognized by the DFA.
+
 ## Python Bindings
 
 Additionally, project provides interfaces to integrate the crate's functionality with Python.