Skip to content

Commit bd345d7

Browse files
ishitatsuyukiBurntSushi
authored andcommitted
automata: fix broken universal start states with sparse DFA
The state IDs were not remapped, which will usually result in an index out of range error. Add a test based on is_special_state's doctest, which will validate the start state's behavior with a custom searcher. Closes #1195
1 parent 72618c6 commit bd345d7

File tree

2 files changed

+100
-1
lines changed

2 files changed

+100
-1
lines changed

regex-automata/src/dfa/sparse.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1860,6 +1860,12 @@ impl StartTable<Vec<u8>> {
18601860
let new_start_id = remap[dfa.to_index(old_start_id)];
18611861
sl.set_start(anchored, sty, new_start_id);
18621862
}
1863+
if let Some(ref mut id) = sl.universal_start_anchored {
1864+
*id = remap[dfa.to_index(*id)];
1865+
}
1866+
if let Some(ref mut id) = sl.universal_start_unanchored {
1867+
*id = remap[dfa.to_index(*id)];
1868+
}
18631869
Ok(sl)
18641870
}
18651871
}

regex-automata/tests/dfa/api.rs

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::error::Error;
33
use regex_automata::{
44
dfa::{dense, Automaton, OverlappingState},
55
nfa::thompson,
6-
HalfMatch, Input, MatchError,
6+
Anchored, HalfMatch, Input, MatchError,
77
};
88

99
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,96 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767
assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a")));
6868
Ok(())
6969
}
70+
71+
// A variant of [`Automaton::is_special_state`]'s doctest, but with universal
72+
// start states.
73+
//
74+
// See: https://github.com/rust-lang/regex/pull/1195
75+
#[test]
76+
fn universal_start_search() -> Result<(), Box<dyn Error>> {
77+
fn find<A: Automaton>(
78+
dfa: &A,
79+
haystack: &[u8],
80+
) -> Result<Option<HalfMatch>, MatchError> {
81+
let mut state = dfa
82+
.universal_start_state(Anchored::No)
83+
.expect("regex should not require lookbehind");
84+
let mut last_match = None;
85+
// Walk all the bytes in the haystack. We can quit early if we see
86+
// a dead or a quit state. The former means the automaton will
87+
// never transition to any other state. The latter means that the
88+
// automaton entered a condition in which its search failed.
89+
for (i, &b) in haystack.iter().enumerate() {
90+
state = dfa.next_state(state, b);
91+
if dfa.is_special_state(state) {
92+
if dfa.is_match_state(state) {
93+
last_match =
94+
Some(HalfMatch::new(dfa.match_pattern(state, 0), i));
95+
} else if dfa.is_dead_state(state) {
96+
return Ok(last_match);
97+
} else if dfa.is_quit_state(state) {
98+
// It is possible to enter into a quit state after
99+
// observing a match has occurred. In that case, we
100+
// should return the match instead of an error.
101+
if last_match.is_some() {
102+
return Ok(last_match);
103+
}
104+
return Err(MatchError::quit(b, i));
105+
}
106+
// Implementors may also want to check for start or accel
107+
// states and handle them differently for performance
108+
// reasons. But it is not necessary for correctness.
109+
}
110+
}
111+
// Matches are always delayed by 1 byte, so we must explicitly walk
112+
// the special "EOI" transition at the end of the search.
113+
state = dfa.next_eoi_state(state);
114+
if dfa.is_match_state(state) {
115+
last_match = Some(HalfMatch::new(
116+
dfa.match_pattern(state, 0),
117+
haystack.len(),
118+
));
119+
}
120+
Ok(last_match)
121+
}
122+
123+
fn check_impl(
124+
dfa: impl Automaton,
125+
haystack: &str,
126+
pat: usize,
127+
offset: usize,
128+
) -> Result<(), Box<dyn Error>> {
129+
let haystack = haystack.as_bytes();
130+
let mat = find(&dfa, haystack)?.unwrap();
131+
assert_eq!(mat.pattern().as_usize(), pat);
132+
assert_eq!(mat.offset(), offset);
133+
Ok(())
134+
}
135+
136+
fn check(
137+
dfa: &dense::DFA<Vec<u32>>,
138+
haystack: &str,
139+
pat: usize,
140+
offset: usize,
141+
) -> Result<(), Box<dyn Error>> {
142+
check_impl(dfa, haystack, pat, offset)?;
143+
check_impl(dfa.to_sparse()?, haystack, pat, offset)?;
144+
Ok(())
145+
}
146+
147+
let dfa = dense::DFA::new(r"[a-z]+")?;
148+
let haystack = "123 foobar 4567";
149+
check(&dfa, haystack, 0, 10)?;
150+
151+
let dfa = dense::DFA::new(r"[0-9]{4}")?;
152+
let haystack = "123 foobar 4567";
153+
check(&dfa, haystack, 0, 15)?;
154+
155+
let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
156+
let haystack = "123 foobar 4567";
157+
check(&dfa, haystack, 1, 3)?;
158+
check(&dfa, &haystack[3..], 0, 7)?;
159+
check(&dfa, &haystack[10..], 1, 5)?;
160+
161+
Ok(())
162+
}

0 commit comments

Comments
 (0)