@@ -3,7 +3,7 @@ use std::error::Error;
3
3
use regex_automata:: {
4
4
dfa:: { dense, Automaton , OverlappingState } ,
5
5
nfa:: thompson,
6
- HalfMatch , Input , MatchError ,
6
+ Anchored , HalfMatch , Input , MatchError ,
7
7
} ;
8
8
9
9
// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,96 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
67
67
assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
68
68
Ok ( ( ) )
69
69
}
70
+
71
+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal
72
+ // start states.
73
+ //
74
+ // See: https://github.com/rust-lang/regex/pull/1195
75
+ #[ test]
76
+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
77
+ fn find < A : Automaton > (
78
+ dfa : & A ,
79
+ haystack : & [ u8 ] ,
80
+ ) -> Result < Option < HalfMatch > , MatchError > {
81
+ let mut state = dfa
82
+ . universal_start_state ( Anchored :: No )
83
+ . expect ( "regex should not require lookbehind" ) ;
84
+ let mut last_match = None ;
85
+ // Walk all the bytes in the haystack. We can quit early if we see
86
+ // a dead or a quit state. The former means the automaton will
87
+ // never transition to any other state. The latter means that the
88
+ // automaton entered a condition in which its search failed.
89
+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
90
+ state = dfa. next_state ( state, b) ;
91
+ if dfa. is_special_state ( state) {
92
+ if dfa. is_match_state ( state) {
93
+ last_match =
94
+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
95
+ } else if dfa. is_dead_state ( state) {
96
+ return Ok ( last_match) ;
97
+ } else if dfa. is_quit_state ( state) {
98
+ // It is possible to enter into a quit state after
99
+ // observing a match has occurred. In that case, we
100
+ // should return the match instead of an error.
101
+ if last_match. is_some ( ) {
102
+ return Ok ( last_match) ;
103
+ }
104
+ return Err ( MatchError :: quit ( b, i) ) ;
105
+ }
106
+ // Implementors may also want to check for start or accel
107
+ // states and handle them differently for performance
108
+ // reasons. But it is not necessary for correctness.
109
+ }
110
+ }
111
+ // Matches are always delayed by 1 byte, so we must explicitly walk
112
+ // the special "EOI" transition at the end of the search.
113
+ state = dfa. next_eoi_state ( state) ;
114
+ if dfa. is_match_state ( state) {
115
+ last_match = Some ( HalfMatch :: new (
116
+ dfa. match_pattern ( state, 0 ) ,
117
+ haystack. len ( ) ,
118
+ ) ) ;
119
+ }
120
+ Ok ( last_match)
121
+ }
122
+
123
+ fn check_impl (
124
+ dfa : impl Automaton ,
125
+ haystack : & str ,
126
+ pat : usize ,
127
+ offset : usize ,
128
+ ) -> Result < ( ) , Box < dyn Error > > {
129
+ let haystack = haystack. as_bytes ( ) ;
130
+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
131
+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
132
+ assert_eq ! ( mat. offset( ) , offset) ;
133
+ Ok ( ( ) )
134
+ }
135
+
136
+ fn check (
137
+ dfa : & dense:: DFA < Vec < u32 > > ,
138
+ haystack : & str ,
139
+ pat : usize ,
140
+ offset : usize ,
141
+ ) -> Result < ( ) , Box < dyn Error > > {
142
+ check_impl ( dfa, haystack, pat, offset) ?;
143
+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
144
+ Ok ( ( ) )
145
+ }
146
+
147
+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
148
+ let haystack = "123 foobar 4567" ;
149
+ check ( & dfa, haystack, 0 , 10 ) ?;
150
+
151
+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
152
+ let haystack = "123 foobar 4567" ;
153
+ check ( & dfa, haystack, 0 , 15 ) ?;
154
+
155
+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
156
+ let haystack = "123 foobar 4567" ;
157
+ check ( & dfa, haystack, 1 , 3 ) ?;
158
+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
159
+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
160
+
161
+ Ok ( ( ) )
162
+ }
0 commit comments