@@ -95,65 +95,32 @@ cfg_match! {
95
95
if multibyte_mask == 0 {
96
96
assert!( intra_chunk_offset == 0 ) ;
97
97
98
- // Check if there are any control characters in the chunk. All
99
- // control characters that we can encounter at this point have a
100
- // byte value less than 32 or ...
101
- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
102
- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
103
-
104
- // ... it's the ASCII 'DEL' character with a value of 127.
105
- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
106
- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
107
-
108
- let control_char_mask = control_char_mask0 | control_char_mask1;
109
-
110
- if control_char_mask != 0 {
111
- // Check for newlines in the chunk
112
- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
113
- let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
114
-
115
- if control_char_mask == newlines_mask {
116
- // All control characters are newlines, record them
117
- let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
118
- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
119
-
120
- loop {
121
- let index = newlines_mask. trailing_zeros( ) ;
122
-
123
- if index >= CHUNK_SIZE as u32 {
124
- // We have arrived at the end of the chunk.
125
- break ;
126
- }
127
-
128
- lines. push( RelativeBytePos ( index) + output_offset) ;
129
-
130
- // Clear the bit, so we can find the next one.
131
- newlines_mask &= ( !1 ) << index;
132
- }
133
-
134
- // We are done for this chunk. All control characters were
135
- // newlines and we took care of those.
136
- continue ;
137
- } else {
138
- // Some of the control characters are not newlines,
139
- // fall through to the slow path below.
140
- }
141
- } else {
142
- // No control characters, nothing to record for this chunk
143
- continue ;
98
+ // Check for newlines in the chunk
99
+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
100
+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
101
+
102
+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
103
+
104
+ while newlines_mask != 0 {
105
+ let index = newlines_mask. trailing_zeros( ) ;
106
+
107
+ lines. push( RelativeBytePos ( index) + output_offset) ;
108
+
109
+ // Clear the bit, so we can find the next one.
110
+ newlines_mask &= newlines_mask - 1 ;
144
111
}
112
+ } else {
113
+ // The slow path.
114
+ // There are multibyte chars in here, fallback to generic decoding.
115
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116
+ intra_chunk_offset = analyze_source_file_generic(
117
+ & src[ scan_start..] ,
118
+ CHUNK_SIZE - intra_chunk_offset,
119
+ RelativeBytePos :: from_usize( scan_start) ,
120
+ lines,
121
+ multi_byte_chars,
122
+ ) ;
145
123
}
146
-
147
- // The slow path.
148
- // There are control chars in here, fallback to generic decoding.
149
- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
150
- intra_chunk_offset = analyze_source_file_generic(
151
- & src[ scan_start..] ,
152
- CHUNK_SIZE - intra_chunk_offset,
153
- RelativeBytePos :: from_usize( scan_start) ,
154
- lines,
155
- multi_byte_chars,
156
- ) ;
157
124
}
158
125
159
126
// There might still be a tail left to analyze
@@ -253,65 +220,32 @@ cfg_match! {
253
220
if multibyte_mask == 0 {
254
221
assert!( intra_chunk_offset == 0 ) ;
255
222
256
- // Check if there are any control characters in the chunk. All
257
- // control characters that we can encounter at this point have a
258
- // byte value less than 32 or ...
259
- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
260
- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
261
-
262
- // ... it's the ASCII 'DEL' character with a value of 127.
263
- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
264
- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
265
-
266
- let control_char_mask = control_char_mask0 | control_char_mask1;
267
-
268
- if control_char_mask != 0 {
269
- // Check for newlines in the chunk
270
- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
271
- let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
272
-
273
- if control_char_mask == newlines_mask {
274
- // All control characters are newlines, record them
275
- let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
276
- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
277
-
278
- loop {
279
- let index = newlines_mask. trailing_zeros( ) ;
280
-
281
- if index >= CHUNK_SIZE as u32 {
282
- // We have arrived at the end of the chunk.
283
- break ;
284
- }
285
-
286
- lines. push( RelativeBytePos ( index) + output_offset) ;
287
-
288
- // Clear the bit, so we can find the next one.
289
- newlines_mask &= ( !1 ) << index;
290
- }
291
-
292
- // We are done for this chunk. All control characters were
293
- // newlines and we took care of those.
294
- continue ;
295
- } else {
296
- // Some of the control characters are not newlines,
297
- // fall through to the slow path below.
298
- }
299
- } else {
300
- // No control characters, nothing to record for this chunk
301
- continue ;
223
+ // Check for newlines in the chunk
224
+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
225
+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
226
+
227
+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
228
+
229
+ while newlines_mask != 0 {
230
+ let index = newlines_mask. trailing_zeros( ) ;
231
+
232
+ lines. push( RelativeBytePos ( index) + output_offset) ;
233
+
234
+ // Clear the bit, so we can find the next one.
235
+ newlines_mask &= newlines_mask - 1 ;
302
236
}
237
+ } else {
238
+ // The slow path.
239
+ // There are multibyte chars in here, fallback to generic decoding.
240
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241
+ intra_chunk_offset = analyze_source_file_generic(
242
+ & src[ scan_start..] ,
243
+ CHUNK_SIZE - intra_chunk_offset,
244
+ RelativeBytePos :: from_usize( scan_start) ,
245
+ lines,
246
+ multi_byte_chars,
247
+ ) ;
303
248
}
304
-
305
- // The slow path.
306
- // There are control chars in here, fallback to generic decoding.
307
- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
308
- intra_chunk_offset = analyze_source_file_generic(
309
- & src[ scan_start..] ,
310
- CHUNK_SIZE - intra_chunk_offset,
311
- RelativeBytePos :: from_usize( scan_start) ,
312
- lines,
313
- multi_byte_chars,
314
- ) ;
315
249
}
316
250
317
251
// There might still be a tail left to analyze
@@ -369,29 +303,18 @@ fn analyze_source_file_generic(
369
303
// string.
370
304
let mut char_len = 1 ;
371
305
372
- if byte < 32 {
373
- // This is an ASCII control character, it could be one of the cases
374
- // that are interesting to us.
375
-
306
+ if byte == b'\n' {
376
307
let pos = RelativeBytePos :: from_usize ( i) + output_offset;
377
-
378
- if let b'\n' = byte {
379
- lines. push ( pos + RelativeBytePos ( 1 ) ) ;
380
- }
381
- } else if byte >= 127 {
382
- // The slow path:
383
- // This is either ASCII control character "DEL" or the beginning of
384
- // a multibyte char. Just decode to `char`.
308
+ lines. push ( pos + RelativeBytePos ( 1 ) ) ;
309
+ } else if byte >= 128 {
310
+ // This is the beginning of a multibyte char. Just decode to `char`.
385
311
let c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
386
312
char_len = c. len_utf8 ( ) ;
387
313
388
314
let pos = RelativeBytePos :: from_usize ( i) + output_offset;
389
-
390
- if char_len > 1 {
391
- assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
392
- let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
393
- multi_byte_chars. push ( mbc) ;
394
- }
315
+ assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
316
+ let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
317
+ multi_byte_chars. push ( mbc) ;
395
318
}
396
319
397
320
i += char_len;
0 commit comments