@@ -95,59 +95,32 @@ cfg_match! {
95
95
if multibyte_mask == 0 {
96
96
assert!( intra_chunk_offset == 0 ) ;
97
97
98
- // Check if there are any control characters in the chunk. All
99
- // control characters that we can encounter at this point have a
100
- // byte value less than 32 or ...
101
- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
102
- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
103
-
104
- // ... it's the ASCII 'DEL' character with a value of 127.
105
- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
106
- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
107
-
108
- let control_char_mask = control_char_mask0 | control_char_mask1;
109
-
110
- if control_char_mask != 0 {
111
- // Check for newlines in the chunk
112
- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
113
- let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
114
-
115
- if control_char_mask == newlines_mask {
116
- // All control characters are newlines, record them
117
- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
118
-
119
- while newlines_mask != 0 {
120
- let index = newlines_mask. trailing_zeros( ) ;
121
-
122
- lines. push( RelativeBytePos ( index) + output_offset) ;
123
-
124
- // Clear the bit, so we can find the next one.
125
- newlines_mask &= newlines_mask - 1 ;
126
- }
127
-
128
- // We are done for this chunk. All control characters were
129
- // newlines and we took care of those.
130
- continue ;
131
- } else {
132
- // Some of the control characters are not newlines,
133
- // fall through to the slow path below.
134
- }
135
- } else {
136
- // No control characters, nothing to record for this chunk
137
- continue ;
98
+ // Check for newlines in the chunk
99
+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
100
+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
101
+
102
+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
103
+
104
+ while newlines_mask != 0 {
105
+ let index = newlines_mask. trailing_zeros( ) ;
106
+
107
+ lines. push( RelativeBytePos ( index) + output_offset) ;
108
+
109
+ // Clear the bit, so we can find the next one.
110
+ newlines_mask &= newlines_mask - 1 ;
138
111
}
112
+ } else {
113
+ // The slow path.
114
+ // There are multibyte chars in here, fallback to generic decoding.
115
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116
+ intra_chunk_offset = analyze_source_file_generic(
117
+ & src[ scan_start..] ,
118
+ CHUNK_SIZE - intra_chunk_offset,
119
+ RelativeBytePos :: from_usize( scan_start) ,
120
+ lines,
121
+ multi_byte_chars,
122
+ ) ;
139
123
}
140
-
141
- // The slow path.
142
- // There are control chars in here, fallback to generic decoding.
143
- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
144
- intra_chunk_offset = analyze_source_file_generic(
145
- & src[ scan_start..] ,
146
- CHUNK_SIZE - intra_chunk_offset,
147
- RelativeBytePos :: from_usize( scan_start) ,
148
- lines,
149
- multi_byte_chars,
150
- ) ;
151
124
}
152
125
153
126
// There might still be a tail left to analyze
@@ -247,59 +220,32 @@ cfg_match! {
247
220
if multibyte_mask == 0 {
248
221
assert!( intra_chunk_offset == 0 ) ;
249
222
250
- // Check if there are any control characters in the chunk. All
251
- // control characters that we can encounter at this point have a
252
- // byte value less than 32 or ...
253
- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
254
- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
255
-
256
- // ... it's the ASCII 'DEL' character with a value of 127.
257
- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
258
- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
259
-
260
- let control_char_mask = control_char_mask0 | control_char_mask1;
261
-
262
- if control_char_mask != 0 {
263
- // Check for newlines in the chunk
264
- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
265
- let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
266
-
267
- if control_char_mask == newlines_mask {
268
- // All control characters are newlines, record them
269
- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
270
-
271
- while newlines_mask != 0 {
272
- let index = newlines_mask. trailing_zeros( ) ;
273
-
274
- lines. push( RelativeBytePos ( index) + output_offset) ;
275
-
276
- // Clear the bit, so we can find the next one.
277
- newlines_mask &= newlines_mask - 1 ;
278
- }
279
-
280
- // We are done for this chunk. All control characters were
281
- // newlines and we took care of those.
282
- continue ;
283
- } else {
284
- // Some of the control characters are not newlines,
285
- // fall through to the slow path below.
286
- }
287
- } else {
288
- // No control characters, nothing to record for this chunk
289
- continue ;
223
+ // Check for newlines in the chunk
224
+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
225
+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
226
+
227
+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
228
+
229
+ while newlines_mask != 0 {
230
+ let index = newlines_mask. trailing_zeros( ) ;
231
+
232
+ lines. push( RelativeBytePos ( index) + output_offset) ;
233
+
234
+ // Clear the bit, so we can find the next one.
235
+ newlines_mask &= newlines_mask - 1 ;
290
236
}
237
+ } else {
238
+ // The slow path.
239
+ // There are multibyte chars in here, fallback to generic decoding.
240
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241
+ intra_chunk_offset = analyze_source_file_generic(
242
+ & src[ scan_start..] ,
243
+ CHUNK_SIZE - intra_chunk_offset,
244
+ RelativeBytePos :: from_usize( scan_start) ,
245
+ lines,
246
+ multi_byte_chars,
247
+ ) ;
291
248
}
292
-
293
- // The slow path.
294
- // There are control chars in here, fallback to generic decoding.
295
- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
296
- intra_chunk_offset = analyze_source_file_generic(
297
- & src[ scan_start..] ,
298
- CHUNK_SIZE - intra_chunk_offset,
299
- RelativeBytePos :: from_usize( scan_start) ,
300
- lines,
301
- multi_byte_chars,
302
- ) ;
303
249
}
304
250
305
251
// There might still be a tail left to analyze
@@ -357,29 +303,18 @@ fn analyze_source_file_generic(
357
303
// string.
358
304
let mut char_len = 1 ;
359
305
360
- if byte < 32 {
361
- // This is an ASCII control character, it could be one of the cases
362
- // that are interesting to us.
363
-
306
+ if byte == b'\n' {
364
307
let pos = RelativeBytePos :: from_usize ( i) + output_offset;
365
-
366
- if let b'\n' = byte {
367
- lines. push ( pos + RelativeBytePos ( 1 ) ) ;
368
- }
369
- } else if byte >= 127 {
370
- // The slow path:
371
- // This is either ASCII control character "DEL" or the beginning of
372
- // a multibyte char. Just decode to `char`.
308
+ lines. push ( pos + RelativeBytePos ( 1 ) ) ;
309
+ } else if byte >= 128 {
310
+ // This is the beginning of a multibyte char. Just decode to `char`.
373
311
let c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
374
312
char_len = c. len_utf8 ( ) ;
375
313
376
314
let pos = RelativeBytePos :: from_usize ( i) + output_offset;
377
-
378
- if char_len > 1 {
379
- assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
380
- let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
381
- multi_byte_chars. push ( mbc) ;
382
- }
315
+ assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
316
+ let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
317
+ multi_byte_chars. push ( mbc) ;
383
318
}
384
319
385
320
i += char_len;
0 commit comments