Skip to content

Commit d6ca7ad

Browse files
committed
Remove dead control char logic
Only newlines and multibyte characters are actually relevant
1 parent 3c7c38a commit d6ca7ad

File tree

1 file changed

+55
-120
lines changed

1 file changed

+55
-120
lines changed

compiler/rustc_span/src/analyze_source_file.rs

+55-120
Original file line numberDiff line numberDiff line change
@@ -95,59 +95,32 @@ cfg_match! {
9595
if multibyte_mask == 0 {
9696
assert!(intra_chunk_offset == 0);
9797

98-
// Check if there are any control characters in the chunk. All
99-
// control characters that we can encounter at this point have a
100-
// byte value less than 32 or ...
101-
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
102-
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
103-
104-
// ... it's the ASCII 'DEL' character with a value of 127.
105-
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
106-
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
107-
108-
let control_char_mask = control_char_mask0 | control_char_mask1;
109-
110-
if control_char_mask != 0 {
111-
// Check for newlines in the chunk
112-
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
113-
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
114-
115-
if control_char_mask == newlines_mask {
116-
// All control characters are newlines, record them
117-
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
118-
119-
while newlines_mask != 0 {
120-
let index = newlines_mask.trailing_zeros();
121-
122-
lines.push(RelativeBytePos(index) + output_offset);
123-
124-
// Clear the bit, so we can find the next one.
125-
newlines_mask &= newlines_mask - 1;
126-
}
127-
128-
// We are done for this chunk. All control characters were
129-
// newlines and we took care of those.
130-
continue;
131-
} else {
132-
// Some of the control characters are not newlines,
133-
// fall through to the slow path below.
134-
}
135-
} else {
136-
// No control characters, nothing to record for this chunk
137-
continue;
98+
// Check for newlines in the chunk
99+
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
100+
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
101+
102+
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
103+
104+
while newlines_mask != 0 {
105+
let index = newlines_mask.trailing_zeros();
106+
107+
lines.push(RelativeBytePos(index) + output_offset);
108+
109+
// Clear the bit, so we can find the next one.
110+
newlines_mask &= newlines_mask - 1;
138111
}
112+
} else {
113+
// The slow path.
114+
// There are multibyte chars in here, fallback to generic decoding.
115+
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116+
intra_chunk_offset = analyze_source_file_generic(
117+
&src[scan_start..],
118+
CHUNK_SIZE - intra_chunk_offset,
119+
RelativeBytePos::from_usize(scan_start),
120+
lines,
121+
multi_byte_chars,
122+
);
139123
}
140-
141-
// The slow path.
142-
// There are control chars in here, fallback to generic decoding.
143-
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
144-
intra_chunk_offset = analyze_source_file_generic(
145-
&src[scan_start..],
146-
CHUNK_SIZE - intra_chunk_offset,
147-
RelativeBytePos::from_usize(scan_start),
148-
lines,
149-
multi_byte_chars,
150-
);
151124
}
152125

153126
// There might still be a tail left to analyze
@@ -247,59 +220,32 @@ cfg_match! {
247220
if multibyte_mask == 0 {
248221
assert!(intra_chunk_offset == 0);
249222

250-
// Check if there are any control characters in the chunk. All
251-
// control characters that we can encounter at this point have a
252-
// byte value less than 32 or ...
253-
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
254-
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
255-
256-
// ... it's the ASCII 'DEL' character with a value of 127.
257-
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
258-
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
259-
260-
let control_char_mask = control_char_mask0 | control_char_mask1;
261-
262-
if control_char_mask != 0 {
263-
// Check for newlines in the chunk
264-
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
265-
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
266-
267-
if control_char_mask == newlines_mask {
268-
// All control characters are newlines, record them
269-
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
270-
271-
while newlines_mask != 0 {
272-
let index = newlines_mask.trailing_zeros();
273-
274-
lines.push(RelativeBytePos(index) + output_offset);
275-
276-
// Clear the bit, so we can find the next one.
277-
newlines_mask &= newlines_mask - 1;
278-
}
279-
280-
// We are done for this chunk. All control characters were
281-
// newlines and we took care of those.
282-
continue;
283-
} else {
284-
// Some of the control characters are not newlines,
285-
// fall through to the slow path below.
286-
}
287-
} else {
288-
// No control characters, nothing to record for this chunk
289-
continue;
223+
// Check for newlines in the chunk
224+
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
225+
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
226+
227+
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
228+
229+
while newlines_mask != 0 {
230+
let index = newlines_mask.trailing_zeros();
231+
232+
lines.push(RelativeBytePos(index) + output_offset);
233+
234+
// Clear the bit, so we can find the next one.
235+
newlines_mask &= newlines_mask - 1;
290236
}
237+
} else {
238+
// The slow path.
239+
// There are multibyte chars in here, fallback to generic decoding.
240+
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241+
intra_chunk_offset = analyze_source_file_generic(
242+
&src[scan_start..],
243+
CHUNK_SIZE - intra_chunk_offset,
244+
RelativeBytePos::from_usize(scan_start),
245+
lines,
246+
multi_byte_chars,
247+
);
291248
}
292-
293-
// The slow path.
294-
// There are control chars in here, fallback to generic decoding.
295-
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
296-
intra_chunk_offset = analyze_source_file_generic(
297-
&src[scan_start..],
298-
CHUNK_SIZE - intra_chunk_offset,
299-
RelativeBytePos::from_usize(scan_start),
300-
lines,
301-
multi_byte_chars,
302-
);
303249
}
304250

305251
// There might still be a tail left to analyze
@@ -357,29 +303,18 @@ fn analyze_source_file_generic(
357303
// string.
358304
let mut char_len = 1;
359305

360-
if byte < 32 {
361-
// This is an ASCII control character, it could be one of the cases
362-
// that are interesting to us.
363-
306+
if byte == b'\n' {
364307
let pos = RelativeBytePos::from_usize(i) + output_offset;
365-
366-
if let b'\n' = byte {
367-
lines.push(pos + RelativeBytePos(1));
368-
}
369-
} else if byte >= 127 {
370-
// The slow path:
371-
// This is either ASCII control character "DEL" or the beginning of
372-
// a multibyte char. Just decode to `char`.
308+
lines.push(pos + RelativeBytePos(1));
309+
} else if byte >= 128 {
310+
// This is the beginning of a multibyte char. Just decode to `char`.
373311
let c = src[i..].chars().next().unwrap();
374312
char_len = c.len_utf8();
375313

376314
let pos = RelativeBytePos::from_usize(i) + output_offset;
377-
378-
if char_len > 1 {
379-
assert!((2..=4).contains(&char_len));
380-
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
381-
multi_byte_chars.push(mbc);
382-
}
315+
assert!((2..=4).contains(&char_len));
316+
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
317+
multi_byte_chars.push(mbc);
383318
}
384319

385320
i += char_len;

0 commit comments

Comments
 (0)