Skip to content

Commit c248fef

Browse files
Rollup merge of rust-lang#136460 - real-eren:simplify-rustc_span-analyze, r=Noratrieb
Simplify `rustc_span` `analyze_source_file` Simplifies the logic to what the code *actually* does, which is to just record newlines and multibyte characters. Checking for other ASCII control characters is unnecessary because the generic fallback doesn't do anything for those cases. Also uses a simpler (and more efficient) means of iterating the set bits of the mask.
2 parents c3e21a2 + d6ca7ad commit c248fef

File tree

1 file changed

+55
-132
lines changed

1 file changed

+55
-132
lines changed

compiler/rustc_span/src/analyze_source_file.rs

+55-132
Original file line numberDiff line numberDiff line change
@@ -95,65 +95,32 @@ cfg_match! {
9595
if multibyte_mask == 0 {
9696
assert!(intra_chunk_offset == 0);
9797

98-
// Check if there are any control characters in the chunk. All
99-
// control characters that we can encounter at this point have a
100-
// byte value less than 32 or ...
101-
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
102-
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
103-
104-
// ... it's the ASCII 'DEL' character with a value of 127.
105-
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
106-
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
107-
108-
let control_char_mask = control_char_mask0 | control_char_mask1;
109-
110-
if control_char_mask != 0 {
111-
// Check for newlines in the chunk
112-
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
113-
let newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
114-
115-
if control_char_mask == newlines_mask {
116-
// All control characters are newlines, record them
117-
let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
118-
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
119-
120-
loop {
121-
let index = newlines_mask.trailing_zeros();
122-
123-
if index >= CHUNK_SIZE as u32 {
124-
// We have arrived at the end of the chunk.
125-
break;
126-
}
127-
128-
lines.push(RelativeBytePos(index) + output_offset);
129-
130-
// Clear the bit, so we can find the next one.
131-
newlines_mask &= (!1) << index;
132-
}
133-
134-
// We are done for this chunk. All control characters were
135-
// newlines and we took care of those.
136-
continue;
137-
} else {
138-
// Some of the control characters are not newlines,
139-
// fall through to the slow path below.
140-
}
141-
} else {
142-
// No control characters, nothing to record for this chunk
143-
continue;
98+
// Check for newlines in the chunk
99+
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
100+
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
101+
102+
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
103+
104+
while newlines_mask != 0 {
105+
let index = newlines_mask.trailing_zeros();
106+
107+
lines.push(RelativeBytePos(index) + output_offset);
108+
109+
// Clear the bit, so we can find the next one.
110+
newlines_mask &= newlines_mask - 1;
144111
}
112+
} else {
113+
// The slow path.
114+
// There are multibyte chars in here, fallback to generic decoding.
115+
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116+
intra_chunk_offset = analyze_source_file_generic(
117+
&src[scan_start..],
118+
CHUNK_SIZE - intra_chunk_offset,
119+
RelativeBytePos::from_usize(scan_start),
120+
lines,
121+
multi_byte_chars,
122+
);
145123
}
146-
147-
// The slow path.
148-
// There are control chars in here, fallback to generic decoding.
149-
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
150-
intra_chunk_offset = analyze_source_file_generic(
151-
&src[scan_start..],
152-
CHUNK_SIZE - intra_chunk_offset,
153-
RelativeBytePos::from_usize(scan_start),
154-
lines,
155-
multi_byte_chars,
156-
);
157124
}
158125

159126
// There might still be a tail left to analyze
@@ -253,65 +220,32 @@ cfg_match! {
253220
if multibyte_mask == 0 {
254221
assert!(intra_chunk_offset == 0);
255222

256-
// Check if there are any control characters in the chunk. All
257-
// control characters that we can encounter at this point have a
258-
// byte value less than 32 or ...
259-
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
260-
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
261-
262-
// ... it's the ASCII 'DEL' character with a value of 127.
263-
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
264-
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
265-
266-
let control_char_mask = control_char_mask0 | control_char_mask1;
267-
268-
if control_char_mask != 0 {
269-
// Check for newlines in the chunk
270-
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
271-
let newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
272-
273-
if control_char_mask == newlines_mask {
274-
// All control characters are newlines, record them
275-
let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
276-
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
277-
278-
loop {
279-
let index = newlines_mask.trailing_zeros();
280-
281-
if index >= CHUNK_SIZE as u32 {
282-
// We have arrived at the end of the chunk.
283-
break;
284-
}
285-
286-
lines.push(RelativeBytePos(index) + output_offset);
287-
288-
// Clear the bit, so we can find the next one.
289-
newlines_mask &= (!1) << index;
290-
}
291-
292-
// We are done for this chunk. All control characters were
293-
// newlines and we took care of those.
294-
continue;
295-
} else {
296-
// Some of the control characters are not newlines,
297-
// fall through to the slow path below.
298-
}
299-
} else {
300-
// No control characters, nothing to record for this chunk
301-
continue;
223+
// Check for newlines in the chunk
224+
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
225+
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
226+
227+
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
228+
229+
while newlines_mask != 0 {
230+
let index = newlines_mask.trailing_zeros();
231+
232+
lines.push(RelativeBytePos(index) + output_offset);
233+
234+
// Clear the bit, so we can find the next one.
235+
newlines_mask &= newlines_mask - 1;
302236
}
237+
} else {
238+
// The slow path.
239+
// There are multibyte chars in here, fallback to generic decoding.
240+
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241+
intra_chunk_offset = analyze_source_file_generic(
242+
&src[scan_start..],
243+
CHUNK_SIZE - intra_chunk_offset,
244+
RelativeBytePos::from_usize(scan_start),
245+
lines,
246+
multi_byte_chars,
247+
);
303248
}
304-
305-
// The slow path.
306-
// There are control chars in here, fallback to generic decoding.
307-
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
308-
intra_chunk_offset = analyze_source_file_generic(
309-
&src[scan_start..],
310-
CHUNK_SIZE - intra_chunk_offset,
311-
RelativeBytePos::from_usize(scan_start),
312-
lines,
313-
multi_byte_chars,
314-
);
315249
}
316250

317251
// There might still be a tail left to analyze
@@ -369,29 +303,18 @@ fn analyze_source_file_generic(
369303
// string.
370304
let mut char_len = 1;
371305

372-
if byte < 32 {
373-
// This is an ASCII control character, it could be one of the cases
374-
// that are interesting to us.
375-
306+
if byte == b'\n' {
376307
let pos = RelativeBytePos::from_usize(i) + output_offset;
377-
378-
if let b'\n' = byte {
379-
lines.push(pos + RelativeBytePos(1));
380-
}
381-
} else if byte >= 127 {
382-
// The slow path:
383-
// This is either ASCII control character "DEL" or the beginning of
384-
// a multibyte char. Just decode to `char`.
308+
lines.push(pos + RelativeBytePos(1));
309+
} else if byte >= 128 {
310+
// This is the beginning of a multibyte char. Just decode to `char`.
385311
let c = src[i..].chars().next().unwrap();
386312
char_len = c.len_utf8();
387313

388314
let pos = RelativeBytePos::from_usize(i) + output_offset;
389-
390-
if char_len > 1 {
391-
assert!((2..=4).contains(&char_len));
392-
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
393-
multi_byte_chars.push(mbc);
394-
}
315+
assert!((2..=4).contains(&char_len));
316+
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
317+
multi_byte_chars.push(mbc);
395318
}
396319

397320
i += char_len;

0 commit comments

Comments
 (0)