Skip to content

Commit 4a03527

Browse files
committed
refactor: move runtime functions to core
Instead of `include_str!()`ing `range_search.rs`, just make it a normal module under `core::unicode`. This means the same source code doesn't have to be checked in twice, and it plays nicer with IDEs. Also rename it to `rt` since it includes functions for searching the bitsets as well as the range represesentation.
1 parent ff6dc92 commit 4a03527

File tree

4 files changed

+13
-140
lines changed

4 files changed

+13
-140
lines changed

library/core/src/unicode/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1818

1919
pub(crate) mod printable;
2020

21+
#[allow(unreachable_pub)]
22+
mod rt;
2123
#[allow(unreachable_pub)]
2224
mod unicode_data;
2325

src/tools/unicode-table-generator/src/range_search.rs renamed to library/core/src/unicode/rt.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
//! Runtime support for `unicode_data`.
2+
13
#[inline(always)]
2-
const fn bitset_search<
4+
pub const fn bitset_search<
35
const N: usize,
46
const CHUNK_SIZE: usize,
57
const N1: usize,
@@ -46,23 +48,23 @@ const fn bitset_search<
4648
}
4749

4850
#[repr(transparent)]
49-
struct ShortOffsetRunHeader(u32);
51+
pub struct ShortOffsetRunHeader(pub u32);
5052

5153
impl ShortOffsetRunHeader {
52-
const fn new(start_index: usize, prefix_sum: u32) -> Self {
54+
pub const fn new(start_index: usize, prefix_sum: u32) -> Self {
5355
assert!(start_index < (1 << 11));
5456
assert!(prefix_sum < (1 << 21));
5557

5658
Self((start_index as u32) << 21 | prefix_sum)
5759
}
5860

5961
#[inline]
60-
const fn start_index(&self) -> usize {
62+
pub const fn start_index(&self) -> usize {
6163
(self.0 >> 21) as usize
6264
}
6365

6466
#[inline]
65-
const fn prefix_sum(&self) -> u32 {
67+
pub const fn prefix_sum(&self) -> u32 {
6668
self.0 & ((1 << 21) - 1)
6769
}
6870
}
@@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
7274
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
7375
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
7476
#[inline(always)]
75-
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
77+
pub unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
7678
needle: char,
7779
short_offset_runs: &[ShortOffsetRunHeader; SOR],
7880
offsets: &[u8; OFFSETS],

library/core/src/unicode/unicode_data.rs

Lines changed: 1 addition & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -11,136 +11,8 @@
1111
// to_upper : 13656 bytes
1212
// Total : 31911 bytes
1313

14-
#[inline(always)]
15-
const fn bitset_search<
16-
const N: usize,
17-
const CHUNK_SIZE: usize,
18-
const N1: usize,
19-
const CANONICAL: usize,
20-
const CANONICALIZED: usize,
21-
>(
22-
needle: u32,
23-
chunk_idx_map: &[u8; N],
24-
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
25-
bitset_canonical: &[u64; CANONICAL],
26-
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
27-
) -> bool {
28-
let bucket_idx = (needle / 64) as usize;
29-
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
30-
let chunk_piece = bucket_idx % CHUNK_SIZE;
31-
// FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const.
32-
let chunk_idx = if chunk_map_idx < chunk_idx_map.len() {
33-
chunk_idx_map[chunk_map_idx]
34-
} else {
35-
return false;
36-
};
37-
let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
38-
// FIXME(const-hack): Revert to `slice::get` when slice indexing becomes possible in const.
39-
let word = if idx < bitset_canonical.len() {
40-
bitset_canonical[idx]
41-
} else {
42-
let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
43-
let mut word = bitset_canonical[real_idx as usize];
44-
let should_invert = mapping & (1 << 6) != 0;
45-
if should_invert {
46-
word = !word;
47-
}
48-
// Lower 6 bits
49-
let quantity = mapping & ((1 << 6) - 1);
50-
if mapping & (1 << 7) != 0 {
51-
// shift
52-
word >>= quantity as u64;
53-
} else {
54-
word = word.rotate_left(quantity as u32);
55-
}
56-
word
57-
};
58-
(word & (1 << (needle % 64) as u64)) != 0
59-
}
60-
61-
#[repr(transparent)]
62-
struct ShortOffsetRunHeader(u32);
63-
64-
impl ShortOffsetRunHeader {
65-
const fn new(start_index: usize, prefix_sum: u32) -> Self {
66-
assert!(start_index < (1 << 11));
67-
assert!(prefix_sum < (1 << 21));
68-
69-
Self((start_index as u32) << 21 | prefix_sum)
70-
}
71-
72-
#[inline]
73-
const fn start_index(&self) -> usize {
74-
(self.0 >> 21) as usize
75-
}
76-
77-
#[inline]
78-
const fn prefix_sum(&self) -> u32 {
79-
self.0 & ((1 << 21) - 1)
80-
}
81-
}
82-
83-
/// # Safety
84-
///
85-
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
86-
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
87-
#[inline(always)]
88-
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
89-
needle: char,
90-
short_offset_runs: &[ShortOffsetRunHeader; SOR],
91-
offsets: &[u8; OFFSETS],
92-
) -> bool {
93-
let needle = needle as u32;
94-
95-
let last_idx =
96-
match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header.0 << 11) {
97-
Ok(idx) => idx + 1,
98-
Err(idx) => idx,
99-
};
100-
// SAFETY: `last_idx` *cannot* be past the end of the array, as the last
101-
// element is greater than `std::char::MAX` (the largest possible needle)
102-
// as guaranteed by the caller.
103-
//
104-
// So, we cannot have found it (i.e. `Ok(idx) => idx + 1 != length`) and the
105-
// correct location cannot be past it, so `Err(idx) => idx != length` either.
106-
//
107-
// This means that we can avoid bounds checking for the accesses below, too.
108-
//
109-
// We need to use `intrinsics::assume` since the `panic_nounwind` contained
110-
// in `hint::assert_unchecked` may not be optimized out.
111-
unsafe { crate::intrinsics::assume(last_idx < SOR) };
112-
113-
let mut offset_idx = short_offset_runs[last_idx].start_index();
114-
let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
115-
(*next).start_index() - offset_idx
116-
} else {
117-
offsets.len() - offset_idx
118-
};
119-
120-
let prev =
121-
last_idx.checked_sub(1).map(|prev| short_offset_runs[prev].prefix_sum()).unwrap_or(0);
122-
123-
let total = needle - prev;
124-
let mut prefix_sum = 0;
125-
for _ in 0..(length - 1) {
126-
// SAFETY: It is guaranteed that `length <= OFFSETS - offset_idx`,
127-
// so it follows that `length - 1 + offset_idx < OFFSETS`, therefore
128-
// `offset_idx < OFFSETS` is always true in this loop.
129-
//
130-
// We need to use `intrinsics::assume` since the `panic_nounwind` contained
131-
// in `hint::assert_unchecked` may not be optimized out.
132-
unsafe { crate::intrinsics::assume(offset_idx < OFFSETS) };
133-
let offset = offsets[offset_idx];
134-
prefix_sum += offset as u32;
135-
if prefix_sum > total {
136-
break;
137-
}
138-
offset_idx += 1;
139-
}
140-
offset_idx % 2 == 1
141-
}
142-
14314
pub const UNICODE_VERSION: (u8, u8, u8) = (17, 0, 0);
15+
use super::rt::*;
14416

14517
#[rustfmt::skip]
14618
pub mod alphabetic {

src/tools/unicode-table-generator/src/main.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -264,13 +264,9 @@ fn main() {
264264
}
265265
table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes));
266266

267-
// Include the range search function
268267
table_file.push('\n');
269-
table_file.push_str(include_str!("range_search.rs"));
270-
table_file.push('\n');
271-
272268
table_file.push_str(&version());
273-
269+
table_file.push_str("use super::rt::*;\n");
274270
table_file.push('\n');
275271

276272
modules.push((String::from("conversions"), conversions));
@@ -335,6 +331,7 @@ fn generate_tests(data: &UnicodeData) -> Result<String, fmt::Error> {
335331
writeln!(s, "// ignore-tidy-filelength")?;
336332
writeln!(s, "use std::intrinsics;")?;
337333
writeln!(s, "mod unicode_data;")?;
334+
writeln!(s, "mod rt {{ {} }}", include_str!("../../../../library/core/src/unicode/rt.rs"))?;
338335
writeln!(s, "fn main() {{")?;
339336
for (property, ranges) in &data.ranges {
340337
let prop = property.to_lowercase();

0 commit comments

Comments
 (0)