Skip to content

Commit 6fa8898

Browse files
committed
Split Symbol interner into static and dynamic
This improves performance of Symbol interning in several ways. The main motivation of this work is to prepare rustc for efficient parallel builds. The Symbol lookup table (mapping from strings to Symbol numbers) is now split into two separate tables: a static table and a dynamic table. The static table contains strings that are known to rustc, including all keywords and all `sym::foo` symbols. The dynamic table contains strings that rustc discovers while compiling, such as "my_super_obscure_function_name". Since the static table is known at compile time (that is, when rustc itself is being compiled), this table can be stored entirely in static data structures. We use the `phf` crate to generate this table; `phf` generates perfect hash functions. This allows rustc to perform Symbol lookups for static symbols without any multithreaded synchronization, or accessing any dynamic data whatsoever. I measured the percentage of static symbol lookups in many common Rust crates, including rust/compiler, rust/library, servo, rand, quote, syn, rust-analyzer, rayon, and rsvg. Among these crates, between 35% and 55% of all symbol lookups were resolved using the static lookup table.
1 parent 14265f9 commit 6fa8898

File tree

8 files changed

+212
-56
lines changed

8 files changed

+212
-56
lines changed

Cargo.lock

+2
Original file line numberDiff line numberDiff line change
@@ -3913,6 +3913,7 @@ dependencies = [
39133913
name = "rustc_macros"
39143914
version = "0.1.0"
39153915
dependencies = [
3916+
"phf_codegen",
39163917
"proc-macro2",
39173918
"quote",
39183919
"syn",
@@ -4200,6 +4201,7 @@ version = "0.0.0"
42004201
dependencies = [
42014202
"cfg-if 0.1.10",
42024203
"md-5",
4204+
"phf",
42034205
"rustc_arena",
42044206
"rustc_data_structures",
42054207
"rustc_index",

compiler/rustc_macros/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ synstructure = "0.12.1"
1212
syn = { version = "1", features = ["full"] }
1313
proc-macro2 = "1"
1414
quote = "1"
15+
phf_codegen = "0.8"

compiler/rustc_macros/src/symbols.rs

+57-35
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,6 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
124124
}
125125
};
126126

127-
let mut keyword_stream = quote! {};
128-
let mut symbols_stream = quote! {};
129-
let mut prefill_stream = quote! {};
130-
let mut counter = 0u32;
131127
let mut keys =
132128
HashMap::<String, Span>::with_capacity(input.keywords.len() + input.symbols.len() + 10);
133129
let mut prev_key: Option<(Span, String)> = None;
@@ -136,8 +132,10 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
136132
if let Some(prev_span) = keys.get(str) {
137133
errors.error(span, format!("Symbol `{}` is duplicated", str));
138134
errors.error(*prev_span, format!("location of previous definition"));
135+
Err(())
139136
} else {
140137
keys.insert(str.to_string(), span);
138+
Ok(())
141139
}
142140
};
143141

@@ -151,51 +149,75 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
151149
prev_key = Some((span, str.to_string()));
152150
};
153151

152+
let mut symbol_strings: Vec<String> = Vec::new();
153+
154154
// Generate the listed keywords.
155+
let mut keyword_stream = quote! {};
155156
for keyword in input.keywords.iter() {
156157
let name = &keyword.name;
157158
let value = &keyword.value;
158159
let value_string = value.value();
159-
check_dup(keyword.name.span(), &value_string, &mut errors);
160-
prefill_stream.extend(quote! {
161-
#value,
162-
});
160+
let symbol_index = symbol_strings.len() as u32;
161+
if check_dup(keyword.name.span(), &value_string, &mut errors).is_ok() {
162+
// Only add an entry to `symbol_strings` if it is not a duplicate.
163+
// If it is a duplicate, then compilation will fail. However, we still
164+
// want to avoid panicking, if a duplicate is detected.
165+
symbol_strings.push(value_string);
166+
}
163167
keyword_stream.extend(quote! {
164-
pub const #name: Symbol = Symbol::new(#counter);
168+
pub const #name: Symbol = Symbol::new(#symbol_index);
165169
});
166-
counter += 1;
170+
}
171+
172+
// Generate symbols for the strings "0", "1", ..., "9".
173+
let digits_base = symbol_strings.len() as u32;
174+
for n in 0..10 {
175+
let n_string = n.to_string();
176+
if check_dup(Span::call_site(), &n_string, &mut errors).is_ok() {
177+
symbol_strings.push(n_string);
178+
}
167179
}
168180

169181
// Generate the listed symbols.
182+
let mut symbols_stream = quote! {};
170183
for symbol in input.symbols.iter() {
171184
let name = &symbol.name;
185+
let name_string = symbol.name.to_string();
186+
check_order(symbol.name.span(), &name_string, &mut errors);
172187
let value = match &symbol.value {
173188
Some(value) => value.value(),
174-
None => name.to_string(),
189+
None => name_string,
175190
};
176-
check_dup(symbol.name.span(), &value, &mut errors);
177-
check_order(symbol.name.span(), &name.to_string(), &mut errors);
178191

179-
prefill_stream.extend(quote! {
180-
#value,
181-
});
192+
let symbol_index = symbol_strings.len() as u32;
193+
if check_dup(symbol.name.span(), &value, &mut errors).is_ok() {
194+
// Only add an entry to `symbol_strings` if it is not a duplicate.
195+
// If it is a duplicate, then compilation will fail. However, we still
196+
// want to avoid panicking, if a duplicate is detected.
197+
symbol_strings.push(value);
198+
}
199+
182200
symbols_stream.extend(quote! {
183-
pub const #name: Symbol = Symbol::new(#counter);
201+
pub const #name: Symbol = Symbol::new(#symbol_index);
184202
});
185-
counter += 1;
186203
}
187204

188-
// Generate symbols for the strings "0", "1", ..., "9".
189-
let digits_base = counter;
190-
counter += 10;
191-
for n in 0..10 {
192-
let n = n.to_string();
193-
check_dup(Span::call_site(), &n, &mut errors);
194-
prefill_stream.extend(quote! {
195-
#n,
196-
});
205+
// We have finished collecting symbol strings.
206+
let static_symbols_len = symbol_strings.len();
207+
let dynamic_symbol_base = symbol_strings.len() as u32;
208+
let symbol_strings = symbol_strings; // no more mutation
209+
210+
// Build the body of STATIC_SYMBOLS.
211+
let symbol_strings_tokens: TokenStream = symbol_strings.iter().map(|s| quote!(#s,)).collect();
212+
213+
// Build the PHF map. This translates from strings to Symbol values.
214+
let mut phf_map = phf_codegen::Map::<&str>::new();
215+
for (symbol_index, symbol) in symbol_strings.iter().enumerate() {
216+
phf_map.entry(symbol, format!("Symbol::new({})", symbol_index as u32).as_str());
197217
}
198-
let _ = counter; // for future use
218+
let phf_map_built = phf_map.build();
219+
let phf_map_text = phf_map_built.to_string();
220+
let phf_map_expr = syn::parse_str::<syn::Expr>(&phf_map_text).unwrap();
199221

200222
let output = quote! {
201223
const SYMBOL_DIGITS_BASE: u32 = #digits_base;
@@ -215,13 +237,13 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
215237
#symbols_stream
216238
}
217239

218-
impl Interner {
219-
pub fn fresh() -> Self {
220-
Interner::prefill(&[
221-
#prefill_stream
222-
])
223-
}
224-
}
240+
const DYNAMIC_SYMBOL_BASE: u32 = #dynamic_symbol_base;
241+
242+
static STATIC_SYMBOLS: [&str; #static_symbols_len as usize] = [
243+
#symbol_strings_tokens
244+
];
245+
246+
static STATIC_SYMBOLS_PHF: ::phf::Map<&'static str, Symbol> = #phf_map_expr;
225247
};
226248

227249
(output, errors.list)

compiler/rustc_span/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ tracing = "0.1"
2020
sha-1 = "0.9"
2121
sha2 = "0.9"
2222
md-5 = "0.9"
23+
phf = "0.8"

compiler/rustc_span/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ pub struct SessionGlobals {
9090
impl SessionGlobals {
9191
pub fn new(edition: Edition) -> SessionGlobals {
9292
SessionGlobals {
93-
symbol_interner: Lock::new(symbol::Interner::fresh()),
93+
symbol_interner: Lock::new(symbol::Interner::default()),
9494
span_interner: Lock::new(span_encoding::SpanInterner::default()),
9595
hygiene_data: Lock::new(hygiene::HygieneData::new(edition)),
9696
source_map: Lock::new(None),

compiler/rustc_span/src/symbol.rs

+74-13
Original file line numberDiff line numberDiff line change
@@ -1452,17 +1452,55 @@ impl Symbol {
14521452
Symbol(SymbolIndex::from_u32(n))
14531453
}
14541454

1455+
/// Maps a string to its interned representation, but only if this string is a known
1456+
/// (static) symbol.
1457+
pub fn intern_static(string: &str) -> Option<Symbol> {
1458+
if let Some(symbol) = STATIC_SYMBOLS_PHF.get(string) { Some(*symbol) } else { None }
1459+
}
1460+
14551461
/// Maps a string to its interned representation.
1462+
// #[inline(never)] - There is no benefit to inlining this function (verified with
1463+
// performance measurements), and a reduction in overall code size by disabling inlining.
1464+
#[inline(never)]
14561465
pub fn intern(string: &str) -> Self {
1457-
with_interner(|interner| interner.intern(string))
1466+
if let Some(symbol) = Symbol::intern_static(string) {
1467+
symbol
1468+
} else {
1469+
with_interner(|interner| interner.intern_dynamic(string))
1470+
}
1471+
}
1472+
1473+
pub fn is_static(self) -> bool {
1474+
self.0.as_u32() < DYNAMIC_SYMBOL_BASE
1475+
}
1476+
1477+
/// Translates the `Symbol` to a string, but only if this `Symbol`
1478+
/// was originally interned as a static symbol.
1479+
pub fn as_str_static(self) -> Option<&'static str> {
1480+
let symbol_index = self.0.as_usize();
1481+
if symbol_index < STATIC_SYMBOLS.len() {
1482+
// This is a well-known symbol. The symbol string is stored in a static field.
1483+
// There is no need to lock the interner.
1484+
Some(STATIC_SYMBOLS[symbol_index])
1485+
} else {
1486+
None
1487+
}
14581488
}
14591489

14601490
/// Convert to a `SymbolStr`. This is a slowish operation because it
14611491
/// requires locking the symbol interner.
1492+
///
1493+
/// If the symbol is a statically-interned symbol (interned at rustc compile time),
1494+
/// then this operation is fast, and does not acquire any locks.
14621495
pub fn as_str(self) -> SymbolStr {
1463-
with_interner(|interner| unsafe {
1464-
SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get(self)) }
1465-
})
1496+
if let Some(string) = self.as_str_static() {
1497+
SymbolStr { string }
1498+
} else {
1499+
// This is a dynamic string. The string is stored in the Interner.
1500+
with_interner(|interner| unsafe {
1501+
SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get_dynamic(self)) }
1502+
})
1503+
}
14661504
}
14671505

14681506
pub fn as_u32(self) -> u32 {
@@ -1528,6 +1566,13 @@ impl<CTX> ToStableHashKey<CTX> for Symbol {
15281566
// The `FxHashMap`+`Vec` pair could be replaced by `FxIndexSet`, but #75278
15291567
// found that to regress performance up to 2% in some cases. This might be
15301568
// revisited after further improvements to `indexmap`.
1569+
//
1570+
// `Interner` does not contain any of the statically-known symbol names.
1571+
// It does not contain any of the strings defined in the `Keyword` or
1572+
// `Symbol` sections. Since those strings are statically-known, we just
1573+
// look them up in a (static) table, when needed. See
1574+
// `STATIC_SYMBOLS` and `STATIC_SYMBOLS_PHF`, which are both generated by
1575+
// `compiler/rustc_macros/src/symbols.rs`.
15311576
#[derive(Default)]
15321577
pub struct Interner {
15331578
arena: DroplessArena,
@@ -1536,21 +1581,28 @@ pub struct Interner {
15361581
}
15371582

15381583
impl Interner {
1539-
fn prefill(init: &[&'static str]) -> Self {
1540-
Interner {
1541-
strings: init.into(),
1542-
names: init.iter().copied().zip((0..).map(Symbol::new)).collect(),
1543-
..Default::default()
1584+
pub fn intern(&mut self, string: &str) -> Symbol {
1585+
if let Some(sym) = Symbol::intern_static(string) {
1586+
sym
1587+
} else {
1588+
self.intern_dynamic(string)
15441589
}
15451590
}
15461591

1547-
#[inline]
1548-
pub fn intern(&mut self, string: &str) -> Symbol {
1592+
fn intern_dynamic(&mut self, string: &str) -> Symbol {
1593+
// The caller should have already checked for static symbols.
1594+
// Failure to do so is a bug, since this code will mistakenly
1595+
// intern the static symbol, resulting in a bogus symbol index.
1596+
// (The whole point of this design is that you can do static
1597+
// lookups without acquiring the thread-local Interner, so if
1598+
// we got here with a static symbol, we goofed.)
1599+
debug_assert!(Symbol::intern_static(string).is_none());
1600+
15491601
if let Some(&name) = self.names.get(string) {
15501602
return name;
15511603
}
15521604

1553-
let name = Symbol::new(self.strings.len() as u32);
1605+
let name = Symbol::new(DYNAMIC_SYMBOL_BASE + self.strings.len() as u32);
15541606

15551607
// `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be
15561608
// UTF-8.
@@ -1567,7 +1619,16 @@ impl Interner {
15671619
// Get the symbol as a string. `Symbol::as_str()` should be used in
15681620
// preference to this function.
15691621
pub fn get(&self, symbol: Symbol) -> &str {
1570-
self.strings[symbol.0.as_usize()]
1622+
if let Some(string) = symbol.as_str_static() {
1623+
string
1624+
} else {
1625+
&self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize]
1626+
}
1627+
}
1628+
1629+
fn get_dynamic(&self, symbol: Symbol) -> &str {
1630+
debug_assert!(!symbol.is_static());
1631+
self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize]
15711632
}
15721633
}
15731634

compiler/rustc_span/src/symbol/tests.rs

+71-7
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@ use crate::{edition, SessionGlobals};
55
#[test]
66
fn interner_tests() {
77
let mut i: Interner = Interner::default();
8-
// first one is zero:
9-
assert_eq!(i.intern("dog"), Symbol::new(0));
8+
let dog = i.intern("dog");
109
// re-use gets the same entry:
11-
assert_eq!(i.intern("dog"), Symbol::new(0));
10+
assert_eq!(i.intern("dog").as_u32(), dog.as_u32());
1211
// different string gets a different #:
13-
assert_eq!(i.intern("cat"), Symbol::new(1));
14-
assert_eq!(i.intern("cat"), Symbol::new(1));
15-
// dog is still at zero
16-
assert_eq!(i.intern("dog"), Symbol::new(0));
12+
let cat = i.intern("cat");
13+
assert_ne!(dog.as_u32(), cat.as_u32());
14+
assert_eq!(i.intern("cat").as_u32(), cat.as_u32());
15+
assert_eq!(i.intern("dog").as_u32(), dog.as_u32());
1716
}
1817

1918
#[test]
@@ -23,3 +22,68 @@ fn without_first_quote_test() {
2322
assert_eq!(i.without_first_quote().name, kw::Break);
2423
});
2524
}
25+
26+
#[test]
27+
fn test_static_symbols() {
28+
assert_eq!(Symbol::intern_static(""), Some(kw::Invalid));
29+
assert_eq!(Symbol::intern_static("not in the static table"), None);
30+
assert!(Symbol::intern_static("fn").is_some()); // don't care about exact index
31+
32+
// check round-tripping
33+
for &string in ["as", "fn", "let", "trait", "size_of_val"].iter() {
34+
let sym = Symbol::intern_static(string).unwrap();
35+
assert_eq!(string, &*sym.as_str(), "sym #{}", sym.0.as_u32());
36+
}
37+
}
38+
39+
#[test]
40+
fn test_ident_is_special() {
41+
for &s in [kw::Invalid, kw::PathRoot, kw::DollarCrate, kw::Underscore].iter() {
42+
let ident = Ident::with_dummy_span(s);
43+
assert_eq!(ident.is_special(), true, "s = {:?}", s);
44+
}
45+
46+
for &s in [kw::As, kw::Break, kw::UnderscoreLifetime].iter() {
47+
let ident = Ident::with_dummy_span(s);
48+
assert_eq!(ident.is_special(), false, "s = {:?}", s);
49+
}
50+
}
51+
52+
#[test]
53+
fn test_symbol_as_str() {
54+
SESSION_GLOBALS.set(&SessionGlobals::new(edition::Edition::Edition2018), || {
55+
for &(sym, string) in [
56+
(kw::Invalid, ""),
57+
(kw::PathRoot, "{{root}}"),
58+
(kw::DollarCrate, "$crate"),
59+
(kw::As, "as"),
60+
(kw::Break, "break"),
61+
(kw::While, "while"),
62+
(kw::Union, "union"),
63+
(sym::Alignment, "Alignment"),
64+
(sym::Arc, "Arc"),
65+
(sym::zmm_reg, "zmm_reg"),
66+
(sym::i64, "i64"),
67+
]
68+
.iter()
69+
{
70+
let as_str = sym.as_str();
71+
assert_eq!(&*as_str, string);
72+
73+
let sym2 = Symbol::intern(string);
74+
assert_eq!(sym, sym2, "sym={} sym2={}", sym.as_u32(), sym2.as_u32());
75+
}
76+
77+
let colon = Symbol::intern(":");
78+
assert_eq!(&*colon.as_str(), ":");
79+
});
80+
}
81+
82+
#[test]
83+
fn test_dynamic_symbols() {
84+
crate::with_session_globals(crate::edition::Edition::Edition2018, || {
85+
let s1 = Symbol::intern("fuzzy wuzzy");
86+
assert!(!s1.is_static());
87+
assert_eq!(&*s1.as_str(), "fuzzy wuzzy");
88+
});
89+
}

0 commit comments

Comments
 (0)