1
Fork 0

Auto merge of #114860 - Zoxc:sharded-layout, r=SparrowLii

Make `Sharded` an enum and specialize it for the single thread case

This changes `Sharded` to use a single shard by an enum, reducing the size of `Sharded` for greater cache efficiency.

Performance improvement with 1 thread and `cfg(parallel_compiler)`:
<table><tr><td rowspan="2">Benchmark</td><td colspan="1"><b>Before</b></th><td colspan="2"><b>After</b></th></tr><tr><td align="right">Time</td><td align="right">Time</td><td align="right">%</th></tr><tr><td>🟣 <b>clap</b>:check</td><td align="right">1.7009s</td><td align="right">1.6748s</td><td align="right">💚  -1.53%</td></tr><tr><td>🟣 <b>hyper</b>:check</td><td align="right">0.2525s</td><td align="right">0.2451s</td><td align="right">💚  -2.90%</td></tr><tr><td>🟣 <b>regex</b>:check</td><td align="right">0.9519s</td><td align="right">0.9353s</td><td align="right">💚  -1.74%</td></tr><tr><td>🟣 <b>syn</b>:check</td><td align="right">1.5504s</td><td align="right">1.5280s</td><td align="right">💚  -1.45%</td></tr><tr><td>🟣 <b>syntex_syntax</b>:check</td><td align="right">5.9536s</td><td align="right">5.8873s</td><td align="right">💚  -1.11%</td></tr><tr><td>Total</td><td align="right">10.4092s</td><td align="right">10.2706s</td><td align="right">💚  -1.33%</td></tr><tr><td>Summary</td><td align="right">1.0000s</td><td align="right">0.9825s</td><td align="right">💚  -1.75%</td></tr></table>

I did see an unexpected 0.23% change for the serial compiler, so this could use a perf run to see if that reproduces.

cc `@SparrowLii`
This commit is contained in:
bors 2023-08-24 02:24:25 +00:00
commit 840ed5d133
2 changed files with 49 additions and 41 deletions

View file

@ -1,31 +1,26 @@
use crate::fx::{FxHashMap, FxHasher}; use crate::fx::{FxHashMap, FxHasher};
#[cfg(parallel_compiler)] #[cfg(parallel_compiler)]
use crate::sync::is_dyn_thread_safe; use crate::sync::{is_dyn_thread_safe, CacheAligned};
use crate::sync::{CacheAligned, Lock, LockGuard}; use crate::sync::{Lock, LockGuard};
use std::borrow::Borrow; use std::borrow::Borrow;
use std::collections::hash_map::RawEntryMut; use std::collections::hash_map::RawEntryMut;
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use std::mem; use std::mem;
#[cfg(parallel_compiler)]
// 32 shards is sufficient to reduce contention on an 8-core Ryzen 7 1700, // 32 shards is sufficient to reduce contention on an 8-core Ryzen 7 1700,
// but this should be tested on higher core count CPUs. How the `Sharded` type gets used // but this should be tested on higher core count CPUs. How the `Sharded` type gets used
// may also affect the ideal number of shards. // may also affect the ideal number of shards.
const SHARD_BITS: usize = 5; const SHARD_BITS: usize = 5;
#[cfg(not(parallel_compiler))] #[cfg(parallel_compiler)]
const SHARD_BITS: usize = 0; const SHARDS: usize = 1 << SHARD_BITS;
pub const SHARDS: usize = 1 << SHARD_BITS;
/// An array of cache-line aligned inner locked structures with convenience methods. /// An array of cache-line aligned inner locked structures with convenience methods.
pub struct Sharded<T> { /// A single field is used when the compiler uses only one thread.
/// This mask is used to ensure that accesses are inbounds of `shards`. pub enum Sharded<T> {
/// When dynamic thread safety is off, this field is set to 0 causing only Single(Lock<T>),
/// a single shard to be used for greater cache efficiency.
#[cfg(parallel_compiler)] #[cfg(parallel_compiler)]
mask: usize, Shards(Box<[CacheAligned<Lock<T>>; SHARDS]>),
shards: [CacheAligned<Lock<T>>; SHARDS],
} }
impl<T: Default> Default for Sharded<T> { impl<T: Default> Default for Sharded<T> {
@ -38,35 +33,24 @@ impl<T: Default> Default for Sharded<T> {
impl<T> Sharded<T> { impl<T> Sharded<T> {
#[inline] #[inline]
pub fn new(mut value: impl FnMut() -> T) -> Self { pub fn new(mut value: impl FnMut() -> T) -> Self {
Sharded {
#[cfg(parallel_compiler)] #[cfg(parallel_compiler)]
mask: if is_dyn_thread_safe() { SHARDS - 1 } else { 0 }, if is_dyn_thread_safe() {
shards: [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))), return Sharded::Shards(Box::new(
} [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))),
));
} }
#[inline(always)] Sharded::Single(Lock::new(value()))
fn mask(&self) -> usize {
#[cfg(parallel_compiler)]
{
if SHARDS == 1 { 0 } else { self.mask }
}
#[cfg(not(parallel_compiler))]
{
0
}
}
#[inline(always)]
fn count(&self) -> usize {
// `self.mask` is always one below the used shard count
self.mask() + 1
} }
/// The shard is selected by hashing `val` with `FxHasher`. /// The shard is selected by hashing `val` with `FxHasher`.
#[inline] #[inline]
pub fn get_shard_by_value<K: Hash + ?Sized>(&self, val: &K) -> &Lock<T> { pub fn get_shard_by_value<K: Hash + ?Sized>(&self, _val: &K) -> &Lock<T> {
self.get_shard_by_hash(if SHARDS == 1 { 0 } else { make_hash(val) }) match self {
Self::Single(single) => &single,
#[cfg(parallel_compiler)]
Self::Shards(..) => self.get_shard_by_hash(make_hash(_val)),
}
} }
#[inline] #[inline]
@ -75,18 +59,42 @@ impl<T> Sharded<T> {
} }
#[inline] #[inline]
pub fn get_shard_by_index(&self, i: usize) -> &Lock<T> { pub fn get_shard_by_index(&self, _i: usize) -> &Lock<T> {
// SAFETY: The index get ANDed with the mask, ensuring it is always inbounds. match self {
unsafe { &self.shards.get_unchecked(i & self.mask()).0 } Self::Single(single) => &single,
#[cfg(parallel_compiler)]
Self::Shards(shards) => {
// SAFETY: The index gets ANDed with the shard mask, ensuring it is always inbounds.
unsafe { &shards.get_unchecked(_i & (SHARDS - 1)).0 }
}
}
} }
pub fn lock_shards(&self) -> Vec<LockGuard<'_, T>> { pub fn lock_shards(&self) -> Vec<LockGuard<'_, T>> {
(0..self.count()).map(|i| self.get_shard_by_index(i).lock()).collect() match self {
Self::Single(single) => vec![single.lock()],
#[cfg(parallel_compiler)]
Self::Shards(shards) => shards.iter().map(|shard| shard.0.lock()).collect(),
}
} }
pub fn try_lock_shards(&self) -> Option<Vec<LockGuard<'_, T>>> { pub fn try_lock_shards(&self) -> Option<Vec<LockGuard<'_, T>>> {
(0..self.count()).map(|i| self.get_shard_by_index(i).try_lock()).collect() match self {
Self::Single(single) => Some(vec![single.try_lock()?]),
#[cfg(parallel_compiler)]
Self::Shards(shards) => shards.iter().map(|shard| shard.0.try_lock()).collect(),
} }
}
}
#[inline]
pub fn shards() -> usize {
#[cfg(parallel_compiler)]
if is_dyn_thread_safe() {
return SHARDS;
}
1
} }
pub type ShardedHashMap<K, V> = Sharded<FxHashMap<K, V>>; pub type ShardedHashMap<K, V> = Sharded<FxHashMap<K, V>>;

View file

@ -1166,7 +1166,7 @@ impl<K: DepKind> CurrentDepGraph<K> {
)), )),
new_node_to_index: Sharded::new(|| { new_node_to_index: Sharded::new(|| {
FxHashMap::with_capacity_and_hasher( FxHashMap::with_capacity_and_hasher(
new_node_count_estimate / sharded::SHARDS, new_node_count_estimate / sharded::shards(),
Default::default(), Default::default(),
) )
}), }),