Auto merge of #136115 - Mark-Simulacrum:shard-alloc-id, r=RalfJung

Shard AllocMap Lock This improves performance on many-seed parallel (-Zthreads=32) miri executions from managing to use ~8 cores to using 27-28 cores, which is about the same as what I see with the data structure proposed in https://github.com/rust-lang/rust/pull/136105 - I haven't analyzed but I suspect the sharding might actually work out better if we commonly insert "densely" since sharding would split the cache lines and the OnceVec packs locks close together. Of course, we could do something similar with the bitset lock too. Either way, this seems like a very reasonable starting point that solves the problem ~equally well on what I can test locally. r? `@RalfJung`
2025-02-04 23:47:45 +00:00 · 2025-02-04 23:47:45 +00:00 · e5f11af042
commit e5f11af042
parent bef3c3b01f 7f1231c986
2 changed files with 44 additions and 26 deletions
--- a/compiler/rustc_middle/src/mir/interpret/mod.rs
+++ b/compiler/rustc_middle/src/mir/interpret/mod.rs
@ -15,7 +15,8 @@ use std::{fmt, io};
 use rustc_abi::{AddressSpace, Align, Endian, HasDataLayout, Size};
 use rustc_ast::{LitKind, Mutability};
 use rustc_data_structures::fx::FxHashMap;
-use rustc_data_structures::sync::Lock;
+use rustc_data_structures::sharded::ShardedHashMap;
 use rustc_data_structures::sync::{AtomicU64, Lock};
 use rustc_hir::def::DefKind;
 use rustc_hir::def_id::{DefId, LocalDefId};
 use rustc_macros::{HashStable, TyDecodable, TyEncodable, TypeFoldable, TypeVisitable};
@ -389,35 +390,39 @@ pub const CTFE_ALLOC_SALT: usize = 0;
 pub(crate) struct AllocMap<'tcx> {
    /// Maps `AllocId`s to their corresponding allocations.
-    alloc_map: FxHashMap<AllocId, GlobalAlloc<'tcx>>,
+    // Note that this map on rustc workloads seems to be rather dense, but in miri workloads should
    // be pretty sparse. In #136105 we considered replacing it with a (dense) Vec-based map, but
    // since there are workloads where it can be sparse we decided to go with sharding for now. At
    // least up to 32 cores the one workload tested didn't exhibit much difference between the two.
    //
    // Should be locked *after* locking dedup if locking both to avoid deadlocks.
    to_alloc: ShardedHashMap<AllocId, GlobalAlloc<'tcx>>,
    /// Used to deduplicate global allocations: functions, vtables, string literals, ...
    ///
    /// The `usize` is a "salt" used by Miri to make deduplication imperfect, thus better emulating
    /// the actual guarantees.
-    dedup: FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>,
+    dedup: Lock<FxHashMap<(GlobalAlloc<'tcx>, usize), AllocId>>,
    /// The `AllocId` to assign to the next requested ID.
    /// Always incremented; never gets smaller.
-    next_id: AllocId,
+    next_id: AtomicU64,
 }
 impl<'tcx> AllocMap<'tcx> {
    pub(crate) fn new() -> Self {
        AllocMap {
-            alloc_map: Default::default(),
+            to_alloc: Default::default(),
            dedup: Default::default(),
-            next_id: AllocId(NonZero::new(1).unwrap()),
+            next_id: AtomicU64::new(1),
        }
    }
-    fn reserve(&mut self) -> AllocId {
+    fn reserve(&self) -> AllocId {
-        let next = self.next_id;
+        // Technically there is a window here where we overflow and then another thread
-        self.next_id.0 = self.next_id.0.checked_add(1).expect(
+        // increments `next_id` *again* and uses it before we panic and tear down the entire session.
-            "You overflowed a u64 by incrementing by 1... \
+        // We consider this fine since such overflows cannot realistically occur.
-             You've just earned yourself a free drink if we ever meet. \
+        let next_id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-             Seriously, how did you do that?!",
+        AllocId(NonZero::new(next_id).unwrap())
        );
        next
    }
 }
@ -428,26 +433,34 @@ impl<'tcx> TyCtxt<'tcx> {
    /// Make sure to call `set_alloc_id_memory` or `set_alloc_id_same_memory` before returning such
    /// an `AllocId` from a query.
    pub fn reserve_alloc_id(self) -> AllocId {
-        self.alloc_map.lock().reserve()
+        self.alloc_map.reserve()
    }
    /// Reserves a new ID *if* this allocation has not been dedup-reserved before.
    /// Should not be used for mutable memory.
    fn reserve_and_set_dedup(self, alloc: GlobalAlloc<'tcx>, salt: usize) -> AllocId {
        let mut alloc_map = self.alloc_map.lock();
        if let GlobalAlloc::Memory(mem) = alloc {
            if mem.inner().mutability.is_mut() {
                bug!("trying to dedup-reserve mutable memory");
            }
        }
        let alloc_salt = (alloc, salt);
-        if let Some(&alloc_id) = alloc_map.dedup.get(&alloc_salt) {
+        // Locking this *before* `to_alloc` also to ensure correct lock order.
        let mut dedup = self.alloc_map.dedup.lock();
        if let Some(&alloc_id) = dedup.get(&alloc_salt) {
            return alloc_id;
        }
-        let id = alloc_map.reserve();
+        let id = self.alloc_map.reserve();
        debug!("creating alloc {:?} with id {id:?}", alloc_salt.0);
-        alloc_map.alloc_map.insert(id, alloc_salt.0.clone());
+        let had_previous = self
-        alloc_map.dedup.insert(alloc_salt, id);
+            .alloc_map
            .to_alloc
            .lock_shard_by_value(&id)
            .insert(id, alloc_salt.0.clone())
            .is_some();
        // We just reserved, so should always be unique.
        assert!(!had_previous);
        dedup.insert(alloc_salt, id);
        id
    }
@ -497,7 +510,7 @@ impl<'tcx> TyCtxt<'tcx> {
    /// local dangling pointers and allocations in constants/statics.
    #[inline]
    pub fn try_get_global_alloc(self, id: AllocId) -> Option<GlobalAlloc<'tcx>> {
-        self.alloc_map.lock().alloc_map.get(&id).cloned()
+        self.alloc_map.to_alloc.lock_shard_by_value(&id).get(&id).cloned()
    }
    #[inline]
@ -516,7 +529,9 @@ impl<'tcx> TyCtxt<'tcx> {
    /// Freezes an `AllocId` created with `reserve` by pointing it at an `Allocation`. Trying to
    /// call this function twice, even with the same `Allocation` will ICE the compiler.
    pub fn set_alloc_id_memory(self, id: AllocId, mem: ConstAllocation<'tcx>) {
-        if let Some(old) = self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Memory(mem)) {
+        if let Some(old) =
            self.alloc_map.to_alloc.lock_shard_by_value(&id).insert(id, GlobalAlloc::Memory(mem))
        {
            bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
        }
    }
@ -524,8 +539,11 @@ impl<'tcx> TyCtxt<'tcx> {
    /// Freezes an `AllocId` created with `reserve` by pointing it at a static item. Trying to
    /// call this function twice, even with the same `DefId` will ICE the compiler.
    pub fn set_nested_alloc_id_static(self, id: AllocId, def_id: LocalDefId) {
-        if let Some(old) =
+        if let Some(old) = self
-            self.alloc_map.lock().alloc_map.insert(id, GlobalAlloc::Static(def_id.to_def_id()))
+            .alloc_map
            .to_alloc
            .lock_shard_by_value(&id)
            .insert(id, GlobalAlloc::Static(def_id.to_def_id()))
        {
            bug!("tried to set allocation ID {id:?}, but it was already existing as {old:#?}");
        }
--- a/compiler/rustc_middle/src/ty/context.rs
+++ b/compiler/rustc_middle/src/ty/context.rs
@ -1366,7 +1366,7 @@ pub struct GlobalCtxt<'tcx> {
    pub data_layout: TargetDataLayout,
    /// Stores memory for globals (statics/consts).
-    pub(crate) alloc_map: Lock<interpret::AllocMap<'tcx>>,
+    pub(crate) alloc_map: interpret::AllocMap<'tcx>,
    current_gcx: CurrentGcx,
 }
@ -1583,7 +1583,7 @@ impl<'tcx> TyCtxt<'tcx> {
            new_solver_evaluation_cache: Default::default(),
            canonical_param_env_cache: Default::default(),
            data_layout,
-            alloc_map: Lock::new(interpret::AllocMap::new()),
+            alloc_map: interpret::AllocMap::new(),
            current_gcx,
        });