Auto merge of #124780 - Mark-Simulacrum:lockless-cache, r=lcnr

Improve VecCache under parallel frontend This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (~1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds). try-job: i686-gnu-nopt try-job: dist-x86_64-linux
2024-11-19 02:07:48 +00:00 · 2024-11-19 02:07:48 +00:00 · 5926e82dd1
commit 5926e82dd1
parent b71fb5edc0 da58efb11d
6 changed files with 458 additions and 65 deletions
--- a/compiler/rustc_query_system/src/lib.rs
+++ b/compiler/rustc_query_system/src/lib.rs
@ -2,6 +2,7 @@
 #![allow(rustc::potential_query_instability, internal_features)]
 #![feature(assert_matches)]
 #![feature(core_intrinsics)]
+#![feature(dropck_eyepatch)]
 #![feature(hash_raw_entry)]
 #![feature(let_chains)]
 #![feature(min_specialization)]
--- a/compiler/rustc_query_system/src/query/caches.rs
+++ b/compiler/rustc_query_system/src/query/caches.rs
@ -3,9 +3,10 @@ use std::hash::Hash;

 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sharded::{self, Sharded};
-use rustc_data_structures::sync::{Lock, OnceLock};
+use rustc_data_structures::sync::OnceLock;
+pub use rustc_data_structures::vec_cache::VecCache;
 use rustc_hir::def_id::LOCAL_CRATE;
-use rustc_index::{Idx, IndexVec};
+use rustc_index::Idx;
 use rustc_span::def_id::{DefId, DefIndex};

 use crate::dep_graph::DepNodeIndex;
@ -100,52 +101,10 @@ where
    }
 }

-pub struct VecCache<K: Idx, V> {
-    cache: Lock<IndexVec<K, Option<(V, DepNodeIndex)>>>,
-}
-
-impl<K: Idx, V> Default for VecCache<K, V> {
-    fn default() -> Self {
-        VecCache { cache: Default::default() }
-    }
-}
-
-impl<K, V> QueryCache for VecCache<K, V>
-where
-    K: Eq + Idx + Copy + Debug,
-    V: Copy,
-{
-    type Key = K;
-    type Value = V;
-
-    #[inline(always)]
-    fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
-        let lock = self.cache.lock();
-        if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None }
-    }
-
-    #[inline]
-    fn complete(&self, key: K, value: V, index: DepNodeIndex) {
-        let mut lock = self.cache.lock();
-        lock.insert(key, (value, index));
-    }
-
-    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        for (k, v) in self.cache.lock().iter_enumerated() {
-            if let Some(v) = v {
-                f(&k, &v.0, v.1);
-            }
-        }
-    }
-}
-
 pub struct DefIdCache<V> {
    /// Stores the local DefIds in a dense map. Local queries are much more often dense, so this is
    /// a win over hashing query keys at marginal memory cost (~5% at most) compared to FxHashMap.
-    ///
-    /// The second element of the tuple is the set of keys actually present in the IndexVec, used
-    /// for faster iteration in `iter()`.
-    local: Lock<(IndexVec<DefIndex, Option<(V, DepNodeIndex)>>, Vec<DefIndex>)>,
+    local: VecCache<DefIndex, V, DepNodeIndex>,
    foreign: DefaultCache<DefId, V>,
 }

@ -165,8 +124,7 @@ where
    #[inline(always)]
    fn lookup(&self, key: &DefId) -> Option<(V, DepNodeIndex)> {
        if key.krate == LOCAL_CRATE {
-            let cache = self.local.lock();
-            cache.0.get(key.index).and_then(|v| *v)
+            self.local.lookup(&key.index)
        } else {
            self.foreign.lookup(key)
        }
@ -175,27 +133,39 @@ where
    #[inline]
    fn complete(&self, key: DefId, value: V, index: DepNodeIndex) {
        if key.krate == LOCAL_CRATE {
-            let mut cache = self.local.lock();
-            let (cache, present) = &mut *cache;
-            let slot = cache.ensure_contains_elem(key.index, Default::default);
-            if slot.is_none() {
-                // FIXME: Only store the present set when running in incremental mode. `iter` is not
-                // used outside of saving caches to disk and self-profile.
-                present.push(key.index);
-            }
-            *slot = Some((value, index));
+            self.local.complete(key.index, value, index)
        } else {
            self.foreign.complete(key, value, index)
        }
    }

    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        let guard = self.local.lock();
-        let (cache, present) = &*guard;
-        for &idx in present.iter() {
-            let value = cache[idx].unwrap();
-            f(&DefId { krate: LOCAL_CRATE, index: idx }, &value.0, value.1);
-        }
+        self.local.iter(&mut |key, value, index| {
+            f(&DefId { krate: LOCAL_CRATE, index: *key }, value, index);
+        });
        self.foreign.iter(f);
    }
 }
+
+impl<K, V> QueryCache for VecCache<K, V, DepNodeIndex>
+where
+    K: Idx + Eq + Hash + Copy + Debug,
+    V: Copy,
+{
+    type Key = K;
+    type Value = V;
+
+    #[inline(always)]
+    fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
+        self.lookup(key)
+    }
+
+    #[inline]
+    fn complete(&self, key: K, value: V, index: DepNodeIndex) {
+        self.complete(key, value, index)
+    }
+
+    fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
+        self.iter(f)
+    }
+}