Auto merge of #67397 - michaelwoerister:query-keys-in-self-profiling, r=wesleywiser

self-profiling: Support recording query keys This PR makes self-profiling able to record query keys. The implementation is not as efficient as it could be yet (all query keys except for `DefId`s cause string data to be duplicated) and the rendered strings could be nicer too. But the implementation is functional and introduces the basic framework for emitting per-query-invocation event data. I tried to add proper documentation on how everything works. Let me know if more documentation is needed. r? @wesleywiser @Mark-Simulacrum, heads up: This updates `measureme` to 0.7.0 which means that `summarize` on perf.rlo needs to be update accordingly once this is merged.
2020-01-10 12:18:46 +00:00 · 2020-01-10 12:18:46 +00:00 · f795e8a216
commit f795e8a216
parent 2d8d559bbe ad65e3e6bc
11 changed files with 551 additions and 126 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1995,9 +1995,9 @@ dependencies = [

 [[package]]
 name = "measureme"
-version = "0.5.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c420bbc064623934620b5ab2dc0cf96451b34163329e82f95e7fa1b7b99a6ac8"
+checksum = "fef709d3257013bba7cff14fc504e07e80631d3fe0f6d38ce63b8f6510ccb932"
 dependencies = [
 "byteorder",
 "memmap",
@ -3079,6 +3079,7 @@ dependencies = [
 "graphviz",
 "jobserver",
 "log",
+ "measureme",
 "parking_lot",
 "polonius-engine",
 "rustc-rayon",
--- a/src/librustc/Cargo.toml
+++ b/src/librustc/Cargo.toml
@ -36,5 +36,6 @@ parking_lot = "0.9"
 byteorder = { version = "1.3" }
 chalk-engine = { version = "0.9.0", default-features=false }
 smallvec = { version = "1.0", features = ["union", "may_dangle"] }
+measureme = "0.7.1"
 rustc_error_codes = { path = "../librustc_error_codes" }
 rustc_session = { path = "../librustc_session" }
--- a/src/librustc/dep_graph/graph.rs
+++ b/src/librustc/dep_graph/graph.rs
@ -2,6 +2,7 @@ use crate::ty::{self, TyCtxt};
 use errors::Diagnostic;
 use parking_lot::{Condvar, Mutex};
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
+use rustc_data_structures::profiling::QueryInvocationId;
 use rustc_data_structures::sharded::{self, Sharded};
 use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
 use rustc_data_structures::sync::{AtomicU32, AtomicU64, Lock, Lrc, Ordering};
@ -11,7 +12,7 @@ use std::collections::hash_map::Entry;
 use std::env;
 use std::hash::Hash;
 use std::mem;
-use std::sync::atomic::Ordering::SeqCst;
+use std::sync::atomic::Ordering::Relaxed;

 use crate::ich::{Fingerprint, StableHashingContext, StableHashingContextProvider};

@ -25,6 +26,12 @@ use super::serialized::{SerializedDepGraph, SerializedDepNodeIndex};
 #[derive(Clone)]
 pub struct DepGraph {
    data: Option<Lrc<DepGraphData>>,
+
+    /// This field is used for assigning DepNodeIndices when running in
+    /// non-incremental mode. Even in non-incremental mode we make sure that
+    /// each task has a `DepNodeIndex` that uniquely identifies it. This unique
+    /// ID is used for self-profiling.
+    virtual_dep_node_index: Lrc<AtomicU32>,
 }

 rustc_index::newtype_index! {
@ -35,6 +42,13 @@ impl DepNodeIndex {
    pub const INVALID: DepNodeIndex = DepNodeIndex::MAX;
 }

+impl std::convert::From<DepNodeIndex> for QueryInvocationId {
+    #[inline]
+    fn from(dep_node_index: DepNodeIndex) -> Self {
+        QueryInvocationId(dep_node_index.as_u32())
+    }
+}
+
 #[derive(PartialEq)]
 pub enum DepNodeColor {
    Red,
@ -105,11 +119,12 @@ impl DepGraph {
                previous: prev_graph,
                colors: DepNodeColorMap::new(prev_graph_node_count),
            })),
+            virtual_dep_node_index: Lrc::new(AtomicU32::new(0)),
        }
    }

    pub fn new_disabled() -> DepGraph {
-        DepGraph { data: None }
+        DepGraph { data: None, virtual_dep_node_index: Lrc::new(AtomicU32::new(0)) }
    }

    /// Returns `true` if we are actually building the full dep-graph, and `false` otherwise.
@ -322,7 +337,7 @@ impl DepGraph {

            (result, dep_node_index)
        } else {
-            (task(cx, arg), DepNodeIndex::INVALID)
+            (task(cx, arg), self.next_virtual_depnode_index())
        }
    }

@ -352,7 +367,7 @@ impl DepGraph {
            let dep_node_index = data.current.complete_anon_task(dep_kind, task_deps);
            (result, dep_node_index)
        } else {
-            (op(), DepNodeIndex::INVALID)
+            (op(), self.next_virtual_depnode_index())
        }
    }

@ -478,8 +493,8 @@ impl DepGraph {
            let current_dep_graph = &self.data.as_ref().unwrap().current;

            Some((
-                current_dep_graph.total_read_count.load(SeqCst),
-                current_dep_graph.total_duplicate_read_count.load(SeqCst),
+                current_dep_graph.total_read_count.load(Relaxed),
+                current_dep_graph.total_duplicate_read_count.load(Relaxed),
            ))
        } else {
            None
@ -877,6 +892,11 @@ impl DepGraph {
            }
        }
    }
+
+    fn next_virtual_depnode_index(&self) -> DepNodeIndex {
+        let index = self.virtual_dep_node_index.fetch_add(1, Relaxed);
+        DepNodeIndex::from_u32(index)
+    }
 }

 /// A "work product" is an intermediate result that we save into the
@ -1087,7 +1107,7 @@ impl DepGraphData {
            if let Some(task_deps) = icx.task_deps {
                let mut task_deps = task_deps.lock();
                if cfg!(debug_assertions) {
-                    self.current.total_read_count.fetch_add(1, SeqCst);
+                    self.current.total_read_count.fetch_add(1, Relaxed);
                }
                if task_deps.read_set.insert(source) {
                    task_deps.reads.push(source);
@ -1105,7 +1125,7 @@ impl DepGraphData {
                        }
                    }
                } else if cfg!(debug_assertions) {
-                    self.current.total_duplicate_read_count.fetch_add(1, SeqCst);
+                    self.current.total_duplicate_read_count.fetch_add(1, Relaxed);
                }
            }
        })
--- a/src/librustc/ty/query/config.rs
+++ b/src/librustc/ty/query/config.rs
@ -2,8 +2,7 @@ use crate::dep_graph::SerializedDepNodeIndex;
 use crate::dep_graph::{DepKind, DepNode};
 use crate::ty::query::plumbing::CycleError;
 use crate::ty::query::queries;
-use crate::ty::query::QueryCache;
-use crate::ty::query::{Query, QueryName};
+use crate::ty::query::{Query, QueryCache};
 use crate::ty::TyCtxt;
 use rustc_data_structures::profiling::ProfileCategory;
 use rustc_hir::def_id::{CrateNum, DefId};
@ -20,7 +19,7 @@ use std::hash::Hash;
 // FIXME(eddyb) false positive, the lifetime parameter is used for `Key`/`Value`.
 #[allow(unused_lifetimes)]
 pub trait QueryConfig<'tcx> {
-    const NAME: QueryName;
+    const NAME: &'static str;
    const CATEGORY: ProfileCategory;

    type Key: Eq + Hash + Clone + Debug;
--- a/src/librustc/ty/query/mod.rs
+++ b/src/librustc/ty/query/mod.rs
@ -81,6 +81,9 @@ pub(crate) use self::config::QueryDescription;
 mod on_disk_cache;
 pub use self::on_disk_cache::OnDiskCache;

+mod profiling_support;
+pub use self::profiling_support::{IntoSelfProfilingString, QueryKeyStringBuilder};
+
 // Each of these queries corresponds to a function pointer field in the
 // `Providers` struct for requesting a value of that type, and a method
 // on `tcx: TyCtxt` (and `tcx.at(span)`) for doing that request in a way
--- a/src/librustc/ty/query/plumbing.rs
+++ b/src/librustc/ty/query/plumbing.rs
@ -13,6 +13,8 @@ use errors::{struct_span_err, Diagnostic, DiagnosticBuilder, FatalError, Handler
 #[cfg(not(parallel_compiler))]
 use rustc_data_structures::cold_path;
 use rustc_data_structures::fx::{FxHashMap, FxHasher};
+#[cfg(parallel_compiler)]
+use rustc_data_structures::profiling::TimingGuard;
 use rustc_data_structures::sharded::Sharded;
 use rustc_data_structures::sync::{Lock, Lrc};
 use rustc_data_structures::thin_vec::ThinVec;
@ -82,6 +84,19 @@ impl<'a, 'tcx, Q: QueryDescription<'tcx>> JobOwner<'a, 'tcx, Q> {
    /// for some compile-time benchmarks.
    #[inline(always)]
    pub(super) fn try_get(tcx: TyCtxt<'tcx>, span: Span, key: &Q::Key) -> TryGetJob<'a, 'tcx, Q> {
+        // Handling the `query_blocked_prof_timer` is a bit weird because of the
+        // control flow in this function: Blocking is implemented by
+        // awaiting a running job and, once that is done, entering the loop below
+        // again from the top. In that second iteration we will hit the
+        // cache which provides us with the information we need for
+        // finishing the "query-blocked" event.
+        //
+        // We thus allocate `query_blocked_prof_timer` outside the loop,
+        // initialize it during the first iteration and finish it during the
+        // second iteration.
+        #[cfg(parallel_compiler)]
+        let mut query_blocked_prof_timer: Option<TimingGuard<'_>> = None;
+
        let cache = Q::query_cache(tcx);
        loop {
            // We compute the key's hash once and then use it for both the
@ -95,7 +110,17 @@ impl<'a, 'tcx, Q: QueryDescription<'tcx>> JobOwner<'a, 'tcx, Q> {
            if let Some((_, value)) =
                lock.results.raw_entry().from_key_hashed_nocheck(key_hash, key)
            {
-                tcx.prof.query_cache_hit(Q::NAME);
+                if unlikely!(tcx.prof.enabled()) {
+                    tcx.prof.query_cache_hit(value.index.into());
+
+                    #[cfg(parallel_compiler)]
+                    {
+                        if let Some(prof_timer) = query_blocked_prof_timer.take() {
+                            prof_timer.finish_with_query_invocation_id(value.index.into());
+                        }
+                    }
+                }
+
                let result = (value.value.clone(), value.index);
                #[cfg(debug_assertions)]
                {
@ -104,9 +129,6 @@ impl<'a, 'tcx, Q: QueryDescription<'tcx>> JobOwner<'a, 'tcx, Q> {
                return TryGetJob::JobCompleted(result);
            }

-            #[cfg(parallel_compiler)]
-            let query_blocked_prof_timer;
-
            let job = match lock.active.entry((*key).clone()) {
                Entry::Occupied(entry) => {
                    match *entry.get() {
@ -116,7 +138,7 @@ impl<'a, 'tcx, Q: QueryDescription<'tcx>> JobOwner<'a, 'tcx, Q> {
                            // self-profiler.
                            #[cfg(parallel_compiler)]
                            {
-                                query_blocked_prof_timer = tcx.prof.query_blocked(Q::NAME);
+                                query_blocked_prof_timer = Some(tcx.prof.query_blocked());
                            }

                            job.clone()
@ -153,11 +175,6 @@ impl<'a, 'tcx, Q: QueryDescription<'tcx>> JobOwner<'a, 'tcx, Q> {
            {
                let result = job.r#await(tcx, span);

-                // This `drop()` is not strictly necessary as the binding
-                // would go out of scope anyway. But it's good to have an
-                // explicit marker of how far the measurement goes.
-                drop(query_blocked_prof_timer);
-
                if let Err(cycle) = result {
                    return TryGetJob::Cycle(Q::handle_cycle_error(tcx, cycle));
                }
@ -347,7 +364,7 @@ impl<'tcx> TyCtxt<'tcx> {

    #[inline(never)]
    pub(super) fn get_query<Q: QueryDescription<'tcx>>(self, span: Span, key: Q::Key) -> Q::Value {
-        debug!("ty::query::get_query<{}>(key={:?}, span={:?})", Q::NAME.as_str(), key, span);
+        debug!("ty::query::get_query<{}>(key={:?}, span={:?})", Q::NAME, key, span);

        let job = match JobOwner::try_get(self, span, &key) {
            TryGetJob::NotYetStarted(job) => job,
@ -366,7 +383,7 @@ impl<'tcx> TyCtxt<'tcx> {
        }

        if Q::ANON {
-            let prof_timer = self.prof.query_provider(Q::NAME);
+            let prof_timer = self.prof.query_provider();

            let ((result, dep_node_index), diagnostics) = with_diagnostics(|diagnostics| {
                self.start_query(job.job.clone(), diagnostics, |tcx| {
@ -374,7 +391,7 @@ impl<'tcx> TyCtxt<'tcx> {
                })
            });

-            drop(prof_timer);
+            prof_timer.finish_with_query_invocation_id(dep_node_index.into());

            self.dep_graph.read_index(dep_node_index);

@ -436,8 +453,9 @@ impl<'tcx> TyCtxt<'tcx> {
        let result = if Q::cache_on_disk(self, key.clone(), None)
            && self.sess.opts.debugging_opts.incremental_queries
        {
-            let _prof_timer = self.prof.incr_cache_loading(Q::NAME);
+            let prof_timer = self.prof.incr_cache_loading();
            let result = Q::try_load_from_disk(self, prev_dep_node_index);
+            prof_timer.finish_with_query_invocation_id(dep_node_index.into());

            // We always expect to find a cached result for things that
            // can be forced from `DepNode`.
@ -457,11 +475,13 @@ impl<'tcx> TyCtxt<'tcx> {
        } else {
            // We could not load a result from the on-disk cache, so
            // recompute.
-            let _prof_timer = self.prof.query_provider(Q::NAME);
+            let prof_timer = self.prof.query_provider();

            // The dep-graph for this computation is already in-place.
            let result = self.dep_graph.with_ignore(|| Q::compute(self, key));

+            prof_timer.finish_with_query_invocation_id(dep_node_index.into());
+
            result
        };

@ -523,7 +543,7 @@ impl<'tcx> TyCtxt<'tcx> {
            dep_node
        );

-        let prof_timer = self.prof.query_provider(Q::NAME);
+        let prof_timer = self.prof.query_provider();

        let ((result, dep_node_index), diagnostics) = with_diagnostics(|diagnostics| {
            self.start_query(job.job.clone(), diagnostics, |tcx| {
@ -541,7 +561,7 @@ impl<'tcx> TyCtxt<'tcx> {
            })
        });

-        drop(prof_timer);
+        prof_timer.finish_with_query_invocation_id(dep_node_index.into());

        if unlikely!(!diagnostics.is_empty()) {
            if dep_node.kind != crate::dep_graph::DepKind::Null {
@ -572,17 +592,19 @@ impl<'tcx> TyCtxt<'tcx> {

        let dep_node = Q::to_dep_node(self, &key);

-        if self.dep_graph.try_mark_green_and_read(self, &dep_node).is_none() {
-            // A None return from `try_mark_green_and_read` means that this is either
-            // a new dep node or that the dep node has already been marked red.
-            // Either way, we can't call `dep_graph.read()` as we don't have the
-            // DepNodeIndex. We must invoke the query itself. The performance cost
-            // this introduces should be negligible as we'll immediately hit the
-            // in-memory cache, or another query down the line will.
-
-            let _ = self.get_query::<Q>(DUMMY_SP, key);
-        } else {
-            self.prof.query_cache_hit(Q::NAME);
+        match self.dep_graph.try_mark_green_and_read(self, &dep_node) {
+            None => {
+                // A None return from `try_mark_green_and_read` means that this is either
+                // a new dep node or that the dep node has already been marked red.
+                // Either way, we can't call `dep_graph.read()` as we don't have the
+                // DepNodeIndex. We must invoke the query itself. The performance cost
+                // this introduces should be negligible as we'll immediately hit the
+                // in-memory cache, or another query down the line will.
+                let _ = self.get_query::<Q>(DUMMY_SP, key);
+            }
+            Some((_, dep_node_index)) => {
+                self.prof.query_cache_hit(dep_node_index.into());
+            }
        }
    }

@ -813,36 +835,6 @@ macro_rules! define_queries_inner {
            }
        }

-        #[allow(nonstandard_style)]
-        #[derive(Clone, Copy)]
-        pub enum QueryName {
-            $($name),*
-        }
-
-        impl rustc_data_structures::profiling::QueryName for QueryName {
-            fn discriminant(self) -> std::mem::Discriminant<QueryName> {
-                std::mem::discriminant(&self)
-            }
-
-            fn as_str(self) -> &'static str {
-                QueryName::as_str(&self)
-            }
-        }
-
-        impl QueryName {
-            pub fn register_with_profiler(
-                profiler: &rustc_data_structures::profiling::SelfProfiler,
-            ) {
-                $(profiler.register_query_name(QueryName::$name);)*
-            }
-
-            pub fn as_str(&self) -> &'static str {
-                match self {
-                    $(QueryName::$name => stringify!($name),)*
-                }
-            }
-        }
-
        #[allow(nonstandard_style)]
        #[derive(Clone, Debug)]
        pub enum Query<$tcx> {
@ -883,12 +875,6 @@ macro_rules! define_queries_inner {
                    $(Query::$name(key) => key.default_span(tcx),)*
                }
            }
-
-            pub fn query_name(&self) -> QueryName {
-                match self {
-                    $(Query::$name(_) => QueryName::$name,)*
-                }
-            }
        }

        impl<'a, $tcx> HashStable<StableHashingContext<'a>> for Query<$tcx> {
@ -923,7 +909,7 @@ macro_rules! define_queries_inner {
            type Key = $K;
            type Value = $V;

-            const NAME: QueryName = QueryName::$name;
+            const NAME: &'static str = stringify!($name);
            const CATEGORY: ProfileCategory = $category;
        }

@ -1035,6 +1021,35 @@ macro_rules! define_queries_inner {
            pub fn $name(self, key: $K) -> $V {
                self.at(DUMMY_SP).$name(key)
            })*
+
+            /// All self-profiling events generated by the query engine use
+            /// virtual `StringId`s for their `event_id`. This method makes all
+            /// those virtual `StringId`s point to actual strings.
+            ///
+            /// If we are recording only summary data, the ids will point to
+            /// just the query names. If we are recording query keys too, we
+            /// allocate the corresponding strings here.
+            pub fn alloc_self_profile_query_strings(self) {
+                use crate::ty::query::profiling_support::{
+                    alloc_self_profile_query_strings_for_query_cache,
+                    QueryKeyStringCache,
+                };
+
+                if !self.prof.enabled() {
+                    return;
+                }
+
+                let mut string_cache = QueryKeyStringCache::new();
+
+                $({
+                    alloc_self_profile_query_strings_for_query_cache(
+                        self,
+                        stringify!($name),
+                        &self.queries.$name,
+                        &mut string_cache,
+                    );
+                })*
+            }
        }

        impl TyCtxtAt<$tcx> {
--- a/src/librustc/ty/query/profiling_support.rs
+++ b/src/librustc/ty/query/profiling_support.rs
@ -0,0 +1,235 @@
+use crate::hir::map::definitions::DefPathData;
+use crate::ty::context::TyCtxt;
+use crate::ty::query::config::QueryConfig;
+use crate::ty::query::plumbing::QueryCache;
+use measureme::{StringComponent, StringId};
+use rustc_data_structures::fx::FxHashMap;
+use rustc_data_structures::profiling::SelfProfiler;
+use rustc_data_structures::sharded::Sharded;
+use rustc_hir::def_id::{CrateNum, DefId, DefIndex, CRATE_DEF_INDEX, LOCAL_CRATE};
+use std::fmt::Debug;
+use std::io::Write;
+
+pub struct QueryKeyStringCache {
+    def_id_cache: FxHashMap<DefId, StringId>,
+}
+
+impl QueryKeyStringCache {
+    pub fn new() -> QueryKeyStringCache {
+        QueryKeyStringCache { def_id_cache: Default::default() }
+    }
+}
+
+pub struct QueryKeyStringBuilder<'p, 'c, 'tcx> {
+    profiler: &'p SelfProfiler,
+    tcx: TyCtxt<'tcx>,
+    string_cache: &'c mut QueryKeyStringCache,
+}
+
+impl<'p, 'c, 'tcx> QueryKeyStringBuilder<'p, 'c, 'tcx> {
+    pub fn new(
+        profiler: &'p SelfProfiler,
+        tcx: TyCtxt<'tcx>,
+        string_cache: &'c mut QueryKeyStringCache,
+    ) -> QueryKeyStringBuilder<'p, 'c, 'tcx> {
+        QueryKeyStringBuilder { profiler, tcx, string_cache }
+    }
+
+    // The current implementation is rather crude. In the future it might be a
+    // good idea to base this on `ty::print` in order to get nicer and more
+    // efficient query keys.
+    fn def_id_to_string_id(&mut self, def_id: DefId) -> StringId {
+        if let Some(&string_id) = self.string_cache.def_id_cache.get(&def_id) {
+            return string_id;
+        }
+
+        let def_key = self.tcx.def_key(def_id);
+
+        let (parent_string_id, start_index) = match def_key.parent {
+            Some(parent_index) => {
+                let parent_def_id = DefId { index: parent_index, krate: def_id.krate };
+
+                (self.def_id_to_string_id(parent_def_id), 0)
+            }
+            None => (StringId::INVALID, 2),
+        };
+
+        let dis_buffer = &mut [0u8; 16];
+        let name;
+        let dis;
+        let end_index;
+
+        match def_key.disambiguated_data.data {
+            DefPathData::CrateRoot => {
+                name = self.tcx.original_crate_name(def_id.krate).as_str();
+                dis = "";
+                end_index = 3;
+            }
+            other => {
+                name = other.as_symbol().as_str();
+                if def_key.disambiguated_data.disambiguator == 0 {
+                    dis = "";
+                    end_index = 3;
+                } else {
+                    write!(&mut dis_buffer[..], "[{}]", def_key.disambiguated_data.disambiguator)
+                        .unwrap();
+                    let end_of_dis = dis_buffer.iter().position(|&c| c == b']').unwrap();
+                    dis = std::str::from_utf8(&dis_buffer[..end_of_dis + 1]).unwrap();
+                    end_index = 4;
+                }
+            }
+        }
+
+        let components = [
+            StringComponent::Ref(parent_string_id),
+            StringComponent::Value("::"),
+            StringComponent::Value(&name[..]),
+            StringComponent::Value(dis),
+        ];
+
+        let string_id = self.profiler.alloc_string(&components[start_index..end_index]);
+
+        self.string_cache.def_id_cache.insert(def_id, string_id);
+
+        string_id
+    }
+}
+
+pub trait IntoSelfProfilingString {
+    fn to_self_profile_string(&self, builder: &mut QueryKeyStringBuilder<'_, '_, '_>) -> StringId;
+}
+
+// The default implementation of `IntoSelfProfilingString` just uses `Debug`
+// which is slow and causes lots of duplication of string data.
+// The specialized impls below take care of making the `DefId` case more
+// efficient.
+impl<T: Debug> IntoSelfProfilingString for T {
+    default fn to_self_profile_string(
+        &self,
+        builder: &mut QueryKeyStringBuilder<'_, '_, '_>,
+    ) -> StringId {
+        let s = format!("{:?}", self);
+        builder.profiler.alloc_string(&s[..])
+    }
+}
+
+impl IntoSelfProfilingString for DefId {
+    fn to_self_profile_string(&self, builder: &mut QueryKeyStringBuilder<'_, '_, '_>) -> StringId {
+        builder.def_id_to_string_id(*self)
+    }
+}
+
+impl IntoSelfProfilingString for CrateNum {
+    fn to_self_profile_string(&self, builder: &mut QueryKeyStringBuilder<'_, '_, '_>) -> StringId {
+        builder.def_id_to_string_id(DefId { krate: *self, index: CRATE_DEF_INDEX })
+    }
+}
+
+impl IntoSelfProfilingString for DefIndex {
+    fn to_self_profile_string(&self, builder: &mut QueryKeyStringBuilder<'_, '_, '_>) -> StringId {
+        builder.def_id_to_string_id(DefId { krate: LOCAL_CRATE, index: *self })
+    }
+}
+
+impl<T0, T1> IntoSelfProfilingString for (T0, T1)
+where
+    T0: IntoSelfProfilingString + Debug,
+    T1: IntoSelfProfilingString + Debug,
+{
+    default fn to_self_profile_string(
+        &self,
+        builder: &mut QueryKeyStringBuilder<'_, '_, '_>,
+    ) -> StringId {
+        let val0 = self.0.to_self_profile_string(builder);
+        let val1 = self.1.to_self_profile_string(builder);
+
+        let components = &[
+            StringComponent::Value("("),
+            StringComponent::Ref(val0),
+            StringComponent::Value(","),
+            StringComponent::Ref(val1),
+            StringComponent::Value(")"),
+        ];
+
+        builder.profiler.alloc_string(components)
+    }
+}
+
+/// Allocate the self-profiling query strings for a single query cache. This
+/// method is called from `alloc_self_profile_query_strings` which knows all
+/// the queries via macro magic.
+pub(super) fn alloc_self_profile_query_strings_for_query_cache<'tcx, Q>(
+    tcx: TyCtxt<'tcx>,
+    query_name: &'static str,
+    query_cache: &Sharded<QueryCache<'tcx, Q>>,
+    string_cache: &mut QueryKeyStringCache,
+) where
+    Q: QueryConfig<'tcx>,
+{
+    tcx.prof.with_profiler(|profiler| {
+        let event_id_builder = profiler.event_id_builder();
+
+        // Walk the entire query cache and allocate the appropriate
+        // string representations. Each cache entry is uniquely
+        // identified by its dep_node_index.
+        if profiler.query_key_recording_enabled() {
+            let mut query_string_builder = QueryKeyStringBuilder::new(profiler, tcx, string_cache);
+
+            let query_name = profiler.get_or_alloc_cached_string(query_name);
+
+            // Since building the string representation of query keys might
+            // need to invoke queries itself, we cannot keep the query caches
+            // locked while doing so. Instead we copy out the
+            // `(query_key, dep_node_index)` pairs and release the lock again.
+            let query_keys_and_indices = {
+                let shards = query_cache.lock_shards();
+                let len = shards.iter().map(|shard| shard.results.len()).sum();
+
+                let mut query_keys_and_indices = Vec::with_capacity(len);
+
+                for shard in &shards {
+                    query_keys_and_indices.extend(
+                        shard.results.iter().map(|(q_key, q_val)| (q_key.clone(), q_val.index)),
+                    );
+                }
+
+                query_keys_and_indices
+            };
+
+            // Now actually allocate the strings. If allocating the strings
+            // generates new entries in the query cache, we'll miss them but
+            // we don't actually care.
+            for (query_key, dep_node_index) in query_keys_and_indices {
+                // Translate the DepNodeIndex into a QueryInvocationId
+                let query_invocation_id = dep_node_index.into();
+
+                // Create the string version of the query-key
+                let query_key = query_key.to_self_profile_string(&mut query_string_builder);
+                let event_id = event_id_builder.from_label_and_arg(query_name, query_key);
+
+                // Doing this in bulk might be a good idea:
+                profiler.map_query_invocation_id_to_string(
+                    query_invocation_id,
+                    event_id.to_string_id(),
+                );
+            }
+        } else {
+            // In this branch we don't allocate query keys
+            let query_name = profiler.get_or_alloc_cached_string(query_name);
+            let event_id = event_id_builder.from_label(query_name).to_string_id();
+
+            let shards = query_cache.lock_shards();
+
+            for shard in shards.iter() {
+                let query_invocation_ids = shard
+                    .results
+                    .values()
+                    .map(|v| v.index)
+                    .map(|dep_node_index| dep_node_index.into());
+
+                profiler
+                    .bulk_map_query_invocation_id_to_single_string(query_invocation_ids, event_id);
+            }
+        }
+    });
+}
--- a/src/librustc_codegen_ssa/base.rs
+++ b/src/librustc_codegen_ssa/base.rs
@ -85,7 +85,7 @@ pub fn bin_op_to_icmp_predicate(op: hir::BinOpKind, signed: bool) -> IntPredicat
        }
        op => bug!(
            "comparison_op_to_icmp_predicate: expected comparison operator, \
-                  found {:?}",
+             found {:?}",
            op
        ),
    }
@ -102,7 +102,7 @@ pub fn bin_op_to_fcmp_predicate(op: hir::BinOpKind) -> RealPredicate {
        op => {
            bug!(
                "comparison_op_to_fcmp_predicate: expected comparison operator, \
-                  found {:?}",
+                 found {:?}",
                op
            );
        }
@ -519,7 +519,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(

        ongoing_codegen.codegen_finished(tcx);

-        assert_and_save_dep_graph(tcx);
+        finalize_tcx(tcx);

        ongoing_codegen.check_for_errors(tcx.sess);

@ -660,7 +660,8 @@ pub fn codegen_crate<B: ExtraBackendMethods>(

    ongoing_codegen.check_for_errors(tcx.sess);

-    assert_and_save_dep_graph(tcx);
+    finalize_tcx(tcx);
+
    ongoing_codegen.into_inner()
 }

@ -711,10 +712,16 @@ impl<B: ExtraBackendMethods> Drop for AbortCodegenOnDrop<B> {
    }
 }

-fn assert_and_save_dep_graph(tcx: TyCtxt<'_>) {
+fn finalize_tcx(tcx: TyCtxt<'_>) {
    tcx.sess.time("assert_dep_graph", || ::rustc_incremental::assert_dep_graph(tcx));
-
    tcx.sess.time("serialize_dep_graph", || ::rustc_incremental::save_dep_graph(tcx));
+
+    // We assume that no queries are run past here. If there are new queries
+    // after this point, they'll show up as "<unknown>" in self-profiling data.
+    {
+        let _prof_timer = tcx.prof.generic_activity("self_profile_alloc_query_strings");
+        tcx.alloc_self_profile_query_strings();
+    }
 }

 impl CrateInfo {
--- a/src/librustc_data_structures/Cargo.toml
+++ b/src/librustc_data_structures/Cargo.toml
@ -26,7 +26,7 @@ rustc-hash = "1.0.1"
 smallvec = { version = "1.0", features = ["union", "may_dangle"] }
 rustc_index = { path = "../librustc_index", package = "rustc_index" }
 bitflags = "1.2.1"
-measureme = "0.5"
+measureme = "0.7.1"

 [dependencies.parking_lot]
 version = "0.9"
--- a/src/librustc_data_structures/profiling.rs
+++ b/src/librustc_data_structures/profiling.rs
@ -1,6 +1,90 @@
+//! # Rust Compiler Self-Profiling
+//!
+//! This module implements the basic framework for the compiler's self-
+//! profiling support. It provides the `SelfProfiler` type which enables
+//! recording "events". An event is something that starts and ends at a given
+//! point in time and has an ID and a kind attached to it. This allows for
+//! tracing the compiler's activity.
+//!
+//! Internally this module uses the custom tailored [measureme][mm] crate for
+//! efficiently recording events to disk in a compact format that can be
+//! post-processed and analyzed by the suite of tools in the `measureme`
+//! project. The highest priority for the tracing framework is on incurring as
+//! little overhead as possible.
+//!
+//!
+//! ## Event Overview
+//!
+//! Events have a few properties:
+//!
+//! - The `event_kind` designates the broad category of an event (e.g. does it
+//!   correspond to the execution of a query provider or to loading something
+//!   from the incr. comp. on-disk cache, etc).
+//! - The `event_id` designates the query invocation or function call it
+//!   corresponds to, possibly including the query key or function arguments.
+//! - Each event stores the ID of the thread it was recorded on.
+//! - The timestamp stores beginning and end of the event, or the single point
+//!   in time it occurred at for "instant" events.
+//!
+//!
+//! ## Event Filtering
+//!
+//! Event generation can be filtered by event kind. Recording all possible
+//! events generates a lot of data, much of which is not needed for most kinds
+//! of analysis. So, in order to keep overhead as low as possible for a given
+//! use case, the `SelfProfiler` will only record the kinds of events that
+//! pass the filter specified as a command line argument to the compiler.
+//!
+//!
+//! ## `event_id` Assignment
+//!
+//! As far as `measureme` is concerned, `event_id`s are just strings. However,
+//! it would incur too much overhead to generate and persist each `event_id`
+//! string at the point where the event is recorded. In order to make this more
+//! efficient `measureme` has two features:
+//!
+//! - Strings can share their content, so that re-occurring parts don't have to
+//!   be copied over and over again. One allocates a string in `measureme` and
+//!   gets back a `StringId`. This `StringId` is then used to refer to that
+//!   string. `measureme` strings are actually DAGs of string components so that
+//!   arbitrary sharing of substrings can be done efficiently. This is useful
+//!   because `event_id`s contain lots of redundant text like query names or
+//!   def-path components.
+//!
+//! - `StringId`s can be "virtual" which means that the client picks a numeric
+//!   ID according to some application-specific scheme and can later make that
+//!   ID be mapped to an actual string. This is used to cheaply generate
+//!   `event_id`s while the events actually occur, causing little timing
+//!   distortion, and then later map those `StringId`s, in bulk, to actual
+//!   `event_id` strings. This way the largest part of the tracing overhead is
+//!   localized to one contiguous chunk of time.
+//!
+//! How are these `event_id`s generated in the compiler? For things that occur
+//! infrequently (e.g. "generic activities"), we just allocate the string the
+//! first time it is used and then keep the `StringId` in a hash table. This
+//! is implemented in `SelfProfiler::get_or_alloc_cached_string()`.
+//!
+//! For queries it gets more interesting: First we need a unique numeric ID for
+//! each query invocation (the `QueryInvocationId`). This ID is used as the
+//! virtual `StringId` we use as `event_id` for a given event. This ID has to
+//! be available both when the query is executed and later, together with the
+//! query key, when we allocate the actual `event_id` strings in bulk.
+//!
+//! We could make the compiler generate and keep track of such an ID for each
+//! query invocation but luckily we already have something that fits all the
+//! the requirements: the query's `DepNodeIndex`. So we use the numeric value
+//! of the `DepNodeIndex` as `event_id` when recording the event and then,
+//! just before the query context is dropped, we walk the entire query cache
+//! (which stores the `DepNodeIndex` along with the query key for each
+//! invocation) and allocate the corresponding strings together with a mapping
+//! for `DepNodeIndex as StringId`.
+//!
+//! [mm]: https://github.com/rust-lang/measureme/
+
+use crate::fx::FxHashMap;
+
 use std::error::Error;
 use std::fs;
-use std::mem::{self, Discriminant};
 use std::path::Path;
 use std::process;
 use std::sync::Arc;
@ -8,7 +92,8 @@ use std::thread::ThreadId;
 use std::time::{Duration, Instant};
 use std::u32;

-use measureme::StringId;
+use measureme::{EventId, EventIdBuilder, SerializableString, StringId};
+use parking_lot::RwLock;

 /// MmapSerializatioSink is faster on macOS and Linux
 /// but FileSerializationSink is faster on Windows
@ -19,11 +104,6 @@ type SerializationSink = measureme::FileSerializationSink;

 type Profiler = measureme::Profiler<SerializationSink>;

-pub trait QueryName: Sized + Copy {
-    fn discriminant(self) -> Discriminant<Self>;
-    fn as_str(self) -> &'static str;
-}
-
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd)]
 pub enum ProfileCategory {
    Parsing,
@ -43,6 +123,8 @@ bitflags::bitflags! {
        const QUERY_BLOCKED      = 1 << 3;
        const INCR_CACHE_LOADS   = 1 << 4;

+        const QUERY_KEYS         = 1 << 5;
+
        const DEFAULT = Self::GENERIC_ACTIVITIES.bits |
                        Self::QUERY_PROVIDERS.bits |
                        Self::QUERY_BLOCKED.bits |
@ -62,12 +144,16 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
    ("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
    ("query-blocked", EventFilter::QUERY_BLOCKED),
    ("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
+    ("query-keys", EventFilter::QUERY_KEYS),
 ];

 fn thread_id_to_u32(tid: ThreadId) -> u32 {
-    unsafe { mem::transmute::<ThreadId, u64>(tid) as u32 }
+    unsafe { std::mem::transmute::<ThreadId, u64>(tid) as u32 }
 }

+/// Something that uniquely identifies a query invocation.
+pub struct QueryInvocationId(pub u32);
+
 /// A reference to the SelfProfiler. It can be cloned and sent across thread
 /// boundaries at will.
 #[derive(Clone)]
@ -138,7 +224,10 @@ impl SelfProfilerRef {
    /// a measureme event, "verbose" generic activities also print a timing entry to
    /// stdout if the compiler is invoked with -Ztime or -Ztime-passes.
    #[inline(always)]
-    pub fn verbose_generic_activity<'a>(&'a self, event_id: &'a str) -> VerboseTimingGuard<'a> {
+    pub fn verbose_generic_activity<'a>(
+        &'a self,
+        event_id: &'static str,
+    ) -> VerboseTimingGuard<'a> {
        VerboseTimingGuard::start(
            event_id,
            self.print_verbose_generic_activities,
@ -167,9 +256,10 @@ impl SelfProfilerRef {
    /// Start profiling a generic activity. Profiling continues until the
    /// TimingGuard returned from this call is dropped.
    #[inline(always)]
-    pub fn generic_activity(&self, event_id: &str) -> TimingGuard<'_> {
+    pub fn generic_activity(&self, event_id: &'static str) -> TimingGuard<'_> {
        self.exec(EventFilter::GENERIC_ACTIVITIES, |profiler| {
-            let event_id = profiler.profiler.alloc_string(event_id);
+            let event_id = profiler.get_or_alloc_cached_string(event_id);
+            let event_id = EventId::from_label(event_id);
            TimingGuard::start(profiler, profiler.generic_activity_event_kind, event_id)
        })
    }
@ -177,19 +267,18 @@ impl SelfProfilerRef {
    /// Start profiling a query provider. Profiling continues until the
    /// TimingGuard returned from this call is dropped.
    #[inline(always)]
-    pub fn query_provider(&self, query_name: impl QueryName) -> TimingGuard<'_> {
+    pub fn query_provider(&self) -> TimingGuard<'_> {
        self.exec(EventFilter::QUERY_PROVIDERS, |profiler| {
-            let event_id = SelfProfiler::get_query_name_string_id(query_name);
-            TimingGuard::start(profiler, profiler.query_event_kind, event_id)
+            TimingGuard::start(profiler, profiler.query_event_kind, EventId::INVALID)
        })
    }

    /// Record a query in-memory cache hit.
    #[inline(always)]
-    pub fn query_cache_hit(&self, query_name: impl QueryName) {
+    pub fn query_cache_hit(&self, query_invocation_id: QueryInvocationId) {
        self.instant_query_event(
            |profiler| profiler.query_cache_hit_event_kind,
-            query_name,
+            query_invocation_id,
            EventFilter::QUERY_CACHE_HITS,
        );
    }
@ -198,10 +287,9 @@ impl SelfProfilerRef {
    /// Profiling continues until the TimingGuard returned from this call is
    /// dropped.
    #[inline(always)]
-    pub fn query_blocked(&self, query_name: impl QueryName) -> TimingGuard<'_> {
+    pub fn query_blocked(&self) -> TimingGuard<'_> {
        self.exec(EventFilter::QUERY_BLOCKED, |profiler| {
-            let event_id = SelfProfiler::get_query_name_string_id(query_name);
-            TimingGuard::start(profiler, profiler.query_blocked_event_kind, event_id)
+            TimingGuard::start(profiler, profiler.query_blocked_event_kind, EventId::INVALID)
        })
    }

@ -209,10 +297,13 @@ impl SelfProfilerRef {
    /// incremental compilation on-disk cache. Profiling continues until the
    /// TimingGuard returned from this call is dropped.
    #[inline(always)]
-    pub fn incr_cache_loading(&self, query_name: impl QueryName) -> TimingGuard<'_> {
+    pub fn incr_cache_loading(&self) -> TimingGuard<'_> {
        self.exec(EventFilter::INCR_CACHE_LOADS, |profiler| {
-            let event_id = SelfProfiler::get_query_name_string_id(query_name);
-            TimingGuard::start(profiler, profiler.incremental_load_result_event_kind, event_id)
+            TimingGuard::start(
+                profiler,
+                profiler.incremental_load_result_event_kind,
+                EventId::INVALID,
+            )
        })
    }

@ -220,29 +311,41 @@ impl SelfProfilerRef {
    fn instant_query_event(
        &self,
        event_kind: fn(&SelfProfiler) -> StringId,
-        query_name: impl QueryName,
+        query_invocation_id: QueryInvocationId,
        event_filter: EventFilter,
    ) {
        drop(self.exec(event_filter, |profiler| {
-            let event_id = SelfProfiler::get_query_name_string_id(query_name);
+            let event_id = StringId::new_virtual(query_invocation_id.0);
            let thread_id = thread_id_to_u32(std::thread::current().id());

-            profiler.profiler.record_instant_event(event_kind(profiler), event_id, thread_id);
+            profiler.profiler.record_instant_event(
+                event_kind(profiler),
+                EventId::from_virtual(event_id),
+                thread_id,
+            );

            TimingGuard::none()
        }));
    }

-    pub fn register_queries(&self, f: impl FnOnce(&SelfProfiler)) {
+    pub fn with_profiler(&self, f: impl FnOnce(&SelfProfiler)) {
        if let Some(profiler) = &self.profiler {
            f(&profiler)
        }
    }
+
+    #[inline]
+    pub fn enabled(&self) -> bool {
+        self.profiler.is_some()
+    }
 }

 pub struct SelfProfiler {
    profiler: Profiler,
    event_filter_mask: EventFilter,
+
+    string_cache: RwLock<FxHashMap<&'static str, StringId>>,
+
    query_event_kind: StringId,
    generic_activity_event_kind: StringId,
    incremental_load_result_event_kind: StringId,
@ -305,6 +408,7 @@ impl SelfProfiler {
        Ok(SelfProfiler {
            profiler,
            event_filter_mask,
+            string_cache: RwLock::new(FxHashMap::default()),
            query_event_kind,
            generic_activity_event_kind,
            incremental_load_result_event_kind,
@ -313,16 +417,51 @@ impl SelfProfiler {
        })
    }

-    fn get_query_name_string_id(query_name: impl QueryName) -> StringId {
-        let discriminant =
-            unsafe { mem::transmute::<Discriminant<_>, u64>(query_name.discriminant()) };
-
-        StringId::reserved(discriminant as u32)
+    /// Allocates a new string in the profiling data. Does not do any caching
+    /// or deduplication.
+    pub fn alloc_string<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
+        self.profiler.alloc_string(s)
    }

-    pub fn register_query_name(&self, query_name: impl QueryName) {
-        let id = SelfProfiler::get_query_name_string_id(query_name);
-        self.profiler.alloc_string_with_reserved_id(id, query_name.as_str());
+    /// Gets a `StringId` for the given string. This method makes sure that
+    /// any strings going through it will only be allocated once in the
+    /// profiling data.
+    pub fn get_or_alloc_cached_string(&self, s: &'static str) -> StringId {
+        // Only acquire a read-lock first since we assume that the string is
+        // already present in the common case.
+        {
+            let string_cache = self.string_cache.read();
+
+            if let Some(&id) = string_cache.get(s) {
+                return id;
+            }
+        }
+
+        let mut string_cache = self.string_cache.write();
+        // Check if the string has already been added in the small time window
+        // between dropping the read lock and acquiring the write lock.
+        *string_cache.entry(s).or_insert_with(|| self.profiler.alloc_string(s))
+    }
+
+    pub fn map_query_invocation_id_to_string(&self, from: QueryInvocationId, to: StringId) {
+        let from = StringId::new_virtual(from.0);
+        self.profiler.map_virtual_to_concrete_string(from, to);
+    }
+
+    pub fn bulk_map_query_invocation_id_to_single_string<I>(&self, from: I, to: StringId)
+    where
+        I: Iterator<Item = QueryInvocationId> + ExactSizeIterator,
+    {
+        let from = from.map(|qid| StringId::new_virtual(qid.0));
+        self.profiler.bulk_map_virtual_to_single_concrete_string(from, to);
+    }
+
+    pub fn query_key_recording_enabled(&self) -> bool {
+        self.event_filter_mask.contains(EventFilter::QUERY_KEYS)
+    }
+
+    pub fn event_id_builder(&self) -> EventIdBuilder<'_, SerializationSink> {
+        EventIdBuilder::new(&self.profiler)
    }
 }

@ -334,7 +473,7 @@ impl<'a> TimingGuard<'a> {
    pub fn start(
        profiler: &'a SelfProfiler,
        event_kind: StringId,
-        event_id: StringId,
+        event_id: EventId,
    ) -> TimingGuard<'a> {
        let thread_id = thread_id_to_u32(std::thread::current().id());
        let raw_profiler = &profiler.profiler;
@ -343,6 +482,15 @@ impl<'a> TimingGuard<'a> {
        TimingGuard(Some(timing_guard))
    }

+    #[inline]
+    pub fn finish_with_query_invocation_id(self, query_invocation_id: QueryInvocationId) {
+        if let Some(guard) = self.0 {
+            let event_id = StringId::new_virtual(query_invocation_id.0);
+            let event_id = EventId::from_virtual(event_id);
+            guard.finish_with_override_event_id(event_id);
+        }
+    }
+
    #[inline]
    pub fn none() -> TimingGuard<'a> {
        TimingGuard(None)
@ -444,8 +592,8 @@ fn get_resident() -> Option<usize> {
            cb: DWORD,
        ) -> BOOL;
    }
-    let mut pmc: PROCESS_MEMORY_COUNTERS = unsafe { mem::zeroed() };
-    pmc.cb = mem::size_of_val(&pmc) as DWORD;
+    let mut pmc: PROCESS_MEMORY_COUNTERS = unsafe { std::mem::zeroed() };
+    pmc.cb = std::mem::size_of_val(&pmc) as DWORD;
    match unsafe { GetProcessMemoryInfo(GetCurrentProcess(), &mut pmc, pmc.cb) } {
        0 => None,
        _ => Some(pmc.WorkingSetSize as usize),
--- a/src/librustc_interface/util.rs
+++ b/src/librustc_interface/util.rs
@ -71,10 +71,6 @@ pub fn create_session(
        lint_caps,
    );

-    sess.prof.register_queries(|profiler| {
-        rustc::ty::query::QueryName::register_with_profiler(&profiler);
-    });
-
    let codegen_backend = get_codegen_backend(&sess);

    let mut cfg = config::build_configuration(&sess, config::to_crate_config(cfg));