Auto merge of #110083 - saethlin:encode-hashes-as-bytes, r=cjgillot

Encode hashes as bytes, not varint In a few places, we store hashes as `u64` or `u128` and then apply `derive(Decodable, Encodable)` to the enclosing struct/enum. It is more efficient to encode hashes directly than try to apply some varint encoding. This PR adds two new types `Hash64` and `Hash128` which are produced by `StableHasher` and replace every use of storing a `u64` or `u128` that represents a hash. Distribution of the byte lengths of leb128 encodings, from `x build --stage 2` with `incremental = true` Before: ``` ( 1) 373418203 (53.7%, 53.7%): 1 ( 2) 196240113 (28.2%, 81.9%): 3 ( 3) 108157958 (15.6%, 97.5%): 2 ( 4) 17213120 ( 2.5%, 99.9%): 4 ( 5) 223614 ( 0.0%,100.0%): 9 ( 6) 216262 ( 0.0%,100.0%): 10 ( 7) 15447 ( 0.0%,100.0%): 5 ( 8) 3633 ( 0.0%,100.0%): 19 ( 9) 3030 ( 0.0%,100.0%): 8 ( 10) 1167 ( 0.0%,100.0%): 18 ( 11) 1032 ( 0.0%,100.0%): 7 ( 12) 1003 ( 0.0%,100.0%): 6 ( 13) 10 ( 0.0%,100.0%): 16 ( 14) 10 ( 0.0%,100.0%): 17 ( 15) 5 ( 0.0%,100.0%): 12 ( 16) 4 ( 0.0%,100.0%): 14 ``` After: ``` ( 1) 372939136 (53.7%, 53.7%): 1 ( 2) 196240140 (28.3%, 82.0%): 3 ( 3) 108014969 (15.6%, 97.5%): 2 ( 4) 17192375 ( 2.5%,100.0%): 4 ( 5) 435 ( 0.0%,100.0%): 5 ( 6) 83 ( 0.0%,100.0%): 18 ( 7) 79 ( 0.0%,100.0%): 10 ( 8) 50 ( 0.0%,100.0%): 9 ( 9) 6 ( 0.0%,100.0%): 19 ``` The remaining 9 or 10 and 18 or 19 are `u64` and `u128` respectively that have the high bits set. As far as I can tell these are coming primarily from `SwitchTargets`.
2023-04-18 22:27:15 +00:00 · 2023-04-18 22:27:15 +00:00 · b3f1379509
commit b3f1379509
parent c609da59d9 073d99b25d
38 changed files with 289 additions and 138 deletions
--- a/compiler/rustc_data_structures/src/fingerprint.rs
+++ b/compiler/rustc_data_structures/src/fingerprint.rs
@ -1,4 +1,4 @@
-use crate::stable_hasher;
+use crate::stable_hasher::{Hash64, StableHasher, StableHasherResult};
 use rustc_serialize::{Decodable, Decoder, Encodable, Encoder};
 use std::hash::{Hash, Hasher};

@ -9,32 +9,49 @@ mod tests;
 #[repr(C)]
 pub struct Fingerprint(u64, u64);

+pub trait FingerprintComponent {
+    fn as_u64(&self) -> u64;
+}
+
+impl FingerprintComponent for Hash64 {
+    #[inline]
+    fn as_u64(&self) -> u64 {
+        Hash64::as_u64(*self)
+    }
+}
+
+impl FingerprintComponent for u64 {
+    #[inline]
+    fn as_u64(&self) -> u64 {
+        *self
+    }
+}
+
 impl Fingerprint {
    pub const ZERO: Fingerprint = Fingerprint(0, 0);

    #[inline]
-    pub fn new(_0: u64, _1: u64) -> Fingerprint {
-        Fingerprint(_0, _1)
+    pub fn new<A, B>(_0: A, _1: B) -> Fingerprint
+    where
+        A: FingerprintComponent,
+        B: FingerprintComponent,
+    {
+        Fingerprint(_0.as_u64(), _1.as_u64())
    }

    #[inline]
-    pub fn from_smaller_hash(hash: u64) -> Fingerprint {
-        Fingerprint(hash, hash)
-    }
-
-    #[inline]
-    pub fn to_smaller_hash(&self) -> u64 {
+    pub fn to_smaller_hash(&self) -> Hash64 {
        // Even though both halves of the fingerprint are expected to be good
        // quality hash values, let's still combine the two values because the
        // Fingerprints in DefPathHash have the StableCrateId portion which is
        // the same for all DefPathHashes from the same crate. Combining the
        // two halves makes sure we get a good quality hash in such cases too.
-        self.0.wrapping_mul(3).wrapping_add(self.1)
+        Hash64::new(self.0.wrapping_mul(3).wrapping_add(self.1))
    }

    #[inline]
-    pub fn as_value(&self) -> (u64, u64) {
-        (self.0, self.1)
+    pub fn split(&self) -> (Hash64, Hash64) {
+        (Hash64::new(self.0), Hash64::new(self.1))
    }

    #[inline]
@ -131,9 +148,9 @@ impl FingerprintHasher for crate::unhash::Unhasher {
    }
 }

-impl stable_hasher::StableHasherResult for Fingerprint {
+impl StableHasherResult for Fingerprint {
    #[inline]
-    fn finish(hasher: stable_hasher::StableHasher) -> Self {
+    fn finish(hasher: StableHasher) -> Self {
        let (_0, _1) = hasher.finalize();
        Fingerprint(_0, _1)
    }
--- a/compiler/rustc_data_structures/src/fingerprint/tests.rs
+++ b/compiler/rustc_data_structures/src/fingerprint/tests.rs
@ -1,11 +1,12 @@
 use super::*;
+use crate::stable_hasher::Hash64;

 // Check that `combine_commutative` is order independent.
 #[test]
 fn combine_commutative_is_order_independent() {
-    let a = Fingerprint::new(0xf6622fb349898b06, 0x70be9377b2f9c610);
-    let b = Fingerprint::new(0xa9562bf5a2a5303c, 0x67d9b6c82034f13d);
-    let c = Fingerprint::new(0x0d013a27811dbbc3, 0x9a3f7b3d9142ec43);
+    let a = Fingerprint::new(Hash64::new(0xf6622fb349898b06), Hash64::new(0x70be9377b2f9c610));
+    let b = Fingerprint::new(Hash64::new(0xa9562bf5a2a5303c), Hash64::new(0x67d9b6c82034f13d));
+    let c = Fingerprint::new(Hash64::new(0x0d013a27811dbbc3), Hash64::new(0x9a3f7b3d9142ec43));
    let permutations = [(a, b, c), (a, c, b), (b, a, c), (b, c, a), (c, a, b), (c, b, a)];
    let f = a.combine_commutative(b).combine_commutative(c);
    for p in &permutations {
--- a/compiler/rustc_data_structures/src/hashes.rs
+++ b/compiler/rustc_data_structures/src/hashes.rs
@ -0,0 +1,132 @@
+//! rustc encodes a lot of hashes. If hashes are stored as `u64` or `u128`, a `derive(Encodable)`
+//! will apply varint encoding to the hashes, which is less efficient than directly encoding the 8
+//! or 16 bytes of the hash.
+//!
+//! The types in this module represent 64-bit or 128-bit hashes produced by a `StableHasher`.
+//! `Hash64` and `Hash128` expose some utilty functions to encourage users to not extract the inner
+//! hash value as an integer type and accidentally apply varint encoding to it.
+//!
+//! In contrast with `Fingerprint`, users of these types cannot and should not attempt to construct
+//! and decompose these types into constitutent pieces. The point of these types is only to
+//! connect the fact that they can only be produced by a `StableHasher` to their
+//! `Encode`/`Decode` impls.
+
+use crate::stable_hasher::{StableHasher, StableHasherResult};
+use rustc_serialize::{Decodable, Decoder, Encodable, Encoder};
+use std::fmt;
+use std::ops::BitXorAssign;
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
+pub struct Hash64 {
+    inner: u64,
+}
+
+impl Hash64 {
+    pub const ZERO: Hash64 = Hash64 { inner: 0 };
+
+    #[inline]
+    pub(crate) fn new(n: u64) -> Self {
+        Self { inner: n }
+    }
+
+    #[inline]
+    pub fn as_u64(self) -> u64 {
+        self.inner
+    }
+}
+
+impl BitXorAssign<u64> for Hash64 {
+    #[inline]
+    fn bitxor_assign(&mut self, rhs: u64) {
+        self.inner ^= rhs;
+    }
+}
+
+impl<S: Encoder> Encodable<S> for Hash64 {
+    #[inline]
+    fn encode(&self, s: &mut S) {
+        s.emit_raw_bytes(&self.inner.to_le_bytes());
+    }
+}
+
+impl<D: Decoder> Decodable<D> for Hash64 {
+    #[inline]
+    fn decode(d: &mut D) -> Self {
+        Self { inner: u64::from_le_bytes(d.read_raw_bytes(8).try_into().unwrap()) }
+    }
+}
+
+impl StableHasherResult for Hash64 {
+    #[inline]
+    fn finish(hasher: StableHasher) -> Self {
+        Self { inner: hasher.finalize().0 }
+    }
+}
+
+impl fmt::Debug for Hash64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.inner.fmt(f)
+    }
+}
+
+impl fmt::LowerHex for Hash64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::LowerHex::fmt(&self.inner, f)
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
+pub struct Hash128 {
+    inner: u128,
+}
+
+impl Hash128 {
+    #[inline]
+    pub fn truncate(self) -> Hash64 {
+        Hash64 { inner: self.inner as u64 }
+    }
+
+    #[inline]
+    pub fn wrapping_add(self, other: Self) -> Self {
+        Self { inner: self.inner.wrapping_add(other.inner) }
+    }
+
+    #[inline]
+    pub fn as_u128(self) -> u128 {
+        self.inner
+    }
+}
+
+impl<S: Encoder> Encodable<S> for Hash128 {
+    #[inline]
+    fn encode(&self, s: &mut S) {
+        s.emit_raw_bytes(&self.inner.to_le_bytes());
+    }
+}
+
+impl<D: Decoder> Decodable<D> for Hash128 {
+    #[inline]
+    fn decode(d: &mut D) -> Self {
+        Self { inner: u128::from_le_bytes(d.read_raw_bytes(16).try_into().unwrap()) }
+    }
+}
+
+impl StableHasherResult for Hash128 {
+    #[inline]
+    fn finish(hasher: StableHasher) -> Self {
+        let (_0, _1) = hasher.finalize();
+        Self { inner: u128::from(_0) | (u128::from(_1) << 64) }
+    }
+}
+
+impl fmt::Debug for Hash128 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.inner.fmt(f)
+    }
+}
+
+impl fmt::LowerHex for Hash128 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::LowerHex::fmt(&self.inner, f)
+    }
+}
--- a/compiler/rustc_data_structures/src/lib.rs
+++ b/compiler/rustc_data_structures/src/lib.rs
@ -86,6 +86,7 @@ pub mod work_queue;
 pub use atomic_ref::AtomicRef;
 pub mod aligned;
 pub mod frozen;
+mod hashes;
 pub mod owned_slice;
 pub mod sso;
 pub mod steal;
--- a/compiler/rustc_data_structures/src/stable_hasher.rs
+++ b/compiler/rustc_data_structures/src/stable_hasher.rs
@ -2,6 +2,7 @@ use crate::sip128::SipHasher128;
 use rustc_index::bit_set;
 use rustc_index::vec;
 use smallvec::SmallVec;
+use std::fmt;
 use std::hash::{BuildHasher, Hash, Hasher};
 use std::marker::PhantomData;
 use std::mem;
@ -9,6 +10,8 @@ use std::mem;
 #[cfg(test)]
 mod tests;

+pub use crate::hashes::{Hash128, Hash64};
+
 /// When hashing something that ends up affecting properties like symbol names,
 /// we want these symbol names to be calculated independently of other factors
 /// like what architecture you're compiling *from*.
@ -20,8 +23,8 @@ pub struct StableHasher {
    state: SipHasher128,
 }

-impl ::std::fmt::Debug for StableHasher {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+impl fmt::Debug for StableHasher {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{:?}", self.state)
    }
 }
@ -42,21 +45,6 @@ impl StableHasher {
    }
 }

-impl StableHasherResult for u128 {
-    #[inline]
-    fn finish(hasher: StableHasher) -> Self {
-        let (_0, _1) = hasher.finalize();
-        u128::from(_0) | (u128::from(_1) << 64)
-    }
-}
-
-impl StableHasherResult for u64 {
-    #[inline]
-    fn finish(hasher: StableHasher) -> Self {
-        hasher.finalize().0
-    }
-}
-
 impl StableHasher {
    #[inline]
    pub fn finalize(self) -> (u64, u64) {
@ -287,6 +275,9 @@ impl_stable_traits_for_trivial_type!(i128);
 impl_stable_traits_for_trivial_type!(char);
 impl_stable_traits_for_trivial_type!(());

+impl_stable_traits_for_trivial_type!(Hash64);
+impl_stable_traits_for_trivial_type!(Hash128);
+
 impl<CTX> HashStable<CTX> for ! {
    fn hash_stable(&self, _ctx: &mut CTX, _hasher: &mut StableHasher) {
        unreachable!()
@ -669,7 +660,7 @@ fn stable_hash_reduce<HCX, I, C, F>(
                .map(|value| {
                    let mut hasher = StableHasher::new();
                    hash_function(&mut hasher, hcx, value);
-                    hasher.finish::<u128>()
+                    hasher.finish::<Hash128>()
                })
                .reduce(|accum, value| accum.wrapping_add(value));
            hash.hash_stable(hcx, hasher);
--- a/compiler/rustc_data_structures/src/stable_hasher/tests.rs
+++ b/compiler/rustc_data_structures/src/stable_hasher/tests.rs
@ -72,7 +72,7 @@ fn test_hash_isize() {
    assert_eq!(h.finalize(), expected);
 }

-fn hash<T: HashStable<()>>(t: &T) -> u128 {
+fn hash<T: HashStable<()>>(t: &T) -> Hash128 {
    let mut h = StableHasher::new();
    let ctx = &mut ();
    t.hash_stable(ctx, &mut h);
--- a/compiler/rustc_data_structures/src/svh.rs
+++ b/compiler/rustc_data_structures/src/svh.rs
@ -24,7 +24,7 @@ impl Svh {
    }

    pub fn as_u64(&self) -> u64 {
-        self.hash.to_smaller_hash()
+        self.hash.to_smaller_hash().as_u64()
    }

    pub fn to_string(&self) -> String {