1
Fork 0

Auto merge of #110083 - saethlin:encode-hashes-as-bytes, r=cjgillot

Encode hashes as bytes, not varint

In a few places, we store hashes as `u64` or `u128` and then apply `derive(Decodable, Encodable)` to the enclosing struct/enum. It is more efficient to encode hashes directly than try to apply some varint encoding. This PR adds two new types `Hash64` and `Hash128` which are produced by `StableHasher` and replace every use of storing a `u64` or `u128` that represents a hash.

Distribution of the byte lengths of leb128 encodings, from `x build --stage 2` with `incremental = true`

Before:
```
(  1) 373418203 (53.7%, 53.7%): 1
(  2) 196240113 (28.2%, 81.9%): 3
(  3) 108157958 (15.6%, 97.5%): 2
(  4)  17213120 ( 2.5%, 99.9%): 4
(  5)    223614 ( 0.0%,100.0%): 9
(  6)    216262 ( 0.0%,100.0%): 10
(  7)     15447 ( 0.0%,100.0%): 5
(  8)      3633 ( 0.0%,100.0%): 19
(  9)      3030 ( 0.0%,100.0%): 8
( 10)      1167 ( 0.0%,100.0%): 18
( 11)      1032 ( 0.0%,100.0%): 7
( 12)      1003 ( 0.0%,100.0%): 6
( 13)        10 ( 0.0%,100.0%): 16
( 14)        10 ( 0.0%,100.0%): 17
( 15)         5 ( 0.0%,100.0%): 12
( 16)         4 ( 0.0%,100.0%): 14
```

After:
```
(  1) 372939136 (53.7%, 53.7%): 1
(  2) 196240140 (28.3%, 82.0%): 3
(  3) 108014969 (15.6%, 97.5%): 2
(  4)  17192375 ( 2.5%,100.0%): 4
(  5)       435 ( 0.0%,100.0%): 5
(  6)        83 ( 0.0%,100.0%): 18
(  7)        79 ( 0.0%,100.0%): 10
(  8)        50 ( 0.0%,100.0%): 9
(  9)         6 ( 0.0%,100.0%): 19
```

The remaining 9 or 10 and 18 or 19 are `u64` and `u128` respectively that have the high bits set. As far as I can tell these are coming primarily from `SwitchTargets`.
This commit is contained in:
bors 2023-04-18 22:27:15 +00:00
commit b3f1379509
38 changed files with 289 additions and 138 deletions

View file

@ -1,4 +1,4 @@
use crate::stable_hasher;
use crate::stable_hasher::{Hash64, StableHasher, StableHasherResult};
use rustc_serialize::{Decodable, Decoder, Encodable, Encoder};
use std::hash::{Hash, Hasher};
@ -9,32 +9,49 @@ mod tests;
#[repr(C)]
pub struct Fingerprint(u64, u64);
pub trait FingerprintComponent {
fn as_u64(&self) -> u64;
}
impl FingerprintComponent for Hash64 {
#[inline]
fn as_u64(&self) -> u64 {
Hash64::as_u64(*self)
}
}
impl FingerprintComponent for u64 {
#[inline]
fn as_u64(&self) -> u64 {
*self
}
}
impl Fingerprint {
pub const ZERO: Fingerprint = Fingerprint(0, 0);
#[inline]
pub fn new(_0: u64, _1: u64) -> Fingerprint {
Fingerprint(_0, _1)
pub fn new<A, B>(_0: A, _1: B) -> Fingerprint
where
A: FingerprintComponent,
B: FingerprintComponent,
{
Fingerprint(_0.as_u64(), _1.as_u64())
}
#[inline]
pub fn from_smaller_hash(hash: u64) -> Fingerprint {
Fingerprint(hash, hash)
}
#[inline]
pub fn to_smaller_hash(&self) -> u64 {
pub fn to_smaller_hash(&self) -> Hash64 {
// Even though both halves of the fingerprint are expected to be good
// quality hash values, let's still combine the two values because the
// Fingerprints in DefPathHash have the StableCrateId portion which is
// the same for all DefPathHashes from the same crate. Combining the
// two halves makes sure we get a good quality hash in such cases too.
self.0.wrapping_mul(3).wrapping_add(self.1)
Hash64::new(self.0.wrapping_mul(3).wrapping_add(self.1))
}
#[inline]
pub fn as_value(&self) -> (u64, u64) {
(self.0, self.1)
pub fn split(&self) -> (Hash64, Hash64) {
(Hash64::new(self.0), Hash64::new(self.1))
}
#[inline]
@ -131,9 +148,9 @@ impl FingerprintHasher for crate::unhash::Unhasher {
}
}
impl stable_hasher::StableHasherResult for Fingerprint {
impl StableHasherResult for Fingerprint {
#[inline]
fn finish(hasher: stable_hasher::StableHasher) -> Self {
fn finish(hasher: StableHasher) -> Self {
let (_0, _1) = hasher.finalize();
Fingerprint(_0, _1)
}

View file

@ -1,11 +1,12 @@
use super::*;
use crate::stable_hasher::Hash64;
// Check that `combine_commutative` is order independent.
#[test]
fn combine_commutative_is_order_independent() {
let a = Fingerprint::new(0xf6622fb349898b06, 0x70be9377b2f9c610);
let b = Fingerprint::new(0xa9562bf5a2a5303c, 0x67d9b6c82034f13d);
let c = Fingerprint::new(0x0d013a27811dbbc3, 0x9a3f7b3d9142ec43);
let a = Fingerprint::new(Hash64::new(0xf6622fb349898b06), Hash64::new(0x70be9377b2f9c610));
let b = Fingerprint::new(Hash64::new(0xa9562bf5a2a5303c), Hash64::new(0x67d9b6c82034f13d));
let c = Fingerprint::new(Hash64::new(0x0d013a27811dbbc3), Hash64::new(0x9a3f7b3d9142ec43));
let permutations = [(a, b, c), (a, c, b), (b, a, c), (b, c, a), (c, a, b), (c, b, a)];
let f = a.combine_commutative(b).combine_commutative(c);
for p in &permutations {

View file

@ -0,0 +1,132 @@
//! rustc encodes a lot of hashes. If hashes are stored as `u64` or `u128`, a `derive(Encodable)`
//! will apply varint encoding to the hashes, which is less efficient than directly encoding the 8
//! or 16 bytes of the hash.
//!
//! The types in this module represent 64-bit or 128-bit hashes produced by a `StableHasher`.
//! `Hash64` and `Hash128` expose some utilty functions to encourage users to not extract the inner
//! hash value as an integer type and accidentally apply varint encoding to it.
//!
//! In contrast with `Fingerprint`, users of these types cannot and should not attempt to construct
//! and decompose these types into constitutent pieces. The point of these types is only to
//! connect the fact that they can only be produced by a `StableHasher` to their
//! `Encode`/`Decode` impls.
use crate::stable_hasher::{StableHasher, StableHasherResult};
use rustc_serialize::{Decodable, Decoder, Encodable, Encoder};
use std::fmt;
use std::ops::BitXorAssign;
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
pub struct Hash64 {
inner: u64,
}
impl Hash64 {
pub const ZERO: Hash64 = Hash64 { inner: 0 };
#[inline]
pub(crate) fn new(n: u64) -> Self {
Self { inner: n }
}
#[inline]
pub fn as_u64(self) -> u64 {
self.inner
}
}
impl BitXorAssign<u64> for Hash64 {
#[inline]
fn bitxor_assign(&mut self, rhs: u64) {
self.inner ^= rhs;
}
}
impl<S: Encoder> Encodable<S> for Hash64 {
#[inline]
fn encode(&self, s: &mut S) {
s.emit_raw_bytes(&self.inner.to_le_bytes());
}
}
impl<D: Decoder> Decodable<D> for Hash64 {
#[inline]
fn decode(d: &mut D) -> Self {
Self { inner: u64::from_le_bytes(d.read_raw_bytes(8).try_into().unwrap()) }
}
}
impl StableHasherResult for Hash64 {
#[inline]
fn finish(hasher: StableHasher) -> Self {
Self { inner: hasher.finalize().0 }
}
}
impl fmt::Debug for Hash64 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.inner.fmt(f)
}
}
impl fmt::LowerHex for Hash64 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::LowerHex::fmt(&self.inner, f)
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)]
pub struct Hash128 {
inner: u128,
}
impl Hash128 {
#[inline]
pub fn truncate(self) -> Hash64 {
Hash64 { inner: self.inner as u64 }
}
#[inline]
pub fn wrapping_add(self, other: Self) -> Self {
Self { inner: self.inner.wrapping_add(other.inner) }
}
#[inline]
pub fn as_u128(self) -> u128 {
self.inner
}
}
impl<S: Encoder> Encodable<S> for Hash128 {
#[inline]
fn encode(&self, s: &mut S) {
s.emit_raw_bytes(&self.inner.to_le_bytes());
}
}
impl<D: Decoder> Decodable<D> for Hash128 {
#[inline]
fn decode(d: &mut D) -> Self {
Self { inner: u128::from_le_bytes(d.read_raw_bytes(16).try_into().unwrap()) }
}
}
impl StableHasherResult for Hash128 {
#[inline]
fn finish(hasher: StableHasher) -> Self {
let (_0, _1) = hasher.finalize();
Self { inner: u128::from(_0) | (u128::from(_1) << 64) }
}
}
impl fmt::Debug for Hash128 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.inner.fmt(f)
}
}
impl fmt::LowerHex for Hash128 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::LowerHex::fmt(&self.inner, f)
}
}

View file

@ -86,6 +86,7 @@ pub mod work_queue;
pub use atomic_ref::AtomicRef;
pub mod aligned;
pub mod frozen;
mod hashes;
pub mod owned_slice;
pub mod sso;
pub mod steal;

View file

@ -2,6 +2,7 @@ use crate::sip128::SipHasher128;
use rustc_index::bit_set;
use rustc_index::vec;
use smallvec::SmallVec;
use std::fmt;
use std::hash::{BuildHasher, Hash, Hasher};
use std::marker::PhantomData;
use std::mem;
@ -9,6 +10,8 @@ use std::mem;
#[cfg(test)]
mod tests;
pub use crate::hashes::{Hash128, Hash64};
/// When hashing something that ends up affecting properties like symbol names,
/// we want these symbol names to be calculated independently of other factors
/// like what architecture you're compiling *from*.
@ -20,8 +23,8 @@ pub struct StableHasher {
state: SipHasher128,
}
impl ::std::fmt::Debug for StableHasher {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
impl fmt::Debug for StableHasher {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:?}", self.state)
}
}
@ -42,21 +45,6 @@ impl StableHasher {
}
}
impl StableHasherResult for u128 {
#[inline]
fn finish(hasher: StableHasher) -> Self {
let (_0, _1) = hasher.finalize();
u128::from(_0) | (u128::from(_1) << 64)
}
}
impl StableHasherResult for u64 {
#[inline]
fn finish(hasher: StableHasher) -> Self {
hasher.finalize().0
}
}
impl StableHasher {
#[inline]
pub fn finalize(self) -> (u64, u64) {
@ -287,6 +275,9 @@ impl_stable_traits_for_trivial_type!(i128);
impl_stable_traits_for_trivial_type!(char);
impl_stable_traits_for_trivial_type!(());
impl_stable_traits_for_trivial_type!(Hash64);
impl_stable_traits_for_trivial_type!(Hash128);
impl<CTX> HashStable<CTX> for ! {
fn hash_stable(&self, _ctx: &mut CTX, _hasher: &mut StableHasher) {
unreachable!()
@ -669,7 +660,7 @@ fn stable_hash_reduce<HCX, I, C, F>(
.map(|value| {
let mut hasher = StableHasher::new();
hash_function(&mut hasher, hcx, value);
hasher.finish::<u128>()
hasher.finish::<Hash128>()
})
.reduce(|accum, value| accum.wrapping_add(value));
hash.hash_stable(hcx, hasher);

View file

@ -72,7 +72,7 @@ fn test_hash_isize() {
assert_eq!(h.finalize(), expected);
}
fn hash<T: HashStable<()>>(t: &T) -> u128 {
fn hash<T: HashStable<()>>(t: &T) -> Hash128 {
let mut h = StableHasher::new();
let ctx = &mut ();
t.hash_stable(ctx, &mut h);

View file

@ -24,7 +24,7 @@ impl Svh {
}
pub fn as_u64(&self) -> u64 {
self.hash.to_smaller_hash()
self.hash.to_smaller_hash().as_u64()
}
pub fn to_string(&self) -> String {