1
Fork 0

Auto merge of #110083 - saethlin:encode-hashes-as-bytes, r=cjgillot

Encode hashes as bytes, not varint

In a few places, we store hashes as `u64` or `u128` and then apply `derive(Decodable, Encodable)` to the enclosing struct/enum. It is more efficient to encode hashes directly than try to apply some varint encoding. This PR adds two new types `Hash64` and `Hash128` which are produced by `StableHasher` and replace every use of storing a `u64` or `u128` that represents a hash.

Distribution of the byte lengths of leb128 encodings, from `x build --stage 2` with `incremental = true`

Before:
```
(  1) 373418203 (53.7%, 53.7%): 1
(  2) 196240113 (28.2%, 81.9%): 3
(  3) 108157958 (15.6%, 97.5%): 2
(  4)  17213120 ( 2.5%, 99.9%): 4
(  5)    223614 ( 0.0%,100.0%): 9
(  6)    216262 ( 0.0%,100.0%): 10
(  7)     15447 ( 0.0%,100.0%): 5
(  8)      3633 ( 0.0%,100.0%): 19
(  9)      3030 ( 0.0%,100.0%): 8
( 10)      1167 ( 0.0%,100.0%): 18
( 11)      1032 ( 0.0%,100.0%): 7
( 12)      1003 ( 0.0%,100.0%): 6
( 13)        10 ( 0.0%,100.0%): 16
( 14)        10 ( 0.0%,100.0%): 17
( 15)         5 ( 0.0%,100.0%): 12
( 16)         4 ( 0.0%,100.0%): 14
```

After:
```
(  1) 372939136 (53.7%, 53.7%): 1
(  2) 196240140 (28.3%, 82.0%): 3
(  3) 108014969 (15.6%, 97.5%): 2
(  4)  17192375 ( 2.5%,100.0%): 4
(  5)       435 ( 0.0%,100.0%): 5
(  6)        83 ( 0.0%,100.0%): 18
(  7)        79 ( 0.0%,100.0%): 10
(  8)        50 ( 0.0%,100.0%): 9
(  9)         6 ( 0.0%,100.0%): 19
```

The remaining 9 or 10 and 18 or 19 are `u64` and `u128` respectively that have the high bits set. As far as I can tell these are coming primarily from `SwitchTargets`.
This commit is contained in:
bors 2023-04-18 22:27:15 +00:00
commit b3f1379509
38 changed files with 289 additions and 138 deletions

View file

@ -357,7 +357,7 @@ impl<'tcx> DepNodeParams<TyCtxt<'tcx>> for HirId {
Fingerprint::new(
// `owner` is local, so is completely defined by the local hash
def_path_hash.local_hash(),
local_id.as_u32().into(),
local_id.as_u32() as u64,
)
}
@ -370,7 +370,7 @@ impl<'tcx> DepNodeParams<TyCtxt<'tcx>> for HirId {
#[inline(always)]
fn recover(tcx: TyCtxt<'tcx>, dep_node: &DepNode) -> Option<Self> {
if tcx.fingerprint_style(dep_node.kind) == FingerprintStyle::HirId {
let (local_hash, local_id) = Fingerprint::from(dep_node.hash).as_value();
let (local_hash, local_id) = Fingerprint::from(dep_node.hash).split();
let def_path_hash = DefPathHash::new(tcx.sess.local_stable_crate_id(), local_hash);
let def_id = tcx
.def_path_hash_to_def_id(def_path_hash, &mut || {
@ -378,6 +378,7 @@ impl<'tcx> DepNodeParams<TyCtxt<'tcx>> for HirId {
})
.expect_local();
let local_id = local_id
.as_u64()
.try_into()
.unwrap_or_else(|_| panic!("local id should be u32, found {:?}", local_id));
Some(HirId { owner: OwnerId { def_id }, local_id: ItemLocalId::from_u32(local_id) })

View file

@ -72,6 +72,6 @@ pub fn metadata_symbol_name(tcx: TyCtxt<'_>) -> String {
format!(
"rust_metadata_{}_{:08x}",
tcx.crate_name(LOCAL_CRATE),
tcx.sess.local_stable_crate_id().to_u64(),
tcx.sess.local_stable_crate_id(),
)
}

View file

@ -4,7 +4,7 @@ use rustc_attr::InlineAttr;
use rustc_data_structures::base_n;
use rustc_data_structures::fingerprint::Fingerprint;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
use rustc_data_structures::stable_hasher::{Hash128, HashStable, StableHasher};
use rustc_hir::def_id::{CrateNum, DefId, LOCAL_CRATE};
use rustc_hir::ItemId;
use rustc_index::vec::Idx;
@ -313,8 +313,8 @@ impl<'tcx> CodegenUnit<'tcx> {
// avoid collisions and is still reasonably short for filenames.
let mut hasher = StableHasher::new();
human_readable_name.hash(&mut hasher);
let hash: u128 = hasher.finish();
let hash = hash & ((1u128 << 80) - 1);
let hash: Hash128 = hasher.finish();
let hash = hash.as_u128() & ((1u128 << 80) - 1);
base_n::encode(hash, base_n::CASE_INSENSITIVE)
}
@ -505,22 +505,13 @@ impl<'tcx> CodegenUnitNameBuilder<'tcx> {
// instantiating stuff for upstream crates.
let local_crate_id = if cnum != LOCAL_CRATE {
let local_stable_crate_id = tcx.sess.local_stable_crate_id();
format!(
"-in-{}.{:08x}",
tcx.crate_name(LOCAL_CRATE),
local_stable_crate_id.to_u64() as u32,
)
format!("-in-{}.{:08x}", tcx.crate_name(LOCAL_CRATE), local_stable_crate_id)
} else {
String::new()
};
let stable_crate_id = tcx.sess.local_stable_crate_id();
format!(
"{}.{:08x}{}",
tcx.crate_name(cnum),
stable_crate_id.to_u64() as u32,
local_crate_id,
)
format!("{}.{:08x}{}", tcx.crate_name(cnum), stable_crate_id, local_crate_id)
});
write!(cgu_name, "{}", crate_prefix).unwrap();

View file

@ -141,14 +141,18 @@ impl<CTX> crate::ty::HashStable<CTX> for ScalarInt {
impl<S: Encoder> Encodable<S> for ScalarInt {
fn encode(&self, s: &mut S) {
s.emit_u128(self.data);
s.emit_u8(self.size.get());
let size = self.size.get();
s.emit_u8(size);
s.emit_raw_bytes(&self.data.to_le_bytes()[..size as usize]);
}
}
impl<D: Decoder> Decodable<D> for ScalarInt {
fn decode(d: &mut D) -> ScalarInt {
ScalarInt { data: d.read_u128(), size: NonZeroU8::new(d.read_u8()).unwrap() }
let mut data = [0u8; 16];
let size = d.read_u8();
data[..size as usize].copy_from_slice(d.read_raw_bytes(size as usize));
ScalarInt { data: u128::from_le_bytes(data), size: NonZeroU8::new(size).unwrap() }
}
}

View file

@ -925,7 +925,7 @@ impl<'tcx> TyCtxt<'tcx> {
crate_name,
// Don't print the whole stable crate id. That's just
// annoying in debug output.
stable_crate_id.to_u64() >> (8 * 6),
stable_crate_id.as_u64() >> (8 * 6),
self.def_path(def_id).to_string_no_crate_verbose()
)
}

View file

@ -11,7 +11,7 @@ use crate::ty::{
use crate::ty::{GenericArgKind, SubstsRef};
use rustc_apfloat::Float as _;
use rustc_data_structures::fx::{FxHashMap, FxHashSet};
use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
use rustc_data_structures::stable_hasher::{Hash64, HashStable, StableHasher};
use rustc_errors::ErrorGuaranteed;
use rustc_hir as hir;
use rustc_hir::def::{CtorOf, DefKind, Res};
@ -124,7 +124,7 @@ impl IntTypeExt for IntegerType {
impl<'tcx> TyCtxt<'tcx> {
/// Creates a hash of the type `Ty` which will be the same no matter what crate
/// context it's calculated within. This is used by the `type_id` intrinsic.
pub fn type_id_hash(self, ty: Ty<'tcx>) -> u64 {
pub fn type_id_hash(self, ty: Ty<'tcx>) -> Hash64 {
// We want the type_id be independent of the types free regions, so we
// erase them. The erase_regions() call will also anonymize bound
// regions, which is desirable too.