Auto merge of #136410 - saethlin:clean-up-cgu-internal-copy, r=compiler-errors

Remove InstanceKind::generates_cgu_internal_copy

This PR should not contain any behavior changes. Before this PR, the logic for selecting instantiation mode is spread across all of
* `instantiation_mode`
* `cross_crate_inlinable`
* `generates_cgu_internal_copy`
* `requires_inline`

The last two of those functions are not well-designed. The function that actually decides if we generate a CGU-internal copy is `instantiation_mode`, _not_ `generates_cgu_internal_copy`. The function `requires_inline` documents that it is about the LLVM `inline` attribute and that it is a hint. The LLVM attribute is called `inlinehint`, this function is also used by other codegen backends, and since it is part of instantiation mode selection it is *not* a hint.

The goal of this PR is to start cleaning up the logic into a sequence of checks that have a more logical flow and are easier to customize in the future (to do things like improve incrementality or improve optimizations without causing obscure linker errors because you forgot to update another part of the compiler).
This commit is contained in:
bors 2025-03-25 06:36:41 +00:00
commit 7d49ae9731
3 changed files with 68 additions and 64 deletions

View file

@ -95,16 +95,11 @@ fn reachable_non_generics_provider(tcx: TyCtxt<'_>, _: LocalCrate) -> DefIdMap<S
return None; return None;
} }
// Functions marked with #[inline] are codegened with "internal" if Instance::mono(tcx, def_id.into()).def.requires_inline(tcx) {
// linkage and are not exported unless marked with an extern return None;
// indicator
if !Instance::mono(tcx, def_id.to_def_id()).def.generates_cgu_internal_copy(tcx)
|| tcx.codegen_fn_attrs(def_id.to_def_id()).contains_extern_indicator()
{
Some(def_id)
} else {
None
} }
if tcx.cross_crate_inlinable(def_id) { None } else { Some(def_id) }
}) })
.map(|def_id| { .map(|def_id| {
// We won't link right if this symbol is stripped during LTO. // We won't link right if this symbol is stripped during LTO.

View file

@ -21,7 +21,7 @@ use tracing::debug;
use crate::dep_graph::{DepNode, WorkProduct, WorkProductId}; use crate::dep_graph::{DepNode, WorkProduct, WorkProductId};
use crate::middle::codegen_fn_attrs::CodegenFnAttrFlags; use crate::middle::codegen_fn_attrs::CodegenFnAttrFlags;
use crate::ty::{GenericArgs, Instance, InstanceKind, SymbolName, TyCtxt}; use crate::ty::{self, GenericArgs, Instance, InstanceKind, SymbolName, Ty, TyCtxt};
/// Describes how a monomorphization will be instantiated in object files. /// Describes how a monomorphization will be instantiated in object files.
#[derive(PartialEq)] #[derive(PartialEq)]
@ -55,6 +55,39 @@ pub enum MonoItem<'tcx> {
GlobalAsm(ItemId), GlobalAsm(ItemId),
} }
fn opt_incr_drop_glue_mode<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> InstantiationMode {
// Non-ADTs can't have a Drop impl. This case is mostly hit by closures whose captures require
// dropping.
let ty::Adt(adt_def, _) = ty.kind() else {
return InstantiationMode::LocalCopy;
};
// Types that don't have a direct Drop impl, but have fields that require dropping.
let Some(dtor) = adt_def.destructor(tcx) else {
// We use LocalCopy for drops of enums only; this code is inherited from
// https://github.com/rust-lang/rust/pull/67332 and the theory is that we get to optimize
// out code like drop_in_place(Option::None) before crate-local ThinLTO, which improves
// compile time. At the time of writing, simply removing this entire check does seem to
// regress incr-opt compile times. But it sure seems like a more sophisticated check could
// do better here.
if adt_def.is_enum() {
return InstantiationMode::LocalCopy;
} else {
return InstantiationMode::GloballyShared { may_conflict: true };
}
};
// We've gotten to a drop_in_place for a type that directly implements Drop.
// The drop glue is a wrapper for the Drop::drop impl, and we are an optimized build, so in an
// effort to coordinate with the mode that the actual impl will get, we make the glue also
// LocalCopy.
if tcx.cross_crate_inlinable(dtor.did) {
InstantiationMode::LocalCopy
} else {
InstantiationMode::GloballyShared { may_conflict: true }
}
}
impl<'tcx> MonoItem<'tcx> { impl<'tcx> MonoItem<'tcx> {
/// Returns `true` if the mono item is user-defined (i.e. not compiler-generated, like shims). /// Returns `true` if the mono item is user-defined (i.e. not compiler-generated, like shims).
pub fn is_user_defined(&self) -> bool { pub fn is_user_defined(&self) -> bool {
@ -124,16 +157,36 @@ impl<'tcx> MonoItem<'tcx> {
return InstantiationMode::GloballyShared { may_conflict: false }; return InstantiationMode::GloballyShared { may_conflict: false };
} }
// FIXME: The logic for which functions are permitted to get LocalCopy is actually spread // This is technically a heuristic even though it's in the "not a heuristic" part of
// across 4 functions: // instantiation mode selection.
// * cross_crate_inlinable(def_id) // It is surely possible to untangle this; the root problem is that the way we instantiate
// * InstanceKind::requires_inline // InstanceKind other than Item is very complicated.
// * InstanceKind::generate_cgu_internal_copy //
// * MonoItem::instantiation_mode // The fallback case is to give everything else GloballyShared at OptLevel::No and
// Since reachable_non_generics calls InstanceKind::generates_cgu_internal_copy to decide // LocalCopy at all other opt levels. This is a good default, except for one specific build
// which symbols this crate exports, we are obligated to only generate LocalCopy when // configuration: Optimized incremental builds.
// generates_cgu_internal_copy returns true. // In the current compiler architecture there is a fundamental tension between
if !instance.def.generates_cgu_internal_copy(tcx) { // optimizations (which want big CGUs with as many things LocalCopy as possible) and
// incrementality (which wants small CGUs with as many things GloballyShared as possible).
// The heuristics implemented here do better than a completely naive approach in the
// compiler benchmark suite, but there is no reason to believe they are optimal.
if let InstanceKind::DropGlue(_, Some(ty)) = instance.def {
if tcx.sess.opts.optimize == OptLevel::No {
return InstantiationMode::GloballyShared { may_conflict: false };
}
if tcx.sess.opts.incremental.is_none() {
return InstantiationMode::LocalCopy;
}
return opt_incr_drop_glue_mode(tcx, ty);
}
// We need to ensure that we do not decide the InstantiationMode of an exported symbol is
// LocalCopy. Since exported symbols are computed based on the output of
// cross_crate_inlinable, we are beholden to our previous decisions.
//
// Note that just like above, this check for requires_inline is technically a heuristic
// even though it's in the "not a heuristic" part of instantiation mode selection.
if !tcx.cross_crate_inlinable(instance.def_id()) && !instance.def.requires_inline(tcx) {
return InstantiationMode::GloballyShared { may_conflict: false }; return InstantiationMode::GloballyShared { may_conflict: false };
} }

View file

@ -302,50 +302,6 @@ impl<'tcx> InstanceKind<'tcx> {
) )
} }
/// Returns `true` if the machine code for this instance is instantiated in
/// each codegen unit that references it.
/// Note that this is only a hint! The compiler can globally decide to *not*
/// do this in order to speed up compilation. CGU-internal copies are
/// only exist to enable inlining. If inlining is not performed (e.g. at
/// `-Copt-level=0`) then the time for generating them is wasted and it's
/// better to create a single copy with external linkage.
pub fn generates_cgu_internal_copy(&self, tcx: TyCtxt<'tcx>) -> bool {
if self.requires_inline(tcx) {
return true;
}
if let ty::InstanceKind::DropGlue(.., Some(ty))
| ty::InstanceKind::AsyncDropGlueCtorShim(.., Some(ty)) = *self
{
// Drop glue generally wants to be instantiated at every codegen
// unit, but without an #[inline] hint. We should make this
// available to normal end-users.
if tcx.sess.opts.incremental.is_none() {
return true;
}
// When compiling with incremental, we can generate a *lot* of
// codegen units. Including drop glue into all of them has a
// considerable compile time cost.
//
// We include enums without destructors to allow, say, optimizing
// drops of `Option::None` before LTO. We also respect the intent of
// `#[inline]` on `Drop::drop` implementations.
return ty.ty_adt_def().is_none_or(|adt_def| {
match *self {
ty::InstanceKind::DropGlue(..) => adt_def.destructor(tcx).map(|dtor| dtor.did),
ty::InstanceKind::AsyncDropGlueCtorShim(..) => {
adt_def.async_destructor(tcx).map(|dtor| dtor.ctor)
}
_ => unreachable!(),
}
.map_or_else(|| adt_def.is_enum(), |did| tcx.cross_crate_inlinable(did))
});
}
if let ty::InstanceKind::ThreadLocalShim(..) = *self {
return false;
}
tcx.cross_crate_inlinable(self.def_id())
}
pub fn requires_caller_location(&self, tcx: TyCtxt<'_>) -> bool { pub fn requires_caller_location(&self, tcx: TyCtxt<'_>) -> bool {
match *self { match *self {
InstanceKind::Item(def_id) | InstanceKind::Virtual(def_id, _) => { InstanceKind::Item(def_id) | InstanceKind::Virtual(def_id, _) => {