Auto merge of #133250 - DianQK:embed-bitcode-pgo, r=nikic

The embedded bitcode should always be prepared for LTO/ThinLTO Fixes #115344. Fixes #117220. There are currently two methods for generating bitcode that used for LTO. One method involves using `-C linker-plugin-lto` to emit object files as bitcode, which is the typical setting used by cargo. The other method is through `-C embed-bitcode=yes`. When using with `-C embed-bitcode=yes -C lto=no`, we run a complete non-LTO LLVM pipeline to obtain bitcode, then the bitcode is used for LTO. We run the Call Graph Profile Pass twice on the same module. This PR is doing something similar to LLVM's `buildFatLTODefaultPipeline`, obtaining the bitcode for embedding after running `buildThinLTOPreLinkDefaultPipeline`. r? nikic
2025-03-01 08:22:18 +00:00 · 2025-03-01 08:22:18 +00:00 · 0c72c0d11a
commit 0c72c0d11a
parent 002da76821 a897cc0351
20 changed files with 294 additions and 101 deletions
--- a/compiler/rustc_codegen_gcc/src/back/lto.rs
+++ b/compiler/rustc_codegen_gcc/src/back/lto.rs
@ -632,17 +632,16 @@ pub unsafe fn optimize_thin_module(
            Arc::new(SyncContext::new(context))
        }
    };
-    let module = ModuleCodegen {
-        module_llvm: GccContext {
+    let module = ModuleCodegen::new_regular(
+        thin_module.name().to_string(),
+        GccContext {
            context,
            should_combine_object_files,
            // TODO(antoyo): use the correct relocation model here.
            relocation_model: RelocModel::Pic,
            temp_dir: None,
        },
-        name: thin_module.name().to_string(),
-        kind: ModuleKind::Regular,
-    };
+    );
    /*{
        let target = &*module.module_llvm.tm;
        let llmod = module.module_llvm.llmod();
--- a/compiler/rustc_codegen_gcc/src/base.rs
+++ b/compiler/rustc_codegen_gcc/src/base.rs
@ -4,10 +4,10 @@ use std::sync::Arc;
 use std::time::Instant;

 use gccjit::{CType, Context, FunctionType, GlobalKind};
+use rustc_codegen_ssa::ModuleCodegen;
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
 use rustc_codegen_ssa::mono_item::MonoItemExt;
 use rustc_codegen_ssa::traits::DebugInfoCodegenMethods;
-use rustc_codegen_ssa::{ModuleCodegen, ModuleKind};
 use rustc_middle::dep_graph;
 use rustc_middle::mir::mono::Linkage;
 #[cfg(feature = "master")]
@ -237,16 +237,15 @@ pub fn compile_codegen_unit(
            }
        }

-        ModuleCodegen {
-            name: cgu_name.to_string(),
-            module_llvm: GccContext {
+        ModuleCodegen::new_regular(
+            cgu_name.to_string(),
+            GccContext {
                context: Arc::new(SyncContext::new(context)),
                relocation_model: tcx.sess.relocation_model(),
                should_combine_object_files: false,
                temp_dir: None,
            },
-            kind: ModuleKind::Regular,
-        }
+        )
    }

    (module, cost)
--- a/compiler/rustc_codegen_gcc/src/lib.rs
+++ b/compiler/rustc_codegen_gcc/src/lib.rs
@ -393,7 +393,7 @@ impl WriteBackendMethods for GccCodegenBackend {
    unsafe fn optimize(
        _cgcx: &CodegenContext<Self>,
        _dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<(), FatalError> {
        module.module_llvm.context.set_optimization_level(to_gcc_opt_level(config.opt_level));
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@ -2,6 +2,7 @@ use std::collections::BTreeMap;
 use std::ffi::{CStr, CString};
 use std::fs::File;
 use std::path::Path;
+use std::ptr::NonNull;
 use std::sync::Arc;
 use std::{io, iter, slice};

@ -305,11 +306,8 @@ fn fat_lto(
            assert!(!serialized_modules.is_empty(), "must have at least one serialized module");
            let (buffer, name) = serialized_modules.remove(0);
            info!("no in-memory regular modules to choose from, parsing {:?}", name);
-            ModuleCodegen {
-                module_llvm: ModuleLlvm::parse(cgcx, &name, buffer.data(), dcx)?,
-                name: name.into_string().unwrap(),
-                kind: ModuleKind::Regular,
-            }
+            let llvm_module = ModuleLlvm::parse(cgcx, &name, buffer.data(), dcx)?;
+            ModuleCodegen::new_regular(name.into_string().unwrap(), llvm_module)
        }
    };
    {
@ -655,14 +653,14 @@ pub(crate) fn run_pass_manager(
    }

    unsafe {
-        write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, stage)?;
+        write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
    }

    if cfg!(llvm_enzyme) && enable_ad {
        let opt_stage = llvm::OptStage::FatLTO;
        let stage = write::AutodiffStage::PostAD;
        unsafe {
-            write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, stage)?;
+            write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
        }

        // This is the final IR, so people should be able to inspect the optimized autodiff output.
@ -729,6 +727,11 @@ impl ThinBuffer {
            ThinBuffer(buffer)
        }
    }
+
+    pub unsafe fn from_raw_ptr(ptr: *mut llvm::ThinLTOBuffer) -> ThinBuffer {
+        let mut ptr = NonNull::new(ptr).unwrap();
+        ThinBuffer(unsafe { ptr.as_mut() })
+    }
 }

 impl ThinBufferMethods for ThinBuffer {
@ -772,11 +775,11 @@ pub(crate) unsafe fn optimize_thin_module(
    // crates but for locally codegened modules we may be able to reuse
    // that LLVM Context and Module.
    let module_llvm = ModuleLlvm::parse(cgcx, module_name, thin_module.data(), dcx)?;
-    let mut module = ModuleCodegen {
-        module_llvm,
-        name: thin_module.name().to_string(),
-        kind: ModuleKind::Regular,
-    };
+    let mut module = ModuleCodegen::new_regular(thin_module.name(), module_llvm);
+    // Given that the newly created module lacks a thinlto buffer for embedding, we need to re-add it here.
+    if cgcx.config(ModuleKind::Regular).embed_bitcode() {
+        module.thin_lto_buffer = Some(thin_module.data().to_vec());
+    }
    {
        let target = &*module.module_llvm.tm;
        let llmod = module.module_llvm.llmod();
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@ -1,6 +1,7 @@
 use std::ffi::{CStr, CString};
 use std::io::{self, Write};
 use std::path::{Path, PathBuf};
+use std::ptr::null_mut;
 use std::sync::Arc;
 use std::{fs, slice, str};

@ -15,7 +16,7 @@ use rustc_codegen_ssa::back::write::{
    TargetMachineFactoryFn,
 };
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{CompiledModule, ModuleCodegen};
+use rustc_codegen_ssa::{CompiledModule, ModuleCodegen, ModuleKind};
 use rustc_data_structures::profiling::SelfProfilerRef;
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_errors::{DiagCtxtHandle, FatalError, Level};
@ -551,6 +552,7 @@ pub(crate) unsafe fn llvm_optimize(
    cgcx: &CodegenContext<LlvmCodegenBackend>,
    dcx: DiagCtxtHandle<'_>,
    module: &ModuleCodegen<ModuleLlvm>,
+    thin_lto_buffer: Option<&mut *mut llvm::ThinLTOBuffer>,
    config: &ModuleConfig,
    opt_level: config::OptLevel,
    opt_stage: llvm::OptStage,
@ -584,7 +586,17 @@ pub(crate) unsafe fn llvm_optimize(
        vectorize_loop = config.vectorize_loop;
    }
    trace!(?unroll_loops, ?vectorize_slp, ?vectorize_loop, ?run_enzyme);
-    let using_thin_buffers = opt_stage == llvm::OptStage::PreLinkThinLTO || config.bitcode_needed();
+    if thin_lto_buffer.is_some() {
+        assert!(
+            matches!(
+                opt_stage,
+                llvm::OptStage::PreLinkNoLTO
+                    | llvm::OptStage::PreLinkFatLTO
+                    | llvm::OptStage::PreLinkThinLTO
+            ),
+            "the bitcode for LTO can only be obtained at the pre-link stage"
+        );
+    }
    let pgo_gen_path = get_pgo_gen_path(config);
    let pgo_use_path = get_pgo_use_path(config);
    let pgo_sample_use_path = get_pgo_sample_use_path(config);
@ -644,7 +656,9 @@ pub(crate) unsafe fn llvm_optimize(
            config.no_prepopulate_passes,
            config.verify_llvm_ir,
            config.lint_llvm_ir,
-            using_thin_buffers,
+            thin_lto_buffer,
+            config.emit_thin_lto,
+            config.emit_thin_lto_summary,
            config.merge_functions,
            unroll_loops,
            vectorize_slp,
@ -675,7 +689,7 @@ pub(crate) unsafe fn llvm_optimize(
 pub(crate) unsafe fn optimize(
    cgcx: &CodegenContext<LlvmCodegenBackend>,
    dcx: DiagCtxtHandle<'_>,
-    module: &ModuleCodegen<ModuleLlvm>,
+    module: &mut ModuleCodegen<ModuleLlvm>,
    config: &ModuleConfig,
 ) -> Result<(), FatalError> {
    let _timer = cgcx.prof.generic_activity_with_arg("LLVM_module_optimize", &*module.name);
@ -705,9 +719,53 @@ pub(crate) unsafe fn optimize(
        // Otherwise we pretend AD is already done and run the normal opt pipeline (=PostAD).
        let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
        let autodiff_stage = if consider_ad { AutodiffStage::PreAD } else { AutodiffStage::PostAD };
-        return unsafe {
-            llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, autodiff_stage)
+        // The embedded bitcode is used to run LTO/ThinLTO.
+        // The bitcode obtained during the `codegen` phase is no longer suitable for performing LTO.
+        // It may have undergone LTO due to ThinLocal, so we need to obtain the embedded bitcode at
+        // this point.
+        let mut thin_lto_buffer = if (module.kind == ModuleKind::Regular
+            && config.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full))
+            || config.emit_thin_lto_summary
+        {
+            Some(null_mut())
+        } else {
+            None
        };
+        unsafe {
+            llvm_optimize(
+                cgcx,
+                dcx,
+                module,
+                thin_lto_buffer.as_mut(),
+                config,
+                opt_level,
+                opt_stage,
+                autodiff_stage,
+            )
+        }?;
+        if let Some(thin_lto_buffer) = thin_lto_buffer {
+            let thin_lto_buffer = unsafe { ThinBuffer::from_raw_ptr(thin_lto_buffer) };
+            module.thin_lto_buffer = Some(thin_lto_buffer.data().to_vec());
+            let bc_summary_out =
+                cgcx.output_filenames.temp_path(OutputType::ThinLinkBitcode, module_name);
+            if config.emit_thin_lto_summary
+                && let Some(thin_link_bitcode_filename) = bc_summary_out.file_name()
+            {
+                let summary_data = thin_lto_buffer.thin_link_data();
+                cgcx.prof.artifact_size(
+                    "llvm_bitcode_summary",
+                    thin_link_bitcode_filename.to_string_lossy(),
+                    summary_data.len() as u64,
+                );
+                let _timer = cgcx.prof.generic_activity_with_arg(
+                    "LLVM_module_codegen_emit_bitcode_summary",
+                    &*module.name,
+                );
+                if let Err(err) = fs::write(&bc_summary_out, summary_data) {
+                    dcx.emit_err(WriteBytecode { path: &bc_summary_out, err });
+                }
+            }
+        }
    }
    Ok(())
 }
@ -760,17 +818,21 @@ pub(crate) unsafe fn codegen(
        // otherwise requested.

        let bc_out = cgcx.output_filenames.temp_path(OutputType::Bitcode, module_name);
-        let bc_summary_out =
-            cgcx.output_filenames.temp_path(OutputType::ThinLinkBitcode, module_name);
        let obj_out = cgcx.output_filenames.temp_path(OutputType::Object, module_name);

        if config.bitcode_needed() {
+            if config.emit_bc || config.emit_obj == EmitObj::Bitcode {
+                let thin = {
+                    let _timer = cgcx.prof.generic_activity_with_arg(
+                        "LLVM_module_codegen_make_bitcode",
+                        &*module.name,
+                    );
+                    ThinBuffer::new(llmod, config.emit_thin_lto, false)
+                };
+                let data = thin.data();
                let _timer = cgcx
                    .prof
-                .generic_activity_with_arg("LLVM_module_codegen_make_bitcode", &*module.name);
-            let thin = ThinBuffer::new(llmod, config.emit_thin_lto, config.emit_thin_lto_summary);
-            let data = thin.data();
-
+                    .generic_activity_with_arg("LLVM_module_codegen_emit_bitcode", &*module.name);
                if let Some(bitcode_filename) = bc_out.file_name() {
                    cgcx.prof.artifact_size(
                        "llvm_bitcode",
@ -778,41 +840,19 @@ pub(crate) unsafe fn codegen(
                        data.len() as u64,
                    );
                }
-
-            if config.emit_thin_lto_summary
-                && let Some(thin_link_bitcode_filename) = bc_summary_out.file_name()
-            {
-                let summary_data = thin.thin_link_data();
-                cgcx.prof.artifact_size(
-                    "llvm_bitcode_summary",
-                    thin_link_bitcode_filename.to_string_lossy(),
-                    summary_data.len() as u64,
-                );
-
-                let _timer = cgcx.prof.generic_activity_with_arg(
-                    "LLVM_module_codegen_emit_bitcode_summary",
-                    &*module.name,
-                );
-                if let Err(err) = fs::write(&bc_summary_out, summary_data) {
-                    dcx.emit_err(WriteBytecode { path: &bc_summary_out, err });
-                }
-            }
-
-            if config.emit_bc || config.emit_obj == EmitObj::Bitcode {
-                let _timer = cgcx
-                    .prof
-                    .generic_activity_with_arg("LLVM_module_codegen_emit_bitcode", &*module.name);
                if let Err(err) = fs::write(&bc_out, data) {
                    dcx.emit_err(WriteBytecode { path: &bc_out, err });
                }
            }

-            if config.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full) {
+            if config.embed_bitcode() && module.kind == ModuleKind::Regular {
                let _timer = cgcx
                    .prof
                    .generic_activity_with_arg("LLVM_module_codegen_embed_bitcode", &*module.name);
+                let thin_bc =
+                    module.thin_lto_buffer.as_deref().expect("cannot find embedded bitcode");
                unsafe {
-                    embed_bitcode(cgcx, llcx, llmod, &config.bc_cmdline, data);
+                    embed_bitcode(cgcx, llcx, llmod, &config.bc_cmdline, &thin_bc);
                }
            }
        }
--- a/compiler/rustc_codegen_llvm/src/base.rs
+++ b/compiler/rustc_codegen_llvm/src/base.rs
@ -13,10 +13,10 @@

 use std::time::Instant;

+use rustc_codegen_ssa::ModuleCodegen;
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
 use rustc_codegen_ssa::mono_item::MonoItemExt;
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{ModuleCodegen, ModuleKind};
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_middle::dep_graph;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
@ -133,11 +133,7 @@ pub(crate) fn compile_codegen_unit(
            }
        }

-        ModuleCodegen {
-            name: cgu_name.to_string(),
-            module_llvm: llvm_module,
-            kind: ModuleKind::Regular,
-        }
+        ModuleCodegen::new_regular(cgu_name.to_string(), llvm_module)
    }

    (module, cost)
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@ -194,7 +194,7 @@ impl WriteBackendMethods for LlvmCodegenBackend {
    unsafe fn optimize(
        cgcx: &CodegenContext<Self>,
        dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<(), FatalError> {
        unsafe { back::write::optimize(cgcx, dcx, module, config) }
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@ -2425,7 +2425,9 @@ unsafe extern "C" {
        NoPrepopulatePasses: bool,
        VerifyIR: bool,
        LintIR: bool,
-        UseThinLTOBuffers: bool,
+        ThinLTOBuffer: Option<&mut *mut ThinLTOBuffer>,
+        EmitThinLTO: bool,
+        EmitThinLTOSummary: bool,
        MergeFunctions: bool,
        UnrollLoops: bool,
        SLPVectorize: bool,
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@ -278,6 +278,10 @@ impl ModuleConfig {
            || self.emit_obj == EmitObj::Bitcode
            || self.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full)
    }
+
+    pub fn embed_bitcode(&self) -> bool {
+        self.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full)
+    }
 }

 /// Configuration passed to the function returned by the `target_machine_factory`.
@ -877,14 +881,14 @@ pub(crate) fn compute_per_cgu_lto_type(

 fn execute_optimize_work_item<B: ExtraBackendMethods>(
    cgcx: &CodegenContext<B>,
-    module: ModuleCodegen<B::Module>,
+    mut module: ModuleCodegen<B::Module>,
    module_config: &ModuleConfig,
 ) -> Result<WorkItemResult<B>, FatalError> {
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();

    unsafe {
-        B::optimize(cgcx, dcx, &module, module_config)?;
+        B::optimize(cgcx, dcx, &mut module, module_config)?;
    }

    // After we've done the initial round of optimizations we need to
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@ -687,7 +687,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
        submit_codegened_module_to_llvm(
            &backend,
            &ongoing_codegen.coordinator.sender,
-            ModuleCodegen { name: llmod_id, module_llvm, kind: ModuleKind::Allocator },
+            ModuleCodegen::new_allocator(llmod_id, module_llvm),
            cost,
        );
    }
--- a/compiler/rustc_codegen_ssa/src/lib.rs
+++ b/compiler/rustc_codegen_ssa/src/lib.rs
@ -75,9 +75,29 @@ pub struct ModuleCodegen<M> {
    pub name: String,
    pub module_llvm: M,
    pub kind: ModuleKind,
+    /// Saving the ThinLTO buffer for embedding in the object file.
+    pub thin_lto_buffer: Option<Vec<u8>>,
 }

 impl<M> ModuleCodegen<M> {
+    pub fn new_regular(name: impl Into<String>, module: M) -> Self {
+        Self {
+            name: name.into(),
+            module_llvm: module,
+            kind: ModuleKind::Regular,
+            thin_lto_buffer: None,
+        }
+    }
+
+    pub fn new_allocator(name: impl Into<String>, module: M) -> Self {
+        Self {
+            name: name.into(),
+            module_llvm: module,
+            kind: ModuleKind::Allocator,
+            thin_lto_buffer: None,
+        }
+    }
+
    pub fn into_compiled_module(
        self,
        emit_obj: bool,
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@ -40,7 +40,7 @@ pub trait WriteBackendMethods: 'static + Sized + Clone {
    unsafe fn optimize(
        cgcx: &CodegenContext<Self>,
        dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<(), FatalError>;
    fn optimize_fat(
--- a/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp
@ -7,6 +7,7 @@
 #include "llvm/Analysis/Lint.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/AutoUpgrade.h"
@ -37,6 +38,7 @@
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
@ -195,6 +197,19 @@ extern "C" void LLVMRustTimeTraceProfilerFinish(const char *FileName) {
 GEN_SUBTARGETS
 #undef SUBTARGET

+// This struct and various functions are sort of a hack right now, but the
+// problem is that we've got in-memory LLVM modules after we generate and
+// optimize all codegen-units for one compilation in rustc. To be compatible
+// with the LTO support above we need to serialize the modules plus their
+// ThinLTO summary into memory.
+//
+// This structure is basically an owned version of a serialize module, with
+// a ThinLTO summary attached.
+struct LLVMRustThinLTOBuffer {
+  std::string data;
+  std::string thin_link_data;
+};
+
 extern "C" bool LLVMRustHasFeature(LLVMTargetMachineRef TM,
                                   const char *Feature) {
  TargetMachine *Target = unwrap(TM);
@ -704,7 +719,8 @@ extern "C" LLVMRustResult LLVMRustOptimize(
    LLVMModuleRef ModuleRef, LLVMTargetMachineRef TMRef,
    LLVMRustPassBuilderOptLevel OptLevelRust, LLVMRustOptStage OptStage,
    bool IsLinkerPluginLTO, bool NoPrepopulatePasses, bool VerifyIR,
-    bool LintIR, bool UseThinLTOBuffers, bool MergeFunctions, bool UnrollLoops,
+    bool LintIR, LLVMRustThinLTOBuffer **ThinLTOBufferRef, bool EmitThinLTO,
+    bool EmitThinLTOSummary, bool MergeFunctions, bool UnrollLoops,
    bool SLPVectorize, bool LoopVectorize, bool DisableSimplifyLibCalls,
    bool EmitLifetimeMarkers, bool RunEnzyme,
    LLVMRustSanitizerOptions *SanitizerOptions, const char *PGOGenPath,
@ -952,7 +968,10 @@ extern "C" LLVMRustResult LLVMRustOptimize(
  }

  ModulePassManager MPM;
-  bool NeedThinLTOBufferPasses = UseThinLTOBuffers;
+  bool NeedThinLTOBufferPasses = EmitThinLTO;
+  auto ThinLTOBuffer = std::make_unique<LLVMRustThinLTOBuffer>();
+  raw_string_ostream ThinLTODataOS(ThinLTOBuffer->data);
+  raw_string_ostream ThinLinkDataOS(ThinLTOBuffer->thin_link_data);
  if (!NoPrepopulatePasses) {
    // The pre-link pipelines don't support O0 and require using
    // buildO0DefaultPipeline() instead. At the same time, the LTO pipelines do
@ -976,7 +995,25 @@ extern "C" LLVMRustResult LLVMRustOptimize(

      switch (OptStage) {
      case LLVMRustOptStage::PreLinkNoLTO:
+        if (ThinLTOBufferRef) {
+          // This is similar to LLVM's `buildFatLTODefaultPipeline`, where the
+          // bitcode for embedding is obtained after performing
+          // `ThinLTOPreLinkDefaultPipeline`.
+          MPM.addPass(PB.buildThinLTOPreLinkDefaultPipeline(OptLevel));
+          if (EmitThinLTO) {
+            MPM.addPass(ThinLTOBitcodeWriterPass(
+                ThinLTODataOS, EmitThinLTOSummary ? &ThinLinkDataOS : nullptr));
+          } else {
+            MPM.addPass(BitcodeWriterPass(ThinLTODataOS));
+          }
+          *ThinLTOBufferRef = ThinLTOBuffer.release();
+          MPM.addPass(PB.buildModuleOptimizationPipeline(
+              OptLevel, ThinOrFullLTOPhase::None));
+          MPM.addPass(
+              createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
+        } else {
          MPM = PB.buildPerModuleDefaultPipeline(OptLevel);
+        }
        break;
      case LLVMRustOptStage::PreLinkThinLTO:
        MPM = PB.buildThinLTOPreLinkDefaultPipeline(OptLevel);
@ -1022,6 +1059,16 @@ extern "C" LLVMRustResult LLVMRustOptimize(
    MPM.addPass(CanonicalizeAliasesPass());
    MPM.addPass(NameAnonGlobalPass());
  }
+  // For `-Copt-level=0`, ThinLTO, or LTO.
+  if (ThinLTOBufferRef && *ThinLTOBufferRef == nullptr) {
+    if (EmitThinLTO) {
+      MPM.addPass(ThinLTOBitcodeWriterPass(
+          ThinLTODataOS, EmitThinLTOSummary ? &ThinLinkDataOS : nullptr));
+    } else {
+      MPM.addPass(BitcodeWriterPass(ThinLTODataOS));
+    }
+    *ThinLTOBufferRef = ThinLTOBuffer.release();
+  }

  // now load "-enzyme" pass:
 #ifdef ENZYME
@ -1500,19 +1547,6 @@ extern "C" bool LLVMRustPrepareThinLTOImport(const LLVMRustThinLTOData *Data,
  return true;
 }

-// This struct and various functions are sort of a hack right now, but the
-// problem is that we've got in-memory LLVM modules after we generate and
-// optimize all codegen-units for one compilation in rustc. To be compatible
-// with the LTO support above we need to serialize the modules plus their
-// ThinLTO summary into memory.
-//
-// This structure is basically an owned version of a serialize module, with
-// a ThinLTO summary attached.
-struct LLVMRustThinLTOBuffer {
-  std::string data;
-  std::string thin_link_data;
-};
-
 extern "C" LLVMRustThinLTOBuffer *
 LLVMRustThinLTOBufferCreate(LLVMModuleRef M, bool is_thin, bool emit_summary) {
  auto Ret = std::make_unique<LLVMRustThinLTOBuffer>();
--- a/compiler/rustc_session/src/config.rs
+++ b/compiler/rustc_session/src/config.rs
@ -539,7 +539,10 @@ impl FromStr for SplitDwarfKind {
 #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, HashStable_Generic)]
 #[derive(Encodable, Decodable)]
 pub enum OutputType {
+    /// This is the optimized bitcode, which could be either pre-LTO or non-LTO bitcode,
+    /// depending on the specific request type.
    Bitcode,
+    /// This is the summary or index data part of the ThinLTO bitcode.
    ThinLinkBitcode,
    Assembly,
    LlvmAssembly,
--- a/tests/codegen/overaligned-constant.rs
+++ b/tests/codegen/overaligned-constant.rs
@ -9,14 +9,14 @@ struct S(i32);

 struct SmallStruct(f32, Option<S>, &'static [f32]);

-// CHECK: @0 = private unnamed_addr constant
+// CHECK: [[const:@.*]] = private unnamed_addr constant
 // CHECK-SAME: , align 8

 #[no_mangle]
 pub fn overaligned_constant() {
    // CHECK-LABEL: @overaligned_constant
    // CHECK: [[full:%_.*]] = alloca [32 x i8], align 8
-    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[full]], ptr align 8 @0, i64 32, i1 false)
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[full]], ptr align 8 [[const]], i64 32, i1 false)
    let mut s = S(1);

    s.0 = 3;
--- a/tests/codegen/uninit-consts.rs
+++ b/tests/codegen/uninit-consts.rs
@ -11,15 +11,15 @@ pub struct PartiallyUninit {
    y: MaybeUninit<[u8; 10]>,
 }

-// CHECK: [[FULLY_UNINIT:@[0-9]+]] = private unnamed_addr constant <{ [10 x i8] }> undef
+// CHECK: [[FULLY_UNINIT:@.*]] = private unnamed_addr constant <{ [10 x i8] }> undef

-// CHECK: [[PARTIALLY_UNINIT:@[0-9]+]] = private unnamed_addr constant <{ [4 x i8], [12 x i8] }> <{ [4 x i8] c"{{\\EF\\BE\\AD\\DE|\\DE\\AD\\BE\\EF}}", [12 x i8] undef }>, align 4
+// CHECK: [[PARTIALLY_UNINIT:@.*]] = private unnamed_addr constant <{ [4 x i8], [12 x i8] }> <{ [4 x i8] c"{{\\EF\\BE\\AD\\DE|\\DE\\AD\\BE\\EF}}", [12 x i8] undef }>, align 4

 // This shouldn't contain undef, since it contains more chunks
 // than the default value of uninit_const_chunk_threshold.
-// CHECK: [[UNINIT_PADDING_HUGE:@[0-9]+]] = private unnamed_addr constant <{ [32768 x i8] }> <{ [32768 x i8] c"{{.+}}" }>, align 4
+// CHECK: [[UNINIT_PADDING_HUGE:@.*]] = private unnamed_addr constant <{ [32768 x i8] }> <{ [32768 x i8] c"{{.+}}" }>, align 4

-// CHECK: [[FULLY_UNINIT_HUGE:@[0-9]+]] = private unnamed_addr constant <{ [16384 x i8] }> undef
+// CHECK: [[FULLY_UNINIT_HUGE:@.*]] = private unnamed_addr constant <{ [16384 x i8] }> undef

 // CHECK-LABEL: @fully_uninit
 #[no_mangle]
--- a/tests/run-make/pgo-embed-bc-lto/interesting.rs
+++ b/tests/run-make/pgo-embed-bc-lto/interesting.rs
@ -0,0 +1,16 @@
+#![crate_name = "interesting"]
+#![crate_type = "rlib"]
+
+extern crate opaque;
+
+#[no_mangle]
+#[inline(never)]
+pub fn function_called_once() {
+    opaque::foo();
+}
+
+// CHECK-LABEL: @function_called_once
+// CHECK-SAME: !prof [[function_called_once_id:![0-9]+]] {
+// CHECK: "CG Profile"
+// CHECK-NOT: "CG Profile"
+// CHECK-DAG: [[function_called_once_id]] = !{!"function_entry_count", i64 1}
--- a/tests/run-make/pgo-embed-bc-lto/main.rs
+++ b/tests/run-make/pgo-embed-bc-lto/main.rs
@ -0,0 +1,5 @@
+extern crate interesting;
+
+fn main() {
+    interesting::function_called_once();
+}
--- a/tests/run-make/pgo-embed-bc-lto/opaque.rs
+++ b/tests/run-make/pgo-embed-bc-lto/opaque.rs
@ -0,0 +1,5 @@
+#![crate_name = "opaque"]
+#![crate_type = "rlib"]
+
+#[inline(never)]
+pub fn foo() {}
--- a/tests/run-make/pgo-embed-bc-lto/rmake.rs
+++ b/tests/run-make/pgo-embed-bc-lto/rmake.rs
@ -0,0 +1,67 @@
+// This test case verifies that we successfully complete an LTO build with PGO
+// using the embedded bitcode.
+// It also ensures that the generated IR correctly includes the call results.
+
+//@ needs-profiler-runtime
+//@ ignore-cross-compile
+
+use std::path::Path;
+
+use run_make_support::{
+    has_extension, has_prefix, llvm_filecheck, llvm_profdata, rfs, run, rustc, shallow_find_files,
+};
+
+fn run_test(cg_units: usize) {
+    let path_prof_data_dir = Path::new("prof_data_dir");
+    if path_prof_data_dir.exists() {
+        rfs::remove_dir_all(path_prof_data_dir);
+    }
+    rfs::create_dir_all(&path_prof_data_dir);
+    let path_merged_profdata = path_prof_data_dir.join("merged.profdata");
+    rustc().input("opaque.rs").codegen_units(1).run();
+    rustc()
+        .input("interesting.rs")
+        .profile_generate(&path_prof_data_dir)
+        .opt()
+        .crate_type("lib,cdylib")
+        .codegen_units(cg_units)
+        .run();
+    rustc()
+        .input("main.rs")
+        .arg("-Clto=thin")
+        .opt()
+        .codegen_units(cg_units)
+        .profile_generate(&path_prof_data_dir)
+        .opt()
+        .run();
+    run("main");
+    llvm_profdata().merge().output(&path_merged_profdata).input(path_prof_data_dir).run();
+    rustc()
+        .input("interesting.rs")
+        .profile_use(&path_merged_profdata)
+        .opt()
+        .crate_type("lib,cdylib")
+        .codegen_units(cg_units)
+        .emit("link")
+        .run();
+    rustc()
+        .input("main.rs")
+        .arg("-Clto=thin")
+        .opt()
+        .codegen_units(cg_units)
+        .profile_use(&path_merged_profdata)
+        .emit("llvm-ir,link")
+        .opt()
+        .run();
+    let files = shallow_find_files(".", |path| {
+        has_prefix(path, "main.interesting.interesting") && has_extension(path, "ll")
+    });
+    assert_eq!(files.len(), 1);
+    let llvm_ir = &files[0];
+    llvm_filecheck().patterns("interesting.rs").stdin_buf(rfs::read(llvm_ir)).run();
+}
+
+fn main() {
+    run_test(1);
+    run_test(16);
+}