Merge commit '3e50cf6502' into sync_cg_clif-2024-01-26

2024-01-26 18:33:45 +00:00 · 2024-01-26 18:33:45 +00:00 · 37018026f0
commit 37018026f0
parent e7bbe8ce93 3e50cf6502
24 changed files with 434 additions and 323 deletions
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs
@ -13,17 +13,14 @@ use gimli::write::{
 };
 use gimli::{Encoding, Format, LineEncoding, RunTimeEndian};
 use indexmap::IndexSet;
+use rustc_session::Session;

 pub(crate) use self::emit::{DebugReloc, DebugRelocName};
 pub(crate) use self::unwind::UnwindContext;
 use crate::prelude::*;

-pub(crate) fn producer() -> String {
-    format!(
-        "rustc version {} with cranelift {}",
-        rustc_interface::util::rustc_version_str().unwrap_or("unknown version"),
-        cranelift_codegen::VERSION,
-    )
+pub(crate) fn producer(sess: &Session) -> String {
+    format!("rustc version {} with cranelift {}", sess.cfg_version, cranelift_codegen::VERSION)
 }

 pub(crate) struct DebugContext {
@ -67,7 +64,7 @@ impl DebugContext {

        let should_remap_filepaths = tcx.sess.should_prefer_remapped_for_codegen();

-        let producer = producer();
+        let producer = producer(tcx.sess);
        let comp_dir = tcx
            .sess
            .opts
--- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
@ -143,6 +143,7 @@ fn emit_cgu(
    debug: Option<DebugContext>,
    unwind_context: UnwindContext,
    global_asm_object_file: Option<PathBuf>,
+    producer: &str,
 ) -> Result<ModuleCodegenResult, String> {
    let mut product = module.finish();

@ -152,8 +153,14 @@ fn emit_cgu(

    unwind_context.emit(&mut product);

-    let module_regular =
-        emit_module(output_filenames, prof, product.object, ModuleKind::Regular, name.clone())?;
+    let module_regular = emit_module(
+        output_filenames,
+        prof,
+        product.object,
+        ModuleKind::Regular,
+        name.clone(),
+        producer,
+    )?;

    Ok(ModuleCodegenResult {
        module_regular,
@ -174,6 +181,7 @@ fn emit_module(
    mut object: cranelift_object::object::write::Object<'_>,
    kind: ModuleKind,
    name: String,
+    producer_str: &str,
 ) -> Result<CompiledModule, String> {
    if object.format() == cranelift_object::object::BinaryFormat::Elf {
        let comment_section = object.add_section(
@ -182,7 +190,7 @@ fn emit_module(
            cranelift_object::object::SectionKind::OtherString,
        );
        let mut producer = vec![0];
-        producer.extend(crate::debuginfo::producer().as_bytes());
+        producer.extend(producer_str.as_bytes());
        producer.push(0);
        object.set_section_data(comment_section, producer, 1);
    }
@ -321,6 +329,8 @@ fn module_codegen(
            (cgu_name, cx, module, codegened_functions)
        });

+    let producer = crate::debuginfo::producer(tcx.sess);
+
    OngoingModuleCodegen::Async(std::thread::spawn(move || {
        cx.profiler.clone().generic_activity_with_arg("compile functions", &*cgu_name).run(|| {
            cranelift_codegen::timing::set_thread_profiler(Box::new(super::MeasuremeProfiler(
@ -348,6 +358,7 @@ fn module_codegen(
                    cx.debug_context,
                    cx.unwind_context,
                    global_asm_object_file,
+                    &producer,
                )
            });
        std::mem::drop(token);
@ -453,6 +464,7 @@ pub(crate) fn run_aot(
            product.object,
            ModuleKind::Allocator,
            "allocator_shim".to_owned(),
+            &crate::debuginfo::producer(tcx.sess),
        ) {
            Ok(allocator_module) => Some(allocator_module),
            Err(err) => tcx.dcx().fatal(err),
@ -467,7 +479,7 @@ pub(crate) fn run_aot(

            let cgu_name_builder = &mut CodegenUnitNameBuilder::new(tcx);
            let metadata_cgu_name = cgu_name_builder
-                .build_cgu_name(LOCAL_CRATE, &["crate"], Some("metadata"))
+                .build_cgu_name(LOCAL_CRATE, ["crate"], Some("metadata"))
                .as_str()
                .to_string();

--- a/compiler/rustc_codegen_cranelift/src/driver/jit.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/jit.rs
@ -321,10 +321,9 @@ fn dep_symbol_lookup_fn(
            Linkage::NotLinked | Linkage::IncludedFromDylib => {}
            Linkage::Static => {
                let name = crate_info.crate_name[&cnum];
-                sess.dcx()
-                    .struct_err(format!("Can't load static lib {}", name))
-                    .note("rustc_codegen_cranelift can only load dylibs in JIT mode.")
-                    .emit();
+                let mut diag = sess.dcx().struct_err(format!("Can't load static lib {}", name));
+                diag.note("rustc_codegen_cranelift can only load dylibs in JIT mode.");
+                diag.emit();
            }
            Linkage::Dynamic => {
                dylib_paths.push(src.dylib.as_ref().unwrap().0.clone());
--- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
@ -52,7 +52,7 @@ pub(crate) fn codegen_inline_asm_terminator<'tcx>(
    }

    let operands = operands
-        .into_iter()
+        .iter()
        .map(|operand| match *operand {
            InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
                reg,
@ -506,10 +506,34 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
                            if self.options.contains(InlineAsmOptions::ATT_SYNTAX) {
                                generated_asm.push('%');
                            }
-                            self.registers[*operand_idx]
-                                .unwrap()
-                                .emit(&mut generated_asm, self.arch, *modifier)
-                                .unwrap();
+
+                            let reg = self.registers[*operand_idx].unwrap();
+                            match self.arch {
+                                InlineAsmArch::X86_64 => match reg {
+                                    InlineAsmReg::X86(reg)
+                                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                                    {
+                                        // rustc emits x0 rather than xmm0
+                                        let class = match *modifier {
+                                            None | Some('x') => "xmm",
+                                            Some('y') => "ymm",
+                                            Some('z') => "zmm",
+                                            _ => unreachable!(),
+                                        };
+                                        write!(
+                                            generated_asm,
+                                            "{class}{}",
+                                            reg as u32 - X86InlineAsmReg::xmm0 as u32
+                                        )
+                                        .unwrap();
+                                    }
+                                    _ => reg
+                                        .emit(&mut generated_asm, InlineAsmArch::X86_64, *modifier)
+                                        .unwrap(),
+                                },
+                                _ => reg.emit(&mut generated_asm, self.arch, *modifier).unwrap(),
+                            }
                        }
                        CInlineAsmOperand::Const { ref value } => {
                            generated_asm.push_str(value);
@ -739,7 +763,7 @@ fn call_inline_asm<'tcx>(
            },
        )
        .unwrap();
-    let inline_asm_func = fx.module.declare_func_in_func(inline_asm_func, &mut fx.bcx.func);
+    let inline_asm_func = fx.module.declare_func_in_func(inline_asm_func, fx.bcx.func);
    if fx.clif_comments.enabled() {
        fx.add_comment(inline_asm_func, asm_name);
    }
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
@ -35,6 +35,10 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
    }

    match intrinsic {
+        "llvm.prefetch" => {
+            // Nothing to do. This is merely a perf hint.
+        }
+
        _ if intrinsic.starts_with("llvm.ctlz.v") => {
            intrinsic_args!(fx, args => (a); intrinsic);

--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
@ -243,6 +243,20 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
        }

        // FIXME generalize vector types
+        "llvm.aarch64.neon.tbl1.v8i8" => {
+            intrinsic_args!(fx, args => (t, idx); intrinsic);
+
+            let zero = fx.bcx.ins().iconst(types::I8, 0);
+            for i in 0..8 {
+                let idx_lane = idx.value_lane(fx, i).load_scalar(fx);
+                let is_zero =
+                    fx.bcx.ins().icmp_imm(IntCC::UnsignedGreaterThanOrEqual, idx_lane, 16);
+                let t_idx = fx.bcx.ins().uextend(fx.pointer_type, idx_lane);
+                let t_lane = t.value_lane_dyn(fx, t_idx).load_scalar(fx);
+                let res = fx.bcx.ins().select(is_zero, zero, t_lane);
+                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
+            }
+        }
        "llvm.aarch64.neon.tbl1.v16i8" => {
            intrinsic_args!(fx, args => (t, idx); intrinsic);

--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@ -610,230 +610,56 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
            intrinsic_args!(fx, args => (a, b); intrinsic);

-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
+            pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Sse);
+        }

-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i16);
-            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
-            assert_eq!(lane_count * 2, ret_lane_count);
+        "llvm.x86.sse2.packsswb.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848
+            intrinsic_args!(fx, args => (a, b); intrinsic);

-            let zero = fx.bcx.ins().iconst(types::I16, 0);
-            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
-            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
-
-            for idx in 0..lane_count {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
-            }
+            pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Sse);
        }

        "llvm.x86.avx2.packuswb" => {
            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
            intrinsic_args!(fx, args => (a, b); intrinsic);

-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
-
-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i16);
-            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
-            assert_eq!(lane_count * 2, ret_lane_count);
-
-            let zero = fx.bcx.ins().iconst(types::I16, 0);
-            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
-            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
-
-            for idx in 0..lane_count / 2 {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, zero);
-                let sat = fx.bcx.ins().umin(sat, max_u8);
-                let res = fx.bcx.ins().ireduce(types::I8, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
-            }
+            pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Avx);
        }

-        "llvm.x86.sse2.packssdw.128" => {
-            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
+        "llvm.x86.avx2.packsswb" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16&ig_expand=4851
            intrinsic_args!(fx, args => (a, b); intrinsic);

-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
-
-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i32);
-            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
-            assert_eq!(lane_count * 2, ret_lane_count);
-
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
-            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
-
-            for idx in 0..lane_count {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
-            }
+            pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Avx);
        }

        "llvm.x86.sse41.packusdw" => {
            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
            intrinsic_args!(fx, args => (a, b); intrinsic);

-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
+            pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Sse);
+        }

-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i32);
-            assert_eq!(ret_lane_ty, fx.tcx.types.u16);
-            assert_eq!(lane_count * 2, ret_lane_count);
+        "llvm.x86.sse2.packssdw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
+            intrinsic_args!(fx, args => (a, b); intrinsic);

-            let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
-            let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
-            let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
+            pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Sse);
+        }

-            for idx in 0..lane_count {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_u16);
-                let sat = fx.bcx.ins().smin(sat, max_u16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
+        "llvm.x86.avx2.packusdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883
+            intrinsic_args!(fx, args => (a, b); intrinsic);

-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_u16);
-                let sat = fx.bcx.ins().smin(sat, max_u16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
-            }
+            pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Avx);
        }

        "llvm.x86.avx2.packssdw" => {
            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
            intrinsic_args!(fx, args => (a, b); intrinsic);

-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
-
-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i32);
-            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
-            assert_eq!(lane_count * 2, ret_lane_count);
-
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
-            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
-
-            for idx in 0..lane_count / 2 {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
-            }
-
-            for idx in 0..lane_count / 2 {
-                let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().smin(sat, max_i16);
-                let res = fx.bcx.ins().ireduce(types::I16, sat);
-
-                let res_lane = CValue::by_val(res, ret_lane_layout);
-                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
-            }
+            pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Avx);
        }

        "llvm.x86.fma.vfmaddsub.ps"
@ -1407,3 +1233,115 @@ fn llvm_add_sub<'tcx>(

    (cb_out, c)
 }
+
+enum PackSize {
+    U8,
+    U16,
+    S8,
+    S16,
+}
+
+impl PackSize {
+    fn ret_clif_type(&self) -> Type {
+        match self {
+            Self::U8 | Self::S8 => types::I8,
+            Self::U16 | Self::S16 => types::I16,
+        }
+    }
+    fn src_clif_type(&self) -> Type {
+        match self {
+            Self::U8 | Self::S8 => types::I16,
+            Self::U16 | Self::S16 => types::I32,
+        }
+    }
+    fn src_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
+        match self {
+            Self::U8 | Self::S8 => tcx.types.i16,
+            Self::U16 | Self::S16 => tcx.types.i32,
+        }
+    }
+    fn ret_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
+        match self {
+            Self::U8 => tcx.types.u8,
+            Self::S8 => tcx.types.i8,
+            Self::U16 => tcx.types.u16,
+            Self::S16 => tcx.types.i16,
+        }
+    }
+    fn max(&self) -> i64 {
+        match self {
+            Self::U8 => u8::MAX as u64 as i64,
+            Self::S8 => i8::MAX as u8 as u64 as i64,
+            Self::U16 => u16::MAX as u64 as i64,
+            Self::S16 => i16::MAX as u64 as u64 as i64,
+        }
+    }
+    fn min(&self) -> i64 {
+        match self {
+            Self::U8 | Self::U16 => 0,
+            Self::S8 => i16::from(i8::MIN) as u16 as i64,
+            Self::S16 => i32::from(i16::MIN) as u32 as i64,
+        }
+    }
+}
+
+enum PackWidth {
+    Sse = 1,
+    Avx = 2,
+}
+impl PackWidth {
+    fn divisor(&self) -> u64 {
+        match self {
+            Self::Sse => 1,
+            Self::Avx => 2,
+        }
+    }
+}
+
+/// Implement an x86 pack instruction with the intrinsic `_mm{,256}pack{us,s}_epi{16,32}`.
+/// Validated for correctness against LLVM, see commit `c8f5d35508e062bd2d95e6c03429bfec831db6d3`.
+fn pack_instruction<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    a: CValue<'tcx>,
+    b: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    ret_size: PackSize,
+    width: PackWidth,
+) {
+    assert_eq!(a.layout(), b.layout());
+    let layout = a.layout();
+
+    let (src_lane_count, src_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    assert_eq!(src_lane_ty, ret_size.src_ty(fx.tcx));
+    assert_eq!(ret_lane_ty, ret_size.ret_ty(fx.tcx));
+    assert_eq!(src_lane_count * 2, ret_lane_count);
+
+    let min = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.min());
+    let max = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.max());
+    let ret_lane_layout = fx.layout_of(ret_size.ret_ty(fx.tcx));
+
+    let mut round = |source: CValue<'tcx>, source_offset: u64, dest_offset: u64| {
+        let step_amount = src_lane_count / width.divisor();
+        let dest_offset = step_amount * dest_offset;
+        for idx in 0..step_amount {
+            let lane = source.value_lane(fx, step_amount * source_offset + idx).load_scalar(fx);
+            let sat = fx.bcx.ins().smax(lane, min);
+            let sat = match ret_size {
+                PackSize::U8 | PackSize::U16 => fx.bcx.ins().umin(sat, max),
+                PackSize::S8 | PackSize::S16 => fx.bcx.ins().smin(sat, max),
+            };
+            let res = fx.bcx.ins().ireduce(ret_size.ret_clif_type(), sat);
+            let res_lane = CValue::by_val(res, ret_lane_layout);
+            ret.place_lane(fx, dest_offset + idx).write_cvalue(fx, res_lane);
+        }
+    };
+
+    round(a, 0, 0);
+    round(b, 0, 1);
+
+    if let PackWidth::Avx = width {
+        round(a, 1, 2);
+        round(b, 1, 3);
+    }
+}
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
@ -293,7 +293,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
            }

            ret.write_cvalue(fx, base);
-            let ret_lane = ret.place_lane(fx, idx.try_into().unwrap());
+            let ret_lane = ret.place_lane(fx, idx.into());
            ret_lane.write_cvalue(fx, val);
        }

@ -340,7 +340,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                );
            }

-            let ret_lane = v.value_lane(fx, idx.try_into().unwrap());
+            let ret_lane = v.value_lane(fx, idx.into());
            ret.write_cvalue(fx, ret_lane);
        }

@ -822,7 +822,35 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
            let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
            let lane_layout = fx.layout_of(lane_ty);

-            let m = m.load_scalar(fx);
+            let expected_int_bits = lane_count.max(8);
+            let expected_bytes = expected_int_bits / 8 + ((expected_int_bits % 8 > 0) as u64);
+
+            let m = match m.layout().ty.kind() {
+                ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => m.load_scalar(fx),
+                ty::Array(elem, len)
+                    if matches!(elem.kind(), ty::Uint(ty::UintTy::U8))
+                        && len.try_eval_target_usize(fx.tcx, ty::ParamEnv::reveal_all())
+                            == Some(expected_bytes) =>
+                {
+                    m.force_stack(fx).0.load(
+                        fx,
+                        Type::int(expected_int_bits as u16).unwrap(),
+                        MemFlags::trusted(),
+                    )
+                }
+                _ => {
+                    fx.tcx.dcx().span_fatal(
+                        span,
+                        format!(
+                            "invalid monomorphization of `simd_select_bitmask` intrinsic: \
+                            cannot accept `{}` as mask, expected `u{}` or `[u8; {}]`",
+                            ret.layout().ty,
+                            expected_int_bits,
+                            expected_bytes
+                        ),
+                    );
+                }
+            };

            for lane in 0..lane_count {
                let m_lane = fx.bcx.ins().ushr_imm(m, u64::from(lane) as i64);
--- a/compiler/rustc_codegen_cranelift/src/lib.rs
+++ b/compiler/rustc_codegen_cranelift/src/lib.rs
@ -18,7 +18,6 @@ extern crate rustc_fs_util;
 extern crate rustc_hir;
 extern crate rustc_incremental;
 extern crate rustc_index;
-extern crate rustc_interface;
 extern crate rustc_metadata;
 extern crate rustc_session;
 extern crate rustc_span;
@ -42,7 +41,7 @@ use rustc_metadata::EncodedMetadata;
 use rustc_middle::dep_graph::{WorkProduct, WorkProductId};
 use rustc_session::config::OutputFilenames;
 use rustc_session::Session;
-use rustc_span::Symbol;
+use rustc_span::{sym, Symbol};

 pub use crate::config::*;
 use crate::prelude::*;
@ -190,8 +189,17 @@ impl CodegenBackend for CraneliftCodegenBackend {
        }
    }

-    fn target_features(&self, _sess: &Session, _allow_unstable: bool) -> Vec<rustc_span::Symbol> {
-        vec![] // FIXME necessary for #[cfg(target_feature]
+    fn target_features(&self, sess: &Session, _allow_unstable: bool) -> Vec<rustc_span::Symbol> {
+        // FIXME return the actually used target features. this is necessary for #[cfg(target_feature)]
+        if sess.target.arch == "x86_64" && sess.target.os != "none" {
+            // x86_64 mandates SSE2 support
+            vec![Symbol::intern("fxsr"), sym::sse, Symbol::intern("sse2")]
+        } else if sess.target.arch == "aarch64" && sess.target.os != "none" {
+            // AArch64 mandates Neon support
+            vec![sym::neon]
+        } else {
+            vec![]
+        }
    }

    fn print_version(&self) {
@ -305,16 +313,13 @@ fn build_isa(sess: &Session, backend_config: &BackendConfig) -> Arc<dyn isa::Tar
    let flags = settings::Flags::new(flags_builder);

    let isa_builder = match sess.opts.cg.target_cpu.as_deref() {
-        Some("native") => {
-            let builder = cranelift_native::builder_with_options(true).unwrap();
-            builder
-        }
+        Some("native") => cranelift_native::builder_with_options(true).unwrap(),
        Some(value) => {
            let mut builder =
                cranelift_codegen::isa::lookup(target_triple.clone()).unwrap_or_else(|err| {
                    sess.dcx().fatal(format!("can't compile for {}: {}", target_triple, err));
                });
-            if let Err(_) = builder.enable(value) {
+            if builder.enable(value).is_err() {
                sess.dcx()
                    .fatal("the specified target cpu isn't currently supported by Cranelift.");
            }
--- a/compiler/rustc_codegen_cranelift/src/unsize.rs
+++ b/compiler/rustc_codegen_cranelift/src/unsize.rs
@ -28,10 +28,9 @@ pub(crate) fn unsized_info<'tcx>(
            .bcx
            .ins()
            .iconst(fx.pointer_type, len.eval_target_usize(fx.tcx, ParamEnv::reveal_all()) as i64),
-        (
-            &ty::Dynamic(ref data_a, _, src_dyn_kind),
-            &ty::Dynamic(ref data_b, _, target_dyn_kind),
-        ) if src_dyn_kind == target_dyn_kind => {
+        (&ty::Dynamic(data_a, _, src_dyn_kind), &ty::Dynamic(data_b, _, target_dyn_kind))
+            if src_dyn_kind == target_dyn_kind =>
+        {
            let old_info =
                old_info.expect("unsized_info: missing old info for trait upcasting coercion");
            if data_a.principal_def_id() == data_b.principal_def_id() {
--- a/compiler/rustc_codegen_cranelift/src/vtable.rs
+++ b/compiler/rustc_codegen_cranelift/src/vtable.rs
@ -95,7 +95,7 @@ pub(crate) fn get_vtable<'tcx>(
    let alloc_id = fx.tcx.vtable_allocation((ty, trait_ref));
    let data_id =
        data_id_for_alloc_id(&mut fx.constants_cx, &mut *fx.module, alloc_id, Mutability::Not);
-    let local_data_id = fx.module.declare_data_in_func(data_id, &mut fx.bcx.func);
+    let local_data_id = fx.module.declare_data_in_func(data_id, fx.bcx.func);
    if fx.clif_comments.enabled() {
        fx.add_comment(local_data_id, format!("vtable: {:?}", alloc_id));
    }