Merge commit 'c84d1871dc
' into sync_cg_clif-2023-11-10
This commit is contained in:
parent
8eca01f4b6
commit
d49fd9f877
13 changed files with 961 additions and 67 deletions
|
@ -76,8 +76,6 @@ configuration options.
|
||||||
|
|
||||||
## Not yet supported
|
## Not yet supported
|
||||||
|
|
||||||
* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
|
|
||||||
* On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
|
|
||||||
* SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
|
* SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
|
||||||
* Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
|
* Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
|
||||||
|
|
||||||
|
|
|
@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
|
||||||
TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
|
TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
|
||||||
TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
|
TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
|
||||||
TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
|
TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
|
||||||
|
TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
|
||||||
];
|
];
|
||||||
|
|
||||||
pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
|
pub(crate) static RAND_REPO: GitRepo = GitRepo::github(
|
||||||
|
|
|
@ -42,6 +42,7 @@ aot.float-minmax-pass
|
||||||
aot.mod_bench
|
aot.mod_bench
|
||||||
aot.issue-72793
|
aot.issue-72793
|
||||||
aot.issue-59326
|
aot.issue-59326
|
||||||
|
aot.neon
|
||||||
|
|
||||||
testsuite.extended_sysroot
|
testsuite.extended_sysroot
|
||||||
test.rust-random/rand
|
test.rust-random/rand
|
||||||
|
|
234
example/neon.rs
Normal file
234
example/neon.rs
Normal file
|
@ -0,0 +1,234 @@
|
||||||
|
// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs
|
||||||
|
|
||||||
|
#![feature(portable_simd)]
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
use std::arch::aarch64::*;
|
||||||
|
use std::mem::transmute;
|
||||||
|
use std::simd::*;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_s8() {
|
||||||
|
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
|
||||||
|
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||||
|
let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]);
|
||||||
|
let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_s16() {
|
||||||
|
let a = i16x4::from([1, 2, 3, -4]);
|
||||||
|
let b = i16x4::from([0, 3, 2, 5]);
|
||||||
|
let e = i16x4::from([1, -4, 0, 2]);
|
||||||
|
let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_s32() {
|
||||||
|
let a = i32x2::from([1, -2]);
|
||||||
|
let b = i32x2::from([0, 3]);
|
||||||
|
let e = i32x2::from([-2, 0]);
|
||||||
|
let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_u8() {
|
||||||
|
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||||
|
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||||
|
let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]);
|
||||||
|
let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_u16() {
|
||||||
|
let a = u16x4::from([1, 2, 3, 4]);
|
||||||
|
let b = u16x4::from([0, 3, 2, 5]);
|
||||||
|
let e = u16x4::from([1, 3, 0, 2]);
|
||||||
|
let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_u32() {
|
||||||
|
let a = u32x2::from([1, 2]);
|
||||||
|
let b = u32x2::from([0, 3]);
|
||||||
|
let e = u32x2::from([1, 0]);
|
||||||
|
let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmin_f32() {
|
||||||
|
let a = f32x2::from([1., -2.]);
|
||||||
|
let b = f32x2::from([0., 3.]);
|
||||||
|
let e = f32x2::from([-2., 0.]);
|
||||||
|
let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_s8() {
|
||||||
|
let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]);
|
||||||
|
let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||||
|
let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]);
|
||||||
|
let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_s16() {
|
||||||
|
let a = i16x4::from([1, 2, 3, -4]);
|
||||||
|
let b = i16x4::from([0, 3, 2, 5]);
|
||||||
|
let e = i16x4::from([2, 3, 3, 5]);
|
||||||
|
let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_s32() {
|
||||||
|
let a = i32x2::from([1, -2]);
|
||||||
|
let b = i32x2::from([0, 3]);
|
||||||
|
let e = i32x2::from([1, 3]);
|
||||||
|
let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_u8() {
|
||||||
|
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||||
|
let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]);
|
||||||
|
let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]);
|
||||||
|
let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_u16() {
|
||||||
|
let a = u16x4::from([1, 2, 3, 4]);
|
||||||
|
let b = u16x4::from([0, 3, 2, 5]);
|
||||||
|
let e = u16x4::from([2, 4, 3, 5]);
|
||||||
|
let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_u32() {
|
||||||
|
let a = u32x2::from([1, 2]);
|
||||||
|
let b = u32x2::from([0, 3]);
|
||||||
|
let e = u32x2::from([2, 3]);
|
||||||
|
let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpmax_f32() {
|
||||||
|
let a = f32x2::from([1., -2.]);
|
||||||
|
let b = f32x2::from([0., 3.]);
|
||||||
|
let e = f32x2::from([1., 3.]);
|
||||||
|
let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b)));
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_s16() {
|
||||||
|
let a = i16x4::from([1, 2, 3, 4]);
|
||||||
|
let b = i16x4::from([0, -1, -2, -3]);
|
||||||
|
let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b)));
|
||||||
|
let e = i16x4::from([3, 7, -1, -5]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_s32() {
|
||||||
|
let a = i32x2::from([1, 2]);
|
||||||
|
let b = i32x2::from([0, -1]);
|
||||||
|
let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b)));
|
||||||
|
let e = i32x2::from([3, -1]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_s8() {
|
||||||
|
let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||||
|
let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]);
|
||||||
|
let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b)));
|
||||||
|
let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_u16() {
|
||||||
|
let a = u16x4::from([1, 2, 3, 4]);
|
||||||
|
let b = u16x4::from([30, 31, 32, 33]);
|
||||||
|
let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b)));
|
||||||
|
let e = u16x4::from([3, 7, 61, 65]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_u32() {
|
||||||
|
let a = u32x2::from([1, 2]);
|
||||||
|
let b = u32x2::from([30, 31]);
|
||||||
|
let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b)));
|
||||||
|
let e = u32x2::from([3, 61]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vpadd_u8() {
|
||||||
|
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]);
|
||||||
|
let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]);
|
||||||
|
let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b)));
|
||||||
|
let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vqsub_u8() {
|
||||||
|
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
|
||||||
|
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
|
||||||
|
let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b)));
|
||||||
|
let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
unsafe fn test_vqadd_u8() {
|
||||||
|
let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]);
|
||||||
|
let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]);
|
||||||
|
let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b)));
|
||||||
|
let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]);
|
||||||
|
assert_eq!(r, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_arch = "aarch64")]
|
||||||
|
fn main() {
|
||||||
|
unsafe {
|
||||||
|
test_vpmin_s8();
|
||||||
|
test_vpmin_s16();
|
||||||
|
test_vpmin_s32();
|
||||||
|
test_vpmin_u8();
|
||||||
|
test_vpmin_u16();
|
||||||
|
test_vpmin_u32();
|
||||||
|
test_vpmin_f32();
|
||||||
|
test_vpmax_s8();
|
||||||
|
test_vpmax_s16();
|
||||||
|
test_vpmax_s32();
|
||||||
|
test_vpmax_u8();
|
||||||
|
test_vpmax_u16();
|
||||||
|
test_vpmax_u32();
|
||||||
|
test_vpmax_f32();
|
||||||
|
|
||||||
|
test_vpadd_s16();
|
||||||
|
test_vpadd_s32();
|
||||||
|
test_vpadd_s8();
|
||||||
|
test_vpadd_u16();
|
||||||
|
test_vpadd_u32();
|
||||||
|
test_vpadd_u8();
|
||||||
|
|
||||||
|
test_vqsub_u8();
|
||||||
|
test_vqadd_u8();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_arch = "aarch64"))]
|
||||||
|
fn main() {}
|
|
@ -58,9 +58,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "compiler_builtins"
|
name = "compiler_builtins"
|
||||||
version = "0.1.100"
|
version = "0.1.103"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775"
|
checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"rustc-std-workspace-core",
|
"rustc-std-workspace-core",
|
||||||
|
@ -158,9 +158,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.149"
|
version = "0.2.150"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
|
checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rustc-std-workspace-core",
|
"rustc-std-workspace-core",
|
||||||
]
|
]
|
||||||
|
@ -415,7 +415,6 @@ dependencies = [
|
||||||
name = "unwind"
|
name = "unwind"
|
||||||
version = "0.0.0"
|
version = "0.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"compiler_builtins",
|
"compiler_builtins",
|
||||||
"core",
|
"core",
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
[toolchain]
|
[toolchain]
|
||||||
channel = "nightly-2023-10-29"
|
channel = "nightly-2023-11-10"
|
||||||
components = ["rust-src", "rustc-dev", "llvm-tools"]
|
components = ["rust-src", "rustc-dev", "llvm-tools"]
|
||||||
|
|
|
@ -146,6 +146,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
|
||||||
|
|
||||||
rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
|
rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
|
||||||
|
|
||||||
|
# rustc bugs
|
||||||
|
# ==========
|
||||||
|
# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
|
||||||
|
rm tests/ui/coroutine/gen_block_*.rs
|
||||||
|
|
||||||
cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
|
cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
|
||||||
|
|
||||||
# prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by
|
# prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by
|
||||||
|
|
|
@ -13,7 +13,7 @@ use crate::prelude::*;
|
||||||
enum CInlineAsmOperand<'tcx> {
|
enum CInlineAsmOperand<'tcx> {
|
||||||
In {
|
In {
|
||||||
reg: InlineAsmRegOrRegClass,
|
reg: InlineAsmRegOrRegClass,
|
||||||
value: CValue<'tcx>,
|
value: Value,
|
||||||
},
|
},
|
||||||
Out {
|
Out {
|
||||||
reg: InlineAsmRegOrRegClass,
|
reg: InlineAsmRegOrRegClass,
|
||||||
|
@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
|
||||||
InOut {
|
InOut {
|
||||||
reg: InlineAsmRegOrRegClass,
|
reg: InlineAsmRegOrRegClass,
|
||||||
_late: bool,
|
_late: bool,
|
||||||
in_value: CValue<'tcx>,
|
in_value: Value,
|
||||||
out_place: Option<CPlace<'tcx>>,
|
out_place: Option<CPlace<'tcx>>,
|
||||||
},
|
},
|
||||||
Const {
|
Const {
|
||||||
|
@ -47,7 +47,9 @@ pub(crate) fn codegen_inline_asm<'tcx>(
|
||||||
// Used by panic_abort on Windows, but uses a syntax which only happens to work with
|
// Used by panic_abort on Windows, but uses a syntax which only happens to work with
|
||||||
// asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
|
// asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
|
||||||
// the LLVM backend.
|
// the LLVM backend.
|
||||||
if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
|
if template.len() == 1
|
||||||
|
&& template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
|
||||||
|
{
|
||||||
fx.bcx.ins().trap(TrapCode::User(1));
|
fx.bcx.ins().trap(TrapCode::User(1));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -55,9 +57,10 @@ pub(crate) fn codegen_inline_asm<'tcx>(
|
||||||
let operands = operands
|
let operands = operands
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|operand| match *operand {
|
.map(|operand| match *operand {
|
||||||
InlineAsmOperand::In { reg, ref value } => {
|
InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
|
||||||
CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
|
reg,
|
||||||
}
|
value: crate::base::codegen_operand(fx, value).load_scalar(fx),
|
||||||
|
},
|
||||||
InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
|
InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
|
||||||
reg,
|
reg,
|
||||||
late,
|
late,
|
||||||
|
@ -67,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
|
||||||
CInlineAsmOperand::InOut {
|
CInlineAsmOperand::InOut {
|
||||||
reg,
|
reg,
|
||||||
_late: late,
|
_late: late,
|
||||||
in_value: crate::base::codegen_operand(fx, in_value),
|
in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
|
||||||
out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
|
out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -165,7 +168,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
|
||||||
for (i, operand) in operands.iter().enumerate() {
|
for (i, operand) in operands.iter().enumerate() {
|
||||||
match operand {
|
match operand {
|
||||||
CInlineAsmOperand::In { reg: _, value } => {
|
CInlineAsmOperand::In { reg: _, value } => {
|
||||||
inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
|
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
|
||||||
}
|
}
|
||||||
CInlineAsmOperand::Out { reg: _, late: _, place } => {
|
CInlineAsmOperand::Out { reg: _, late: _, place } => {
|
||||||
if let Some(place) = place {
|
if let Some(place) = place {
|
||||||
|
@ -173,7 +176,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
|
CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
|
||||||
inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
|
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
|
||||||
if let Some(out_place) = out_place {
|
if let Some(out_place) = out_place {
|
||||||
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
|
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
|
||||||
}
|
}
|
||||||
|
@ -726,3 +729,83 @@ fn call_inline_asm<'tcx>(
|
||||||
place.write_cvalue(fx, CValue::by_val(value, place.layout()));
|
place.write_cvalue(fx, CValue::by_val(value, place.layout()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn codegen_xgetbv<'tcx>(
|
||||||
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
xcr_no: Value,
|
||||||
|
ret: CPlace<'tcx>,
|
||||||
|
) {
|
||||||
|
// FIXME add .eh_frame unwind info directives
|
||||||
|
|
||||||
|
let operands = vec![
|
||||||
|
CInlineAsmOperand::In {
|
||||||
|
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
|
||||||
|
value: xcr_no,
|
||||||
|
},
|
||||||
|
CInlineAsmOperand::Out {
|
||||||
|
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
|
||||||
|
late: true,
|
||||||
|
place: Some(ret),
|
||||||
|
},
|
||||||
|
CInlineAsmOperand::Out {
|
||||||
|
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
|
||||||
|
late: true,
|
||||||
|
place: None,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
|
||||||
|
|
||||||
|
let mut inputs = Vec::new();
|
||||||
|
let mut outputs = Vec::new();
|
||||||
|
|
||||||
|
let mut asm_gen = InlineAssemblyGenerator {
|
||||||
|
tcx: fx.tcx,
|
||||||
|
arch: fx.tcx.sess.asm_arch.unwrap(),
|
||||||
|
enclosing_def_id: fx.instance.def_id(),
|
||||||
|
template: &[InlineAsmTemplatePiece::String(
|
||||||
|
"
|
||||||
|
xgetbv
|
||||||
|
// out = rdx << 32 | rax
|
||||||
|
shl rdx, 32
|
||||||
|
or rax, rdx
|
||||||
|
"
|
||||||
|
.to_string(),
|
||||||
|
)],
|
||||||
|
operands: &operands,
|
||||||
|
options,
|
||||||
|
registers: Vec::new(),
|
||||||
|
stack_slots_clobber: Vec::new(),
|
||||||
|
stack_slots_input: Vec::new(),
|
||||||
|
stack_slots_output: Vec::new(),
|
||||||
|
stack_slot_size: Size::from_bytes(0),
|
||||||
|
};
|
||||||
|
asm_gen.allocate_registers();
|
||||||
|
asm_gen.allocate_stack_slots();
|
||||||
|
|
||||||
|
let inline_asm_index = fx.cx.inline_asm_index.get();
|
||||||
|
fx.cx.inline_asm_index.set(inline_asm_index + 1);
|
||||||
|
let asm_name = format!(
|
||||||
|
"__inline_asm_{}_n{}",
|
||||||
|
fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
|
||||||
|
inline_asm_index
|
||||||
|
);
|
||||||
|
|
||||||
|
let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
|
||||||
|
fx.cx.global_asm.push_str(&generated_asm);
|
||||||
|
|
||||||
|
for (i, operand) in operands.iter().enumerate() {
|
||||||
|
match operand {
|
||||||
|
CInlineAsmOperand::In { reg: _, value } => {
|
||||||
|
inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
|
||||||
|
}
|
||||||
|
CInlineAsmOperand::Out { reg: _, late: _, place } => {
|
||||||
|
if let Some(place) = place {
|
||||||
|
outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
|
||||||
|
}
|
||||||
|
|
|
@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.fma.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x,y,z); intrinsic);
|
||||||
|
|
||||||
|
simd_trio_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
z,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
|
||||||
|
fx.bcx.ins().fma(lane_x, lane_y, lane_z)
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
_ => {
|
_ => {
|
||||||
fx.tcx
|
fx.tcx
|
||||||
.sess
|
.sess
|
||||||
|
|
|
@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
|
_ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
|
||||||
|
|| intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
|
||||||
|
{
|
||||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
|
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
|
||||||
|
@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
|
_ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
|
||||||
|
|| intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
|
||||||
|
{
|
||||||
intrinsic_args!(fx, args => (x, y); intrinsic);
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
|
simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
|
||||||
|
@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
_ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
|
||||||
|
intrinsic_args!(fx, args => (x, y); intrinsic);
|
||||||
|
|
||||||
|
simd_horizontal_pair_for_each_lane(
|
||||||
|
fx,
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
ret,
|
||||||
|
&|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME generalize vector types
|
// FIXME generalize vector types
|
||||||
"llvm.aarch64.neon.tbl1.v16i8" => {
|
"llvm.aarch64.neon.tbl1.v16i8" => {
|
||||||
intrinsic_args!(fx, args => (t, idx); intrinsic);
|
intrinsic_args!(fx, args => (t, idx); intrinsic);
|
||||||
|
@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME generalize vector types
|
|
||||||
"llvm.aarch64.neon.umaxp.v16i8" => {
|
|
||||||
intrinsic_args!(fx, args => (a, b); intrinsic);
|
|
||||||
|
|
||||||
// FIXME add helper for horizontal pairwise operations
|
|
||||||
for i in 0..8 {
|
|
||||||
let lane1 = a.value_lane(fx, i * 2).load_scalar(fx);
|
|
||||||
let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx);
|
|
||||||
let res = fx.bcx.ins().umax(lane1, lane2);
|
|
||||||
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
|
|
||||||
}
|
|
||||||
for i in 0..8 {
|
|
||||||
let lane1 = b.value_lane(fx, i * 2).load_scalar(fx);
|
|
||||||
let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx);
|
|
||||||
let res = fx.bcx.ins().umax(lane1, lane2);
|
|
||||||
ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
_ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
|
_ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
|
||||||
|| intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
|
|| intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
|
||||||
|
|
|
@ -20,16 +20,21 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
|
|
||||||
// Used by is_x86_feature_detected!();
|
// Used by is_x86_feature_detected!();
|
||||||
"llvm.x86.xgetbv" => {
|
"llvm.x86.xgetbv" => {
|
||||||
// FIXME use the actual xgetbv instruction
|
intrinsic_args!(fx, args => (xcr_no); intrinsic);
|
||||||
intrinsic_args!(fx, args => (v); intrinsic);
|
|
||||||
|
|
||||||
let v = v.load_scalar(fx);
|
let xcr_no = xcr_no.load_scalar(fx);
|
||||||
|
|
||||||
// As of writing on XCR0 exists
|
crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
|
||||||
fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached);
|
}
|
||||||
|
|
||||||
let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */);
|
"llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
|
||||||
ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010
|
||||||
|
intrinsic_args!(fx, args => (ptr); intrinsic);
|
||||||
|
|
||||||
|
// FIXME correctly handle unalignedness
|
||||||
|
let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout());
|
||||||
|
ret.write_cvalue(fx, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
|
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
|
||||||
|
@ -177,8 +182,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"llvm.x86.avx2.vperm2i128" => {
|
"llvm.x86.avx2.vperm2i128"
|
||||||
|
| "llvm.x86.avx.vperm2f128.ps.256"
|
||||||
|
| "llvm.x86.avx.vperm2f128.pd.256" => {
|
||||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
|
||||||
let (a, b, imm8) = match args {
|
let (a, b, imm8) = match args {
|
||||||
[a, b, imm8] => (a, b, imm8),
|
[a, b, imm8] => (a, b, imm8),
|
||||||
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
|
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
|
||||||
|
@ -187,19 +196,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
let b = codegen_operand(fx, b);
|
let b = codegen_operand(fx, b);
|
||||||
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
|
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
|
||||||
|
|
||||||
let a_0 = a.value_lane(fx, 0).load_scalar(fx);
|
let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
|
||||||
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
|
let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
|
||||||
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
|
|
||||||
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
|
|
||||||
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
|
|
||||||
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
|
|
||||||
|
|
||||||
let b_0 = b.value_lane(fx, 0).load_scalar(fx);
|
let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
|
||||||
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
|
let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
|
||||||
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
|
|
||||||
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
|
|
||||||
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
|
|
||||||
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
|
|
||||||
|
|
||||||
fn select4(
|
fn select4(
|
||||||
fx: &mut FunctionCx<'_, '_, '_>,
|
fx: &mut FunctionCx<'_, '_, '_>,
|
||||||
|
@ -224,16 +225,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
|
|
||||||
let control0 = imm8;
|
let control0 = imm8;
|
||||||
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
|
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
|
||||||
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
|
|
||||||
|
|
||||||
let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
|
let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
|
||||||
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
|
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
|
||||||
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
|
|
||||||
|
|
||||||
ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
|
ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
|
||||||
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
|
fx,
|
||||||
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
|
res_low,
|
||||||
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
|
MemFlags::trusted(),
|
||||||
|
);
|
||||||
|
ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
|
||||||
|
fx,
|
||||||
|
res_high,
|
||||||
|
MemFlags::trusted(),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
|
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
|
||||||
let a = match args {
|
let a = match args {
|
||||||
|
@ -309,7 +314,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
fx.bcx.ins().sshr(a_lane, saturated_count)
|
fx.bcx.ins().sshr(a_lane, saturated_count)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
"llvm.x86.sse2.psad.bw" => {
|
"llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771
|
||||||
intrinsic_args!(fx, args => (a, b); intrinsic);
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
assert_eq!(a.layout(), b.layout());
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
@ -340,7 +347,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"llvm.x86.ssse3.pmadd.ub.sw.128" => {
|
"llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270
|
||||||
intrinsic_args!(fx, args => (a, b); intrinsic);
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
|
let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
@ -379,7 +388,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"llvm.x86.sse2.pmadd.wd" => {
|
"llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234
|
||||||
intrinsic_args!(fx, args => (a, b); intrinsic);
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
assert_eq!(a.layout(), b.layout());
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
@ -412,6 +423,369 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||||
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
"llvm.x86.ssse3.pmul.hr.sw.128" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(lane_count, ret_lane_count);
|
||||||
|
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
|
||||||
|
for out_lane_idx in 0..lane_count {
|
||||||
|
let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
|
||||||
|
let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
|
||||||
|
let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
|
||||||
|
let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
|
||||||
|
|
||||||
|
let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
|
||||||
|
let shifted = fx.bcx.ins().ushr_imm(mul, 14);
|
||||||
|
let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
|
||||||
|
let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
|
||||||
|
|
||||||
|
let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
|
||||||
|
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
|
||||||
|
|
||||||
|
ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.sse2.packuswb.128" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
|
||||||
|
assert_eq!(lane_count * 2, ret_lane_count);
|
||||||
|
|
||||||
|
let zero = fx.bcx.ins().iconst(types::I16, 0);
|
||||||
|
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.avx2.packuswb" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
|
||||||
|
assert_eq!(lane_count * 2, ret_lane_count);
|
||||||
|
|
||||||
|
let zero = fx.bcx.ins().iconst(types::I16, 0);
|
||||||
|
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, zero);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u8);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I8, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.sse2.packssdw.128" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i32);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(lane_count * 2, ret_lane_count);
|
||||||
|
|
||||||
|
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
|
||||||
|
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.sse41.packusdw" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i32);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.u16);
|
||||||
|
assert_eq!(lane_count * 2, ret_lane_count);
|
||||||
|
|
||||||
|
let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
|
||||||
|
let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().umax(lane, min_u16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().umax(lane, min_u16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_u16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.avx2.packssdw" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i32);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
|
||||||
|
assert_eq!(lane_count * 2, ret_lane_count);
|
||||||
|
|
||||||
|
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
|
||||||
|
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
|
||||||
|
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in 0..lane_count / 2 {
|
||||||
|
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||||
|
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||||
|
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||||
|
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||||
|
|
||||||
|
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||||
|
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.pclmulqdq" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
|
||||||
|
intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i64);
|
||||||
|
assert_eq!(ret_lane_ty, fx.tcx.types.i64);
|
||||||
|
assert_eq!(lane_count, 2);
|
||||||
|
assert_eq!(ret_lane_count, 2);
|
||||||
|
|
||||||
|
let imm8 = imm8.load_scalar(fx);
|
||||||
|
|
||||||
|
let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
|
||||||
|
let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
|
||||||
|
let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
|
||||||
|
let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
|
||||||
|
|
||||||
|
let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
|
||||||
|
let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
|
||||||
|
let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
|
||||||
|
let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
|
||||||
|
|
||||||
|
fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
|
||||||
|
let tmp = fx.bcx.ins().ushr_imm(val, bit);
|
||||||
|
fx.bcx.ins().band_imm(tmp, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
|
||||||
|
for i in 0..=63 {
|
||||||
|
let x = extract_bit(fx, temp1, 0);
|
||||||
|
let y = extract_bit(fx, temp2, i);
|
||||||
|
let mut temp = fx.bcx.ins().band(x, y);
|
||||||
|
for j in 1..=i {
|
||||||
|
let x = extract_bit(fx, temp1, j);
|
||||||
|
let y = extract_bit(fx, temp2, i - j);
|
||||||
|
let z = fx.bcx.ins().band(x, y);
|
||||||
|
temp = fx.bcx.ins().bxor(temp, z);
|
||||||
|
}
|
||||||
|
let temp = fx.bcx.ins().ishl_imm(temp, i);
|
||||||
|
res1 = fx.bcx.ins().bor(res1, temp);
|
||||||
|
}
|
||||||
|
ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
|
||||||
|
|
||||||
|
let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
|
||||||
|
for i in 64..=127 {
|
||||||
|
let mut temp = fx.bcx.ins().iconst(types::I64, 0);
|
||||||
|
for j in i - 63..=63 {
|
||||||
|
let x = extract_bit(fx, temp1, j);
|
||||||
|
let y = extract_bit(fx, temp2, i - j);
|
||||||
|
let z = fx.bcx.ins().band(x, y);
|
||||||
|
temp = fx.bcx.ins().bxor(temp, z);
|
||||||
|
}
|
||||||
|
let temp = fx.bcx.ins().ishl_imm(temp, i);
|
||||||
|
res2 = fx.bcx.ins().bor(res2, temp);
|
||||||
|
}
|
||||||
|
ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
|
||||||
|
}
|
||||||
|
|
||||||
|
"llvm.x86.avx.ptestz.256" => {
|
||||||
|
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
|
||||||
|
intrinsic_args!(fx, args => (a, b); intrinsic);
|
||||||
|
|
||||||
|
assert_eq!(a.layout(), b.layout());
|
||||||
|
let layout = a.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
assert_eq!(lane_ty, fx.tcx.types.i64);
|
||||||
|
assert_eq!(ret.layout().ty, fx.tcx.types.i32);
|
||||||
|
assert_eq!(lane_count, 4);
|
||||||
|
|
||||||
|
let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
|
||||||
|
let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
|
||||||
|
let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);
|
||||||
|
let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);
|
||||||
|
let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
|
||||||
|
let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
|
||||||
|
let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);
|
||||||
|
let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);
|
||||||
|
|
||||||
|
let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);
|
||||||
|
let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);
|
||||||
|
let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);
|
||||||
|
let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);
|
||||||
|
|
||||||
|
let all_zero0 = fx.bcx.ins().bor(zero0, zero1);
|
||||||
|
let all_zero1 = fx.bcx.ins().bor(zero2, zero3);
|
||||||
|
let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);
|
||||||
|
|
||||||
|
let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);
|
||||||
|
let res = CValue::by_val(
|
||||||
|
fx.bcx.ins().uextend(types::I32, res),
|
||||||
|
fx.layout_of(fx.tcx.types.i32),
|
||||||
|
);
|
||||||
|
ret.write_cvalue(fx, res);
|
||||||
|
}
|
||||||
|
|
||||||
_ => {
|
_ => {
|
||||||
fx.tcx
|
fx.tcx
|
||||||
.sess
|
.sess
|
||||||
|
|
|
@ -132,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn simd_horizontal_pair_for_each_lane<'tcx>(
|
||||||
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
x: CValue<'tcx>,
|
||||||
|
y: CValue<'tcx>,
|
||||||
|
ret: CPlace<'tcx>,
|
||||||
|
f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
|
||||||
|
) {
|
||||||
|
assert_eq!(x.layout(), y.layout());
|
||||||
|
let layout = x.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let lane_layout = fx.layout_of(lane_ty);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||||
|
assert_eq!(lane_count, ret_lane_count);
|
||||||
|
|
||||||
|
for lane_idx in 0..lane_count {
|
||||||
|
let src = if lane_idx < (lane_count / 2) { x } else { y };
|
||||||
|
let src_idx = lane_idx % (lane_count / 2);
|
||||||
|
|
||||||
|
let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
|
||||||
|
let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
|
||||||
|
|
||||||
|
let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
|
||||||
|
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
|
||||||
|
|
||||||
|
ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn simd_trio_for_each_lane<'tcx>(
|
||||||
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
x: CValue<'tcx>,
|
||||||
|
y: CValue<'tcx>,
|
||||||
|
z: CValue<'tcx>,
|
||||||
|
ret: CPlace<'tcx>,
|
||||||
|
f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
|
||||||
|
) {
|
||||||
|
assert_eq!(x.layout(), y.layout());
|
||||||
|
let layout = x.layout();
|
||||||
|
|
||||||
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let lane_layout = fx.layout_of(lane_ty);
|
||||||
|
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||||
|
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||||
|
assert_eq!(lane_count, ret_lane_count);
|
||||||
|
|
||||||
|
for lane_idx in 0..lane_count {
|
||||||
|
let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
|
||||||
|
let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
|
||||||
|
let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
|
||||||
|
|
||||||
|
let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
|
||||||
|
let res_lane = CValue::by_val(res_lane, ret_lane_layout);
|
||||||
|
|
||||||
|
ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn simd_reduce<'tcx>(
|
fn simd_reduce<'tcx>(
|
||||||
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
val: CValue<'tcx>,
|
val: CValue<'tcx>,
|
||||||
|
|
|
@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
|
||||||
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
let lane_layout = fx.layout_of(lane_ty);
|
let lane_layout = fx.layout_of(lane_ty);
|
||||||
assert!(lane_idx < lane_count);
|
assert!(lane_idx < lane_count);
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
|
||||||
|
CValueInner::ByRef(ptr, None) => {
|
||||||
|
let field_offset = lane_layout.size * lane_idx;
|
||||||
|
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
|
||||||
|
CValue::by_ref(field_ptr, lane_layout)
|
||||||
|
}
|
||||||
|
CValueInner::ByRef(_, Some(_)) => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
|
||||||
|
/// specified by the vector type.
|
||||||
|
pub(crate) fn value_typed_lane(
|
||||||
|
self,
|
||||||
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
lane_ty: Ty<'tcx>,
|
||||||
|
lane_idx: u64,
|
||||||
|
) -> CValue<'tcx> {
|
||||||
|
let layout = self.1;
|
||||||
|
assert!(layout.ty.is_simd());
|
||||||
|
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let lane_layout = fx.layout_of(lane_ty);
|
||||||
|
assert!(
|
||||||
|
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
|
||||||
|
);
|
||||||
|
|
||||||
match self.0 {
|
match self.0 {
|
||||||
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
|
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
|
||||||
CValueInner::ByRef(ptr, None) => {
|
CValueInner::ByRef(ptr, None) => {
|
||||||
|
@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
|
||||||
|
/// specified by the vector type.
|
||||||
|
pub(crate) fn place_typed_lane(
|
||||||
|
self,
|
||||||
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
lane_ty: Ty<'tcx>,
|
||||||
|
lane_idx: u64,
|
||||||
|
) -> CPlace<'tcx> {
|
||||||
|
let layout = self.layout();
|
||||||
|
assert!(layout.ty.is_simd());
|
||||||
|
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||||
|
let lane_layout = fx.layout_of(lane_ty);
|
||||||
|
assert!(
|
||||||
|
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
|
||||||
|
);
|
||||||
|
|
||||||
|
match self.inner {
|
||||||
|
CPlaceInner::Var(_, _) => unreachable!(),
|
||||||
|
CPlaceInner::VarPair(_, _, _) => unreachable!(),
|
||||||
|
CPlaceInner::Addr(ptr, None) => {
|
||||||
|
let field_offset = lane_layout.size * lane_idx;
|
||||||
|
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
|
||||||
|
CPlace::for_ptr(field_ptr, lane_layout)
|
||||||
|
}
|
||||||
|
CPlaceInner::Addr(_, Some(_)) => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn place_index(
|
pub(crate) fn place_index(
|
||||||
self,
|
self,
|
||||||
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
fx: &mut FunctionCx<'_, '_, 'tcx>,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue