2025-02-24 09:26:54 +00:00
|
|
|
//@ add-core-stubs
|
2024-03-02 10:59:11 +01:00
|
|
|
//@ revisions: x86-avx2 x86-avx512
|
|
|
|
//@ [x86-avx2] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
|
|
|
|
//@ [x86-avx2] compile-flags: -C target-feature=+avx2
|
|
|
|
//@ [x86-avx2] needs-llvm-components: x86
|
|
|
|
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
|
|
|
|
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq
|
|
|
|
//@ [x86-avx512] needs-llvm-components: x86
|
|
|
|
//@ assembly-output: emit-asm
|
2025-02-08 18:56:57 -08:00
|
|
|
//@ compile-flags: --crate-type=lib -Copt-level=3 -C panic=abort
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
#![feature(no_core, lang_items, repr_simd, intrinsics)]
|
|
|
|
#![no_core]
|
|
|
|
#![allow(non_camel_case_types)]
|
|
|
|
|
2025-02-24 09:26:54 +00:00
|
|
|
extern crate minicore;
|
|
|
|
use minicore::*;
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct i8x16([i8; 16]);
|
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct m8x16([i8; 16]);
|
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct f32x8([f32; 8]);
|
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct m32x8([i32; 8]);
|
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct f64x4([f64; 4]);
|
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct m64x4([i64; 4]);
|
|
|
|
|
2025-04-06 15:12:24 +02:00
|
|
|
#[rustc_intrinsic]
|
|
|
|
unsafe fn simd_masked_load<M, P, T>(mask: M, pointer: P, values: T) -> T;
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
// CHECK-LABEL: load_i8x16
|
|
|
|
#[no_mangle]
|
|
|
|
pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 {
|
|
|
|
// Since avx2 supports no masked loads for bytes, the code tests each individual bit
|
|
|
|
// and jumps to code that inserts individual bytes.
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx2-NOT: vpsllw
|
|
|
|
// x86-avx2-DAG: vpmovmskb eax
|
|
|
|
// x86-avx2-DAG: vpxor
|
2024-03-02 10:59:11 +01:00
|
|
|
// x86-avx2-NEXT: test al, 1
|
|
|
|
// x86-avx2-NEXT: jne
|
|
|
|
// x86-avx2-NEXT: test al, 2
|
|
|
|
// x86-avx2-NEXT: jne
|
|
|
|
// x86-avx2-DAG: movzx [[REG:[a-z]+]], byte ptr [rdi]
|
|
|
|
// x86-avx2-NEXT: vmovd xmm0, [[REG]]
|
|
|
|
// x86-avx2-DAG: vpinsrb xmm0, xmm0, byte ptr [rdi + 1], 1
|
|
|
|
//
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx512-NOT: vpsllw
|
|
|
|
// x86-avx512: vpmovb2m k1, xmm0
|
2024-03-02 10:59:11 +01:00
|
|
|
// x86-avx512-NEXT: vmovdqu8 xmm0 {k1} {z}, xmmword ptr [rdi]
|
|
|
|
simd_masked_load(mask, pointer, i8x16([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
|
|
|
|
}
|
|
|
|
|
|
|
|
// CHECK-LABEL: load_f32x8
|
|
|
|
#[no_mangle]
|
|
|
|
pub unsafe extern "C" fn load_f32x8(mask: m32x8, pointer: *const f32) -> f32x8 {
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx2-NOT: vpslld
|
|
|
|
// x86-avx2: vmaskmovps ymm0, ymm0, ymmword ptr [rdi]
|
2024-03-02 10:59:11 +01:00
|
|
|
//
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx512-NOT: vpslld
|
|
|
|
// x86-avx512: vpmovd2m k1, ymm0
|
2024-03-02 10:59:11 +01:00
|
|
|
// x86-avx512-NEXT: vmovups ymm0 {k1} {z}, ymmword ptr [rdi]
|
|
|
|
simd_masked_load(mask, pointer, f32x8([0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32]))
|
|
|
|
}
|
|
|
|
|
|
|
|
// CHECK-LABEL: load_f64x4
|
|
|
|
#[no_mangle]
|
|
|
|
pub unsafe extern "C" fn load_f64x4(mask: m64x4, pointer: *const f64) -> f64x4 {
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx2-NOT: vpsllq
|
|
|
|
// x86-avx2: vmaskmovpd ymm0, ymm0, ymmword ptr [rdi]
|
2024-03-02 10:59:11 +01:00
|
|
|
//
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-avx512-NOT: vpsllq
|
|
|
|
// x86-avx512: vpmovq2m k1, ymm0
|
2024-03-02 10:59:11 +01:00
|
|
|
simd_masked_load(mask, pointer, f64x4([0_f64, 0_f64, 0_f64, 0_f64]))
|
|
|
|
}
|