2024-03-02 10:59:11 +01:00
|
|
|
// verify that simd mask reductions do not introduce additional bit shift operations
|
2025-02-24 09:26:54 +00:00
|
|
|
//@ add-core-stubs
|
2024-03-02 10:59:11 +01:00
|
|
|
//@ revisions: x86 aarch64
|
|
|
|
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
|
2024-05-01 16:54:20 -07:00
|
|
|
// Set the base cpu explicitly, in case the default has been changed.
|
2024-05-01 15:25:26 -07:00
|
|
|
//@ [x86] compile-flags: -C target-cpu=x86-64
|
2024-03-02 10:59:11 +01:00
|
|
|
//@ [x86] needs-llvm-components: x86
|
|
|
|
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu
|
|
|
|
//@ [aarch64] needs-llvm-components: aarch64
|
|
|
|
//@ assembly-output: emit-asm
|
2025-02-08 18:56:57 -08:00
|
|
|
//@ compile-flags: --crate-type=lib -Copt-level=3 -C panic=abort
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
#![feature(no_core, lang_items, repr_simd, intrinsics)]
|
|
|
|
#![no_core]
|
|
|
|
#![allow(non_camel_case_types)]
|
|
|
|
|
2025-02-24 09:26:54 +00:00
|
|
|
extern crate minicore;
|
|
|
|
use minicore::*;
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
#[repr(simd)]
|
|
|
|
pub struct mask8x16([i8; 16]);
|
|
|
|
|
2025-04-06 15:12:24 +02:00
|
|
|
#[rustc_intrinsic]
|
|
|
|
unsafe fn simd_reduce_all<T>(x: T) -> bool;
|
|
|
|
#[rustc_intrinsic]
|
|
|
|
unsafe fn simd_reduce_any<T>(x: T) -> bool;
|
2024-03-02 10:59:11 +01:00
|
|
|
|
|
|
|
// CHECK-LABEL: mask_reduce_all:
|
|
|
|
#[no_mangle]
|
|
|
|
pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool {
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-NOT: psllw
|
|
|
|
// x86: pmovmskb eax, xmm0
|
|
|
|
// x86-NEXT: {{cmp ax, -1|cmp eax, 65535|xor eax, 65535}}
|
2024-03-02 10:59:11 +01:00
|
|
|
// x86-NEXT: sete al
|
|
|
|
//
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// aarch64-NOT: shl
|
|
|
|
// aarch64: cmge v0.16b, v0.16b, #0
|
|
|
|
// aarch64-DAG: mov [[REG1:[a-z0-9]+]], #1
|
|
|
|
// aarch64-DAG: umaxv b0, v0.16b
|
|
|
|
// aarch64-NEXT: fmov [[REG2:[a-z0-9]+]], s0
|
|
|
|
// aarch64-NEXT: bic w0, [[REG1]], [[REG2]]
|
2024-03-02 10:59:11 +01:00
|
|
|
simd_reduce_all(m)
|
|
|
|
}
|
|
|
|
|
|
|
|
// CHECK-LABEL: mask_reduce_any:
|
|
|
|
#[no_mangle]
|
|
|
|
pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool {
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// x86-NOT: psllw
|
|
|
|
// x86: pmovmskb
|
2024-03-02 10:59:11 +01:00
|
|
|
// x86-NEXT: test eax, eax
|
|
|
|
// x86-NEXT: setne al
|
|
|
|
//
|
Consistently use the most significant bit of vector masks
This improves the codegen for vector `select`, `gather`, `scatter` and
boolean reduction intrinsics and fixes rust-lang/portable-simd#316.
The current behavior of most mask operations during llvm codegen is to
truncate the mask vector to <N x i1>, telling llvm to use the least
significat bit. The exception is the `simd_bitmask` intrinsics, which
already used the most signifiant bit.
Since sse/avx instructions are defined to use the most significant bit,
truncating means that llvm has to insert a left shift to move the bit
into the most significant position, before the mask can actually be
used.
Similarly on aarch64, mask operations like blend work bit by bit,
repeating the least significant bit across the whole lane involves
shifting it into the sign position and then comparing against zero.
By shifting before truncating to <N x i1>, we tell llvm that we only
consider the most significant bit, removing the need for additional
shift instructions in the assembly.
2023-01-04 23:55:40 +01:00
|
|
|
// aarch64-NOT: shl
|
|
|
|
// aarch64: cmlt v0.16b, v0.16b, #0
|
2024-03-02 10:59:11 +01:00
|
|
|
// aarch64-NEXT: umaxv b0, v0.16b
|
|
|
|
// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0
|
|
|
|
// aarch64-NEXT: and w0, [[REG]], #0x1
|
|
|
|
simd_reduce_any(m)
|
|
|
|
}
|