Add more SIMD
This commit is contained in:
parent
2fa6a9080b
commit
06073c9dfc
3 changed files with 362 additions and 83 deletions
|
@ -286,10 +286,12 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
|
|||
|
||||
if return_type != void_type {
|
||||
unsafe { RETURN_VALUE_COUNT += 1 };
|
||||
let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
|
||||
let func_name = format!("{:?}", func_ptr);
|
||||
let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name);
|
||||
self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
|
||||
let return_value = self.cx.context.new_call_through_ptr(None, func_ptr, &args);
|
||||
let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args);
|
||||
let result = current_func.new_local(None, return_value.get_type(), &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
|
||||
self.block.add_assignment(None, result, return_value);
|
||||
result.to_rvalue()
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -15,7 +15,17 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
|
|||
| "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask"
|
||||
| "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask"
|
||||
| "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask"
|
||||
| "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask"
|
||||
| "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask" | "__builtin_ia32_pslldi512_mask"
|
||||
| "__builtin_ia32_psrldi512_mask" | "__builtin_ia32_psllqi512_mask" | "__builtin_ia32_psrlqi512_mask"
|
||||
| "__builtin_ia32_pslld512_mask" | "__builtin_ia32_psrld512_mask" | "__builtin_ia32_psllq512_mask"
|
||||
| "__builtin_ia32_psrlq512_mask" | "__builtin_ia32_psrad512_mask" | "__builtin_ia32_psraq512_mask"
|
||||
| "__builtin_ia32_psradi512_mask" | "__builtin_ia32_psraqi512_mask" | "__builtin_ia32_psrav16si_mask"
|
||||
| "__builtin_ia32_psrav8di_mask" | "__builtin_ia32_prolvd512_mask" | "__builtin_ia32_prorvd512_mask"
|
||||
| "__builtin_ia32_prolvq512_mask" | "__builtin_ia32_prorvq512_mask" | "__builtin_ia32_psllv16si_mask"
|
||||
| "__builtin_ia32_psrlv16si_mask" | "__builtin_ia32_psllv8di_mask" | "__builtin_ia32_psrlv8di_mask"
|
||||
| "__builtin_ia32_permvarsi512_mask" | "__builtin_ia32_vpermilvarps512_mask"
|
||||
| "__builtin_ia32_vpermilvarpd512_mask" | "__builtin_ia32_permvardi512_mask"
|
||||
| "__builtin_ia32_permvarsf512_mask"
|
||||
=> {
|
||||
let mut new_args = args.to_vec();
|
||||
let arg3_type = gcc_func.get_param_type(2);
|
||||
|
@ -30,7 +40,12 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
|
|||
| "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_prold256_mask" | "__builtin_ia32_prold128_mask"
|
||||
| "__builtin_ia32_prord512_mask" | "__builtin_ia32_prord256_mask" | "__builtin_ia32_prord128_mask"
|
||||
| "__builtin_ia32_prolq256_mask" | "__builtin_ia32_prolq128_mask" | "__builtin_ia32_prorq256_mask"
|
||||
| "__builtin_ia32_prorq128_mask"
|
||||
| "__builtin_ia32_prorq128_mask" | "__builtin_ia32_psraq256_mask" | "__builtin_ia32_psraq128_mask"
|
||||
| "__builtin_ia32_psraqi256_mask" | "__builtin_ia32_psraqi128_mask" | "__builtin_ia32_psravq256_mask"
|
||||
| "__builtin_ia32_psravq128_mask" | "__builtin_ia32_prolvd256_mask" | "__builtin_ia32_prolvd128_mask"
|
||||
| "__builtin_ia32_prorvd256_mask" | "__builtin_ia32_prorvd128_mask" | "__builtin_ia32_prolvq256_mask"
|
||||
| "__builtin_ia32_prolvq128_mask" | "__builtin_ia32_prorvq256_mask" | "__builtin_ia32_prorvq128_mask"
|
||||
| "__builtin_ia32_permvardi256_mask" | "__builtin_ia32_permvardf512_mask" | "__builtin_ia32_permvardf256_mask"
|
||||
=> {
|
||||
let mut new_args = args.to_vec();
|
||||
let arg3_type = gcc_func.get_param_type(2);
|
||||
|
@ -105,6 +120,18 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
|
|||
new_args.push(last_arg);
|
||||
args = new_args.into();
|
||||
},
|
||||
"__builtin_ia32_vpermi2vard512_mask" | "__builtin_ia32_vpermi2vard256_mask"
|
||||
| "__builtin_ia32_vpermi2vard128_mask" | "__builtin_ia32_vpermi2varq512_mask"
|
||||
| "__builtin_ia32_vpermi2varq256_mask" | "__builtin_ia32_vpermi2varq128_mask"
|
||||
| "__builtin_ia32_vpermi2varps512_mask" | "__builtin_ia32_vpermi2varps256_mask"
|
||||
| "__builtin_ia32_vpermi2varps128_mask" | "__builtin_ia32_vpermi2varpd512_mask"
|
||||
| "__builtin_ia32_vpermi2varpd256_mask" | "__builtin_ia32_vpermi2varpd128_mask" => {
|
||||
let mut new_args = args.to_vec();
|
||||
let arg4_type = gcc_func.get_param_type(3);
|
||||
let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
|
||||
new_args.push(minus_one);
|
||||
args = new_args.into();
|
||||
},
|
||||
"__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask"
|
||||
| "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => {
|
||||
let mut new_args = args.to_vec();
|
||||
|
@ -118,6 +145,52 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
|
|||
new_args.push(last_arg);
|
||||
args = new_args.into();
|
||||
},
|
||||
"__builtin_ia32_stmxcsr" => {
|
||||
args = vec![].into();
|
||||
},
|
||||
"__builtin_ia32_addcarryx_u64" => {
|
||||
let mut new_args = args.to_vec();
|
||||
let arg2_type = gcc_func.get_param_type(1);
|
||||
let variable = builder.current_func().new_local(None, arg2_type, "addcarryResult");
|
||||
new_args.push(variable.get_address(None));
|
||||
args = new_args.into();
|
||||
},
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
else {
|
||||
match &*func_name {
|
||||
"__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => {
|
||||
let new_args = args.to_vec();
|
||||
let arg3_type = gcc_func.get_param_type(2);
|
||||
let arg3 = builder.context.new_cast(None, new_args[4], arg3_type);
|
||||
let arg4_type = gcc_func.get_param_type(3);
|
||||
let arg4 = builder.context.new_bitcast(None, new_args[2], arg4_type);
|
||||
args = vec![new_args[0], new_args[1], arg3, arg4, new_args[3], new_args[5]].into();
|
||||
},
|
||||
// NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors.
|
||||
// FIXME: the intrinsics like _mm_mask_fmadd_sd should probably directly call the GCC
|
||||
// instrinsic to avoid this.
|
||||
"__builtin_ia32_vfmaddss3_round" => {
|
||||
let new_args = args.to_vec();
|
||||
let arg1_type = gcc_func.get_param_type(0);
|
||||
let arg2_type = gcc_func.get_param_type(1);
|
||||
let arg3_type = gcc_func.get_param_type(2);
|
||||
let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 4]);
|
||||
let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 4]);
|
||||
let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 4]);
|
||||
args = vec![a, b, c, new_args[3]].into();
|
||||
},
|
||||
"__builtin_ia32_vfmaddsd3_round" => {
|
||||
let new_args = args.to_vec();
|
||||
let arg1_type = gcc_func.get_param_type(0);
|
||||
let arg2_type = gcc_func.get_param_type(1);
|
||||
let arg3_type = gcc_func.get_param_type(2);
|
||||
let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 2]);
|
||||
let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 2]);
|
||||
let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 2]);
|
||||
args = vec![a, b, c, new_args[3]].into();
|
||||
},
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
@ -125,11 +198,30 @@ pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc
|
|||
args
|
||||
}
|
||||
|
||||
pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, mut return_value: RValue<'gcc>, func_name: &str, args: &[RValue<'gcc>]) -> RValue<'gcc> {
|
||||
match func_name {
|
||||
"__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => {
|
||||
let zero = builder.context.new_rvalue_zero(builder.int_type);
|
||||
return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue();
|
||||
},
|
||||
"__builtin_ia32_addcarryx_u64" => {
|
||||
let last_arg = args.last().expect("last arg");
|
||||
let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag");
|
||||
let field2 = builder.context.new_field(None, builder.ulonglong_type, "carryResult");
|
||||
let struct_type = builder.context.new_struct_type(None, "addcarryResult", &[field1, field2]);
|
||||
return_value = builder.context.new_struct_constructor(None, struct_type.as_type(), None, &[return_value, last_arg.dereference(None).to_rvalue()]);
|
||||
},
|
||||
_ => (),
|
||||
}
|
||||
|
||||
return_value
|
||||
}
|
||||
|
||||
pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
|
||||
// NOTE: these intrinsics have missing parameters before the last one, so ignore the
|
||||
// last argument type check.
|
||||
// FIXME(antoyo): find a way to refactor in order to avoid this hack.
|
||||
match func_name {
|
||||
// NOTE: these intrinsics have missing parameters before the last one, so ignore the
|
||||
// last argument type check.
|
||||
"__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
|
||||
| "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask"
|
||||
| "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
|
||||
|
@ -142,6 +234,11 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
|
|||
return true;
|
||||
}
|
||||
},
|
||||
"__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => {
|
||||
if index == 2 || index == 3 {
|
||||
return true;
|
||||
}
|
||||
},
|
||||
"__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
|
||||
// Since there are two LLVM intrinsics that map to each of these GCC builtins and only
|
||||
// one of them has a missing parameter before the last one, we check the number of
|
||||
|
@ -150,6 +247,8 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
|
|||
return true;
|
||||
}
|
||||
},
|
||||
// NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors.
|
||||
"__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => return true,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
|
@ -203,6 +302,47 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
|
|||
"llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask",
|
||||
"llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask",
|
||||
"llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.d.512" => "__builtin_ia32_ucmpd512_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.d.256" => "__builtin_ia32_ucmpd256_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.d.128" => "__builtin_ia32_ucmpd128_mask",
|
||||
"llvm.x86.avx512.mask.cmp.d.512" => "__builtin_ia32_cmpd512_mask",
|
||||
"llvm.x86.avx512.mask.cmp.d.256" => "__builtin_ia32_cmpd256_mask",
|
||||
"llvm.x86.avx512.mask.cmp.d.128" => "__builtin_ia32_cmpd128_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.q.512" => "__builtin_ia32_ucmpq512_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.q.256" => "__builtin_ia32_ucmpq256_mask",
|
||||
"llvm.x86.avx512.mask.ucmp.q.128" => "__builtin_ia32_ucmpq128_mask",
|
||||
"llvm.x86.avx512.mask.cmp.q.512" => "__builtin_ia32_cmpq512_mask",
|
||||
"llvm.x86.avx512.mask.cmp.q.256" => "__builtin_ia32_cmpq256_mask",
|
||||
"llvm.x86.avx512.mask.cmp.q.128" => "__builtin_ia32_cmpq128_mask",
|
||||
"llvm.x86.avx512.mask.max.ss.round" => "__builtin_ia32_maxss_mask_round",
|
||||
"llvm.x86.avx512.mask.max.sd.round" => "__builtin_ia32_maxsd_mask_round",
|
||||
"llvm.x86.avx512.mask.min.ss.round" => "__builtin_ia32_minss_mask_round",
|
||||
"llvm.x86.avx512.mask.min.sd.round" => "__builtin_ia32_minsd_mask_round",
|
||||
"llvm.x86.avx512.mask.sqrt.ss" => "__builtin_ia32_sqrtss_mask_round",
|
||||
"llvm.x86.avx512.mask.sqrt.sd" => "__builtin_ia32_sqrtsd_mask_round",
|
||||
"llvm.x86.avx512.mask.getexp.ss" => "__builtin_ia32_getexpss_mask_round",
|
||||
"llvm.x86.avx512.mask.getexp.sd" => "__builtin_ia32_getexpsd_mask_round",
|
||||
"llvm.x86.avx512.mask.getmant.ss" => "__builtin_ia32_getmantss_mask_round",
|
||||
"llvm.x86.avx512.mask.getmant.sd" => "__builtin_ia32_getmantsd_mask_round",
|
||||
"llvm.x86.avx512.mask.rndscale.ss" => "__builtin_ia32_rndscaless_mask_round",
|
||||
"llvm.x86.avx512.mask.rndscale.sd" => "__builtin_ia32_rndscalesd_mask_round",
|
||||
"llvm.x86.avx512.mask.scalef.ss" => "__builtin_ia32_scalefss_mask_round",
|
||||
"llvm.x86.avx512.mask.scalef.sd" => "__builtin_ia32_scalefsd_mask_round",
|
||||
"llvm.x86.avx512.vfmadd.f32" => "__builtin_ia32_vfmaddss3_round",
|
||||
"llvm.x86.avx512.vfmadd.f64" => "__builtin_ia32_vfmaddsd3_round",
|
||||
"llvm.ceil.v4f64" => "__builtin_ia32_ceilpd256",
|
||||
"llvm.ceil.v8f32" => "__builtin_ia32_ceilps256",
|
||||
"llvm.floor.v4f64" => "__builtin_ia32_floorpd256",
|
||||
"llvm.floor.v8f32" => "__builtin_ia32_floorps256",
|
||||
"llvm.sqrt.v4f64" => "__builtin_ia32_sqrtpd256",
|
||||
"llvm.x86.sse.stmxcsr" => "__builtin_ia32_stmxcsr",
|
||||
"llvm.x86.sse.ldmxcsr" => "__builtin_ia32_ldmxcsr",
|
||||
"llvm.ctpop.v16i32" => "__builtin_ia32_vpopcountd_v16si",
|
||||
"llvm.ctpop.v8i32" => "__builtin_ia32_vpopcountd_v8si",
|
||||
"llvm.ctpop.v4i32" => "__builtin_ia32_vpopcountd_v4si",
|
||||
"llvm.ctpop.v8i64" => "__builtin_ia32_vpopcountq_v8di",
|
||||
"llvm.ctpop.v4i64" => "__builtin_ia32_vpopcountq_v4di",
|
||||
"llvm.ctpop.v2i64" => "__builtin_ia32_vpopcountq_v2di",
|
||||
|
||||
// The above doc points to unknown builtins for the following, so override them:
|
||||
"llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",
|
||||
|
@ -221,7 +361,70 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
|
|||
"llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di",
|
||||
"llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df",
|
||||
"llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df",
|
||||
"" => "",
|
||||
"llvm.x86.avx512.pslli.d.512" => "__builtin_ia32_pslldi512_mask",
|
||||
"llvm.x86.avx512.psrli.d.512" => "__builtin_ia32_psrldi512_mask",
|
||||
"llvm.x86.avx512.pslli.q.512" => "__builtin_ia32_psllqi512_mask",
|
||||
"llvm.x86.avx512.psrli.q.512" => "__builtin_ia32_psrlqi512_mask",
|
||||
"llvm.x86.avx512.psll.d.512" => "__builtin_ia32_pslld512_mask",
|
||||
"llvm.x86.avx512.psrl.d.512" => "__builtin_ia32_psrld512_mask",
|
||||
"llvm.x86.avx512.psll.q.512" => "__builtin_ia32_psllq512_mask",
|
||||
"llvm.x86.avx512.psrl.q.512" => "__builtin_ia32_psrlq512_mask",
|
||||
"llvm.x86.avx512.psra.d.512" => "__builtin_ia32_psrad512_mask",
|
||||
"llvm.x86.avx512.psra.q.512" => "__builtin_ia32_psraq512_mask",
|
||||
"llvm.x86.avx512.psra.q.256" => "__builtin_ia32_psraq256_mask",
|
||||
"llvm.x86.avx512.psra.q.128" => "__builtin_ia32_psraq128_mask",
|
||||
"llvm.x86.avx512.psrai.d.512" => "__builtin_ia32_psradi512_mask",
|
||||
"llvm.x86.avx512.psrai.q.512" => "__builtin_ia32_psraqi512_mask",
|
||||
"llvm.x86.avx512.psrai.q.256" => "__builtin_ia32_psraqi256_mask",
|
||||
"llvm.x86.avx512.psrai.q.128" => "__builtin_ia32_psraqi128_mask",
|
||||
"llvm.x86.avx512.psrav.d.512" => "__builtin_ia32_psrav16si_mask",
|
||||
"llvm.x86.avx512.psrav.q.512" => "__builtin_ia32_psrav8di_mask",
|
||||
"llvm.x86.avx512.psrav.q.256" => "__builtin_ia32_psravq256_mask",
|
||||
"llvm.x86.avx512.psrav.q.128" => "__builtin_ia32_psravq128_mask",
|
||||
"llvm.x86.avx512.psllv.d.512" => "__builtin_ia32_psllv16si_mask",
|
||||
"llvm.x86.avx512.psrlv.d.512" => "__builtin_ia32_psrlv16si_mask",
|
||||
"llvm.x86.avx512.psllv.q.512" => "__builtin_ia32_psllv8di_mask",
|
||||
"llvm.x86.avx512.psrlv.q.512" => "__builtin_ia32_psrlv8di_mask",
|
||||
"llvm.x86.avx512.permvar.si.512" => "__builtin_ia32_permvarsi512_mask",
|
||||
"llvm.x86.avx512.vpermilvar.ps.512" => "__builtin_ia32_vpermilvarps512_mask",
|
||||
"llvm.x86.avx512.vpermilvar.pd.512" => "__builtin_ia32_vpermilvarpd512_mask",
|
||||
"llvm.x86.avx512.permvar.di.512" => "__builtin_ia32_permvardi512_mask",
|
||||
"llvm.x86.avx512.permvar.di.256" => "__builtin_ia32_permvardi256_mask",
|
||||
"llvm.x86.avx512.permvar.sf.512" => "__builtin_ia32_permvarsf512_mask",
|
||||
"llvm.x86.avx512.permvar.df.512" => "__builtin_ia32_permvardf512_mask",
|
||||
"llvm.x86.avx512.permvar.df.256" => "__builtin_ia32_permvardf256_mask",
|
||||
"llvm.x86.avx512.vpermi2var.d.512" => "__builtin_ia32_vpermi2vard512_mask",
|
||||
"llvm.x86.avx512.vpermi2var.d.256" => "__builtin_ia32_vpermi2vard256_mask",
|
||||
"llvm.x86.avx512.vpermi2var.d.128" => "__builtin_ia32_vpermi2vard128_mask",
|
||||
"llvm.x86.avx512.vpermi2var.q.512" => "__builtin_ia32_vpermi2varq512_mask",
|
||||
"llvm.x86.avx512.vpermi2var.q.256" => "__builtin_ia32_vpermi2varq256_mask",
|
||||
"llvm.x86.avx512.vpermi2var.q.128" => "__builtin_ia32_vpermi2varq128_mask",
|
||||
"llvm.x86.avx512.vpermi2var.ps.512" => "__builtin_ia32_vpermi2varps512_mask",
|
||||
"llvm.x86.avx512.vpermi2var.ps.256" => "__builtin_ia32_vpermi2varps256_mask",
|
||||
"llvm.x86.avx512.vpermi2var.ps.128" => "__builtin_ia32_vpermi2varps128_mask",
|
||||
"llvm.x86.avx512.vpermi2var.pd.512" => "__builtin_ia32_vpermi2varpd512_mask",
|
||||
"llvm.x86.avx512.vpermi2var.pd.256" => "__builtin_ia32_vpermi2varpd256_mask",
|
||||
"llvm.x86.avx512.vpermi2var.pd.128" => "__builtin_ia32_vpermi2varpd128_mask",
|
||||
"llvm.x86.avx512.mask.add.ss.round" => "__builtin_ia32_addss_mask_round",
|
||||
"llvm.x86.avx512.mask.add.sd.round" => "__builtin_ia32_addsd_mask_round",
|
||||
"llvm.x86.avx512.mask.sub.ss.round" => "__builtin_ia32_subss_mask_round",
|
||||
"llvm.x86.avx512.mask.sub.sd.round" => "__builtin_ia32_subsd_mask_round",
|
||||
"llvm.x86.avx512.mask.mul.ss.round" => "__builtin_ia32_mulss_mask_round",
|
||||
"llvm.x86.avx512.mask.mul.sd.round" => "__builtin_ia32_mulsd_mask_round",
|
||||
"llvm.x86.avx512.mask.div.ss.round" => "__builtin_ia32_divss_mask_round",
|
||||
"llvm.x86.avx512.mask.div.sd.round" => "__builtin_ia32_divsd_mask_round",
|
||||
"llvm.x86.avx512.mask.cvtss2sd.round" => "__builtin_ia32_cvtss2sd_mask_round",
|
||||
"llvm.x86.avx512.mask.cvtsd2ss.round" => "__builtin_ia32_cvtsd2ss_mask_round",
|
||||
"llvm.x86.aesni.aesenc.256" => "__builtin_ia32_vaesenc_v32qi",
|
||||
"llvm.x86.aesni.aesenclast.256" => "__builtin_ia32_vaesenclast_v32qi",
|
||||
"llvm.x86.aesni.aesdec.256" => "__builtin_ia32_vaesdec_v32qi",
|
||||
"llvm.x86.aesni.aesdeclast.256" => "__builtin_ia32_vaesdeclast_v32qi",
|
||||
"llvm.x86.aesni.aesenc.512" => "__builtin_ia32_vaesenc_v64qi",
|
||||
"llvm.x86.aesni.aesenclast.512" => "__builtin_ia32_vaesenclast_v64qi",
|
||||
"llvm.x86.aesni.aesdec.512" => "__builtin_ia32_vaesdec_v64qi",
|
||||
"llvm.x86.aesni.aesdeclast.512" => "__builtin_ia32_vaesdeclast_v64qi",
|
||||
"llvm.x86.addcarry.64" => "__builtin_ia32_addcarryx_u64",
|
||||
|
||||
// NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py
|
||||
_ => include!("archs.rs"),
|
||||
};
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use gccjit::{BinaryOp, RValue, Type, ToRValue};
|
||||
use gccjit::{BinaryOp, RValue, Type, ToRValue, ComparisonOp, UnaryOp};
|
||||
use rustc_codegen_ssa::base::compare_simd_types;
|
||||
use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error};
|
||||
use rustc_codegen_ssa::mir::operand::OperandRef;
|
||||
|
@ -213,48 +213,12 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
|
|||
let vector = args[0].immediate();
|
||||
let index = args[1].immediate();
|
||||
let value = args[2].immediate();
|
||||
// TODO(antoyo): use a recursive unqualified() here.
|
||||
let vector_type = vector.get_type().unqualified().dyncast_vector().expect("vector type");
|
||||
let element_type = vector_type.get_element_type();
|
||||
// NOTE: we cannot cast to an array and assign to its element here because the value might
|
||||
// not be an l-value. So, call a builtin to set the element.
|
||||
// TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that?
|
||||
// TODO(antoyo): don't use target specific builtins here.
|
||||
let func_name =
|
||||
match in_len {
|
||||
2 => {
|
||||
if element_type == bx.i64_type {
|
||||
"__builtin_ia32_vec_set_v2di"
|
||||
}
|
||||
else {
|
||||
unimplemented!();
|
||||
}
|
||||
},
|
||||
4 => {
|
||||
if element_type == bx.i32_type {
|
||||
"__builtin_ia32_vec_set_v4si"
|
||||
}
|
||||
else {
|
||||
unimplemented!();
|
||||
}
|
||||
},
|
||||
8 => {
|
||||
if element_type == bx.i16_type {
|
||||
"__builtin_ia32_vec_set_v8hi"
|
||||
}
|
||||
else {
|
||||
unimplemented!();
|
||||
}
|
||||
},
|
||||
_ => unimplemented!("Len: {}", in_len),
|
||||
};
|
||||
let builtin = bx.context.get_target_builtin_function(func_name);
|
||||
let param1_type = builtin.get_param(0).to_rvalue().get_type();
|
||||
// TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
|
||||
let vector = bx.cx.bitcast_if_needed(vector, param1_type);
|
||||
let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]);
|
||||
// TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
|
||||
return Ok(bx.context.new_bitcast(None, result, vector.get_type()));
|
||||
let variable = bx.current_func().new_local(None, vector.get_type(), "new_vector");
|
||||
bx.llbb().add_assignment(None, variable, vector);
|
||||
let lvalue = bx.context.new_vector_access(None, variable.to_rvalue(), index);
|
||||
// TODO: si simd_insert est constant, utiliser BIT_REF…
|
||||
bx.llbb().add_assignment(None, lvalue, value);
|
||||
return Ok(variable.to_rvalue());
|
||||
}
|
||||
|
||||
#[cfg(feature="master")]
|
||||
|
@ -357,6 +321,67 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
|
|||
}
|
||||
}
|
||||
|
||||
if name == sym::simd_bitmask {
|
||||
// The `fn simd_bitmask(vector) -> unsigned integer` intrinsic takes a
|
||||
// vector mask and returns the most significant bit (MSB) of each lane in the form
|
||||
// of either:
|
||||
// * an unsigned integer
|
||||
// * an array of `u8`
|
||||
// If the vector has less than 8 lanes, a u8 is returned with zeroed trailing bits.
|
||||
//
|
||||
// The bit order of the result depends on the byte endianness, LSB-first for little
|
||||
// endian and MSB-first for big endian.
|
||||
|
||||
let vector = args[0].immediate();
|
||||
let vector_type = vector.get_type().dyncast_vector().expect("vector type");
|
||||
let elem_type = vector_type.get_element_type();
|
||||
let mut shifts = vec![];
|
||||
let mut masks = vec![];
|
||||
let mut mask = 1;
|
||||
for i in 0..in_len {
|
||||
shifts.push(bx.context.new_rvalue_from_int(elem_type, i as i32));
|
||||
masks.push(bx.context.new_rvalue_from_int(elem_type, mask));
|
||||
mask <<= 1;
|
||||
}
|
||||
masks.reverse();
|
||||
let shifts = bx.context.new_rvalue_from_vector(None, vector.get_type(), &shifts);
|
||||
let shifted = vector >> shifts;
|
||||
let masks = bx.context.new_rvalue_from_vector(None, vector.get_type(), &masks);
|
||||
let masked = shifted & masks;
|
||||
let reduced = bx.vector_reduce_op(masked, BinaryOp::BitwiseOr);
|
||||
|
||||
let expected_int_bits = in_len.max(8);
|
||||
let expected_bytes = expected_int_bits / 8 + ((expected_int_bits % 8 > 0) as u64);
|
||||
|
||||
match ret_ty.kind() {
|
||||
ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => {
|
||||
// Zero-extend iN to the bitmask type:
|
||||
return Ok(bx.zext(reduced, bx.type_ix(expected_int_bits)));
|
||||
}
|
||||
ty::Array(elem, len)
|
||||
if matches!(elem.kind(), ty::Uint(ty::UintTy::U8))
|
||||
&& len.try_eval_usize(bx.tcx, ty::ParamEnv::reveal_all())
|
||||
== Some(expected_bytes) =>
|
||||
{
|
||||
// Zero-extend iN to the array length:
|
||||
let ze = bx.zext(reduced, bx.type_ix(expected_bytes * 8));
|
||||
|
||||
// Convert the integer to a byte array
|
||||
let ptr = bx.alloca(bx.type_ix(expected_bytes * 8), Align::ONE);
|
||||
bx.store(ze, ptr, Align::ONE);
|
||||
let array_ty = bx.type_array(bx.type_i8(), expected_bytes);
|
||||
let ptr = bx.pointercast(ptr, bx.cx.type_ptr_to(array_ty));
|
||||
return Ok(bx.load(array_ty, ptr, Align::ONE));
|
||||
}
|
||||
_ => return_error!(
|
||||
"cannot return `{}`, expected `u{}` or `[u8; {}]`",
|
||||
ret_ty,
|
||||
expected_int_bits,
|
||||
expected_bytes
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn simd_simple_float_intrinsic<'gcc, 'tcx>(
|
||||
name: Symbol,
|
||||
in_elem: Ty<'_>,
|
||||
|
@ -496,9 +521,10 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
|
|||
let rhs = args[1].immediate();
|
||||
let is_add = name == sym::simd_saturating_add;
|
||||
let ptr_bits = bx.tcx().data_layout.pointer_size.bits() as _;
|
||||
let (signed, elem_width, elem_ty) = match *in_elem.kind() {
|
||||
ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_int_from_ty(i)),
|
||||
ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_uint_from_ty(i)),
|
||||
let (signed, elem_width, elem_ty) =
|
||||
match *in_elem.kind() {
|
||||
ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_int_from_ty(i)),
|
||||
ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_uint_from_ty(i)),
|
||||
_ => {
|
||||
return_error!(
|
||||
"expected element type `{}` of vector type `{}` \
|
||||
|
@ -508,30 +534,78 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
|
|||
);
|
||||
}
|
||||
};
|
||||
// TODO(antoyo): don't use target specific builtins here.
|
||||
// Not sure how easy it would be to avoid theme here.
|
||||
let builtin_name =
|
||||
match (signed, is_add, in_len, elem_width) {
|
||||
(true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned.
|
||||
(false, true, 32, 8) => "__builtin_ia32_paddusb256",
|
||||
(true, true, 16, 16) => "__builtin_ia32_paddsw256",
|
||||
(false, true, 16, 16) => "__builtin_ia32_paddusw256",
|
||||
(true, false, 16, 16) => "__builtin_ia32_psubsw256",
|
||||
(false, false, 16, 16) => "__builtin_ia32_psubusw256",
|
||||
(true, false, 32, 8) => "__builtin_ia32_psubsb256",
|
||||
(false, false, 32, 8) => "__builtin_ia32_psubusb256",
|
||||
_ => unimplemented!("signed: {}, is_add: {}, in_len: {}, elem_width: {}", signed, is_add, in_len, elem_width),
|
||||
};
|
||||
let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64);
|
||||
|
||||
let func = bx.context.get_target_builtin_function(builtin_name);
|
||||
let param1_type = func.get_param(0).to_rvalue().get_type();
|
||||
let param2_type = func.get_param(1).to_rvalue().get_type();
|
||||
let lhs = bx.cx.bitcast_if_needed(lhs, param1_type);
|
||||
let rhs = bx.cx.bitcast_if_needed(rhs, param2_type);
|
||||
let result = bx.context.new_call(None, func, &[lhs, rhs]);
|
||||
// TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
|
||||
return Ok(bx.context.new_bitcast(None, result, vec_ty));
|
||||
let result =
|
||||
match (signed, is_add) {
|
||||
(false, true) => {
|
||||
let res = lhs + rhs;
|
||||
let cmp = bx.context.new_comparison(None, ComparisonOp::LessThan, res, lhs);
|
||||
res | cmp
|
||||
},
|
||||
(true, true) => {
|
||||
// Algorithm from: https://codereview.stackexchange.com/questions/115869/saturated-signed-addition
|
||||
// TODO: improve using conditional operators if possible.
|
||||
let arg_type = lhs.get_type();
|
||||
// TODO: convert lhs and rhs to unsigned.
|
||||
let sum = lhs + rhs;
|
||||
let vector_type = arg_type.dyncast_vector().expect("vector type");
|
||||
let unit = vector_type.get_num_units();
|
||||
let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1);
|
||||
let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]);
|
||||
|
||||
let xor1 = lhs ^ rhs;
|
||||
let xor2 = lhs ^ sum;
|
||||
let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2;
|
||||
let mask = and >> width;
|
||||
|
||||
let one = bx.context.new_rvalue_one(elem_ty);
|
||||
let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]);
|
||||
let shift1 = ones << width;
|
||||
let shift2 = sum >> width;
|
||||
let mask_min = shift1 ^ shift2;
|
||||
|
||||
let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum;
|
||||
let and2 = mask & mask_min;
|
||||
|
||||
and1 + and2
|
||||
},
|
||||
(false, false) => {
|
||||
let res = lhs - rhs;
|
||||
let cmp = bx.context.new_comparison(None, ComparisonOp::LessThanEquals, res, lhs);
|
||||
res & cmp
|
||||
},
|
||||
(true, false) => {
|
||||
let arg_type = lhs.get_type();
|
||||
// TODO(antoyo): this uses the same algorithm from saturating add, but add the
|
||||
// negative of the right operand. Find a proper subtraction algorithm.
|
||||
let rhs = bx.context.new_unary_op(None, UnaryOp::Minus, arg_type, rhs);
|
||||
|
||||
// TODO: convert lhs and rhs to unsigned.
|
||||
let sum = lhs + rhs;
|
||||
let vector_type = arg_type.dyncast_vector().expect("vector type");
|
||||
let unit = vector_type.get_num_units();
|
||||
let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1);
|
||||
let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]);
|
||||
|
||||
let xor1 = lhs ^ rhs;
|
||||
let xor2 = lhs ^ sum;
|
||||
let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2;
|
||||
let mask = and >> width;
|
||||
|
||||
let one = bx.context.new_rvalue_one(elem_ty);
|
||||
let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]);
|
||||
let shift1 = ones << width;
|
||||
let shift2 = sum >> width;
|
||||
let mask_min = shift1 ^ shift2;
|
||||
|
||||
let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum;
|
||||
let and2 = mask & mask_min;
|
||||
|
||||
and1 + and2
|
||||
}
|
||||
};
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
macro_rules! arith_red {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue