Auto merge of #111999 - scottmcm:codegen-less-memcpy, r=compiler-errors
Use `load`+`store` instead of `memcpy` for small integer arrays
I was inspired by #98892 to see whether, rather than making `mem::swap` do something smart in the library, we could update MIR assignments like `*_1 = *_2` to do something smarter than `memcpy` for sufficiently-small types that doing it inline is going to be better than a `memcpy` call in assembly anyway. After all, special code may help `mem::swap`, but if the "obvious" MIR can just result in the correct thing that helps everything -- other code like `mem::replace`, people doing it manually, and just passing around by value in general -- as well as makes MIR inlining happier since it doesn't need to deal with all the complicated library code if it just sees a couple assignments.
LLVM will turn the short, known-length `memcpy`s into direct instructions in the backend, but that's too late for it to be able to remove `alloca`s. In general, replacing `memcpy`s with typed instructions is hard in the middle-end -- even for `memcpy.inline` where it knows it won't be a function call -- is hard [due to poison propagation issues](https://rust-lang.zulipchat.com/#narrow/stream/187780-t-compiler.2Fwg-llvm/topic/memcpy.20vs.20load-store.20for.20MIR.20assignments/near/360376712). So because we know more about the type invariants -- these are typed copies -- rustc can emit something more specific, allowing LLVM to `mem2reg` away the `alloca`s in some situations.
#52051 previously did something like this in the library for `mem::swap`, but it ended up regressing during enabling mir inlining (cbbf06b0cd
), so this has been suboptimal on stable for ≈5 releases now.
The code in this PR is narrowly targeted at just integer arrays in LLVM, but works via a new method on the [`LayoutTypeMethods`](https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/trait.LayoutTypeMethods.html) trait, so specific backends based on cg_ssa can enable this for more situations over time, as we find them. I don't want to try to bite off too much in this PR, though. (Transparent newtypes and simple things like the 3×usize `String` would be obvious candidates for a follow-up.)
Codegen demonstrations: <https://llvm.godbolt.org/z/fK8hT9aqv>
Before:
```llvm
define void `@swap_rgb48_old(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #1 {
%a.i = alloca [3 x i16], align 2
call void `@llvm.lifetime.start.p0(i64` 6, ptr nonnull %a.i)
call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %a.i, ptr noundef nonnull align 2 dereferenceable(6) %x, i64 6, i1 false)
tail call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %x, ptr noundef nonnull align 2 dereferenceable(6) %y, i64 6, i1 false)
call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %y, ptr noundef nonnull align 2 dereferenceable(6) %a.i, i64 6, i1 false)
call void `@llvm.lifetime.end.p0(i64` 6, ptr nonnull %a.i)
ret void
}
```
Note it going to stack:
```nasm
swap_rgb48_old: # `@swap_rgb48_old`
movzx eax, word ptr [rdi + 4]
mov word ptr [rsp - 4], ax
mov eax, dword ptr [rdi]
mov dword ptr [rsp - 8], eax
movzx eax, word ptr [rsi + 4]
mov word ptr [rdi + 4], ax
mov eax, dword ptr [rsi]
mov dword ptr [rdi], eax
movzx eax, word ptr [rsp - 4]
mov word ptr [rsi + 4], ax
mov eax, dword ptr [rsp - 8]
mov dword ptr [rsi], eax
ret
```
Now:
```llvm
define void `@swap_rgb48(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #0 {
start:
%0 = load <3 x i16>, ptr %x, align 2
%1 = load <3 x i16>, ptr %y, align 2
store <3 x i16> %1, ptr %x, align 2
store <3 x i16> %0, ptr %y, align 2
ret void
}
```
still lowers to `dword`+`word` operations, but has no stack traffic:
```nasm
swap_rgb48: # `@swap_rgb48`
mov eax, dword ptr [rdi]
movzx ecx, word ptr [rdi + 4]
movzx edx, word ptr [rsi + 4]
mov r8d, dword ptr [rsi]
mov dword ptr [rdi], r8d
mov word ptr [rdi + 4], dx
mov word ptr [rsi + 4], cx
mov dword ptr [rsi], eax
ret
```
And as a demonstration that this isn't just `mem::swap`, a `mem::replace` on a small array (since replace doesn't use swap since #83022), which used to be `memcpy`s in LLVM changes in IR
```llvm
define void `@replace_short_array(ptr` noalias nocapture noundef sret([3 x i32]) dereferenceable(12) %0, ptr noalias noundef align 4 dereferenceable(12) %r, ptr noalias nocapture noundef readonly dereferenceable(12) %v) unnamed_addr #0 {
start:
%1 = load <3 x i32>, ptr %r, align 4
store <3 x i32> %1, ptr %0, align 4
%2 = load <3 x i32>, ptr %v, align 4
store <3 x i32> %2, ptr %r, align 4
ret void
}
```
but that lowers to reasonable `dword`+`qword` instructions still
```nasm
replace_short_array: # `@replace_short_array`
mov rax, rdi
mov rcx, qword ptr [rsi]
mov edi, dword ptr [rsi + 8]
mov dword ptr [rax + 8], edi
mov qword ptr [rax], rcx
mov rcx, qword ptr [rdx]
mov edx, dword ptr [rdx + 8]
mov dword ptr [rsi + 8], edx
mov qword ptr [rsi], rcx
ret
```
This commit is contained in:
commit
fd9bf59436
8 changed files with 146 additions and 6 deletions
|
@ -380,7 +380,19 @@ pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
|
|||
return;
|
||||
}
|
||||
|
||||
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
|
||||
if flags == MemFlags::empty()
|
||||
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
|
||||
{
|
||||
// I look forward to only supporting opaque pointers
|
||||
let pty = bx.type_ptr_to(bty);
|
||||
let src = bx.pointercast(src, pty);
|
||||
let dst = bx.pointercast(dst, pty);
|
||||
|
||||
let temp = bx.load(bty, src, src_align);
|
||||
bx.store(temp, dst, dst_align);
|
||||
} else {
|
||||
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
|
||||
|
|
|
@ -126,6 +126,28 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
|
|||
index: usize,
|
||||
immediate: bool,
|
||||
) -> Self::Type;
|
||||
|
||||
/// A type that can be used in a [`super::BuilderMethods::load`] +
|
||||
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
|
||||
/// such as a MIR `*_0 = *_1`.
|
||||
///
|
||||
/// It's always legal to return `None` here, as the provided impl does,
|
||||
/// in which case callers should use [`super::BuilderMethods::memcpy`]
|
||||
/// instead of the `load`+`store` pair.
|
||||
///
|
||||
/// This can be helpful for things like arrays, where the LLVM backend type
|
||||
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
|
||||
/// instead be copied via an `i48` that stays as the single `load`+`store`.
|
||||
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
|
||||
/// cases, due to `poison` handling, but in codegen we have more information
|
||||
/// about the type invariants, so can emit something better instead.)
|
||||
///
|
||||
/// This *should* return `None` for particularly-large types, where leaving
|
||||
/// the `memcpy` may well be important to avoid code size explosion.
|
||||
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
|
||||
let _ = layout;
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// For backends that support CFI using type membership (i.e., testing whether a given pointer is
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue