diff options
Diffstat (limited to 'zap/src/mem/memcp.S')
-rw-r--r-- | zap/src/mem/memcp.S | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/zap/src/mem/memcp.S b/zap/src/mem/memcp.S new file mode 100644 index 0000000..ead0718 --- /dev/null +++ b/zap/src/mem/memcp.S @@ -0,0 +1,107 @@ +/* + Copyright 2022 Gabriel Jensen. + This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. +*/ + +#include <zap/priv.h> + +.globl zap_memcp + +zap_memcp: + + /* + void const * in + zap_sz num + void * out + */ +#if defined(__amd64__) + + # rdi: Address of the current input element. + # rsi: Number of remaining elements. + # rdx: Address of the current output element. + # rcx: Current element. + # xmm0: Current element. + # ymm0: Current element. + +#if defined(__AVX__) + # AVX support 256-bit moves. + + # Copy 32 bytes: +.big20cp: + + # Check if there are at least 32 bytes remaining: + cmpq $0x20,%rsi + jl .big10cp # If not, skip to the 10 byte copying. + + # Copy: + vmovups (%rdi),%ymm0 # Move into a register. + vmovups %ymm0,(%rdx) # And then back into memory. + + # Continue: + addq $0x20,%rdi + addq $0x20,%rdx + subq $0x20,%rsi + jmp .big20cp + +#endif + + # AMD64 requires SSE(2). + + # Copy 16 bytes: +.big10cp: + + # Check if there are at least 16 bytes remaining: + cmpq $0x10,%rsi + jl .wrdcp + + # Copy: + movdqu (%rdi),%xmm0 + movdqu %xmm0,(%rdx) + + # Continue: + addq $0x10,%rdi + addq $0x10,%rdx + subq $0x10,%rsi + jmp .big10cp + + # Copy one word (8 bytes): +.wrdcp: + + # Check if there are at least 8 bytes remaining: + cmpq $0x8,%rsi + jl .bytecp + + # Copy: + movq (%rdi),%rcx + movq %rcx,(%rdx) + + # Continue: + addq $0x8,%rdi + addq $0x8,%rdx + subq $0x8,%rsi + jmp .wrdcp + + # Copy one byte: +.bytecp: + + # Check if we have any bytes remaining: + testq %rsi,%rsi + jz .done + + # Copy: + movb (%rdi),%cl + movb %cl,(%rdx) + + # Continue: + incq %rdi + incq %rdx + decq %rsi + jmp .bytecp + + # Finish: +.done: + + ret + +#endif |