/*
Copyright 2022 Gabriel Jensen.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
#include <zap/priv.h>
.globl zap_memcp
zap_memcp:
/*
void const * in
zap_sz num
void * out
*/
#if defined(__amd64__)
# rdi: Address of the current input element.
# rsi: Number of remaining elements.
# rdx: Address of the current output element.
# rcx: Current element.
# xmm0: Current element.
# ymm0: Current element.
#if defined(__AVX__)
# AVX support 256-bit moves.
# Copy 32 bytes:
.big20cp:
# Check if there are at least 32 bytes remaining:
cmpq $0x20,%rsi
jl .big10cp # If not, skip to the 10 byte copying.
# Copy:
vmovups (%rdi),%ymm0 # Move into a register.
vmovups %ymm0,(%rdx) # And then back into memory.
# Continue:
addq $0x20,%rdi
addq $0x20,%rdx
subq $0x20,%rsi
jmp .big20cp
#endif
# AMD64 requires SSE(2).
# Copy 16 bytes:
.big10cp:
# Check if there are at least 16 bytes remaining:
cmpq $0x10,%rsi
jl .wrdcp
# Copy:
movdqu (%rdi),%xmm0
movdqu %xmm0,(%rdx)
# Continue:
addq $0x10,%rdi
addq $0x10,%rdx
subq $0x10,%rsi
jmp .big10cp
# Copy one word (8 bytes):
.wrdcp:
# Check if there are at least 8 bytes remaining:
cmpq $0x8,%rsi
jl .bytecp
# Copy:
movq (%rdi),%rcx
movq %rcx,(%rdx)
# Continue:
addq $0x8,%rdi
addq $0x8,%rdx
subq $0x8,%rsi
jmp .wrdcp
# Copy one byte:
.bytecp:
# Check if we have any bytes remaining:
testq %rsi,%rsi
jz .done
# Copy:
movb (%rdi),%cl
movb %cl,(%rdx)
# Continue:
incq %rdi
incq %rdx
decq %rsi
jmp .bytecp
# Finish:
.done:
ret
#endif