# Copyright 2022 Gabriel Jensen.
# This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
.globl zap_memcp
zap_memcp:
# Address of the current input element:
#define iaddr %rdi
# Number of remaining bytes:
#define rem %rsi
# Address of the current output element:
#define oaddr %rdx
# Current element:
#define val1 %cl
#define val8 %rcx
#define val01 %xmm0
#define val02 %ymm0
#if defined(__AVX__)
# AVX support 256-bit moves.
# Copy 32 bytes:
.big02cp:
# Check if there are at least 32 bytes remaining:
cmpq $0x20,rem # if (rem < 20)
jl .big01cp # goto big01cp // If not, skip to the 10 byte copying.
# Copy:
vmovups (iaddr),val02 # val02 = *iaddr
vmovups val02,(oaddr) # *oaddr = val02
# Continue:
addq $0x20,iaddr # iaddr += 0x20
addq $0x20,oaddr # oaddr += 0x20
subq $0x20,rem # rem -= 0x20
jmp .big02cp # goto big02cp
#endif
# AMD64 requires SSE(2), so we don't have to test it.
# Copy 16 bytes:
.big01cp:
# Check if there are at least 16 bytes remaining:
cmpq $0x10,rem # if (rem < 10)
jl .wrdcp # goto wrdcp
# Copy:
movdqu (iaddr),val01 # val01 = *iaddr
movdqu val01,(oaddr) # *oaddr = val01
# Continue:
addq $0x10,iaddr # iaddr += 0x10
addq $0x10,oaddr # oaddr += 0x10
subq $0x10,rem # rem -= 0x10
jmp .big01cp # goto big01cp
# Copy one word (8 bytes):
.wrdcp:
# Check if there are at least 8 bytes remaining:
cmpq $0x8,rem # if (rem < 8)
jl .bytecp # goto bytecp
# Copy:
movq (iaddr),val8 # val8 = *iaddr
movq val8,(oaddr) # *oaddr = val8
# Continue:
addq $0x8,iaddr # iaddr += 0x8
addq $0x8,oaddr # oaddr += 0x8
subq $0x8,rem # rem -= 0x8
jmp .wrdcp # goto wrdcp
# Copy one byte:
.bytecp:
# Check if we have any bytes remaining:
testq rem,rem # if (rem == 0x0)
jz .done # goto done
# Copy:
movb (iaddr),val1 # val1 = *iaddr
movb val1,(oaddr) # *oaddr = val1
# Continue:
incq iaddr # ++iaddr
incq oaddr # ++oaddr
decq rem # --rem
jmp .bytecp # goto bytecp
# Return:
.done:
ret # return