summaryrefslogblamecommitdiff
path: root/zap/source/amd64/mem/memcp.S
blob: 5691446b4fa62f47ef05643d5901ae7e4bbc244e (plain) (tree)
1
2
3
4
5
6
7


                                                                                                              



                





















































































                                                          
       
# Copyright 2022 Gabriel Jensen.
# This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.

.globl zap_memcp

zap_memcp:
	# rdi:  Address of the current input element.
	# rsi:  Number of remaining elements.
	# rdx:  Address of the current output element.
	# rcx:  Current element.
	# xmm0: Current element.
	# ymm0: Current element.

#if defined(__AVX__)
	# AVX support 256-bit moves.

	# Copy 32 bytes:
.big20cp:

	# Check if there are at least 32 bytes remaining:
	cmpq $0x20,%rsi
	jl .big10cp # If not, skip to the 10 byte copying.

	# Copy:
	vmovups (%rdi),%ymm0 # Move into a register.
	vmovups %ymm0,(%rdx) # And then back into memory.

	# Continue:
	addq $0x20,%rdi
	addq $0x20,%rdx
	subq $0x20,%rsi
	jmp .big20cp

#endif

	# AMD64 requires SSE(2).

	# Copy 16 bytes:
.big10cp:

	# Check if there are at least 16 bytes remaining:
	cmpq $0x10,%rsi
	jl .wrdcp

	# Copy:
	movdqu (%rdi),%xmm0
	movdqu %xmm0,(%rdx)

	# Continue:
	addq $0x10,%rdi
	addq $0x10,%rdx
	subq $0x10,%rsi
	jmp .big10cp

	# Copy one word (8 bytes):
.wrdcp:

	# Check if there are at least 8 bytes remaining:
	cmpq $0x8,%rsi
	jl .bytecp

	# Copy:
	movq (%rdi),%rcx
	movq %rcx,(%rdx)

	# Continue:
	addq $0x8,%rdi
	addq $0x8,%rdx
	subq $0x8,%rsi
	jmp .wrdcp

	# Copy one byte:
.bytecp:

	# Check if we have any bytes remaining:
	testq %rsi,%rsi
	jz .done

	# Copy:
	movb (%rdi),%cl
	movb %cl,(%rdx)

	# Continue:
	incq %rdi
	incq %rdx
	decq %rsi
	jmp .bytecp

	# Finish:
.done:

	ret