summaryrefslogtreecommitdiff
path: root/zap/source/amd64/mem/memcp.S
blob: 5691446b4fa62f47ef05643d5901ae7e4bbc244e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Copyright 2022 Gabriel Jensen.
# This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.

.globl zap_memcp

zap_memcp:
	# rdi:  Address of the current input element.
	# rsi:  Number of remaining elements.
	# rdx:  Address of the current output element.
	# rcx:  Current element.
	# xmm0: Current element.
	# ymm0: Current element.

#if defined(__AVX__)
	# AVX support 256-bit moves.

	# Copy 32 bytes:
.big20cp:

	# Check if there are at least 32 bytes remaining:
	cmpq $0x20,%rsi
	jl .big10cp # If not, skip to the 10 byte copying.

	# Copy:
	vmovups (%rdi),%ymm0 # Move into a register.
	vmovups %ymm0,(%rdx) # And then back into memory.

	# Continue:
	addq $0x20,%rdi
	addq $0x20,%rdx
	subq $0x20,%rsi
	jmp .big20cp

#endif

	# AMD64 requires SSE(2).

	# Copy 16 bytes:
.big10cp:

	# Check if there are at least 16 bytes remaining:
	cmpq $0x10,%rsi
	jl .wrdcp

	# Copy:
	movdqu (%rdi),%xmm0
	movdqu %xmm0,(%rdx)

	# Continue:
	addq $0x10,%rdi
	addq $0x10,%rdx
	subq $0x10,%rsi
	jmp .big10cp

	# Copy one word (8 bytes):
.wrdcp:

	# Check if there are at least 8 bytes remaining:
	cmpq $0x8,%rsi
	jl .bytecp

	# Copy:
	movq (%rdi),%rcx
	movq %rcx,(%rdx)

	# Continue:
	addq $0x8,%rdi
	addq $0x8,%rdx
	subq $0x8,%rsi
	jmp .wrdcp

	# Copy one byte:
.bytecp:

	# Check if we have any bytes remaining:
	testq %rsi,%rsi
	jz .done

	# Copy:
	movb (%rdi),%cl
	movb %cl,(%rdx)

	# Continue:
	incq %rdi
	incq %rdx
	decq %rsi
	jmp .bytecp

	# Finish:
.done:

	ret