1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
/*
Copyright 2022 Gabriel Jensen.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
#include <zap/priv.h>
.globl zap_memcp
zap_memcp:
/*
void const * in
zap_sz num
void * out
*/
#if defined(__amd64__)
# rdi: Address of the current input element.
# rsi: Number of remaining elements.
# rdx: Address of the current output element.
# rcx: Current element.
# xmm0: Current element.
# ymm0: Current element.
#if defined(__AVX__)
# AVX support 256-bit moves.
# Copy 32 bytes:
.big20cp:
# Check if there are at least 32 bytes remaining:
cmpq $0x20,%rsi
jl .big10cp # If not, skip to the 10 byte copying.
# Copy:
vmovups (%rdi),%ymm0 # Move into a register.
vmovups %ymm0,(%rdx) # And then back into memory.
# Continue:
addq $0x20,%rdi
addq $0x20,%rdx
subq $0x20,%rsi
jmp .big20cp
#endif
# AMD64 requires SSE(2).
# Copy 16 bytes:
.big10cp:
# Check if there are at least 16 bytes remaining:
cmpq $0x10,%rsi
jl .wrdcp
# Copy:
movdqu (%rdi),%xmm0
movdqu %xmm0,(%rdx)
# Continue:
addq $0x10,%rdi
addq $0x10,%rdx
subq $0x10,%rsi
jmp .big10cp
# Copy one word (8 bytes):
.wrdcp:
# Check if there are at least 8 bytes remaining:
cmpq $0x8,%rsi
jl .bytecp
# Copy:
movq (%rdi),%rcx
movq %rcx,(%rdx)
# Continue:
addq $0x8,%rdi
addq $0x8,%rdx
subq $0x8,%rsi
jmp .wrdcp
# Copy one byte:
.bytecp:
# Check if we have any bytes remaining:
testq %rsi,%rsi
jz .done
# Copy:
movb (%rdi),%cl
movb %cl,(%rdx)
# Continue:
incq %rdi
incq %rdx
decq %rsi
jmp .bytecp
# Finish:
.done:
ret
#endif
|