summaryrefslogtreecommitdiff
path: root/zap/source/amd64/mem/memcp.S
diff options
context:
space:
mode:
Diffstat (limited to 'zap/source/amd64/mem/memcp.S')
-rw-r--r--zap/source/amd64/mem/memcp.S94
1 files changed, 94 insertions, 0 deletions
diff --git a/zap/source/amd64/mem/memcp.S b/zap/source/amd64/mem/memcp.S
new file mode 100644
index 0000000..5691446
--- /dev/null
+++ b/zap/source/amd64/mem/memcp.S
@@ -0,0 +1,94 @@
+# Copyright 2022 Gabriel Jensen.
+# This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
+# If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+.globl zap_memcp
+
+zap_memcp:
+ # rdi: Address of the current input element.
+ # rsi: Number of remaining elements.
+ # rdx: Address of the current output element.
+ # rcx: Current element.
+ # xmm0: Current element.
+ # ymm0: Current element.
+
+#if defined(__AVX__)
+ # AVX support 256-bit moves.
+
+ # Copy 32 bytes:
+.big20cp:
+
+ # Check if there are at least 32 bytes remaining:
+ cmpq $0x20,%rsi
+ jl .big10cp # If not, skip to the 10 byte copying.
+
+ # Copy:
+ vmovups (%rdi),%ymm0 # Move into a register.
+ vmovups %ymm0,(%rdx) # And then back into memory.
+
+ # Continue:
+ addq $0x20,%rdi
+ addq $0x20,%rdx
+ subq $0x20,%rsi
+ jmp .big20cp
+
+#endif
+
+ # AMD64 requires SSE(2).
+
+ # Copy 16 bytes:
+.big10cp:
+
+ # Check if there are at least 16 bytes remaining:
+ cmpq $0x10,%rsi
+ jl .wrdcp
+
+ # Copy:
+ movdqu (%rdi),%xmm0
+ movdqu %xmm0,(%rdx)
+
+ # Continue:
+ addq $0x10,%rdi
+ addq $0x10,%rdx
+ subq $0x10,%rsi
+ jmp .big10cp
+
+ # Copy one word (8 bytes):
+.wrdcp:
+
+ # Check if there are at least 8 bytes remaining:
+ cmpq $0x8,%rsi
+ jl .bytecp
+
+ # Copy:
+ movq (%rdi),%rcx
+ movq %rcx,(%rdx)
+
+ # Continue:
+ addq $0x8,%rdi
+ addq $0x8,%rdx
+ subq $0x8,%rsi
+ jmp .wrdcp
+
+ # Copy one byte:
+.bytecp:
+
+ # Check if we have any bytes remaining:
+ testq %rsi,%rsi
+ jz .done
+
+ # Copy:
+ movb (%rdi),%cl
+ movb %cl,(%rdx)
+
+ # Continue:
+ incq %rdi
+ incq %rdx
+ decq %rsi
+ jmp .bytecp
+
+ # Finish:
+.done:
+
+ ret
+ \ No newline at end of file