summaryrefslogtreecommitdiff
path: root/rgo/src/memcpy.c
diff options
context:
space:
mode:
Diffstat (limited to 'rgo/src/memcpy.c')
-rw-r--r--rgo/src/memcpy.c138
1 files changed, 138 insertions, 0 deletions
diff --git a/rgo/src/memcpy.c b/rgo/src/memcpy.c
new file mode 100644
index 0000000..89adc14
--- /dev/null
+++ b/rgo/src/memcpy.c
@@ -0,0 +1,138 @@
+/*
+ Copyright 2022 Gabriel Jensen.
+ This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
+ If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
+*/
+
+#include <rgo-priv.h>
+
+#if defined(rgo_priv_fastimpl)
+__asm__ (
+ ".global rgo_memcpy\n"
+
+ "rgo_memcpy:\n"
+ /*
+ void const * in
+ sus_typ_usz num
+ void * out
+ */
+#if defined(sus_arch_amd64)
+ /* rdi: Address of the current input element. */
+ /* rsi: Number of remaining elements. */
+ /* rdx: Address of the current output element. */
+ /* rcx: Current element. */
+ /* xmm0: Current element. */
+ /* ymm0: Current element. */
+#if defined(sus_archfeat_avx)
+ ".big256cpy:\n"
+ "cmpq $0x20,%rsi\n"
+ "jl .big128cpy\n"
+ "vmovups (%rdi),%ymm0\n"
+ "vmovups %ymm0,(%rdx)\n"
+ "addq $0x20,%rdi\n"
+ "addq $0x20,%rdx\n"
+ "subq $0x20,%rsi\n"
+ "jmp .big256cpy\n"
+#endif
+ ".big128cpy:\n"
+ "cmpq $0x10,%rsi\n"
+ "jl .wrdcpy\n"
+ "movdqu (%rdi),%xmm0\n"
+ "movdqu %xmm0,(%rdx)\n"
+ "addq $0x10,%rdi\n"
+ "addq $0x10,%rdx\n"
+ "subq $0x10,%rsi\n"
+ "jmp .big128cpy\n"
+ ".wrdcpy:\n"
+ "cmpq $0x8,%rsi\n"
+ "jl .bytecpy\n"
+ "movq (%rdi),%rcx\n"
+ "movq %rcx,(%rdx)\n"
+ "addq $0x8,%rdi\n"
+ "addq $0x8,%rdx\n"
+ "subq $0x8,%rsi\n"
+ "jmp .wrdcpy\n"
+ ".bytecpy:\n"
+ "testq %rsi,%rsi\n"
+ "jz .done\n"
+ "movb (%rdi),%cl\n"
+ "movb %cl,(%rdx)\n"
+ "incq %rdi\n"
+ "incq %rdx\n"
+ "decq %rsi\n"
+ "jmp .bytecpy\n"
+ ".done:\n"
+ "ret\n"
+#elif defined(sus_arch_ia32)
+ /* eax: Address of the current input element. */
+ "movl 0x4(%esp),%eax\n"
+ /* ecx: Number of remaining elements. */
+ "movl 0x8(%esp),%ecx\n"
+ /* edx: Address of the current output element. */
+ "movl 0xC(%esp),%edx\n"
+ /* ebx: Current element. */
+ "pushl %ebx\n" /* ebx must be restored. */
+ /* xmm0: Current element. */
+ /* ymm0: Current element. */
+#if defined(sus_archfeat_avx)
+ ".big256cpy:\n"
+ "cmpl $0x20,%ecx\n"
+#if defined(sus_archfeat_sse)
+ "jl .big128cpy\n"
+#else
+ "jl .wrdcpy\n"
+#endif
+ "vmovdqu (%eax),%ymm0\n"
+ "vmovdqu %ymm0,(%edx)\n"
+ "addl $0x20,%eax\n"
+ "addl $0x20,%edx\n"
+ "subl $0x20,%ecx\n"
+ "jmp .big256cpy\n"
+#endif
+#if defined(sus_archfeat_sse)
+ ".big128cpy:\n"
+ "cmpl $0x10,%ecx\n"
+ "jl .wrdcpy\n"
+#if defined(sus_archfeat_sse2)
+ "movdqu (%eax),%xmm0\n"
+ "movdqu %xmm0,(%edx)\n"
+#else
+ "movups (%eax),%xmm0\n"
+ "movups %xmm0,(%edx)\n"
+#endif
+ "addl $0x10,%eax\n"
+ "addl $0x10,%edx\n"
+ "subl $0x10,%ecx\n"
+ "jmp .big128cpy\n"
+#endif
+ ".wrdcpy:\n"
+ "cmpl $0x4,%ecx\n"
+ "jl .bytecpy\n"
+ "movl (%eax),%ebx\n"
+ "movl %ebx,(%edx)\n"
+ "addl $0x4,%eax\n"
+ "addl $0x4,%edx\n"
+ "subl $0x4,%ecx\n"
+ "jmp .wrdcpy\n"
+ ".bytecpy:\n"
+ "testl %ecx,%ecx\n"
+ "jz .done\n"
+ "movb (%eax),%bl\n"
+ "movb %bl,(%edx)\n"
+ "incl %eax\n"
+ "incl %edx\n"
+ "decl %ecx\n"
+ "jmp .bytecpy\n"
+ ".done:\n"
+ "popl %ebx\n"
+ "ret\n"
+#endif
+);
+#else
+void rgo_memcpy(void const * const sus_restr _in,sus_typ_usz const _num,void * const sus_restr _out) {
+ sus_typ_u8 const * in = (sus_typ_u8 const *)_in;
+ sus_typ_u8 * sus_restr out = (sus_typ_u8 *)_out;
+ sus_typ_u8 const * const afterbuf = in + _num;
+ for (;in != afterbuf;++in,++out) {*out = *in;}
+}
+#endif