diff options
Diffstat (limited to 'rgo/src/memcpy.c')
-rw-r--r-- | rgo/src/memcpy.c | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/rgo/src/memcpy.c b/rgo/src/memcpy.c new file mode 100644 index 0000000..89adc14 --- /dev/null +++ b/rgo/src/memcpy.c @@ -0,0 +1,138 @@ +/* + Copyright 2022 Gabriel Jensen. + This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. +*/ + +#include <rgo-priv.h> + +#if defined(rgo_priv_fastimpl) +__asm__ ( + ".global rgo_memcpy\n" + + "rgo_memcpy:\n" + /* + void const * in + sus_typ_usz num + void * out + */ +#if defined(sus_arch_amd64) + /* rdi: Address of the current input element. */ + /* rsi: Number of remaining elements. */ + /* rdx: Address of the current output element. */ + /* rcx: Current element. */ + /* xmm0: Current element. */ + /* ymm0: Current element. */ +#if defined(sus_archfeat_avx) + ".big256cpy:\n" + "cmpq $0x20,%rsi\n" + "jl .big128cpy\n" + "vmovups (%rdi),%ymm0\n" + "vmovups %ymm0,(%rdx)\n" + "addq $0x20,%rdi\n" + "addq $0x20,%rdx\n" + "subq $0x20,%rsi\n" + "jmp .big256cpy\n" +#endif + ".big128cpy:\n" + "cmpq $0x10,%rsi\n" + "jl .wrdcpy\n" + "movdqu (%rdi),%xmm0\n" + "movdqu %xmm0,(%rdx)\n" + "addq $0x10,%rdi\n" + "addq $0x10,%rdx\n" + "subq $0x10,%rsi\n" + "jmp .big128cpy\n" + ".wrdcpy:\n" + "cmpq $0x8,%rsi\n" + "jl .bytecpy\n" + "movq (%rdi),%rcx\n" + "movq %rcx,(%rdx)\n" + "addq $0x8,%rdi\n" + "addq $0x8,%rdx\n" + "subq $0x8,%rsi\n" + "jmp .wrdcpy\n" + ".bytecpy:\n" + "testq %rsi,%rsi\n" + "jz .done\n" + "movb (%rdi),%cl\n" + "movb %cl,(%rdx)\n" + "incq %rdi\n" + "incq %rdx\n" + "decq %rsi\n" + "jmp .bytecpy\n" + ".done:\n" + "ret\n" +#elif defined(sus_arch_ia32) + /* eax: Address of the current input element. */ + "movl 0x4(%esp),%eax\n" + /* ecx: Number of remaining elements. */ + "movl 0x8(%esp),%ecx\n" + /* edx: Address of the current output element. */ + "movl 0xC(%esp),%edx\n" + /* ebx: Current element. */ + "pushl %ebx\n" /* ebx must be restored. */ + /* xmm0: Current element. */ + /* ymm0: Current element. */ +#if defined(sus_archfeat_avx) + ".big256cpy:\n" + "cmpl $0x20,%ecx\n" +#if defined(sus_archfeat_sse) + "jl .big128cpy\n" +#else + "jl .wrdcpy\n" +#endif + "vmovdqu (%eax),%ymm0\n" + "vmovdqu %ymm0,(%edx)\n" + "addl $0x20,%eax\n" + "addl $0x20,%edx\n" + "subl $0x20,%ecx\n" + "jmp .big256cpy\n" +#endif +#if defined(sus_archfeat_sse) + ".big128cpy:\n" + "cmpl $0x10,%ecx\n" + "jl .wrdcpy\n" +#if defined(sus_archfeat_sse2) + "movdqu (%eax),%xmm0\n" + "movdqu %xmm0,(%edx)\n" +#else + "movups (%eax),%xmm0\n" + "movups %xmm0,(%edx)\n" +#endif + "addl $0x10,%eax\n" + "addl $0x10,%edx\n" + "subl $0x10,%ecx\n" + "jmp .big128cpy\n" +#endif + ".wrdcpy:\n" + "cmpl $0x4,%ecx\n" + "jl .bytecpy\n" + "movl (%eax),%ebx\n" + "movl %ebx,(%edx)\n" + "addl $0x4,%eax\n" + "addl $0x4,%edx\n" + "subl $0x4,%ecx\n" + "jmp .wrdcpy\n" + ".bytecpy:\n" + "testl %ecx,%ecx\n" + "jz .done\n" + "movb (%eax),%bl\n" + "movb %bl,(%edx)\n" + "incl %eax\n" + "incl %edx\n" + "decl %ecx\n" + "jmp .bytecpy\n" + ".done:\n" + "popl %ebx\n" + "ret\n" +#endif +); +#else +void rgo_memcpy(void const * const sus_restr _in,sus_typ_usz const _num,void * const sus_restr _out) { + sus_typ_u8 const * in = (sus_typ_u8 const *)_in; + sus_typ_u8 * sus_restr out = (sus_typ_u8 *)_out; + sus_typ_u8 const * const afterbuf = in + _num; + for (;in != afterbuf;++in,++out) {*out = *in;} +} +#endif |