/*
Copyright 2022 Gabriel Jensen.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
#include <zap/priv.h>
#include <zap/mem.h>
#include <stddef.h>
#include <stdint.h>
#if zap_priv_fastimpl
__asm__ (
".globl zap_memcpy\n"
"zap_memcpy:\n"
/*
void const * in
size_t num
void * out
*/
#if defined(sus_arch_amd64)
/* rdi: Address of the current input element. */
/* rsi: Number of remaining elements. */
/* rdx: Address of the current output element. */
/* rcx: Current element. */
/* xmm0: Current element. */
/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
".big256cpy:\n"
"cmpq $0x20,%rsi\n"
"jl .big128cpy\n"
"vmovups (%rdi),%ymm0\n"
"vmovups %ymm0,(%rdx)\n"
"addq $0x20,%rdi\n"
"addq $0x20,%rdx\n"
"subq $0x20,%rsi\n"
"jmp .big256cpy\n"
#endif
".big128cpy:\n"
"cmpq $0x10,%rsi\n"
"jl .wrdcpy\n"
"movdqu (%rdi),%xmm0\n"
"movdqu %xmm0,(%rdx)\n"
"addq $0x10,%rdi\n"
"addq $0x10,%rdx\n"
"subq $0x10,%rsi\n"
"jmp .big128cpy\n"
".wrdcpy:\n"
"cmpq $0x8,%rsi\n"
"jl .bytecpy\n"
"movq (%rdi),%rcx\n"
"movq %rcx,(%rdx)\n"
"addq $0x8,%rdi\n"
"addq $0x8,%rdx\n"
"subq $0x8,%rsi\n"
"jmp .wrdcpy\n"
".bytecpy:\n"
"testq %rsi,%rsi\n"
"jz .done\n"
"movb (%rdi),%cl\n"
"movb %cl,(%rdx)\n"
"incq %rdi\n"
"incq %rdx\n"
"decq %rsi\n"
"jmp .bytecpy\n"
".done:\n"
"ret\n"
#elif defined(sus_arch_ia32)
/* eax: Address of the current input element. */
"movl 0x4(%esp),%eax\n"
/* ecx: Number of remaining elements. */
"movl 0x8(%esp),%ecx\n"
/* edx: Address of the current output element. */
"movl 0xC(%esp),%edx\n"
/* ebx: Current element. */
"pushl %ebx\n" /* ebx must be restored. */
/* xmm0: Current element. */
/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
".big256cpy:\n"
"cmpl $0x20,%ecx\n"
#if defined(sus_archfeat_sse)
"jl .big128cpy\n"
#else
"jl .wrdcpy\n"
#endif
"vmovdqu (%eax),%ymm0\n"
"vmovdqu %ymm0,(%edx)\n"
"addl $0x20,%eax\n"
"addl $0x20,%edx\n"
"subl $0x20,%ecx\n"
"jmp .big256cpy\n"
#endif
#if defined(sus_archfeat_sse)
".big128cpy:\n"
"cmpl $0x10,%ecx\n"
"jl .wrdcpy\n"
#if defined(sus_archfeat_sse2)
"movdqu (%eax),%xmm0\n"
"movdqu %xmm0,(%edx)\n"
#else
"movups (%eax),%xmm0\n"
"movups %xmm0,(%edx)\n"
#endif
"addl $0x10,%eax\n"
"addl $0x10,%edx\n"
"subl $0x10,%ecx\n"
"jmp .big128cpy\n"
#endif
".wrdcpy:\n"
"cmpl $0x4,%ecx\n"
"jl .bytecpy\n"
"movl (%eax),%ebx\n"
"movl %ebx,(%edx)\n"
"addl $0x4,%eax\n"
"addl $0x4,%edx\n"
"subl $0x4,%ecx\n"
"jmp .wrdcpy\n"
".bytecpy:\n"
"testl %ecx,%ecx\n"
"jz .done\n"
"movb (%eax),%bl\n"
"movb %bl,(%edx)\n"
"incl %eax\n"
"incl %edx\n"
"decl %ecx\n"
"jmp .bytecpy\n"
".done:\n"
"popl %ebx\n"
"ret\n"
#endif
);
#else
void zap_memcpy(void const * const _in,size_t const _num,void * const _out) {
unsigned char const * in = _in;
unsigned char * out = _out;
unsigned char const * const afterbuf = in + _num;
for (;in != afterbuf;++in,++out) {*out = *in;}
}
#endif