summaryrefslogblamecommitdiff
path: root/rgo/src/memcpy.c
blob: 89adc14d16df8dd7ff8676e8f822030ae4716418 (plain) (tree)









































































































































                                                                                                                    
/*
	Copyright 2022 Gabriel Jensen.
	This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
	If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

#include <rgo-priv.h>

#if defined(rgo_priv_fastimpl)
__asm__ (
	".global rgo_memcpy\n"

	"rgo_memcpy:\n"
	/*
		void const * in
		sus_typ_usz       num
		void *       out
	*/
#if defined(sus_arch_amd64)
	/* rdi: Address of the current input element. */
	/* rsi: Number of remaining elements. */
	/* rdx: Address of the current output element. */
	/* rcx: Current element. */
	/* xmm0: Current element. */
	/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
	".big256cpy:\n"
		"cmpq $0x20,%rsi\n"
		"jl .big128cpy\n"
		"vmovups (%rdi),%ymm0\n"
		"vmovups %ymm0,(%rdx)\n"
		"addq $0x20,%rdi\n"
		"addq $0x20,%rdx\n"
		"subq $0x20,%rsi\n"
		"jmp .big256cpy\n"
#endif
	".big128cpy:\n"
		"cmpq $0x10,%rsi\n"
		"jl .wrdcpy\n"
		"movdqu (%rdi),%xmm0\n"
		"movdqu %xmm0,(%rdx)\n"
		"addq $0x10,%rdi\n"
		"addq $0x10,%rdx\n"
		"subq $0x10,%rsi\n"
		"jmp .big128cpy\n"
	".wrdcpy:\n"
		"cmpq $0x8,%rsi\n"
		"jl .bytecpy\n"
		"movq (%rdi),%rcx\n"
		"movq %rcx,(%rdx)\n"
		"addq $0x8,%rdi\n"
		"addq $0x8,%rdx\n"
		"subq $0x8,%rsi\n"
		"jmp .wrdcpy\n"
	".bytecpy:\n"
		"testq %rsi,%rsi\n"
		"jz .done\n"
		"movb (%rdi),%cl\n"
		"movb %cl,(%rdx)\n"
		"incq %rdi\n"
		"incq %rdx\n"
		"decq %rsi\n"
		"jmp .bytecpy\n"
	".done:\n"
		"ret\n"
#elif defined(sus_arch_ia32)
		/* eax: Address of the current input element. */
		"movl 0x4(%esp),%eax\n"
		/* ecx: Number of remaining elements. */
		"movl 0x8(%esp),%ecx\n"
		/* edx: Address of the current output element. */
		"movl 0xC(%esp),%edx\n"
		/* ebx: Current element. */
		"pushl %ebx\n" /* ebx must be restored. */
		/* xmm0: Current element. */
		/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
	".big256cpy:\n"
		"cmpl $0x20,%ecx\n"
#if defined(sus_archfeat_sse)
		"jl .big128cpy\n"
#else
		"jl .wrdcpy\n"
#endif
		"vmovdqu (%eax),%ymm0\n"
		"vmovdqu %ymm0,(%edx)\n"
		"addl $0x20,%eax\n"
		"addl $0x20,%edx\n"
		"subl $0x20,%ecx\n"
		"jmp .big256cpy\n"
#endif
#if defined(sus_archfeat_sse)
	".big128cpy:\n"
		"cmpl $0x10,%ecx\n"
		"jl .wrdcpy\n"
#if defined(sus_archfeat_sse2)
		"movdqu (%eax),%xmm0\n"
		"movdqu %xmm0,(%edx)\n"
#else
		"movups (%eax),%xmm0\n"
		"movups %xmm0,(%edx)\n"
#endif
		"addl $0x10,%eax\n"
		"addl $0x10,%edx\n"
		"subl $0x10,%ecx\n"
		"jmp .big128cpy\n"
#endif
	".wrdcpy:\n"
		"cmpl $0x4,%ecx\n"
		"jl .bytecpy\n"
		"movl (%eax),%ebx\n"
		"movl %ebx,(%edx)\n"
		"addl $0x4,%eax\n"
		"addl $0x4,%edx\n"
		"subl $0x4,%ecx\n"
		"jmp .wrdcpy\n"
	".bytecpy:\n"
		"testl %ecx,%ecx\n"
		"jz .done\n"
		"movb (%eax),%bl\n"
		"movb %bl,(%edx)\n"
		"incl %eax\n"
		"incl %edx\n"
		"decl %ecx\n"
		"jmp .bytecpy\n"
	".done:\n"
		"popl %ebx\n"
		"ret\n"
#endif
);
#else
void rgo_memcpy(void const * const sus_restr _in,sus_typ_usz const _num,void * const sus_restr _out) {
	sus_typ_u8 const *       in       = (sus_typ_u8 const *)_in;
	sus_typ_u8 * sus_restr   out      = (sus_typ_u8 *)_out;
	sus_typ_u8 const * const afterbuf = in + _num;
	for (;in != afterbuf;++in,++out) {*out = *in;}
}
#endif