summaryrefslogblamecommitdiff
path: root/zap/src/memcpy.c
blob: ae923c3136c1e682475a2535f01a328b08bc04f6 (plain) (tree)
1
2
3
4
5
6
7
8
9
10





                                                                                                                    
                     
 

                    


                   
                     
         
                             
 
                       




                                        
















































































































                                                                 
                                                                             


                                                         


                                                      
/*
	Copyright 2022 Gabriel Jensen.
	This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
	If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

#include <zap/priv.h>

#include <zap/mem.h>

#include <stddef.h>
#include <stdint.h>

#if zap_priv_fastimpl
__asm__ (
	".globl zap_memcpy\n"

	"zap_memcpy:\n"
		/*
			void const * in
			size_t       num
			void *       out
		*/
#if defined(sus_arch_amd64)
	/* rdi: Address of the current input element. */
	/* rsi: Number of remaining elements. */
	/* rdx: Address of the current output element. */
	/* rcx: Current element. */
	/* xmm0: Current element. */
	/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
	".big256cpy:\n"
		"cmpq $0x20,%rsi\n"
		"jl .big128cpy\n"
		"vmovups (%rdi),%ymm0\n"
		"vmovups %ymm0,(%rdx)\n"
		"addq $0x20,%rdi\n"
		"addq $0x20,%rdx\n"
		"subq $0x20,%rsi\n"
		"jmp .big256cpy\n"
#endif
	".big128cpy:\n"
		"cmpq $0x10,%rsi\n"
		"jl .wrdcpy\n"
		"movdqu (%rdi),%xmm0\n"
		"movdqu %xmm0,(%rdx)\n"
		"addq $0x10,%rdi\n"
		"addq $0x10,%rdx\n"
		"subq $0x10,%rsi\n"
		"jmp .big128cpy\n"
	".wrdcpy:\n"
		"cmpq $0x8,%rsi\n"
		"jl .bytecpy\n"
		"movq (%rdi),%rcx\n"
		"movq %rcx,(%rdx)\n"
		"addq $0x8,%rdi\n"
		"addq $0x8,%rdx\n"
		"subq $0x8,%rsi\n"
		"jmp .wrdcpy\n"
	".bytecpy:\n"
		"testq %rsi,%rsi\n"
		"jz .done\n"
		"movb (%rdi),%cl\n"
		"movb %cl,(%rdx)\n"
		"incq %rdi\n"
		"incq %rdx\n"
		"decq %rsi\n"
		"jmp .bytecpy\n"
	".done:\n"
		"ret\n"
#elif defined(sus_arch_ia32)
		/* eax: Address of the current input element. */
		"movl 0x4(%esp),%eax\n"
		/* ecx: Number of remaining elements. */
		"movl 0x8(%esp),%ecx\n"
		/* edx: Address of the current output element. */
		"movl 0xC(%esp),%edx\n"
		/* ebx: Current element. */
		"pushl %ebx\n" /* ebx must be restored. */
		/* xmm0: Current element. */
		/* ymm0: Current element. */
#if defined(sus_archfeat_avx)
	".big256cpy:\n"
		"cmpl $0x20,%ecx\n"
#if defined(sus_archfeat_sse)
		"jl .big128cpy\n"
#else
		"jl .wrdcpy\n"
#endif
		"vmovdqu (%eax),%ymm0\n"
		"vmovdqu %ymm0,(%edx)\n"
		"addl $0x20,%eax\n"
		"addl $0x20,%edx\n"
		"subl $0x20,%ecx\n"
		"jmp .big256cpy\n"
#endif
#if defined(sus_archfeat_sse)
	".big128cpy:\n"
		"cmpl $0x10,%ecx\n"
		"jl .wrdcpy\n"
#if defined(sus_archfeat_sse2)
		"movdqu (%eax),%xmm0\n"
		"movdqu %xmm0,(%edx)\n"
#else
		"movups (%eax),%xmm0\n"
		"movups %xmm0,(%edx)\n"
#endif
		"addl $0x10,%eax\n"
		"addl $0x10,%edx\n"
		"subl $0x10,%ecx\n"
		"jmp .big128cpy\n"
#endif
	".wrdcpy:\n"
		"cmpl $0x4,%ecx\n"
		"jl .bytecpy\n"
		"movl (%eax),%ebx\n"
		"movl %ebx,(%edx)\n"
		"addl $0x4,%eax\n"
		"addl $0x4,%edx\n"
		"subl $0x4,%ecx\n"
		"jmp .wrdcpy\n"
	".bytecpy:\n"
		"testl %ecx,%ecx\n"
		"jz .done\n"
		"movb (%eax),%bl\n"
		"movb %bl,(%edx)\n"
		"incl %eax\n"
		"incl %edx\n"
		"decl %ecx\n"
		"jmp .bytecpy\n"
	".done:\n"
		"popl %ebx\n"
		"ret\n"
#endif
);
#else
void zap_memcpy(void const * const _in,size_t const _num,void * const _out) {
	unsigned char const *       in       = _in;
	unsigned char *             out      = _out;
	unsigned char const * const afterbuf = in + _num;
	for (;in != afterbuf;++in,++out) {*out = *in;}
}
#endif