/* Copyright 2022 Gabriel Jensen. This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include .global rgo_memcpy rgo_memcpy: /* void const * in size_t num void * out */ #if defined(__i386__) /* eax: Address of the current input element. */ movl 0x4(%esp),%eax /* ecx: Number of remaining elements. */ movl 0x8(%esp),%ecx /* edx: Address of the current output element. */ movl 0xC(%esp),%edx /* ebx: Current element. */ pushl %ebx /* ebx must be restored. */ /* xmm0: Current element. */ /* ymm0: Current element. */ #if defined(__AVX__) .big256cpy: cmpl $0x20,%ecx #if defined(__SSE__) jl .big128cpy #else jl .wrdcpy #endif vmovdqu (%eax),%ymm0 vmovdqu %ymm0,(%edx) addl $0x20,%eax addl $0x20,%edx subl $0x20,%ecx jmp .big256cpy #endif #if defined(__SSE__) .big128cpy: cmpl $0x10,%ecx jl .wrdcpy #if defined(__SSE2__) movdqu (%eax),%xmm0 movdqu %xmm0,(%edx) #else movups (%eax),%xmm0 movups %xmm0,(%edx) #endif addl $0x10,%eax addl $0x10,%edx subl $0x10,%ecx jmp .big128cpy #endif .wrdcpy: cmpl $0x4,%ecx jl .bytecpy movl (%eax),%ebx movl %ebx,(%edx) addl $0x4,%eax addl $0x4,%edx subl $0x4,%ecx jmp .wrdcpy .bytecpy: testl %ecx,%ecx jz .done movb (%eax),%bl movb %bl,(%edx) incl %eax incl %edx decl %ecx jmp .bytecpy .done: popl %ebx ret #elif defined(__x86_64__) /* rdi: Address of the current input element. */ /* rsi: Number of remaining elements. */ /* rdx: Address of the current output element. */ /* rcx: Current element. */ /* xmm0: Current element. */ /* ymm0: Current element. */ #if defined(__AVX__) .big256cpy: cmpq $0x20,%rsi jl .big128cpy vmovups (%rdi),%ymm0 vmovups %ymm0,(%rdx) addq $0x20,%rdi addq $0x20,%rdx subq $0x20,%rsi jmp .big256cpy #endif .big128cpy: cmpq $0x10,%rsi jl .wrdcpy movdqu (%rdi),%xmm0 movdqu %xmm0,(%rdx) addq $0x10,%rdi addq $0x10,%rdx subq $0x10,%rsi jmp .big128cpy .wrdcpy: cmpq $0x8,%rsi jl .bytecpy movq (%rdi),%rcx movq %rcx,(%rdx) addq $0x8,%rdi addq $0x8,%rdx subq $0x8,%rsi jmp .wrdcpy .bytecpy: testq %rsi,%rsi jz .done movb (%rdi),%cl movb %cl,(%rdx) incq %rdi incq %rdx decq %rsi jmp .bytecpy .done: ret #endif