/* Copyright 2022 Gabriel Jensen. This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include #include #include #if defined(zap_priv_fastimpl) __asm__ ( ".globl zap_memcpy\n" "zap_memcpy:\n" /* void const * in size_t num void * out */ #if defined(sus_arch_amd64) /* rdi: Address of the current input element. */ /* rsi: Number of remaining elements. */ /* rdx: Address of the current output element. */ /* rcx: Current element. */ /* xmm0: Current element. */ /* ymm0: Current element. */ #if defined(sus_archfeat_avx) ".big256cpy:\n" "cmpq $0x20,%rsi\n" "jl .big128cpy\n" "vmovups (%rdi),%ymm0\n" "vmovups %ymm0,(%rdx)\n" "addq $0x20,%rdi\n" "addq $0x20,%rdx\n" "subq $0x20,%rsi\n" "jmp .big256cpy\n" #endif ".big128cpy:\n" "cmpq $0x10,%rsi\n" "jl .wrdcpy\n" "movdqu (%rdi),%xmm0\n" "movdqu %xmm0,(%rdx)\n" "addq $0x10,%rdi\n" "addq $0x10,%rdx\n" "subq $0x10,%rsi\n" "jmp .big128cpy\n" ".wrdcpy:\n" "cmpq $0x8,%rsi\n" "jl .bytecpy\n" "movq (%rdi),%rcx\n" "movq %rcx,(%rdx)\n" "addq $0x8,%rdi\n" "addq $0x8,%rdx\n" "subq $0x8,%rsi\n" "jmp .wrdcpy\n" ".bytecpy:\n" "testq %rsi,%rsi\n" "jz .done\n" "movb (%rdi),%cl\n" "movb %cl,(%rdx)\n" "incq %rdi\n" "incq %rdx\n" "decq %rsi\n" "jmp .bytecpy\n" ".done:\n" "ret\n" #elif defined(sus_arch_ia32) /* eax: Address of the current input element. */ "movl 0x4(%esp),%eax\n" /* ecx: Number of remaining elements. */ "movl 0x8(%esp),%ecx\n" /* edx: Address of the current output element. */ "movl 0xC(%esp),%edx\n" /* ebx: Current element. */ "pushl %ebx\n" /* ebx must be restored. */ /* xmm0: Current element. */ /* ymm0: Current element. */ #if defined(sus_archfeat_avx) ".big256cpy:\n" "cmpl $0x20,%ecx\n" #if defined(sus_archfeat_sse) "jl .big128cpy\n" #else "jl .wrdcpy\n" #endif "vmovdqu (%eax),%ymm0\n" "vmovdqu %ymm0,(%edx)\n" "addl $0x20,%eax\n" "addl $0x20,%edx\n" "subl $0x20,%ecx\n" "jmp .big256cpy\n" #endif #if defined(sus_archfeat_sse) ".big128cpy:\n" "cmpl $0x10,%ecx\n" "jl .wrdcpy\n" #if defined(sus_archfeat_sse2) "movdqu (%eax),%xmm0\n" "movdqu %xmm0,(%edx)\n" #else "movups (%eax),%xmm0\n" "movups %xmm0,(%edx)\n" #endif "addl $0x10,%eax\n" "addl $0x10,%edx\n" "subl $0x10,%ecx\n" "jmp .big128cpy\n" #endif ".wrdcpy:\n" "cmpl $0x4,%ecx\n" "jl .bytecpy\n" "movl (%eax),%ebx\n" "movl %ebx,(%edx)\n" "addl $0x4,%eax\n" "addl $0x4,%edx\n" "subl $0x4,%ecx\n" "jmp .wrdcpy\n" ".bytecpy:\n" "testl %ecx,%ecx\n" "jz .done\n" "movb (%eax),%bl\n" "movb %bl,(%edx)\n" "incl %eax\n" "incl %edx\n" "decl %ecx\n" "jmp .bytecpy\n" ".done:\n" "popl %ebx\n" "ret\n" #endif ); #else void zap_memcpy(void const * const _in,size_t const _num,void * const _out) { uint_least8_t const * in = (uint_least8_t const *)_in; uint_least8_t * out = (uint_least8_t *)_out; uint_least8_t const * const afterbuf = in + _num; for (;in != afterbuf;++in,++out) {*out = *in;} } #endif