diff options
Diffstat (limited to 'rgo/src/memcpy.S')
-rw-r--r-- | rgo/src/memcpy.S | 101 |
1 files changed, 70 insertions, 31 deletions
diff --git a/rgo/src/memcpy.S b/rgo/src/memcpy.S index 51d82f9..475da57 100644 --- a/rgo/src/memcpy.S +++ b/rgo/src/memcpy.S @@ -3,7 +3,7 @@ This file is part of rgo. - rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + rgo is free software: you can reaxstribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. rgo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. @@ -15,46 +15,85 @@ .global rgo_memcpy rgo_memcpy: -#if defined(__x86_64__) /* - rdi: void const * in - rsi: size_t num - rdx: void * out + void const * in + size_t num + void * out */ - /* rcx: Address of the current input element. */ - movq %rdi,%rcx - /* r8: Address of the current output element. */ - movq %rdx,%r8 - /* r9: Number of remaining elements. */ - movq %rsi,%r9 - /* r10: Temporary. */ - /* xmm0: Temporary. */ +#if defined(__i386__) + /* eax: Address of the current input element. */ + movl 0x4(%esp),%eax + /* ecx: Number of remaining elements. */ + movl 0x8(%esp),%ecx + /* edx: Address of the current output element. */ + movl 0xC(%esp),%edx + /* ebx: Current element. */ + pushl %ebx /* ebx must be restored. */ + /* xmm0: Current element. */ +#if defined(__SSE__) .big128cpy: - cmpq $0x10,%r9 + cmpl $0x10,%ecx jl .wrdcpy - movups (%rcx),%xmm0 - movups %xmm0,(%r8) - addq $0x10,%rcx - addq $0x10,%r8 - subq $0x10,%r9 + movups (%eax),%xmm0 + movups %xmm0,(%edx) + addl $0x10,%eax + addl $0x10,%edx + subl $0x10,%ecx + jmp .big128cpy +#endif +.wrdcpy: + cmpl $0x4,%ecx + jl .bytecpy + movl (%eax),%ebx + movl %ebx,(%edx) + addl $0x4,%eax + addl $0x4,%edx + subl $0x4,%ecx + jmp .wrdcpy +.bytecpy: + testl %ecx,%ecx + jz .done + movb (%eax),%bl + movb %bl,(%edx) + incl %eax + incl %edx + decl %ecx + jmp .bytecpy +.done: + popl %ebx + ret +#elif defined(__x86_64__) + /* rdi: Address of the current input element. */ + /* rsi: Number of remaining elements. */ + /* rdx: Address of the current output element. */ + /* rcx: Current element. */ + /* xmm0: Current element. */ +.big128cpy: + cmpq $0x10,%rsi + jl .wrdcpy + movups (%rdi),%xmm0 + movups %xmm0,(%rdx) + addq $0x10,%rdi + addq $0x10,%rdx + subq $0x10,%rsi jmp .big128cpy .wrdcpy: - cmpq $0x8,%r9 + cmpq $0x8,%rsi jl .bytecpy - movq (%rcx),%r10 - movq %r10,(%r8) - addq $0x8,%rcx - addq $0x8,%r8 - subq $0x8,%r9 + movq (%rdi),%rcx + movq %rcx,(%rdx) + addq $0x8,%rdi + addq $0x8,%rdx + subq $0x8,%rsi jmp .wrdcpy .bytecpy: - testq %r9,%r9 + testq %rsi,%rsi jz .done - movb (%rcx),%r10b - movb %r10b,(%r8) - incq %rcx - incq %r8 - decq %r9 + movb (%rdi),%cl + movb %cl,(%rdx) + incq %rdi + incq %rdx + decq %rsi jmp .bytecpy .done: ret |