summaryrefslogtreecommitdiff
path: root/rgo/src/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'rgo/src/memcpy.S')
-rw-r--r--rgo/src/memcpy.S101
1 files changed, 70 insertions, 31 deletions
diff --git a/rgo/src/memcpy.S b/rgo/src/memcpy.S
index 51d82f9..475da57 100644
--- a/rgo/src/memcpy.S
+++ b/rgo/src/memcpy.S
@@ -3,7 +3,7 @@
This file is part of rgo.
- rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+ rgo is free software: you can reaxstribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
rgo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
@@ -15,46 +15,85 @@
.global rgo_memcpy
rgo_memcpy:
-#if defined(__x86_64__)
/*
- rdi: void const * in
- rsi: size_t num
- rdx: void * out
+ void const * in
+ size_t num
+ void * out
*/
- /* rcx: Address of the current input element. */
- movq %rdi,%rcx
- /* r8: Address of the current output element. */
- movq %rdx,%r8
- /* r9: Number of remaining elements. */
- movq %rsi,%r9
- /* r10: Temporary. */
- /* xmm0: Temporary. */
+#if defined(__i386__)
+ /* eax: Address of the current input element. */
+ movl 0x4(%esp),%eax
+ /* ecx: Number of remaining elements. */
+ movl 0x8(%esp),%ecx
+ /* edx: Address of the current output element. */
+ movl 0xC(%esp),%edx
+ /* ebx: Current element. */
+ pushl %ebx /* ebx must be restored. */
+ /* xmm0: Current element. */
+#if defined(__SSE__)
.big128cpy:
- cmpq $0x10,%r9
+ cmpl $0x10,%ecx
jl .wrdcpy
- movups (%rcx),%xmm0
- movups %xmm0,(%r8)
- addq $0x10,%rcx
- addq $0x10,%r8
- subq $0x10,%r9
+ movups (%eax),%xmm0
+ movups %xmm0,(%edx)
+ addl $0x10,%eax
+ addl $0x10,%edx
+ subl $0x10,%ecx
+ jmp .big128cpy
+#endif
+.wrdcpy:
+ cmpl $0x4,%ecx
+ jl .bytecpy
+ movl (%eax),%ebx
+ movl %ebx,(%edx)
+ addl $0x4,%eax
+ addl $0x4,%edx
+ subl $0x4,%ecx
+ jmp .wrdcpy
+.bytecpy:
+ testl %ecx,%ecx
+ jz .done
+ movb (%eax),%bl
+ movb %bl,(%edx)
+ incl %eax
+ incl %edx
+ decl %ecx
+ jmp .bytecpy
+.done:
+ popl %ebx
+ ret
+#elif defined(__x86_64__)
+ /* rdi: Address of the current input element. */
+ /* rsi: Number of remaining elements. */
+ /* rdx: Address of the current output element. */
+ /* rcx: Current element. */
+ /* xmm0: Current element. */
+.big128cpy:
+ cmpq $0x10,%rsi
+ jl .wrdcpy
+ movups (%rdi),%xmm0
+ movups %xmm0,(%rdx)
+ addq $0x10,%rdi
+ addq $0x10,%rdx
+ subq $0x10,%rsi
jmp .big128cpy
.wrdcpy:
- cmpq $0x8,%r9
+ cmpq $0x8,%rsi
jl .bytecpy
- movq (%rcx),%r10
- movq %r10,(%r8)
- addq $0x8,%rcx
- addq $0x8,%r8
- subq $0x8,%r9
+ movq (%rdi),%rcx
+ movq %rcx,(%rdx)
+ addq $0x8,%rdi
+ addq $0x8,%rdx
+ subq $0x8,%rsi
jmp .wrdcpy
.bytecpy:
- testq %r9,%r9
+ testq %rsi,%rsi
jz .done
- movb (%rcx),%r10b
- movb %r10b,(%r8)
- incq %rcx
- incq %r8
- decq %r9
+ movb (%rdi),%cl
+ movb %cl,(%rdx)
+ incq %rdi
+ incq %rdx
+ decq %rsi
jmp .bytecpy
.done:
ret