summaryrefslogtreecommitdiff
path: root/zap/source/amd64/mem/memcp.S
diff options
context:
space:
mode:
Diffstat (limited to 'zap/source/amd64/mem/memcp.S')
-rw-r--r--zap/source/amd64/mem/memcp.S91
1 files changed, 48 insertions, 43 deletions
diff --git a/zap/source/amd64/mem/memcp.S b/zap/source/amd64/mem/memcp.S
index 5691446..ac310ae 100644
--- a/zap/source/amd64/mem/memcp.S
+++ b/zap/source/amd64/mem/memcp.S
@@ -5,90 +5,95 @@
.globl zap_memcp
zap_memcp:
- # rdi: Address of the current input element.
- # rsi: Number of remaining elements.
- # rdx: Address of the current output element.
- # rcx: Current element.
- # xmm0: Current element.
- # ymm0: Current element.
+# Address of the current input element:
+#define iaddr %rdi
+# Number of remaining bytes:
+#define rem %rsi
+# Address of the current output element:
+#define oaddr %rdx
+# Current element:
+#define val1 %cl
+#define val8 %rcx
+#define val01 %xmm0
+#define val02 %ymm0
#if defined(__AVX__)
# AVX support 256-bit moves.
# Copy 32 bytes:
-.big20cp:
+.big02cp:
# Check if there are at least 32 bytes remaining:
- cmpq $0x20,%rsi
- jl .big10cp # If not, skip to the 10 byte copying.
+ cmpq $0x20,rem # if (rem < 20)
+ jl .big01cp # goto big01cp // If not, skip to the 10 byte copying.
# Copy:
- vmovups (%rdi),%ymm0 # Move into a register.
- vmovups %ymm0,(%rdx) # And then back into memory.
+ vmovups (iaddr),val02 # val02 = *iaddr
+ vmovups val02,(oaddr) # *oaddr = val02
# Continue:
- addq $0x20,%rdi
- addq $0x20,%rdx
- subq $0x20,%rsi
- jmp .big20cp
+ addq $0x20,iaddr # iaddr += 0x20
+ addq $0x20,oaddr # oaddr += 0x20
+ subq $0x20,rem # rem -= 0x20
+ jmp .big02cp # goto big02cp
#endif
- # AMD64 requires SSE(2).
+ # AMD64 requires SSE(2), so we don't have to test it.
# Copy 16 bytes:
-.big10cp:
+.big01cp:
# Check if there are at least 16 bytes remaining:
- cmpq $0x10,%rsi
- jl .wrdcp
+ cmpq $0x10,rem # if (rem < 10)
+ jl .wrdcp # goto wrdcp
# Copy:
- movdqu (%rdi),%xmm0
- movdqu %xmm0,(%rdx)
+ movdqu (iaddr),val01 # val01 = *iaddr
+ movdqu val01,(oaddr) # *oaddr = val01
# Continue:
- addq $0x10,%rdi
- addq $0x10,%rdx
- subq $0x10,%rsi
- jmp .big10cp
+ addq $0x10,iaddr # iaddr += 0x10
+ addq $0x10,oaddr # oaddr += 0x10
+ subq $0x10,rem # rem -= 0x10
+ jmp .big01cp # goto big01cp
# Copy one word (8 bytes):
.wrdcp:
# Check if there are at least 8 bytes remaining:
- cmpq $0x8,%rsi
- jl .bytecp
+ cmpq $0x8,rem # if (rem < 8)
+ jl .bytecp # goto bytecp
# Copy:
- movq (%rdi),%rcx
- movq %rcx,(%rdx)
+ movq (iaddr),val8 # val8 = *iaddr
+ movq val8,(oaddr) # *oaddr = val8
# Continue:
- addq $0x8,%rdi
- addq $0x8,%rdx
- subq $0x8,%rsi
- jmp .wrdcp
+ addq $0x8,iaddr # iaddr += 0x8
+ addq $0x8,oaddr # oaddr += 0x8
+ subq $0x8,rem # rem -= 0x8
+ jmp .wrdcp # goto wrdcp
# Copy one byte:
.bytecp:
# Check if we have any bytes remaining:
- testq %rsi,%rsi
- jz .done
+ testq rem,rem # if (rem == 0x0)
+ jz .done # goto done
# Copy:
- movb (%rdi),%cl
- movb %cl,(%rdx)
+ movb (iaddr),val1 # val1 = *iaddr
+ movb val1,(oaddr) # *oaddr = val1
# Continue:
- incq %rdi
- incq %rdx
- decq %rsi
- jmp .bytecp
+ incq iaddr # ++iaddr
+ incq oaddr # ++oaddr
+ decq rem # --rem
+ jmp .bytecp # goto bytecp
- # Finish:
+ # Return:
.done:
- ret
+ ret # return
\ No newline at end of file