diff options
Diffstat (limited to 'zap/source/amd64/mem/memcp.S')
-rw-r--r-- | zap/source/amd64/mem/memcp.S | 91 |
1 files changed, 48 insertions, 43 deletions
diff --git a/zap/source/amd64/mem/memcp.S b/zap/source/amd64/mem/memcp.S index 5691446..ac310ae 100644 --- a/zap/source/amd64/mem/memcp.S +++ b/zap/source/amd64/mem/memcp.S @@ -5,90 +5,95 @@ .globl zap_memcp zap_memcp: - # rdi: Address of the current input element. - # rsi: Number of remaining elements. - # rdx: Address of the current output element. - # rcx: Current element. - # xmm0: Current element. - # ymm0: Current element. +# Address of the current input element: +#define iaddr %rdi +# Number of remaining bytes: +#define rem %rsi +# Address of the current output element: +#define oaddr %rdx +# Current element: +#define val1 %cl +#define val8 %rcx +#define val01 %xmm0 +#define val02 %ymm0 #if defined(__AVX__) # AVX support 256-bit moves. # Copy 32 bytes: -.big20cp: +.big02cp: # Check if there are at least 32 bytes remaining: - cmpq $0x20,%rsi - jl .big10cp # If not, skip to the 10 byte copying. + cmpq $0x20,rem # if (rem < 20) + jl .big01cp # goto big01cp // If not, skip to the 10 byte copying. # Copy: - vmovups (%rdi),%ymm0 # Move into a register. - vmovups %ymm0,(%rdx) # And then back into memory. + vmovups (iaddr),val02 # val02 = *iaddr + vmovups val02,(oaddr) # *oaddr = val02 # Continue: - addq $0x20,%rdi - addq $0x20,%rdx - subq $0x20,%rsi - jmp .big20cp + addq $0x20,iaddr # iaddr += 0x20 + addq $0x20,oaddr # oaddr += 0x20 + subq $0x20,rem # rem -= 0x20 + jmp .big02cp # goto big02cp #endif - # AMD64 requires SSE(2). + # AMD64 requires SSE(2), so we don't have to test it. # Copy 16 bytes: -.big10cp: +.big01cp: # Check if there are at least 16 bytes remaining: - cmpq $0x10,%rsi - jl .wrdcp + cmpq $0x10,rem # if (rem < 10) + jl .wrdcp # goto wrdcp # Copy: - movdqu (%rdi),%xmm0 - movdqu %xmm0,(%rdx) + movdqu (iaddr),val01 # val01 = *iaddr + movdqu val01,(oaddr) # *oaddr = val01 # Continue: - addq $0x10,%rdi - addq $0x10,%rdx - subq $0x10,%rsi - jmp .big10cp + addq $0x10,iaddr # iaddr += 0x10 + addq $0x10,oaddr # oaddr += 0x10 + subq $0x10,rem # rem -= 0x10 + jmp .big01cp # goto big01cp # Copy one word (8 bytes): .wrdcp: # Check if there are at least 8 bytes remaining: - cmpq $0x8,%rsi - jl .bytecp + cmpq $0x8,rem # if (rem < 8) + jl .bytecp # goto bytecp # Copy: - movq (%rdi),%rcx - movq %rcx,(%rdx) + movq (iaddr),val8 # val8 = *iaddr + movq val8,(oaddr) # *oaddr = val8 # Continue: - addq $0x8,%rdi - addq $0x8,%rdx - subq $0x8,%rsi - jmp .wrdcp + addq $0x8,iaddr # iaddr += 0x8 + addq $0x8,oaddr # oaddr += 0x8 + subq $0x8,rem # rem -= 0x8 + jmp .wrdcp # goto wrdcp # Copy one byte: .bytecp: # Check if we have any bytes remaining: - testq %rsi,%rsi - jz .done + testq rem,rem # if (rem == 0x0) + jz .done # goto done # Copy: - movb (%rdi),%cl - movb %cl,(%rdx) + movb (iaddr),val1 # val1 = *iaddr + movb val1,(oaddr) # *oaddr = val1 # Continue: - incq %rdi - incq %rdx - decq %rsi - jmp .bytecp + incq iaddr # ++iaddr + incq oaddr # ++oaddr + decq rem # --rem + jmp .bytecp # goto bytecp - # Finish: + # Return: .done: - ret + ret # return
\ No newline at end of file |