summaryrefslogblamecommitdiff
path: root/zap/source/amd64/mem/memcp.S
blob: ac310ae3563275eafd2cd66c868ca1107a3ef4c1 (plain) (tree)
1
2
3
4
5
6
7


                                                                                                              



                










                                        




                                    
         

                                                         

                                                                              

               

                                              

                   



                                        


      
                                                             

                        
         

                                                         

                                      

               

                                              

                   



                                        




                                                        

                                    

               

                                         

                   



                                      




                                               

                                       

               

                                         

                   



                                 
 
                 

      
                    
       
# Copyright 2022 Gabriel Jensen.
# This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
# If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.

.globl zap_memcp

zap_memcp:
# Address of the current input element:
#define iaddr %rdi
# Number of remaining bytes:
#define rem   %rsi
# Address of the current output element:
#define oaddr %rdx
# Current element:
#define val1  %cl
#define val8  %rcx
#define val01 %xmm0
#define val02 %ymm0

#if defined(__AVX__)
	# AVX support 256-bit moves.

	# Copy 32 bytes:
.big02cp:

	# Check if there are at least 32 bytes remaining:
	cmpq $0x20,rem # if (rem < 20)
	jl .big01cp    # goto big01cp  // If not, skip to the 10 byte copying.

	# Copy:
	vmovups (iaddr),val02 # val02 = *iaddr
	vmovups val02,(oaddr) # *oaddr = val02

	# Continue:
	addq $0x20,iaddr # iaddr += 0x20
	addq $0x20,oaddr # oaddr += 0x20
	subq $0x20,rem   # rem -= 0x20
	jmp .big02cp     # goto big02cp

#endif

	# AMD64 requires SSE(2), so we don't have to test it.

	# Copy 16 bytes:
.big01cp:

	# Check if there are at least 16 bytes remaining:
	cmpq $0x10,rem # if (rem < 10)
	jl .wrdcp      # goto wrdcp

	# Copy:
	movdqu (iaddr),val01 # val01 = *iaddr
	movdqu val01,(oaddr) # *oaddr = val01 

	# Continue:
	addq $0x10,iaddr # iaddr += 0x10
	addq $0x10,oaddr # oaddr += 0x10
	subq $0x10,rem   # rem -= 0x10
	jmp .big01cp     # goto big01cp

	# Copy one word (8 bytes):
.wrdcp:

	# Check if there are at least 8 bytes remaining:
	cmpq $0x8,rem # if (rem < 8)
	jl .bytecp    # goto bytecp

	# Copy:
	movq (iaddr),val8 # val8 = *iaddr
	movq val8,(oaddr) # *oaddr = val8

	# Continue:
	addq $0x8,iaddr # iaddr += 0x8
	addq $0x8,oaddr # oaddr += 0x8
	subq $0x8,rem   # rem -= 0x8
	jmp .wrdcp      # goto wrdcp

	# Copy one byte:
.bytecp:

	# Check if we have any bytes remaining:
	testq rem,rem # if (rem == 0x0)
	jz .done      # goto done

	# Copy:
	movb (iaddr),val1 # val1 = *iaddr
	movb val1,(oaddr) # *oaddr = val1

	# Continue:
	incq iaddr  # ++iaddr
	incq oaddr  # ++oaddr
	decq rem    # --rem
	jmp .bytecp # goto bytecp

	# Return:
.done:

	ret # return