diff options
Diffstat (limited to 'zap/source/amd64/mem')
-rw-r--r-- | zap/source/amd64/mem/cp.s | 76 | ||||
-rw-r--r-- | zap/source/amd64/mem/fill.s | 52 |
2 files changed, 56 insertions, 72 deletions
diff --git a/zap/source/amd64/mem/cp.s b/zap/source/amd64/mem/cp.s index 57e716f..ef333e0 100644 --- a/zap/source/amd64/mem/cp.s +++ b/zap/source/amd64/mem/cp.s @@ -7,58 +7,58 @@ .globl zap_cp zap_cp: - # zap_i8 val1; - # zap_i04 val8; - # unsigned int128_t val01; - # unsigned int256_t val02; + # zap_i8 val1; + # zap_i04 val8; + # unsigned int128_t val01; + # unsigned int256_t val02; -.big02cp: +.big02cp: # // We assume AVX. cmp rdx,0x20 - jl .big01cp # if (num < 0x20u) goto big01cp; + jl short .big01cp # if (num < 0x20u) goto big01cp; - vmovups ymm0,[rsi] # val02 = *src; - vmovups [rdi],ymm0 # *dest = val02; + vmovups ymm0,[rsi] # val02 = *src; + vmovups [rdi],ymm0 # *dest = val02; - add rsi,0x20 # dest += 0x20u; - add rdi,0x20 # src += 0x20u; - sub rdx,0x20 # num -= 0x20u; - jmp .big02cp # goto big02cp; + add rsi,0x20 # dest += 0x20u; + add rdi,0x20 # src += 0x20u; + sub rdx,0x20 # num -= 0x20u; + jmp short .big02cp # goto big02cp; -.big01cp: # big01cp:; +.big01cp: # big01cp:; cmp rdx,0x10 - jl .wrdcp # if (num < 0x10u) goto wrdcp; + jl short .wrdcp # if (num < 0x10u) goto wrdcp; - movdqu xmm0,[rsi] # val01 = *src; - movdqu [rdi],xmm0 # *dest = val01; + movdqu xmm0,[rsi] # val01 = *src; + movdqu [rdi],xmm0 # *dest = val01; - add rsi,0x10 # dest += 0x10u; - add rdi,0x10 # src += 0x10u; - sub rdx,0x10 # num -= 0x10u; - jmp .big01cp # goto big01cp; + add rsi,0x10 # dest += 0x10u; + add rdi,0x10 # src += 0x10u; + sub rdx,0x10 # num -= 0x10u; + jmp short .big01cp # goto big01cp; -.wrdcp: # wrdcp:; +.wrdcp: # wrdcp:; cmp rdx,0x8 - jl .bytecp # if (num < 0x8u) goto bytecp; + jl short .bytecp # if (num < 0x8u) goto bytecp; - mov rcx,[rsi] # val8 = *src; - mov [rdi],rcx # *dest = val8; + mov rcx,[rsi] # val8 = *src; + mov [rdi],rcx # *dest = val8; - add rdi,0x8 # dest += 0x8u; - add rsi,0x8 # src += 0x8u; - sub rdx,0x8 # num -= 0x8u; - jmp .wrdcp # goto wrdcp + add rdi,0x8 # dest += 0x8u; + add rsi,0x8 # src += 0x8u; + sub rdx,0x8 # num -= 0x8u; + jmp short .wrdcp # goto wrdcp -.bytecp: # bytecp:; - test rdx,rdx # if (rem == 0x0) - jz .done # goto done +.bytecp: # bytecp:; + test rdx,rdx # if (rem == 0x0) + jz short .done # goto done - mov cl,[rsi] # val1 = *src; - mov [rsi],cl # *dest = val1; + mov cl,[rsi] # val1 = *src; + mov [rsi],cl # *dest = val1; - inc rdi # ++dest; - inc rsi # ++src; - dec rdx # --rem; - jmp .bytecp # goto bytecp; + inc rdi # ++dest; + inc rsi # ++src; + dec rdx # --rem; + jmp short .bytecp # goto bytecp; .done: - ret # return + ret # return diff --git a/zap/source/amd64/mem/fill.s b/zap/source/amd64/mem/fill.s index 0abd2be..f4022d0 100644 --- a/zap/source/amd64/mem/fill.s +++ b/zap/source/amd64/mem/fill.s @@ -7,45 +7,29 @@ .globl zap_fill zap_fill: - # zap_i8 val1; - # zap_i04 val8; - - movzx rax,sil # val8 = val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - # val1 = val8; - -.wrdfill: # wrdfill:; + movzx rsi,sil # zap_i04 extval = val; + mov rax,0x0101010101010101 # zap_i04 val = 0x0101010101010101u; + imul rax,rsi # val *= extval; + +.wrdfill: # wrdfill:; cmp rdx,0x8 - jl .bytefill # if (num < 0x8u) goto bytefill; + jl short .bytefill # if (num < 0x8u) goto bytefill; - mov [rdi],rax # *dest = val8; + mov [rdi],rax # *dest = val8; - add rdi,0x8 # dest += 0x8u; - sub rdx,0x8 # num -= 0x8u; - jmp .wrdfill # goto wrdfill + add rdi,0x8 # dest += 0x8u; + sub rdx,0x8 # num -= 0x8u; + jmp short .wrdfill # goto wrdfill -.bytefill: # bytefill:; - test rdx,rdx # if (rem == 0x0) - jz .done # goto done +.bytefill: # bytefill:; + test rdx,rdx # if (rem == 0x0) + jz short .done # goto done - mov [rsi],al # *dest = val1; + mov [rsi],al # *dest = val1; - inc rdi # ++dest; - dec rdx # --rem; - jmp .bytefill # goto bytefill; + inc rdi # ++dest; + dec rdx # --rem; + jmp short .bytefill # goto bytefill; .done: - ret # return + ret # return |