diff options
-rw-r--r-- | CHANGELOG.txt | 5 | ||||
-rw-r--r-- | test.c | 61 | ||||
-rw-r--r-- | zap/GNUmakefile | 7 | ||||
-rw-r--r-- | zap/source/amd64/mem/cp.s | 76 | ||||
-rw-r--r-- | zap/source/amd64/mem/fill.s | 52 |
5 files changed, 68 insertions, 133 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt index f2acff8..eae9020 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,8 @@ +# 16.5 + +* Optimise fill (amd64); +* Optimise cp (amd64); + # 16.4 * Update assembly comments; @@ -33,60 +33,13 @@ int main(void) { /* mem */ { zap_i8 const src[0x3Fu] = { -0x02u, -0x03u, -0x05u, -0x07u, -0x0Bu, -0x0Du, -0x11u, -0x13u, -0x17u, -0x1Du, -0x1Fu, -0x25u, -0x29u, -0x2Bu, -0x2Fu, -0x35u, -0x3Bu, -0x3Du, -0x43u, -0x47u, -0x49u, -0x4Fu, -0x53u, -0x59u, -0x61u, -0x65u, -0x67u, -0x6Bu, -0x6Du, -0x71u, -0x7Fu, -0x83u, -0x89u, -0x8Bu, -0x95u, -0x97u, -0x9Du, -0xA3u, -0xA7u, -0xADu, -0xB3u, -0xB5u, -0xBFu, -0xC1u, -0xC5u, -0xC7u, -0xD3u, -0xDFu, -0xE3u, -0xE5u, -0xE9u, -0xEFu, -0xF1u, -0xFBu, + 0x02u,0x03u,0x05u,0x07u,0x0Bu,0x0Du,0x11u,0x13u, + 0x17u,0x1Du,0x1Fu,0x25u,0x29u,0x2Bu,0x2Fu,0x35u, + 0x3Bu,0x3Du,0x43u,0x47u,0x49u,0x4Fu,0x53u,0x59u, + 0x61u,0x65u,0x67u,0x6Bu,0x6Du,0x71u,0x7Fu,0x83u, + 0x89u,0x8Bu,0x95u,0x97u,0x9Du,0xA3u,0xA7u,0xADu, + 0xB3u,0xB5u,0xBFu,0xC1u,0xC5u,0xC7u,0xD3u,0xDFu, + 0xE3u,0xE5u,0xE9u,0xEFu,0xF1u,0xFBu, }; zap_i8 dest[sizeof (src)]; zap_cp(dest,src,sizeof (src)); diff --git a/zap/GNUmakefile b/zap/GNUmakefile index 473fb0c..ade3245 100644 --- a/zap/GNUmakefile +++ b/zap/GNUmakefile @@ -68,18 +68,11 @@ CFLAGS := \ -Wpadded \ -Wpedantic \ -ffreestanding \ - -fno-strict-aliasing \ -fshort-enums \ -nostdlib \ -pipe \ -std=c99 -ifeq "$(trapdiv0)" "true" -CFLAGS := \ - $(CFLAGS) \ - -Dzap_priv_trapdiv0 -endif - .PHONY: clean install purge $(LIB): $(OBJS) diff --git a/zap/source/amd64/mem/cp.s b/zap/source/amd64/mem/cp.s index 57e716f..ef333e0 100644 --- a/zap/source/amd64/mem/cp.s +++ b/zap/source/amd64/mem/cp.s @@ -7,58 +7,58 @@ .globl zap_cp zap_cp: - # zap_i8 val1; - # zap_i04 val8; - # unsigned int128_t val01; - # unsigned int256_t val02; + # zap_i8 val1; + # zap_i04 val8; + # unsigned int128_t val01; + # unsigned int256_t val02; -.big02cp: +.big02cp: # // We assume AVX. cmp rdx,0x20 - jl .big01cp # if (num < 0x20u) goto big01cp; + jl short .big01cp # if (num < 0x20u) goto big01cp; - vmovups ymm0,[rsi] # val02 = *src; - vmovups [rdi],ymm0 # *dest = val02; + vmovups ymm0,[rsi] # val02 = *src; + vmovups [rdi],ymm0 # *dest = val02; - add rsi,0x20 # dest += 0x20u; - add rdi,0x20 # src += 0x20u; - sub rdx,0x20 # num -= 0x20u; - jmp .big02cp # goto big02cp; + add rsi,0x20 # dest += 0x20u; + add rdi,0x20 # src += 0x20u; + sub rdx,0x20 # num -= 0x20u; + jmp short .big02cp # goto big02cp; -.big01cp: # big01cp:; +.big01cp: # big01cp:; cmp rdx,0x10 - jl .wrdcp # if (num < 0x10u) goto wrdcp; + jl short .wrdcp # if (num < 0x10u) goto wrdcp; - movdqu xmm0,[rsi] # val01 = *src; - movdqu [rdi],xmm0 # *dest = val01; + movdqu xmm0,[rsi] # val01 = *src; + movdqu [rdi],xmm0 # *dest = val01; - add rsi,0x10 # dest += 0x10u; - add rdi,0x10 # src += 0x10u; - sub rdx,0x10 # num -= 0x10u; - jmp .big01cp # goto big01cp; + add rsi,0x10 # dest += 0x10u; + add rdi,0x10 # src += 0x10u; + sub rdx,0x10 # num -= 0x10u; + jmp short .big01cp # goto big01cp; -.wrdcp: # wrdcp:; +.wrdcp: # wrdcp:; cmp rdx,0x8 - jl .bytecp # if (num < 0x8u) goto bytecp; + jl short .bytecp # if (num < 0x8u) goto bytecp; - mov rcx,[rsi] # val8 = *src; - mov [rdi],rcx # *dest = val8; + mov rcx,[rsi] # val8 = *src; + mov [rdi],rcx # *dest = val8; - add rdi,0x8 # dest += 0x8u; - add rsi,0x8 # src += 0x8u; - sub rdx,0x8 # num -= 0x8u; - jmp .wrdcp # goto wrdcp + add rdi,0x8 # dest += 0x8u; + add rsi,0x8 # src += 0x8u; + sub rdx,0x8 # num -= 0x8u; + jmp short .wrdcp # goto wrdcp -.bytecp: # bytecp:; - test rdx,rdx # if (rem == 0x0) - jz .done # goto done +.bytecp: # bytecp:; + test rdx,rdx # if (rem == 0x0) + jz short .done # goto done - mov cl,[rsi] # val1 = *src; - mov [rsi],cl # *dest = val1; + mov cl,[rsi] # val1 = *src; + mov [rsi],cl # *dest = val1; - inc rdi # ++dest; - inc rsi # ++src; - dec rdx # --rem; - jmp .bytecp # goto bytecp; + inc rdi # ++dest; + inc rsi # ++src; + dec rdx # --rem; + jmp short .bytecp # goto bytecp; .done: - ret # return + ret # return diff --git a/zap/source/amd64/mem/fill.s b/zap/source/amd64/mem/fill.s index 0abd2be..f4022d0 100644 --- a/zap/source/amd64/mem/fill.s +++ b/zap/source/amd64/mem/fill.s @@ -7,45 +7,29 @@ .globl zap_fill zap_fill: - # zap_i8 val1; - # zap_i04 val8; - - movzx rax,sil # val8 = val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - shl rax,0x8 # val8 <<= 0x8u; - mov al,sil # val8 |= val; - # val1 = val8; - -.wrdfill: # wrdfill:; + movzx rsi,sil # zap_i04 extval = val; + mov rax,0x0101010101010101 # zap_i04 val = 0x0101010101010101u; + imul rax,rsi # val *= extval; + +.wrdfill: # wrdfill:; cmp rdx,0x8 - jl .bytefill # if (num < 0x8u) goto bytefill; + jl short .bytefill # if (num < 0x8u) goto bytefill; - mov [rdi],rax # *dest = val8; + mov [rdi],rax # *dest = val8; - add rdi,0x8 # dest += 0x8u; - sub rdx,0x8 # num -= 0x8u; - jmp .wrdfill # goto wrdfill + add rdi,0x8 # dest += 0x8u; + sub rdx,0x8 # num -= 0x8u; + jmp short .wrdfill # goto wrdfill -.bytefill: # bytefill:; - test rdx,rdx # if (rem == 0x0) - jz .done # goto done +.bytefill: # bytefill:; + test rdx,rdx # if (rem == 0x0) + jz short .done # goto done - mov [rsi],al # *dest = val1; + mov [rsi],al # *dest = val1; - inc rdi # ++dest; - dec rdx # --rem; - jmp .bytefill # goto bytefill; + inc rdi # ++dest; + dec rdx # --rem; + jmp short .bytefill # goto bytefill; .done: - ret # return + ret # return |