summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.txt5
-rw-r--r--test.c61
-rw-r--r--zap/GNUmakefile7
-rw-r--r--zap/source/amd64/mem/cp.s76
-rw-r--r--zap/source/amd64/mem/fill.s52
5 files changed, 68 insertions, 133 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index f2acff8..eae9020 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,3 +1,8 @@
+# 16.5
+
+* Optimise fill (amd64);
+* Optimise cp (amd64);
+
# 16.4
* Update assembly comments;
diff --git a/test.c b/test.c
index 0d78c99..71a0942 100644
--- a/test.c
+++ b/test.c
@@ -33,60 +33,13 @@ int main(void) {
/* mem */
{
zap_i8 const src[0x3Fu] = {
-0x02u,
-0x03u,
-0x05u,
-0x07u,
-0x0Bu,
-0x0Du,
-0x11u,
-0x13u,
-0x17u,
-0x1Du,
-0x1Fu,
-0x25u,
-0x29u,
-0x2Bu,
-0x2Fu,
-0x35u,
-0x3Bu,
-0x3Du,
-0x43u,
-0x47u,
-0x49u,
-0x4Fu,
-0x53u,
-0x59u,
-0x61u,
-0x65u,
-0x67u,
-0x6Bu,
-0x6Du,
-0x71u,
-0x7Fu,
-0x83u,
-0x89u,
-0x8Bu,
-0x95u,
-0x97u,
-0x9Du,
-0xA3u,
-0xA7u,
-0xADu,
-0xB3u,
-0xB5u,
-0xBFu,
-0xC1u,
-0xC5u,
-0xC7u,
-0xD3u,
-0xDFu,
-0xE3u,
-0xE5u,
-0xE9u,
-0xEFu,
-0xF1u,
-0xFBu,
+ 0x02u,0x03u,0x05u,0x07u,0x0Bu,0x0Du,0x11u,0x13u,
+ 0x17u,0x1Du,0x1Fu,0x25u,0x29u,0x2Bu,0x2Fu,0x35u,
+ 0x3Bu,0x3Du,0x43u,0x47u,0x49u,0x4Fu,0x53u,0x59u,
+ 0x61u,0x65u,0x67u,0x6Bu,0x6Du,0x71u,0x7Fu,0x83u,
+ 0x89u,0x8Bu,0x95u,0x97u,0x9Du,0xA3u,0xA7u,0xADu,
+ 0xB3u,0xB5u,0xBFu,0xC1u,0xC5u,0xC7u,0xD3u,0xDFu,
+ 0xE3u,0xE5u,0xE9u,0xEFu,0xF1u,0xFBu,
};
zap_i8 dest[sizeof (src)];
zap_cp(dest,src,sizeof (src));
diff --git a/zap/GNUmakefile b/zap/GNUmakefile
index 473fb0c..ade3245 100644
--- a/zap/GNUmakefile
+++ b/zap/GNUmakefile
@@ -68,18 +68,11 @@ CFLAGS := \
-Wpadded \
-Wpedantic \
-ffreestanding \
- -fno-strict-aliasing \
-fshort-enums \
-nostdlib \
-pipe \
-std=c99
-ifeq "$(trapdiv0)" "true"
-CFLAGS := \
- $(CFLAGS) \
- -Dzap_priv_trapdiv0
-endif
-
.PHONY: clean install purge
$(LIB): $(OBJS)
diff --git a/zap/source/amd64/mem/cp.s b/zap/source/amd64/mem/cp.s
index 57e716f..ef333e0 100644
--- a/zap/source/amd64/mem/cp.s
+++ b/zap/source/amd64/mem/cp.s
@@ -7,58 +7,58 @@
.globl zap_cp
zap_cp:
- # zap_i8 val1;
- # zap_i04 val8;
- # unsigned int128_t val01;
- # unsigned int256_t val02;
+ # zap_i8 val1;
+ # zap_i04 val8;
+ # unsigned int128_t val01;
+ # unsigned int256_t val02;
-.big02cp:
+.big02cp: # // We assume AVX.
cmp rdx,0x20
- jl .big01cp # if (num < 0x20u) goto big01cp;
+ jl short .big01cp # if (num < 0x20u) goto big01cp;
- vmovups ymm0,[rsi] # val02 = *src;
- vmovups [rdi],ymm0 # *dest = val02;
+ vmovups ymm0,[rsi] # val02 = *src;
+ vmovups [rdi],ymm0 # *dest = val02;
- add rsi,0x20 # dest += 0x20u;
- add rdi,0x20 # src += 0x20u;
- sub rdx,0x20 # num -= 0x20u;
- jmp .big02cp # goto big02cp;
+ add rsi,0x20 # dest += 0x20u;
+ add rdi,0x20 # src += 0x20u;
+ sub rdx,0x20 # num -= 0x20u;
+ jmp short .big02cp # goto big02cp;
-.big01cp: # big01cp:;
+.big01cp: # big01cp:;
cmp rdx,0x10
- jl .wrdcp # if (num < 0x10u) goto wrdcp;
+ jl short .wrdcp # if (num < 0x10u) goto wrdcp;
- movdqu xmm0,[rsi] # val01 = *src;
- movdqu [rdi],xmm0 # *dest = val01;
+ movdqu xmm0,[rsi] # val01 = *src;
+ movdqu [rdi],xmm0 # *dest = val01;
- add rsi,0x10 # dest += 0x10u;
- add rdi,0x10 # src += 0x10u;
- sub rdx,0x10 # num -= 0x10u;
- jmp .big01cp # goto big01cp;
+ add rsi,0x10 # dest += 0x10u;
+ add rdi,0x10 # src += 0x10u;
+ sub rdx,0x10 # num -= 0x10u;
+ jmp short .big01cp # goto big01cp;
-.wrdcp: # wrdcp:;
+.wrdcp: # wrdcp:;
cmp rdx,0x8
- jl .bytecp # if (num < 0x8u) goto bytecp;
+ jl short .bytecp # if (num < 0x8u) goto bytecp;
- mov rcx,[rsi] # val8 = *src;
- mov [rdi],rcx # *dest = val8;
+ mov rcx,[rsi] # val8 = *src;
+ mov [rdi],rcx # *dest = val8;
- add rdi,0x8 # dest += 0x8u;
- add rsi,0x8 # src += 0x8u;
- sub rdx,0x8 # num -= 0x8u;
- jmp .wrdcp # goto wrdcp
+ add rdi,0x8 # dest += 0x8u;
+ add rsi,0x8 # src += 0x8u;
+ sub rdx,0x8 # num -= 0x8u;
+ jmp short .wrdcp # goto wrdcp
-.bytecp: # bytecp:;
- test rdx,rdx # if (rem == 0x0)
- jz .done # goto done
+.bytecp: # bytecp:;
+ test rdx,rdx # if (rem == 0x0)
+ jz short .done # goto done
- mov cl,[rsi] # val1 = *src;
- mov [rsi],cl # *dest = val1;
+ mov cl,[rsi] # val1 = *src;
+ mov [rsi],cl # *dest = val1;
- inc rdi # ++dest;
- inc rsi # ++src;
- dec rdx # --rem;
- jmp .bytecp # goto bytecp;
+ inc rdi # ++dest;
+ inc rsi # ++src;
+ dec rdx # --rem;
+ jmp short .bytecp # goto bytecp;
.done:
- ret # return
+ ret # return
diff --git a/zap/source/amd64/mem/fill.s b/zap/source/amd64/mem/fill.s
index 0abd2be..f4022d0 100644
--- a/zap/source/amd64/mem/fill.s
+++ b/zap/source/amd64/mem/fill.s
@@ -7,45 +7,29 @@
.globl zap_fill
zap_fill:
- # zap_i8 val1;
- # zap_i04 val8;
-
- movzx rax,sil # val8 = val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- shl rax,0x8 # val8 <<= 0x8u;
- mov al,sil # val8 |= val;
- # val1 = val8;
-
-.wrdfill: # wrdfill:;
+ movzx rsi,sil # zap_i04 extval = val;
+ mov rax,0x0101010101010101 # zap_i04 val = 0x0101010101010101u;
+ imul rax,rsi # val *= extval;
+
+.wrdfill: # wrdfill:;
cmp rdx,0x8
- jl .bytefill # if (num < 0x8u) goto bytefill;
+ jl short .bytefill # if (num < 0x8u) goto bytefill;
- mov [rdi],rax # *dest = val8;
+ mov [rdi],rax # *dest = val8;
- add rdi,0x8 # dest += 0x8u;
- sub rdx,0x8 # num -= 0x8u;
- jmp .wrdfill # goto wrdfill
+ add rdi,0x8 # dest += 0x8u;
+ sub rdx,0x8 # num -= 0x8u;
+ jmp short .wrdfill # goto wrdfill
-.bytefill: # bytefill:;
- test rdx,rdx # if (rem == 0x0)
- jz .done # goto done
+.bytefill: # bytefill:;
+ test rdx,rdx # if (rem == 0x0)
+ jz short .done # goto done
- mov [rsi],al # *dest = val1;
+ mov [rsi],al # *dest = val1;
- inc rdi # ++dest;
- dec rdx # --rem;
- jmp .bytefill # goto bytefill;
+ inc rdi # ++dest;
+ dec rdx # --rem;
+ jmp short .bytefill # goto bytefill;
.done:
- ret # return
+ ret # return