summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.txt4
-rw-r--r--zap/source/amd64/mem/cp.s18
-rw-r--r--zap/source/amd64/mem/fill.s4
3 files changed, 15 insertions, 11 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index eae9020..098834a 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,3 +1,7 @@
+# 16.6
+
+* cp: Use vmovdqu instead of vmovups (amd64);
+
# 16.5
* Optimise fill (amd64);
diff --git a/zap/source/amd64/mem/cp.s b/zap/source/amd64/mem/cp.s
index ef333e0..2860acf 100644
--- a/zap/source/amd64/mem/cp.s
+++ b/zap/source/amd64/mem/cp.s
@@ -12,12 +12,12 @@ zap_cp:
# unsigned int128_t val01;
# unsigned int256_t val02;
-.big02cp: # // We assume AVX.
+.big02cp: # big02cp:; // We assume AVX.
cmp rdx,0x20
jl short .big01cp # if (num < 0x20u) goto big01cp;
- vmovups ymm0,[rsi] # val02 = *src;
- vmovups [rdi],ymm0 # *dest = val02;
+ vmovdqu ymm0,[rsi] # val02 = *(unsigned int256_t *)src;
+ vmovdqu [rdi],ymm0 # *(unsigned int256_t *)dest = val02;
add rsi,0x20 # dest += 0x20u;
add rdi,0x20 # src += 0x20u;
@@ -28,8 +28,8 @@ zap_cp:
cmp rdx,0x10
jl short .wrdcp # if (num < 0x10u) goto wrdcp;
- movdqu xmm0,[rsi] # val01 = *src;
- movdqu [rdi],xmm0 # *dest = val01;
+ movdqu xmm0,[rsi] # val01 = *(unsigned int128_t *)src;
+ movdqu [rdi],xmm0 # *(unsigned int128_t *)dest = val01;
add rsi,0x10 # dest += 0x10u;
add rdi,0x10 # src += 0x10u;
@@ -40,8 +40,8 @@ zap_cp:
cmp rdx,0x8
jl short .bytecp # if (num < 0x8u) goto bytecp;
- mov rcx,[rsi] # val8 = *src;
- mov [rdi],rcx # *dest = val8;
+ mov rcx,[rsi] # val8 = *(zap_i04 *)src;
+ mov [rdi],rcx # *(zap_i04 *)dest = val8;
add rdi,0x8 # dest += 0x8u;
add rsi,0x8 # src += 0x8u;
@@ -52,8 +52,8 @@ zap_cp:
test rdx,rdx # if (rem == 0x0)
jz short .done # goto done
- mov cl,[rsi] # val1 = *src;
- mov [rsi],cl # *dest = val1;
+ mov cl,[rsi] # val1 = *(zap_i8 *)src;
+ mov [rsi],cl # *(zap_i8 *)dest = val1;
inc rdi # ++dest;
inc rsi # ++src;
diff --git a/zap/source/amd64/mem/fill.s b/zap/source/amd64/mem/fill.s
index f4022d0..7edd36b 100644
--- a/zap/source/amd64/mem/fill.s
+++ b/zap/source/amd64/mem/fill.s
@@ -15,7 +15,7 @@ zap_fill:
cmp rdx,0x8
jl short .bytefill # if (num < 0x8u) goto bytefill;
- mov [rdi],rax # *dest = val8;
+ mov [rdi],rax # *(zap_i04 *)dest = val8;
add rdi,0x8 # dest += 0x8u;
sub rdx,0x8 # num -= 0x8u;
@@ -25,7 +25,7 @@ zap_fill:
test rdx,rdx # if (rem == 0x0)
jz short .done # goto done
- mov [rsi],al # *dest = val1;
+ mov [rsi],al # *(zap_i8 *)dest = val1;
inc rdi # ++dest;
dec rdx # --rem;