summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.txt11
-rw-r--r--README.html20
-rw-r--r--rgo/Makefile23
-rw-r--r--rgo/include/rgo.h8
-rw-r--r--rgo/src/fndbyte.S28
-rw-r--r--rgo/src/fndchr.S30
-rw-r--r--rgo/src/memcpy.S38
-rw-r--r--rgo/src/memfill.S14
-rw-r--r--rgo/src/strlen.S17
-rw-r--r--test.c18
10 files changed, 164 insertions, 43 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 656fdcd..7c7e495 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,3 +1,14 @@
+| 3
+
+- Enable compiler optimisations;
+- Optimise memfill;
+- Optimise memcpy: Use movdqu instead of movups (AMD64, i386 SSE2), add 256-bit copy (AMD64 AVX, i386 AVX);
+- Update makefile;
+- Implement fndbyte, fndchr, strlen in IA-32;
+- Fix bug in test;
+- Update readme;
+- Add new planned architectures: Motorola 68000, Power ISA, RISC-V, Sparc;
+
| 2
- Fix target purge in makefile not being labeled phony;
diff --git a/README.html b/README.html
index 6a732ac..ac68374 100644
--- a/README.html
+++ b/README.html
@@ -11,14 +11,26 @@
<p>rgo is written in GNU C and GNU assembly for the following machine architectures:</p>
<ul>
<li>
- <p>AMD64 (x86-64), including (Planned) AVX;</p>
+ <p>AMD64, including AVX;</p>
</li>
<li>
- <p>IA-32 (i386), including SSE and (Planned) AVX;</p>
- <p><i>Note: Support is currently limited to: memcpy, memeq, memfill.</i></p>
+ <p>IA-32, including SSE and AVX;</p>
+ <p><i>Note: Support is currently limited to: fndbyte, fndchr, memcpy, memeq, memfill, strlen.</i></p>
</li>
<li>
- <p><i>(Planned) Aarch64 (ARM64), including SVE;</i></p>
+ <p><i>(Planned) Aarch64, including Neon and SVE;</i></p>
+ </li>
+ <li>
+ <p><i>(Planned) Motorola 68000;</i></p>
+ </li>
+ <li>
+ <p><i>(Planned) Power ISA, including AltiVec;</i></p>
+ </li>
+ <li>
+ <p><i>(Planned) RISC-V, including Q extension;</i></p>
+ </li>
+ <li>
+ <p><i>(Planned) Sparc;</i></p>
</li>
</ul>
<br />
diff --git a/rgo/Makefile b/rgo/Makefile
index 02800e0..4118fe4 100644
--- a/rgo/Makefile
+++ b/rgo/Makefile
@@ -1,29 +1,30 @@
-SRCS_ASM = \
+SRCS = \
src/fndbyte.S \
src/fndchr.S \
src/memcpy.S \
+ src/memdup.c \
src/memeq.S \
src/memfill.S \
+ src/strdup.c \
src/streq.S \
+ src/strfill.c \
src/strcpy.S \
src/strlen.S
-SRCS_C = \
- src/memdup.c \
- src/strdup.c \
- src/strfill.c
-OBJS_ASM := $(SRCS_ASM:.S=.o)
-OBJS_C := $(SRCS_C:.c=.o)
-OBJS := $(OBJS_ASM) $(OBJS_C)
-LIB := librgo.a
+OBJS := $(SRCS:.S=.o)
+OBJS := $(OBJS:.c=.o)
+LIB := librgo.a
ASFLAGS = \
-Iinclude \
- -g
+ -g \
+ -march=native
CFLAGS = \
-Iinclude \
- -g
+ -O3 \
+ -g \
+ -march=native
.PHONY: clean purge
diff --git a/rgo/include/rgo.h b/rgo/include/rgo.h
index 276c8dc..f575436 100644
--- a/rgo/include/rgo.h
+++ b/rgo/include/rgo.h
@@ -10,15 +10,15 @@
You should have received a copy of the GNU Lesser General Public License along with rgo. If not, see <https://www.gnu.org/licenses/>.
*/
-#if !defined(__i386__) && !defined(__x86_64__)
-#error Unsupported machine architecture! Support: AMD64, IA-32.
+#if !defined(__x86_64__) && !defined(__i386__)
+#error Unsupported machine architecture! Supported: AMD64, IA-32;
#endif
#if !defined(rgo_ver)
#if defined(__ASSEMBLER__)
-#define rgo_ver $0x2
+#define rgo_ver $0x3
#else
-#define rgo_ver (0x2)
+#define rgo_ver (0x3)
#endif
#if defined(__ASSEMBLER__)
diff --git a/rgo/src/fndbyte.S b/rgo/src/fndbyte.S
index c0e4382..4d1e482 100644
--- a/rgo/src/fndbyte.S
+++ b/rgo/src/fndbyte.S
@@ -20,7 +20,33 @@ rgo_fndbyte:
size_t num
uint8_t byte
*/
-#if defined(__x86_64__)
+#if defined(__i386__)
+ /* eax: Address of the current element. */
+ movl 0x4(%esp),%eax
+ /* ecx: Address of the element after the last element. */
+ movl 0x8(%esp),%ecx
+ addl %eax,%ecx
+ /* edx: Byte value. */
+ movb 0xC(%esp),%dl
+ /* ebx: Current element. */
+ pushl %ebx
+.loop:
+ cmpl %eax,%ecx
+ je .nfnd /* We have went through the entire array without finding the byte. */
+ movb (%eax),%bl
+ cmpb %bl,%dl
+ je .fnd /* We have found the byte. */
+ incl %eax
+ jmp .loop
+.fnd:
+ popl %ebx
+ subl 0x4(%esp),%eax
+ ret
+.nfnd:
+ popl %ebx
+ movl $0xFFFFFFFF,%eax
+ ret
+#elif defined(__x86_64__)
/* rax: Address of the current element. */
movq %rdi,%rax
/* rsi: Address of the element after the last element. */
diff --git a/rgo/src/fndchr.S b/rgo/src/fndchr.S
index 1008e52..f12f4c5 100644
--- a/rgo/src/fndchr.S
+++ b/rgo/src/fndchr.S
@@ -19,22 +19,42 @@ rgo_fndchr:
char const * str
char chr
*/
-#if defined(__x86_64__)
+#if defined(__i386__)
+ /* eax: Address of the current character. */
+ movl 0x4(%esp),%eax
+ /* ecx: Character. */
+ movb 0x8(%esp),%cl
+ /* edx: Current character. */
+.loop:
+ movb (%eax),%dl
+ cmpb %dl,%cl
+ je .fnd /* Exit loop if we have found the character. */
+ testb %dl,%dl
+ je .nfnd /* We encountered the null-terminator but not the specified character. */
+ incl %eax
+ jmp .loop
+.fnd:
+ subl 0x4(%esp),%eax
+ ret
+.nfnd:
+ movl $0xFFFFFFFF,%eax
+ ret
+#elif defined(__x86_64__)
/* rax: Address of the current character. */
movq %rdi,%rax
/* rdx: Current character. */
.loop:
movb (%rax),%dl
cmpb %dl,%sil
- je .done /* Exit loop if we have found the character. */
+ je .fnd /* Exit loop if we have found the character. */
testb %dl,%dl
- je .err /* We encountered the null-terminator but not the specified character. */
+ je .nfnd /* We encountered the null-terminator but not the specified character. */
incq %rax
jmp .loop
-.done:
+.fnd:
subq %rdi,%rax
ret
-.err:
+.nfnd:
movq $0xFFFFFFFFFFFFFFFF,%rax
ret
#endif
diff --git a/rgo/src/memcpy.S b/rgo/src/memcpy.S
index 475da57..820781d 100644
--- a/rgo/src/memcpy.S
+++ b/rgo/src/memcpy.S
@@ -30,12 +30,33 @@ rgo_memcpy:
/* ebx: Current element. */
pushl %ebx /* ebx must be restored. */
/* xmm0: Current element. */
+ /* ymm0: Current element. */
+#if defined(__AVX__)
+.big256cpy:
+ cmpl $0x20,%ecx
+#if defined(__SSE__)
+ jl .big128cpy
+#else
+ jl .wrdcpy
+#endif
+ vmovdqu (%eax),%ymm0
+ vmovdqu %ymm0,(%edx)
+ addl $0x20,%eax
+ addl $0x20,%edx
+ subl $0x20,%ecx
+ jmp .big256cpy
+#endif
#if defined(__SSE__)
.big128cpy:
cmpl $0x10,%ecx
jl .wrdcpy
+#if defined(__SSE2__)
+ movdqu (%eax),%xmm0
+ movdqu %xmm0,(%edx)
+#else
movups (%eax),%xmm0
movups %xmm0,(%edx)
+#endif
addl $0x10,%eax
addl $0x10,%edx
subl $0x10,%ecx
@@ -68,11 +89,24 @@ rgo_memcpy:
/* rdx: Address of the current output element. */
/* rcx: Current element. */
/* xmm0: Current element. */
+ /* ymm0: Current element. */
+#if defined(__AVX__)
+.big256cpy:
+ cmpq $0x20,%rsi
+ jl .big128cpy
+ vmovups (%rdi),%ymm0
+ vmovups %ymm0,(%rdx)
+ addq $0x20,%rdi
+ addq $0x20,%rdx
+ subq $0x20,%rsi
+ jmp .big256cpy
+#endif
.big128cpy:
+ ret
cmpq $0x10,%rsi
jl .wrdcpy
- movups (%rdi),%xmm0
- movups %xmm0,(%rdx)
+ movdqu (%rdi),%xmm0
+ movdqu %xmm0,(%rdx)
addq $0x10,%rdi
addq $0x10,%rdx
subq $0x10,%rsi
diff --git a/rgo/src/memfill.S b/rgo/src/memfill.S
index c22547e..7dc00c3 100644
--- a/rgo/src/memfill.S
+++ b/rgo/src/memfill.S
@@ -37,16 +37,14 @@ rgo_memfill:
.done:
ret
#elif defined(__x86_64__)
- /* rax: Address of the current element. */
- movq %rdi,%rax
- /* rax: Address of the element after the last element. */
- movq %rdi,%rcx
- addq %rsi,%rcx
+ /* rdi: Address of the current element. */
+ /* rsi: Address of the element after the last element. */
+ addq %rdi,%rsi
.loop:
- cmpq %rcx,%rax
+ cmpq %rsi,%rdi
je .done /* Exit loop if we have reached the final element. */
- movb %dl,(%rax)
- incq %rax
+ movb %dl,(%rdi)
+ incq %rdi
jmp .loop /* Continue to next element. */
.done:
ret
diff --git a/rgo/src/strlen.S b/rgo/src/strlen.S
index d7ad03e..37b5780 100644
--- a/rgo/src/strlen.S
+++ b/rgo/src/strlen.S
@@ -18,7 +18,22 @@ rgo_strlen:
/*
char const * str
*/
-#if defined(__x86_64__)
+#if defined(__i386__)
+ /* eax: Address of the current character. */
+ movl 0x4(%esp),%eax
+ /* ecx: Address of the first character. */
+ movl %eax,%ecx
+ /* edx: Current character. */
+.loop:
+ movb (%eax),%dl
+ testb %dl,%dl
+ jz .done /* Exit loop if we have reached the null-terminator. */
+ incl %eax /* Continue to the next character. */
+ jmp .loop
+.done:
+ subl %ecx,%eax
+ ret
+#elif defined(__x86_64__)
/* rax: Address of the current character. */
movq %rdi,%rax
/* rdx: Current character. */
diff --git a/test.c b/test.c
index 8892dea..385c0ec 100644
--- a/test.c
+++ b/test.c
@@ -11,7 +11,7 @@ int main(void) {
fprintf(stderr,"\n");
{
#undef arrsz
-#define arrsz ((size_t)0x7)
+#define arrsz ((size_t)0x8)
uint64_t arr0[arrsz] = {0x0};
rgo_memfill(arr0,arrsz * sizeof (uint64_t),UINT8_C(0x0));
fprintf(stderr,"arr0[0]: %" PRIX64 "\n",arr0[(size_t)0x0]);
@@ -65,7 +65,6 @@ int main(void) {
#undef arrsz
}
fprintf(stderr,"\n");
-#if defined(__x86_64__)
{
char const * str0 = "Hello there! General Kenobi?";
fprintf(stderr,"str0: \"%s\"\n",str0);
@@ -75,22 +74,26 @@ int main(void) {
}
fprintf(stderr,"\n");
{
- char const * restrict str = "Oh my science!";
- size_t const len = rgo_strlen(str);
- size_t pos0 = rgo_fndchr(str,' ');
- size_t pos1 = rgo_fndbyte(str,len,(uint8_t)' ');
+ char const * restrict str = "Oh my science!";
+ fprintf(stderr,"str: \"%s\"\n",str);
+ size_t len = rgo_strlen(str);
+ fprintf(stderr,"len: %zX\n",len);
+ size_t pos0 = rgo_fndchr(str,' ');
+ size_t pos1 = rgo_fndbyte(str,len,(uint8_t)' ');
fprintf(stderr,"pos0: %zX\n",pos0);
fprintf(stderr,"pos1: %zX\n",pos1);
assert(pos0 == (size_t)0x2);
assert(pos1 == pos0);
str += pos0 + (size_t)0x1;
+ len = rgo_strlen(str);
pos0 = rgo_fndchr(str,' ');
pos1 = rgo_fndbyte(str,len,(uint8_t)' ');
- fprintf(stderr,"pos0: zX\n",pos0);
+ fprintf(stderr,"pos0: %zX\n",pos0);
fprintf(stderr,"pos1: %zX\n",pos1);
assert(pos0 == (size_t)0x2);
assert(pos1 == pos0);
str += pos0 + (size_t)0x1;
+ len = rgo_strlen(str);
pos0 = rgo_fndchr(str,' ');
pos1 = rgo_fndbyte(str,len,(uint8_t)' ');
fprintf(stderr,"pos0: %zX\n",pos0);
@@ -99,6 +102,7 @@ int main(void) {
assert(pos1 == pos0);
}
fprintf(stderr,"\n");
+#if defined(__x86_64__)
{
char const str0[] = "What's up, my guy?";
fprintf(stderr,"str0: \"%s\"\n",str0);