diff options
-rw-r--r-- | CHANGELOG.txt | 11 | ||||
-rw-r--r-- | README.html | 20 | ||||
-rw-r--r-- | rgo/Makefile | 23 | ||||
-rw-r--r-- | rgo/include/rgo.h | 8 | ||||
-rw-r--r-- | rgo/src/fndbyte.S | 28 | ||||
-rw-r--r-- | rgo/src/fndchr.S | 30 | ||||
-rw-r--r-- | rgo/src/memcpy.S | 38 | ||||
-rw-r--r-- | rgo/src/memfill.S | 14 | ||||
-rw-r--r-- | rgo/src/strlen.S | 17 | ||||
-rw-r--r-- | test.c | 18 |
10 files changed, 164 insertions, 43 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 656fdcd..7c7e495 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,14 @@ +| 3 + +- Enable compiler optimisations; +- Optimise memfill; +- Optimise memcpy: Use movdqu instead of movups (AMD64, i386 SSE2), add 256-bit copy (AMD64 AVX, i386 AVX); +- Update makefile; +- Implement fndbyte, fndchr, strlen in IA-32; +- Fix bug in test; +- Update readme; +- Add new planned architectures: Motorola 68000, Power ISA, RISC-V, Sparc; + | 2 - Fix target purge in makefile not being labeled phony; diff --git a/README.html b/README.html index 6a732ac..ac68374 100644 --- a/README.html +++ b/README.html @@ -11,14 +11,26 @@ <p>rgo is written in GNU C and GNU assembly for the following machine architectures:</p> <ul> <li> - <p>AMD64 (x86-64), including (Planned) AVX;</p> + <p>AMD64, including AVX;</p> </li> <li> - <p>IA-32 (i386), including SSE and (Planned) AVX;</p> - <p><i>Note: Support is currently limited to: memcpy, memeq, memfill.</i></p> + <p>IA-32, including SSE and AVX;</p> + <p><i>Note: Support is currently limited to: fndbyte, fndchr, memcpy, memeq, memfill, strlen.</i></p> </li> <li> - <p><i>(Planned) Aarch64 (ARM64), including SVE;</i></p> + <p><i>(Planned) Aarch64, including Neon and SVE;</i></p> + </li> + <li> + <p><i>(Planned) Motorola 68000;</i></p> + </li> + <li> + <p><i>(Planned) Power ISA, including AltiVec;</i></p> + </li> + <li> + <p><i>(Planned) RISC-V, including Q extension;</i></p> + </li> + <li> + <p><i>(Planned) Sparc;</i></p> </li> </ul> <br /> diff --git a/rgo/Makefile b/rgo/Makefile index 02800e0..4118fe4 100644 --- a/rgo/Makefile +++ b/rgo/Makefile @@ -1,29 +1,30 @@ -SRCS_ASM = \ +SRCS = \ src/fndbyte.S \ src/fndchr.S \ src/memcpy.S \ + src/memdup.c \ src/memeq.S \ src/memfill.S \ + src/strdup.c \ src/streq.S \ + src/strfill.c \ src/strcpy.S \ src/strlen.S -SRCS_C = \ - src/memdup.c \ - src/strdup.c \ - src/strfill.c -OBJS_ASM := $(SRCS_ASM:.S=.o) -OBJS_C := $(SRCS_C:.c=.o) -OBJS := $(OBJS_ASM) $(OBJS_C) -LIB := librgo.a +OBJS := $(SRCS:.S=.o) +OBJS := $(OBJS:.c=.o) +LIB := librgo.a ASFLAGS = \ -Iinclude \ - -g + -g \ + -march=native CFLAGS = \ -Iinclude \ - -g + -O3 \ + -g \ + -march=native .PHONY: clean purge diff --git a/rgo/include/rgo.h b/rgo/include/rgo.h index 276c8dc..f575436 100644 --- a/rgo/include/rgo.h +++ b/rgo/include/rgo.h @@ -10,15 +10,15 @@ You should have received a copy of the GNU Lesser General Public License along with rgo. If not, see <https://www.gnu.org/licenses/>. */ -#if !defined(__i386__) && !defined(__x86_64__) -#error Unsupported machine architecture! Support: AMD64, IA-32. +#if !defined(__x86_64__) && !defined(__i386__) +#error Unsupported machine architecture! Supported: AMD64, IA-32; #endif #if !defined(rgo_ver) #if defined(__ASSEMBLER__) -#define rgo_ver $0x2 +#define rgo_ver $0x3 #else -#define rgo_ver (0x2) +#define rgo_ver (0x3) #endif #if defined(__ASSEMBLER__) diff --git a/rgo/src/fndbyte.S b/rgo/src/fndbyte.S index c0e4382..4d1e482 100644 --- a/rgo/src/fndbyte.S +++ b/rgo/src/fndbyte.S @@ -20,7 +20,33 @@ rgo_fndbyte: size_t num uint8_t byte */ -#if defined(__x86_64__) +#if defined(__i386__) + /* eax: Address of the current element. */ + movl 0x4(%esp),%eax + /* ecx: Address of the element after the last element. */ + movl 0x8(%esp),%ecx + addl %eax,%ecx + /* edx: Byte value. */ + movb 0xC(%esp),%dl + /* ebx: Current element. */ + pushl %ebx +.loop: + cmpl %eax,%ecx + je .nfnd /* We have went through the entire array without finding the byte. */ + movb (%eax),%bl + cmpb %bl,%dl + je .fnd /* We have found the byte. */ + incl %eax + jmp .loop +.fnd: + popl %ebx + subl 0x4(%esp),%eax + ret +.nfnd: + popl %ebx + movl $0xFFFFFFFF,%eax + ret +#elif defined(__x86_64__) /* rax: Address of the current element. */ movq %rdi,%rax /* rsi: Address of the element after the last element. */ diff --git a/rgo/src/fndchr.S b/rgo/src/fndchr.S index 1008e52..f12f4c5 100644 --- a/rgo/src/fndchr.S +++ b/rgo/src/fndchr.S @@ -19,22 +19,42 @@ rgo_fndchr: char const * str char chr */ -#if defined(__x86_64__) +#if defined(__i386__) + /* eax: Address of the current character. */ + movl 0x4(%esp),%eax + /* ecx: Character. */ + movb 0x8(%esp),%cl + /* edx: Current character. */ +.loop: + movb (%eax),%dl + cmpb %dl,%cl + je .fnd /* Exit loop if we have found the character. */ + testb %dl,%dl + je .nfnd /* We encountered the null-terminator but not the specified character. */ + incl %eax + jmp .loop +.fnd: + subl 0x4(%esp),%eax + ret +.nfnd: + movl $0xFFFFFFFF,%eax + ret +#elif defined(__x86_64__) /* rax: Address of the current character. */ movq %rdi,%rax /* rdx: Current character. */ .loop: movb (%rax),%dl cmpb %dl,%sil - je .done /* Exit loop if we have found the character. */ + je .fnd /* Exit loop if we have found the character. */ testb %dl,%dl - je .err /* We encountered the null-terminator but not the specified character. */ + je .nfnd /* We encountered the null-terminator but not the specified character. */ incq %rax jmp .loop -.done: +.fnd: subq %rdi,%rax ret -.err: +.nfnd: movq $0xFFFFFFFFFFFFFFFF,%rax ret #endif diff --git a/rgo/src/memcpy.S b/rgo/src/memcpy.S index 475da57..820781d 100644 --- a/rgo/src/memcpy.S +++ b/rgo/src/memcpy.S @@ -30,12 +30,33 @@ rgo_memcpy: /* ebx: Current element. */ pushl %ebx /* ebx must be restored. */ /* xmm0: Current element. */ + /* ymm0: Current element. */ +#if defined(__AVX__) +.big256cpy: + cmpl $0x20,%ecx +#if defined(__SSE__) + jl .big128cpy +#else + jl .wrdcpy +#endif + vmovdqu (%eax),%ymm0 + vmovdqu %ymm0,(%edx) + addl $0x20,%eax + addl $0x20,%edx + subl $0x20,%ecx + jmp .big256cpy +#endif #if defined(__SSE__) .big128cpy: cmpl $0x10,%ecx jl .wrdcpy +#if defined(__SSE2__) + movdqu (%eax),%xmm0 + movdqu %xmm0,(%edx) +#else movups (%eax),%xmm0 movups %xmm0,(%edx) +#endif addl $0x10,%eax addl $0x10,%edx subl $0x10,%ecx @@ -68,11 +89,24 @@ rgo_memcpy: /* rdx: Address of the current output element. */ /* rcx: Current element. */ /* xmm0: Current element. */ + /* ymm0: Current element. */ +#if defined(__AVX__) +.big256cpy: + cmpq $0x20,%rsi + jl .big128cpy + vmovups (%rdi),%ymm0 + vmovups %ymm0,(%rdx) + addq $0x20,%rdi + addq $0x20,%rdx + subq $0x20,%rsi + jmp .big256cpy +#endif .big128cpy: + ret cmpq $0x10,%rsi jl .wrdcpy - movups (%rdi),%xmm0 - movups %xmm0,(%rdx) + movdqu (%rdi),%xmm0 + movdqu %xmm0,(%rdx) addq $0x10,%rdi addq $0x10,%rdx subq $0x10,%rsi diff --git a/rgo/src/memfill.S b/rgo/src/memfill.S index c22547e..7dc00c3 100644 --- a/rgo/src/memfill.S +++ b/rgo/src/memfill.S @@ -37,16 +37,14 @@ rgo_memfill: .done: ret #elif defined(__x86_64__) - /* rax: Address of the current element. */ - movq %rdi,%rax - /* rax: Address of the element after the last element. */ - movq %rdi,%rcx - addq %rsi,%rcx + /* rdi: Address of the current element. */ + /* rsi: Address of the element after the last element. */ + addq %rdi,%rsi .loop: - cmpq %rcx,%rax + cmpq %rsi,%rdi je .done /* Exit loop if we have reached the final element. */ - movb %dl,(%rax) - incq %rax + movb %dl,(%rdi) + incq %rdi jmp .loop /* Continue to next element. */ .done: ret diff --git a/rgo/src/strlen.S b/rgo/src/strlen.S index d7ad03e..37b5780 100644 --- a/rgo/src/strlen.S +++ b/rgo/src/strlen.S @@ -18,7 +18,22 @@ rgo_strlen: /* char const * str */ -#if defined(__x86_64__) +#if defined(__i386__) + /* eax: Address of the current character. */ + movl 0x4(%esp),%eax + /* ecx: Address of the first character. */ + movl %eax,%ecx + /* edx: Current character. */ +.loop: + movb (%eax),%dl + testb %dl,%dl + jz .done /* Exit loop if we have reached the null-terminator. */ + incl %eax /* Continue to the next character. */ + jmp .loop +.done: + subl %ecx,%eax + ret +#elif defined(__x86_64__) /* rax: Address of the current character. */ movq %rdi,%rax /* rdx: Current character. */ @@ -11,7 +11,7 @@ int main(void) { fprintf(stderr,"\n"); { #undef arrsz -#define arrsz ((size_t)0x7) +#define arrsz ((size_t)0x8) uint64_t arr0[arrsz] = {0x0}; rgo_memfill(arr0,arrsz * sizeof (uint64_t),UINT8_C(0x0)); fprintf(stderr,"arr0[0]: %" PRIX64 "\n",arr0[(size_t)0x0]); @@ -65,7 +65,6 @@ int main(void) { #undef arrsz } fprintf(stderr,"\n"); -#if defined(__x86_64__) { char const * str0 = "Hello there! General Kenobi?"; fprintf(stderr,"str0: \"%s\"\n",str0); @@ -75,22 +74,26 @@ int main(void) { } fprintf(stderr,"\n"); { - char const * restrict str = "Oh my science!"; - size_t const len = rgo_strlen(str); - size_t pos0 = rgo_fndchr(str,' '); - size_t pos1 = rgo_fndbyte(str,len,(uint8_t)' '); + char const * restrict str = "Oh my science!"; + fprintf(stderr,"str: \"%s\"\n",str); + size_t len = rgo_strlen(str); + fprintf(stderr,"len: %zX\n",len); + size_t pos0 = rgo_fndchr(str,' '); + size_t pos1 = rgo_fndbyte(str,len,(uint8_t)' '); fprintf(stderr,"pos0: %zX\n",pos0); fprintf(stderr,"pos1: %zX\n",pos1); assert(pos0 == (size_t)0x2); assert(pos1 == pos0); str += pos0 + (size_t)0x1; + len = rgo_strlen(str); pos0 = rgo_fndchr(str,' '); pos1 = rgo_fndbyte(str,len,(uint8_t)' '); - fprintf(stderr,"pos0: zX\n",pos0); + fprintf(stderr,"pos0: %zX\n",pos0); fprintf(stderr,"pos1: %zX\n",pos1); assert(pos0 == (size_t)0x2); assert(pos1 == pos0); str += pos0 + (size_t)0x1; + len = rgo_strlen(str); pos0 = rgo_fndchr(str,' '); pos1 = rgo_fndbyte(str,len,(uint8_t)' '); fprintf(stderr,"pos0: %zX\n",pos0); @@ -99,6 +102,7 @@ int main(void) { assert(pos1 == pos0); } fprintf(stderr,"\n"); +#if defined(__x86_64__) { char const str0[] = "What's up, my guy?"; fprintf(stderr,"str0: \"%s\"\n",str0); |