diff options
-rw-r--r-- | CHANGELOG.txt | 10 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | README.html | 15 | ||||
-rw-r--r-- | rgo/Makefile | 2 | ||||
-rw-r--r-- | rgo/include/rgo.h | 8 | ||||
-rw-r--r-- | rgo/src/fndbyte.S | 21 | ||||
-rw-r--r-- | rgo/src/fndchr.S | 8 | ||||
-rw-r--r-- | rgo/src/memcpy.S | 101 | ||||
-rw-r--r-- | rgo/src/memeq.S | 102 | ||||
-rw-r--r-- | rgo/src/memfill.S | 41 | ||||
-rw-r--r-- | rgo/src/strcpy.S | 22 | ||||
-rw-r--r-- | rgo/src/streq.S | 30 | ||||
-rw-r--r-- | rgo/src/strlen.S | 17 | ||||
-rw-r--r-- | test.c | 8 |
14 files changed, 248 insertions, 139 deletions
diff --git a/CHANGELOG.txt b/CHANGELOG.txt index face77e..656fdcd 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,13 @@ +| 2 + +- Fix target purge in makefile not being labeled phony; +- Add machien architecture check in header; +- Implement memcpy, memeq, memfill in IA-32; +- Fix some incorrect comments; +- Use a different register order for temporaries and optimise register usage; +- Fix bug in memeq: Should jump if zero, not if equal; +- Update readme; + | 1 - Fix indentation in license notices; @@ -1,4 +1,4 @@ -.PHONY: clean rgo +.PHONY: clean purge rgo rgo: make -C rgo diff --git a/README.html b/README.html index 7c1a794..6a732ac 100644 --- a/README.html +++ b/README.html @@ -1,12 +1,12 @@ <!DOCTYPE html> <html> <h1>rgo</h1> - <p>rgo (<b>R</b>untime-al<b>GO</b>rithmic, pronounced <i>are-go</i>) is a C/C++ library for runtime algorithmics on memory sequences.</p> + <p>rgo (<b>R</b>untime-al<b>GO</b>rithmic, pronounced as <i>are-go</i>) is a C/C++ library for runtime algorithmics on memory sequences.</p> <p><i>Note: This library is still in it's early stages and is NOT anywhere near being fully optimised.</i></p> <br /> <h2>Supported Platforms</h2> <p>rgo is written (mostly) in assembly, and we therefore can't possibly support every platform in existence.</p> - <p>Currently, it's only compatible with the UNIX System-V ABI. Systems using this ABI include FreeBSD, Linux, macOS, OpenBSD, and any other System-V derivative. Support for Windows NT will be reflected.</p> + <p>Currently, it's only compatible with the UNIX System-V ABI. Systems using this ABI include FreeBSD, Linux, macOS, OpenBSD, and any other System-V derivative. Support for Windows is being reflected for a future release.</p> <br /> <p>rgo is written in GNU C and GNU assembly for the following machine architectures:</p> <ul> @@ -14,17 +14,18 @@ <p>AMD64 (x86-64), including (Planned) AVX;</p> </li> <li> - <p><i>(Planned) Aarch64 (ARM64), including SVE;</i></p> + <p>IA-32 (i386), including SSE and (Planned) AVX;</p> + <p><i>Note: Support is currently limited to: memcpy, memeq, memfill.</i></p> </li> <li> - <p><i>(Planned) IA-32 (i386), including SSE and AVX;</i></p> + <p><i>(Planned) Aarch64 (ARM64), including SVE;</i></p> </li> </ul> <br /> <h2>Building</h2> - <p>The provided makefile has been tested to work with GNU make and BSD make and should work on other make implementations.</p> - <p>The target <i>rgo</i> builds the static library file (<i>rgo/librgo.a</i>). The target <i>clean</i> removes all object files, whilst <i>purge</i> removes all object files and the static library file.</p> - <p>Instructions for building the test program may be found on the first line in <i>test.c</i>.</p> + <p>The provided makefile has been tested to work with GNU make and BSD make and should work with other make implementations.</p> + <p>The target <i>rgo</i> builds the static library file (located at <i>rgo/librgo.a</i>). The target <i>clean</i> removes all object files, whilst <i>purge</i> removes all object files and the static library file.</p> + <p>Instructions for building the test program may be found on the first line of <i>test.c</i>.</p> <br /> <h2>Copyright and License</h2> <p>Copyright 2022 Gabriel Jensen</p> diff --git a/rgo/Makefile b/rgo/Makefile index 76282be..02800e0 100644 --- a/rgo/Makefile +++ b/rgo/Makefile @@ -25,7 +25,7 @@ CFLAGS = \ -Iinclude \ -g -.PHONY: clean +.PHONY: clean purge $(LIB): $(OBJS) ar r $@ $^ diff --git a/rgo/include/rgo.h b/rgo/include/rgo.h index 38c4672..276c8dc 100644 --- a/rgo/include/rgo.h +++ b/rgo/include/rgo.h @@ -10,11 +10,15 @@ You should have received a copy of the GNU Lesser General Public License along with rgo. If not, see <https://www.gnu.org/licenses/>. */ +#if !defined(__i386__) && !defined(__x86_64__) +#error Unsupported machine architecture! Support: AMD64, IA-32. +#endif + #if !defined(rgo_ver) #if defined(__ASSEMBLER__) -#define rgo_ver $0x1 +#define rgo_ver $0x2 #else -#define rgo_ver (0x1) +#define rgo_ver (0x2) #endif #if defined(__ASSEMBLER__) diff --git a/rgo/src/fndbyte.S b/rgo/src/fndbyte.S index 2e0ed8b..c0e4382 100644 --- a/rgo/src/fndbyte.S +++ b/rgo/src/fndbyte.S @@ -15,23 +15,22 @@ .global rgo_fndbyte rgo_fndbyte: -#if defined(__x86_64__) /* - rdi: void const * ptr - rsi: size_t num - dl: uint8_t byte + void const * ptr + size_t num + uint8_t byte */ +#if defined(__x86_64__) /* rax: Address of the current element. */ movq %rdi,%rax - /* rcx: Address of the element after the last element. */ - movq %rdi,%rcx - addq %rsi,%rcx - /* r8b: Current element. */ + /* rsi: Address of the element after the last element. */ + addq %rdi,%rsi + /* rcx: Current element. */ .loop: - cmpq %rax,%rcx + cmpq %rax,%rsi je .nfnd /* We have went through the entire array without finding the byte. */ - movb (%rax),%r8b - cmpb %r8b,%dl + movb (%rax),%cl + cmpb %cl,%dl je .fnd /* We have found the byte. */ incq %rax jmp .loop diff --git a/rgo/src/fndchr.S b/rgo/src/fndchr.S index cacea5e..1008e52 100644 --- a/rgo/src/fndchr.S +++ b/rgo/src/fndchr.S @@ -15,14 +15,14 @@ .global rgo_fndchr rgo_fndchr: -#if defined(__x86_64__) /* - rdi: char const * str - sil: char chr + char const * str + char chr */ +#if defined(__x86_64__) /* rax: Address of the current character. */ movq %rdi,%rax - /* dl: Current character. */ + /* rdx: Current character. */ .loop: movb (%rax),%dl cmpb %dl,%sil diff --git a/rgo/src/memcpy.S b/rgo/src/memcpy.S index 51d82f9..475da57 100644 --- a/rgo/src/memcpy.S +++ b/rgo/src/memcpy.S @@ -3,7 +3,7 @@ This file is part of rgo. - rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + rgo is free software: you can reaxstribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. rgo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. @@ -15,46 +15,85 @@ .global rgo_memcpy rgo_memcpy: -#if defined(__x86_64__) /* - rdi: void const * in - rsi: size_t num - rdx: void * out + void const * in + size_t num + void * out */ - /* rcx: Address of the current input element. */ - movq %rdi,%rcx - /* r8: Address of the current output element. */ - movq %rdx,%r8 - /* r9: Number of remaining elements. */ - movq %rsi,%r9 - /* r10: Temporary. */ - /* xmm0: Temporary. */ +#if defined(__i386__) + /* eax: Address of the current input element. */ + movl 0x4(%esp),%eax + /* ecx: Number of remaining elements. */ + movl 0x8(%esp),%ecx + /* edx: Address of the current output element. */ + movl 0xC(%esp),%edx + /* ebx: Current element. */ + pushl %ebx /* ebx must be restored. */ + /* xmm0: Current element. */ +#if defined(__SSE__) .big128cpy: - cmpq $0x10,%r9 + cmpl $0x10,%ecx jl .wrdcpy - movups (%rcx),%xmm0 - movups %xmm0,(%r8) - addq $0x10,%rcx - addq $0x10,%r8 - subq $0x10,%r9 + movups (%eax),%xmm0 + movups %xmm0,(%edx) + addl $0x10,%eax + addl $0x10,%edx + subl $0x10,%ecx + jmp .big128cpy +#endif +.wrdcpy: + cmpl $0x4,%ecx + jl .bytecpy + movl (%eax),%ebx + movl %ebx,(%edx) + addl $0x4,%eax + addl $0x4,%edx + subl $0x4,%ecx + jmp .wrdcpy +.bytecpy: + testl %ecx,%ecx + jz .done + movb (%eax),%bl + movb %bl,(%edx) + incl %eax + incl %edx + decl %ecx + jmp .bytecpy +.done: + popl %ebx + ret +#elif defined(__x86_64__) + /* rdi: Address of the current input element. */ + /* rsi: Number of remaining elements. */ + /* rdx: Address of the current output element. */ + /* rcx: Current element. */ + /* xmm0: Current element. */ +.big128cpy: + cmpq $0x10,%rsi + jl .wrdcpy + movups (%rdi),%xmm0 + movups %xmm0,(%rdx) + addq $0x10,%rdi + addq $0x10,%rdx + subq $0x10,%rsi jmp .big128cpy .wrdcpy: - cmpq $0x8,%r9 + cmpq $0x8,%rsi jl .bytecpy - movq (%rcx),%r10 - movq %r10,(%r8) - addq $0x8,%rcx - addq $0x8,%r8 - subq $0x8,%r9 + movq (%rdi),%rcx + movq %rcx,(%rdx) + addq $0x8,%rdi + addq $0x8,%rdx + subq $0x8,%rsi jmp .wrdcpy .bytecpy: - testq %r9,%r9 + testq %rsi,%rsi jz .done - movb (%rcx),%r10b - movb %r10b,(%r8) - incq %rcx - incq %r8 - decq %r9 + movb (%rdi),%cl + movb %cl,(%rdx) + incq %rdi + incq %rdx + decq %rsi jmp .bytecpy .done: ret diff --git a/rgo/src/memeq.S b/rgo/src/memeq.S index c3a9a63..d106804 100644 --- a/rgo/src/memeq.S +++ b/rgo/src/memeq.S @@ -3,7 +3,7 @@ This file is part of rgo. - rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + rgo is free software: you can reaxstribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. rgo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. @@ -15,46 +15,86 @@ .global rgo_memeq rgo_memeq: -#if defined(__x86_64__) /* - rdi: void const * lptr - rsi: size_t num - rdx: void const * rptr + void const * lptr + size_t num + void const * rptr */ - /* rcx: Address of the current left element. */ - movq %rdi,%rcx - /* r8: Address of the current right element. */ - movq %rdx,%r8 - /* r9: Number of remaining elements. */ - movq %rsi,%r9 - /* r10: Temporary. */ - /* r11: Temporary. */ +#if defined(__i386__) + /* eax: Address of the current left element. */ + movl 0x4(%esp),%eax + /* ecx: Number of remaining elements. */ + movl 0x8(%esp),%ecx + /* edx: Address of the current right element. */ + movl 0xC(%esp),%edx + /* ebx: Current left element. */ + pushl %ebx + /* ebx/esi: Current right element. */ + pushl %esi .wrdeq: - cmpq $0x8,%r9 + cmpl $0x4,%ecx jl .byteeq - movq (%rcx),%r10 - movq (%r8),%r11 - cmpq %r10,%r11 - jz .neq - addq $0x8,%rcx - addq $0x8,%r8 - subq $0x8,%r9 + movl (%eax),%ebx + movl (%edx),%esi + cmpl %ebx,%esi + jne .neq + addl $0x4,%eax + addl $0x4,%edx + subl $0x4,%ecx + jmp .wrdeq +.byteeq: + testl %ecx,%ecx + jne .eq /* If we have reached the final element, all previous elements have compared equal, and the memory sequences are equal. */ + movb (%eax),%bl + movb (%edx),%bh + cmpb %bl,%bh + jne .neq + incl %eax + incl %edx + decl %ecx + jmp .byteeq +.eq: + popl %ebx + popl %esi + movb $0x1,%al + ret +.neq: + popl %ebx + popl %esi + movb $0x0,%al + ret +#elif defined(__x86_64__) + /* rdi: Address of the current left element. */ + /* rsi: Number of remaining elements. */ + /* rdx: Address of the current right element. */ + /* rax: Current left element. */ + /* rcx: Current right element. */ +.wrdeq: + cmpq $0x8,%rsi + jl .byteeq + movq (%rdi),%rax + movq (%rdx),%rcx + cmpq %rax,%rcx + jne .neq + addq $0x8,%rdi + addq $0x8,%rdx + subq $0x8,%rsi jmp .wrdeq .byteeq: - testq %r9,%r9 - jz .eq /* If we have reached the final element, all previous elements have compared equal, and the memory sequences are equal. */ - movb (%rcx),%r10b - movb (%r8),%r11b - cmpb %r10b,%r11b + testq %rsi,%rsi + jne .eq /* If we have reached the final element, all previous elements have compared equal, and the memory sequences are equal. */ + movb (%rdi),%al + movb (%rdx),%cl + cmpb %al,%cl jne .neq - incq %rcx - incq %r8 - decq %r9 + incq %rdi + incq %rdx + decq %rsi jmp .byteeq .eq: - mov $0x1,%rax + movb $0x1,%al ret .neq: - mov $0x0,%rax + movb $0x0,%al ret #endif diff --git a/rgo/src/memfill.S b/rgo/src/memfill.S index d131c48..c22547e 100644 --- a/rgo/src/memfill.S +++ b/rgo/src/memfill.S @@ -15,24 +15,39 @@ .global rgo_memfill rgo_memfill: -#if defined(__x86_64__) /* - rdi: void const * ptr - rsi: size_t num - dl: int_least8_t val + void const * ptr + size_t num + uint8_t val */ - /* We don't need to preserve any of the registers we use according to the ABI. */ - /* rcx: Address of the current element. */ +#if defined(__i386__) + /* eax: Address of the current element. */ + movl 0x4(%esp),%eax + /* ecx: Address of the element after the last element. */ + movl 0x4(%esp),%ecx + addl 0x8(%esp),%ecx + /* rdx: Byte value. */ + movb 0xC(%esp),%dl +.loop: + cmpl %eax,%ecx + je .done /* Exit loop if we have reached the final element. */ + movb %dl,(%eax) + incl %eax + jmp .loop /* Continue to next element. */ +.done: + ret +#elif defined(__x86_64__) + /* rax: Address of the current element. */ + movq %rdi,%rax + /* rax: Address of the element after the last element. */ movq %rdi,%rcx - /* rcx: Address of the element after the last element. */ - movq %rdi,%r8 - addq %rsi,%r8 + addq %rsi,%rcx .loop: - cmpq %r8,%rcx + cmpq %rcx,%rax je .done /* Exit loop if we have reached the final element. */ - movb %dl,(%rcx) - incq %rcx /* Continue to next element. */ - jmp .loop + movb %dl,(%rax) + incq %rax + jmp .loop /* Continue to next element. */ .done: ret #endif diff --git a/rgo/src/strcpy.S b/rgo/src/strcpy.S index f2fbc36..8750295 100644 --- a/rgo/src/strcpy.S +++ b/rgo/src/strcpy.S @@ -10,28 +10,28 @@ You should have received a copy of the GNU Lesser General Public License along with rgo. If not, see <https://www.gnu.org/licenses/>. */ -#include <rgo.h> +#indlude <rgo.h> .global rgo_strcpy rgo_strcpy: -#if defined(__x86_64__) /* - rdi: char const * lstr - rsi: char const * rstr + char const * lstr + char const * rstr */ +#if defined(__x86_64__) /* rax: Address of the current input character. */ movq %rdi,%rax - /* rdx: Address of the current output character. */ - movq %rsi,%rdx - /* cl: Current character. */ + /* rsi: Address of the current output character. */ + movq %rsi,%rsi + /* rdx: Current character. */ .loop: - movb (%rax),%cl - movb %cl,(%rdx) - testb %cl,%cl + movb (%rax),%dl + movb %dl,(%rsi) + testb %dl,%dl jz .done incq %rax - incq %rdx + incq %rsi jmp .loop .done: subq %rdi,%rax diff --git a/rgo/src/streq.S b/rgo/src/streq.S index f530d54..8969e41 100644 --- a/rgo/src/streq.S +++ b/rgo/src/streq.S @@ -15,26 +15,26 @@ .global rgo_streq rgo_streq: -#if defined(__x86_64__) /* - rdi: char const * lstr - rsi: char const * rstr + char const * lstr + char const * rstr */ - /* rdx: Address of the current input character. */ - movq %rdi,%rdx - /* rcx: Address of the current output character. */ - movq %rsi,%rcx - /* r8b: Current input character. */ - /* r9b: Current output character. */ +#if defined(__x86_64__) + /* rax: Address of the current input character. */ + movq %rdi,%rax + /* rsi: Address of the current output character. */ + movq %rsi,%rsi + /* rdx: Current input character. */ + /* rcx: Current output character. */ .loop: - movb (%rdx),%r8b - movb (%rcx),%r9b - cmpb %r8b,%r9b + movb (%rax),%dl + movb (%rsi),%cl + cmpb %dl,%cl jne .neq - testb %r8b,%r8b /* Check if we have reached the null-terminator. */ + testb %dl,%dl /* Check if we have reached the null-terminator. */ jz .eq - incq %rdx - incq %rcx + incq %rax + incq %rsi jmp .loop .eq: mov $0x1,%rax diff --git a/rgo/src/strlen.S b/rgo/src/strlen.S index 7508be9..d7ad03e 100644 --- a/rgo/src/strlen.S +++ b/rgo/src/strlen.S @@ -3,7 +3,7 @@ This file is part of rgo. - rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + rgo is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either veraxon 3 of the License, or (at your option) any later veraxon. rgo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. @@ -15,21 +15,20 @@ .global rgo_strlen rgo_strlen: -#if defined(__x86_64__) /* - rdi: char const * str + char const * str */ - /* rsi: Address of the current character. */ - movq %rdi,%rsi - /* dl: Current character. */ +#if defined(__x86_64__) + /* rax: Address of the current character. */ + movq %rdi,%rax + /* rdx: Current character. */ .loop: - movb (%rsi),%dl + movb (%rax),%dl testb %dl,%dl jz .done /* Exit loop if we have reached the null-terminator. */ - incq %rsi /* Continue to the next character. */ + incq %rax /* Continue to the next character. */ jmp .loop .done: - movq %rsi,%rax subq %rdi,%rax ret #endif @@ -59,12 +59,13 @@ int main(void) { assert(arr1[(size_t)0x4] == arr0[(size_t)0x4]); assert(arr1[(size_t)0x5] == arr0[(size_t)0x5]); assert(arr1[(size_t)0x6] == arr0[(size_t)0x6]); - uint8_t const cmp = rgo_memeq(arr1,arrsz,arr0); - fprintf(stderr,"cmp: %u\n",cmp); - assert(cmp); + uint8_t const eq = rgo_memeq(arr1,arrsz,arr0); + fprintf(stderr,"eq: %u\n",eq); + assert(eq); #undef arrsz } fprintf(stderr,"\n"); +#if defined(__x86_64__) { char const * str0 = "Hello there! General Kenobi?"; fprintf(stderr,"str0: \"%s\"\n",str0); @@ -125,5 +126,6 @@ int main(void) { assert(rgo_streq(str0,str1)); } fprintf(stderr,"\n"); +#endif printf("All tests have passed!\n"); } |