diff options
Diffstat (limited to 'zap/src/fma.c')
-rw-r--r-- | zap/src/fma.c | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/zap/src/fma.c b/zap/src/fma.c new file mode 100644 index 0000000..b2f45ad --- /dev/null +++ b/zap/src/fma.c @@ -0,0 +1,171 @@ +/* + Copyright 2022 Gabriel Jensen. + This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. + If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. +*/ + +#include <zap/priv.h> + +#include <zap/math.h> + +#include <stdint.h> + +#if zap_priv_fastimpl +__asm__ ( + ".globl zap_fma_c\n" + ".globl zap_fma_i\n" + ".globl zap_fma_l\n" + ".globl zap_fma_ll\n" + ".globl zap_fma_s\n" + ".globl zap_fma_uc\n" + ".globl zap_fma_ui\n" + ".globl zap_fma_ul\n" + ".globl zap_fma_ull\n" + ".globl zap_fma_us\n" + + "zap_fma_c:\n" + /* + signed char a + signed char b + signed char c + */ +#if defined(sus_arch_amd64) + "movb %sil,%al\n" + "imulb %dl\n" + "addb %dil,%al\n" + "ret\n" +#endif + + "zap_fma_i:\n" + /* + int a + int b + int c + */ +#if defined(sus_arch_amd64) + "movl %edx,%eax\n" + "imull %esi\n" + "addl %edi,%eax\n" + "ret\n" +#endif + + "zap_fma_l:\n" + /* + long a + long b + long c + */ +#if defined(sus_arch_amd64) + "movq %rdx,%rax\n" + "imulq %rsi\n" + "addq %rdi,%rax\n" + "ret\n" +#endif + + "zap_fma_ll:\n" + /* + long long a + long long b + long long c + */ +#if defined(sus_arch_amd64) + "movq %rdx,%rax\n" + "imulq %rsi\n" + "addq %rdi,%rax\n" + "ret\n" +#endif + + "zap_fma_s:\n" + /* + short a + short b + short c + */ +#if defined(sus_arch_amd64) + "movw %dx,%ax\n" + "imulw %si\n" + "addw %di,%ax\n" + "ret\n" +#endif + + "zap_fma_uc:\n" + /* + unsigned char a + unsigned char b + unsigned char c + */ +#if defined(sus_arch_amd64) + "movb %sil,%al\n" /* mulb uses ax instead of al:dl (like the other variants), so we don't need to worry about it overwritting dl. */ + "mulb %dl\n" + "addb %dil,%al\n" + "ret\n" +#endif + + "zap_fma_ui:\n" + /* + unsigned int a + unsigned int b + unsigned int c + */ +#if defined(sus_arch_amd64) + "movl %edx,%eax\n" + "mull %esi\n" + "addl %edi,%eax\n" + "ret\n" +#endif + + "zap_fma_ul:\n" + /* + unsigned long a + unsigned long b + unsigned long c + */ +#if defined(sus_arch_amd64) + "movq %rdx,%rax\n" + "mulq %rsi\n" + "addq %rdi,%rax\n" + "ret\n" +#endif + + "zap_fma_ull:\n" + /* + unsigned long long a + unsigned long long b + unsigned long long c + */ +#if defined(sus_arch_amd64) + "movq %rdx,%rax\n" /* rdx get overwritten by mulq, so might as well just make it the first operand (in multiplication, the order is meaningless). */ + "mulq %rsi\n" + "addq %rdi,%rax\n" + "ret\n" +#endif + + "zap_fma_us:\n" + /* + unsigned short a + unsigned short b + unsigned short c + */ +#if defined(sus_arch_amd64) + "movw %dx,%ax\n" + "mulw %si\n" + "addw %di,%ax\n" + "ret\n" +#endif +); +#else +#define zap_local_fma(_typ,_sufx) \ + _typ zap_fma_ ## _sufx (_typ const _a,_typ const _b,_typ const _c) {return _a + _b * _c;} + +zap_local_fma(signed char,c) +zap_local_fma(int,i) +zap_local_fma(long,l) +zap_local_fma(long long,ll) +zap_local_fma(short,s) +zap_local_fma(unsigned char,uc) +zap_local_fma(unsigned int,ui) +zap_local_fma(unsigned long,ul) +zap_local_fma(unsigned long long,ull) +zap_local_fma(unsigned short,us) + +#endif |