summaryrefslogtreecommitdiff
path: root/zap/src/fma.c
diff options
context:
space:
mode:
Diffstat (limited to 'zap/src/fma.c')
-rw-r--r--zap/src/fma.c171
1 files changed, 171 insertions, 0 deletions
diff --git a/zap/src/fma.c b/zap/src/fma.c
new file mode 100644
index 0000000..b2f45ad
--- /dev/null
+++ b/zap/src/fma.c
@@ -0,0 +1,171 @@
+/*
+ Copyright 2022 Gabriel Jensen.
+ This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
+ If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
+*/
+
+#include <zap/priv.h>
+
+#include <zap/math.h>
+
+#include <stdint.h>
+
+#if zap_priv_fastimpl
+__asm__ (
+ ".globl zap_fma_c\n"
+ ".globl zap_fma_i\n"
+ ".globl zap_fma_l\n"
+ ".globl zap_fma_ll\n"
+ ".globl zap_fma_s\n"
+ ".globl zap_fma_uc\n"
+ ".globl zap_fma_ui\n"
+ ".globl zap_fma_ul\n"
+ ".globl zap_fma_ull\n"
+ ".globl zap_fma_us\n"
+
+ "zap_fma_c:\n"
+ /*
+ signed char a
+ signed char b
+ signed char c
+ */
+#if defined(sus_arch_amd64)
+ "movb %sil,%al\n"
+ "imulb %dl\n"
+ "addb %dil,%al\n"
+ "ret\n"
+#endif
+
+ "zap_fma_i:\n"
+ /*
+ int a
+ int b
+ int c
+ */
+#if defined(sus_arch_amd64)
+ "movl %edx,%eax\n"
+ "imull %esi\n"
+ "addl %edi,%eax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_l:\n"
+ /*
+ long a
+ long b
+ long c
+ */
+#if defined(sus_arch_amd64)
+ "movq %rdx,%rax\n"
+ "imulq %rsi\n"
+ "addq %rdi,%rax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_ll:\n"
+ /*
+ long long a
+ long long b
+ long long c
+ */
+#if defined(sus_arch_amd64)
+ "movq %rdx,%rax\n"
+ "imulq %rsi\n"
+ "addq %rdi,%rax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_s:\n"
+ /*
+ short a
+ short b
+ short c
+ */
+#if defined(sus_arch_amd64)
+ "movw %dx,%ax\n"
+ "imulw %si\n"
+ "addw %di,%ax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_uc:\n"
+ /*
+ unsigned char a
+ unsigned char b
+ unsigned char c
+ */
+#if defined(sus_arch_amd64)
+ "movb %sil,%al\n" /* mulb uses ax instead of al:dl (like the other variants), so we don't need to worry about it overwritting dl. */
+ "mulb %dl\n"
+ "addb %dil,%al\n"
+ "ret\n"
+#endif
+
+ "zap_fma_ui:\n"
+ /*
+ unsigned int a
+ unsigned int b
+ unsigned int c
+ */
+#if defined(sus_arch_amd64)
+ "movl %edx,%eax\n"
+ "mull %esi\n"
+ "addl %edi,%eax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_ul:\n"
+ /*
+ unsigned long a
+ unsigned long b
+ unsigned long c
+ */
+#if defined(sus_arch_amd64)
+ "movq %rdx,%rax\n"
+ "mulq %rsi\n"
+ "addq %rdi,%rax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_ull:\n"
+ /*
+ unsigned long long a
+ unsigned long long b
+ unsigned long long c
+ */
+#if defined(sus_arch_amd64)
+ "movq %rdx,%rax\n" /* rdx get overwritten by mulq, so might as well just make it the first operand (in multiplication, the order is meaningless). */
+ "mulq %rsi\n"
+ "addq %rdi,%rax\n"
+ "ret\n"
+#endif
+
+ "zap_fma_us:\n"
+ /*
+ unsigned short a
+ unsigned short b
+ unsigned short c
+ */
+#if defined(sus_arch_amd64)
+ "movw %dx,%ax\n"
+ "mulw %si\n"
+ "addw %di,%ax\n"
+ "ret\n"
+#endif
+);
+#else
+#define zap_local_fma(_typ,_sufx) \
+ _typ zap_fma_ ## _sufx (_typ const _a,_typ const _b,_typ const _c) {return _a + _b * _c;}
+
+zap_local_fma(signed char,c)
+zap_local_fma(int,i)
+zap_local_fma(long,l)
+zap_local_fma(long long,ll)
+zap_local_fma(short,s)
+zap_local_fma(unsigned char,uc)
+zap_local_fma(unsigned int,ui)
+zap_local_fma(unsigned long,ul)
+zap_local_fma(unsigned long long,ull)
+zap_local_fma(unsigned short,us)
+
+#endif