/*
Copyright 2022 Gabriel Jensen.
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
#include <zap/priv.h>
#include <zap/math.h>
#include <stdint.h>
#if zap_priv_fastimpl
__asm__ (
".globl zap_fma_c\n"
".globl zap_fma_i\n"
".globl zap_fma_l\n"
".globl zap_fma_ll\n"
".globl zap_fma_s\n"
".globl zap_fma_uc\n"
".globl zap_fma_ui\n"
".globl zap_fma_ul\n"
".globl zap_fma_ull\n"
".globl zap_fma_us\n"
"zap_fma_c:\n"
/*
signed char a
signed char b
signed char c
*/
#if defined(sus_arch_amd64)
"movb %sil,%al\n"
"imulb %dl\n"
"addb %dil,%al\n"
"ret\n"
#endif
"zap_fma_i:\n"
/*
int a
int b
int c
*/
#if defined(sus_arch_amd64)
"movl %edx,%eax\n"
"imull %esi\n"
"addl %edi,%eax\n"
"ret\n"
#endif
"zap_fma_l:\n"
/*
long a
long b
long c
*/
#if defined(sus_arch_amd64)
"movq %rdx,%rax\n"
"imulq %rsi\n"
"addq %rdi,%rax\n"
"ret\n"
#endif
"zap_fma_ll:\n"
/*
long long a
long long b
long long c
*/
#if defined(sus_arch_amd64)
"movq %rdx,%rax\n"
"imulq %rsi\n"
"addq %rdi,%rax\n"
"ret\n"
#endif
"zap_fma_s:\n"
/*
short a
short b
short c
*/
#if defined(sus_arch_amd64)
"movw %dx,%ax\n"
"imulw %si\n"
"addw %di,%ax\n"
"ret\n"
#endif
"zap_fma_uc:\n"
/*
unsigned char a
unsigned char b
unsigned char c
*/
#if defined(sus_arch_amd64)
"movb %sil,%al\n" /* mulb uses ax instead of al:dl (like the other variants), so we don't need to worry about it overwritting dl. */
"mulb %dl\n"
"addb %dil,%al\n"
"ret\n"
#endif
"zap_fma_ui:\n"
/*
unsigned int a
unsigned int b
unsigned int c
*/
#if defined(sus_arch_amd64)
"movl %edx,%eax\n"
"mull %esi\n"
"addl %edi,%eax\n"
"ret\n"
#endif
"zap_fma_ul:\n"
/*
unsigned long a
unsigned long b
unsigned long c
*/
#if defined(sus_arch_amd64)
"movq %rdx,%rax\n"
"mulq %rsi\n"
"addq %rdi,%rax\n"
"ret\n"
#endif
"zap_fma_ull:\n"
/*
unsigned long long a
unsigned long long b
unsigned long long c
*/
#if defined(sus_arch_amd64)
"movq %rdx,%rax\n" /* rdx get overwritten by mulq, so might as well just make it the first operand (in multiplication, the order is meaningless). */
"mulq %rsi\n"
"addq %rdi,%rax\n"
"ret\n"
#endif
"zap_fma_us:\n"
/*
unsigned short a
unsigned short b
unsigned short c
*/
#if defined(sus_arch_amd64)
"movw %dx,%ax\n"
"mulw %si\n"
"addw %di,%ax\n"
"ret\n"
#endif
);
#else
#define zap_local_fma(_typ,_sufx) \
_typ zap_fma_ ## _sufx (_typ const _a,_typ const _b,_typ const _c) {return _a + _b * _c;}
zap_local_fma(signed char,c)
zap_local_fma(int,i)
zap_local_fma(long,l)
zap_local_fma(long long,ll)
zap_local_fma(short,s)
zap_local_fma(unsigned char,uc)
zap_local_fma(unsigned int,ui)
zap_local_fma(unsigned long,ul)
zap_local_fma(unsigned long long,ull)
zap_local_fma(unsigned short,us)
#endif