Auto merge of #41302 - rkruppe:dec2flt-assoc-consts, r=BurntSushi
Use associated constants in core::num::dec2flt
This commit is contained in:
commit
bbdaad0dc8
4 changed files with 99 additions and 152 deletions
|
@ -70,6 +70,7 @@
|
||||||
#![feature(allow_internal_unstable)]
|
#![feature(allow_internal_unstable)]
|
||||||
#![feature(asm)]
|
#![feature(asm)]
|
||||||
#![feature(associated_type_defaults)]
|
#![feature(associated_type_defaults)]
|
||||||
|
#![feature(associated_consts)]
|
||||||
#![feature(cfg_target_feature)]
|
#![feature(cfg_target_feature)]
|
||||||
#![feature(cfg_target_has_atomic)]
|
#![feature(cfg_target_has_atomic)]
|
||||||
#![feature(concat_idents)]
|
#![feature(concat_idents)]
|
||||||
|
|
|
@ -106,17 +106,17 @@ mod fpu_precision {
|
||||||
/// a bignum.
|
/// a bignum.
|
||||||
pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
|
pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Option<T> {
|
||||||
let num_digits = integral.len() + fractional.len();
|
let num_digits = integral.len() + fractional.len();
|
||||||
// log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end,
|
// log_10(f64::MAX_SIG) ~ 15.95. We compare the exact value to MAX_SIG near the end,
|
||||||
// this is just a quick, cheap rejection (and also frees the rest of the code from
|
// this is just a quick, cheap rejection (and also frees the rest of the code from
|
||||||
// worrying about underflow).
|
// worrying about underflow).
|
||||||
if num_digits > 16 {
|
if num_digits > 16 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
if e.abs() >= T::ceil_log5_of_max_sig() as i64 {
|
if e.abs() >= T::CEIL_LOG5_OF_MAX_SIG as i64 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
|
let f = num::from_str_unchecked(integral.iter().chain(fractional.iter()));
|
||||||
if f > T::max_sig() {
|
if f > T::MAX_SIG {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -154,14 +154,14 @@ pub fn fast_path<T: RawFloat>(integral: &[u8], fractional: &[u8], e: i64) -> Opt
|
||||||
/// > the best possible approximation that uses p bits of significand.)
|
/// > the best possible approximation that uses p bits of significand.)
|
||||||
pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
|
pub fn bellerophon<T: RawFloat>(f: &Big, e: i16) -> T {
|
||||||
let slop;
|
let slop;
|
||||||
if f <= &Big::from_u64(T::max_sig()) {
|
if f <= &Big::from_u64(T::MAX_SIG) {
|
||||||
// The cases abs(e) < log5(2^N) are in fast_path()
|
// The cases abs(e) < log5(2^N) are in fast_path()
|
||||||
slop = if e >= 0 { 0 } else { 3 };
|
slop = if e >= 0 { 0 } else { 3 };
|
||||||
} else {
|
} else {
|
||||||
slop = if e >= 0 { 1 } else { 4 };
|
slop = if e >= 0 { 1 } else { 4 };
|
||||||
}
|
}
|
||||||
let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
|
let z = rawfp::big_to_fp(f).mul(&power_of_ten(e)).normalize();
|
||||||
let exp_p_n = 1 << (P - T::sig_bits() as u32);
|
let exp_p_n = 1 << (P - T::SIG_BITS as u32);
|
||||||
let lowbits: i64 = (z.f % exp_p_n) as i64;
|
let lowbits: i64 = (z.f % exp_p_n) as i64;
|
||||||
// Is the slop large enough to make a difference when
|
// Is the slop large enough to make a difference when
|
||||||
// rounding to n bits?
|
// rounding to n bits?
|
||||||
|
@ -210,14 +210,14 @@ fn algorithm_r<T: RawFloat>(f: &Big, e: i16, z0: T) -> T {
|
||||||
if d2 < y {
|
if d2 < y {
|
||||||
let mut d2_double = d2;
|
let mut d2_double = d2;
|
||||||
d2_double.mul_pow2(1);
|
d2_double.mul_pow2(1);
|
||||||
if m == T::min_sig() && d_negative && d2_double > y {
|
if m == T::MIN_SIG && d_negative && d2_double > y {
|
||||||
z = prev_float(z);
|
z = prev_float(z);
|
||||||
} else {
|
} else {
|
||||||
return z;
|
return z;
|
||||||
}
|
}
|
||||||
} else if d2 == y {
|
} else if d2 == y {
|
||||||
if m % 2 == 0 {
|
if m % 2 == 0 {
|
||||||
if m == T::min_sig() && d_negative {
|
if m == T::MIN_SIG && d_negative {
|
||||||
z = prev_float(z);
|
z = prev_float(z);
|
||||||
} else {
|
} else {
|
||||||
return z;
|
return z;
|
||||||
|
@ -303,12 +303,12 @@ pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
|
||||||
quick_start::<T>(&mut u, &mut v, &mut k);
|
quick_start::<T>(&mut u, &mut v, &mut k);
|
||||||
let mut rem = Big::from_small(0);
|
let mut rem = Big::from_small(0);
|
||||||
let mut x = Big::from_small(0);
|
let mut x = Big::from_small(0);
|
||||||
let min_sig = Big::from_u64(T::min_sig());
|
let min_sig = Big::from_u64(T::MIN_SIG);
|
||||||
let max_sig = Big::from_u64(T::max_sig());
|
let max_sig = Big::from_u64(T::MAX_SIG);
|
||||||
loop {
|
loop {
|
||||||
u.div_rem(&v, &mut x, &mut rem);
|
u.div_rem(&v, &mut x, &mut rem);
|
||||||
if k == T::min_exp_int() {
|
if k == T::MIN_EXP_INT {
|
||||||
// We have to stop at the minimum exponent, if we wait until `k < T::min_exp_int()`,
|
// We have to stop at the minimum exponent, if we wait until `k < T::MIN_EXP_INT`,
|
||||||
// then we'd be off by a factor of two. Unfortunately this means we have to special-
|
// then we'd be off by a factor of two. Unfortunately this means we have to special-
|
||||||
// case normal numbers with the minimum exponent.
|
// case normal numbers with the minimum exponent.
|
||||||
// FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
|
// FIXME find a more elegant formulation, but run the `tiny-pow10` test to make sure
|
||||||
|
@ -318,8 +318,8 @@ pub fn algorithm_m<T: RawFloat>(f: &Big, e: i16) -> T {
|
||||||
}
|
}
|
||||||
return underflow(x, v, rem);
|
return underflow(x, v, rem);
|
||||||
}
|
}
|
||||||
if k > T::max_exp_int() {
|
if k > T::MAX_EXP_INT {
|
||||||
return T::infinity2();
|
return T::INFINITY;
|
||||||
}
|
}
|
||||||
if x < min_sig {
|
if x < min_sig {
|
||||||
u.mul_pow2(1);
|
u.mul_pow2(1);
|
||||||
|
@ -345,18 +345,18 @@ fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
|
||||||
// The target ratio is one where u/v is in an in-range significand. Thus our termination
|
// The target ratio is one where u/v is in an in-range significand. Thus our termination
|
||||||
// condition is log2(u / v) being the significand bits, plus/minus one.
|
// condition is log2(u / v) being the significand bits, plus/minus one.
|
||||||
// FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
|
// FIXME Looking at the second bit could improve the estimate and avoid some more divisions.
|
||||||
let target_ratio = T::sig_bits() as i16;
|
let target_ratio = T::SIG_BITS as i16;
|
||||||
let log2_u = u.bit_length() as i16;
|
let log2_u = u.bit_length() as i16;
|
||||||
let log2_v = v.bit_length() as i16;
|
let log2_v = v.bit_length() as i16;
|
||||||
let mut u_shift: i16 = 0;
|
let mut u_shift: i16 = 0;
|
||||||
let mut v_shift: i16 = 0;
|
let mut v_shift: i16 = 0;
|
||||||
assert!(*k == 0);
|
assert!(*k == 0);
|
||||||
loop {
|
loop {
|
||||||
if *k == T::min_exp_int() {
|
if *k == T::MIN_EXP_INT {
|
||||||
// Underflow or subnormal. Leave it to the main function.
|
// Underflow or subnormal. Leave it to the main function.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if *k == T::max_exp_int() {
|
if *k == T::MAX_EXP_INT {
|
||||||
// Overflow. Leave it to the main function.
|
// Overflow. Leave it to the main function.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -376,7 +376,7 @@ fn quick_start<T: RawFloat>(u: &mut Big, v: &mut Big, k: &mut i16) {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
|
fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
|
||||||
if x < Big::from_u64(T::min_sig()) {
|
if x < Big::from_u64(T::MIN_SIG) {
|
||||||
let q = num::to_u64(&x);
|
let q = num::to_u64(&x);
|
||||||
let z = rawfp::encode_subnormal(q);
|
let z = rawfp::encode_subnormal(q);
|
||||||
return round_by_remainder(v, rem, q, z);
|
return round_by_remainder(v, rem, q, z);
|
||||||
|
@ -395,9 +395,9 @@ fn underflow<T: RawFloat>(x: Big, v: Big, rem: Big) -> T {
|
||||||
// needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
|
// needs to be rounded up. Only when the rounded off bits are 1/2 and the remainder
|
||||||
// is zero, we have a half-to-even situation.
|
// is zero, we have a half-to-even situation.
|
||||||
let bits = x.bit_length();
|
let bits = x.bit_length();
|
||||||
let lsb = bits - T::sig_bits() as usize;
|
let lsb = bits - T::SIG_BITS as usize;
|
||||||
let q = num::get_bits(&x, lsb, bits);
|
let q = num::get_bits(&x, lsb, bits);
|
||||||
let k = T::min_exp_int() + lsb as i16;
|
let k = T::MIN_EXP_INT + lsb as i16;
|
||||||
let z = rawfp::encode_normal(Unpacked::new(q, k));
|
let z = rawfp::encode_normal(Unpacked::new(q, k));
|
||||||
let q_even = q % 2 == 0;
|
let q_even = q % 2 == 0;
|
||||||
match num::compare_with_half_ulp(&x, lsb) {
|
match num::compare_with_half_ulp(&x, lsb) {
|
||||||
|
|
|
@ -214,11 +214,11 @@ fn dec2flt<T: RawFloat>(s: &str) -> Result<T, ParseFloatError> {
|
||||||
let (sign, s) = extract_sign(s);
|
let (sign, s) = extract_sign(s);
|
||||||
let flt = match parse_decimal(s) {
|
let flt = match parse_decimal(s) {
|
||||||
ParseResult::Valid(decimal) => convert(decimal)?,
|
ParseResult::Valid(decimal) => convert(decimal)?,
|
||||||
ParseResult::ShortcutToInf => T::infinity2(),
|
ParseResult::ShortcutToInf => T::INFINITY,
|
||||||
ParseResult::ShortcutToZero => T::zero2(),
|
ParseResult::ShortcutToZero => T::ZERO,
|
||||||
ParseResult::Invalid => match s {
|
ParseResult::Invalid => match s {
|
||||||
"inf" => T::infinity2(),
|
"inf" => T::INFINITY,
|
||||||
"NaN" => T::nan2(),
|
"NaN" => T::NAN,
|
||||||
_ => { return Err(pfe_invalid()); }
|
_ => { return Err(pfe_invalid()); }
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -254,7 +254,7 @@ fn convert<T: RawFloat>(mut decimal: Decimal) -> Result<T, ParseFloatError> {
|
||||||
// FIXME These bounds are rather conservative. A more careful analysis of the failure modes
|
// FIXME These bounds are rather conservative. A more careful analysis of the failure modes
|
||||||
// of Bellerophon could allow using it in more cases for a massive speed up.
|
// of Bellerophon could allow using it in more cases for a massive speed up.
|
||||||
let exponent_in_range = table::MIN_E <= e && e <= table::MAX_E;
|
let exponent_in_range = table::MIN_E <= e && e <= table::MAX_E;
|
||||||
let value_in_range = upper_bound <= T::max_normal_digits() as u64;
|
let value_in_range = upper_bound <= T::MAX_NORMAL_DIGITS as u64;
|
||||||
if exponent_in_range && value_in_range {
|
if exponent_in_range && value_in_range {
|
||||||
Ok(algorithm::bellerophon(&f, e))
|
Ok(algorithm::bellerophon(&f, e))
|
||||||
} else {
|
} else {
|
||||||
|
@ -315,17 +315,17 @@ fn bound_intermediate_digits(decimal: &Decimal, e: i64) -> u64 {
|
||||||
fn trivial_cases<T: RawFloat>(decimal: &Decimal) -> Option<T> {
|
fn trivial_cases<T: RawFloat>(decimal: &Decimal) -> Option<T> {
|
||||||
// There were zeros but they were stripped by simplify()
|
// There were zeros but they were stripped by simplify()
|
||||||
if decimal.integral.is_empty() && decimal.fractional.is_empty() {
|
if decimal.integral.is_empty() && decimal.fractional.is_empty() {
|
||||||
return Some(T::zero2());
|
return Some(T::ZERO);
|
||||||
}
|
}
|
||||||
// This is a crude approximation of ceil(log10(the real value)). We don't need to worry too
|
// This is a crude approximation of ceil(log10(the real value)). We don't need to worry too
|
||||||
// much about overflow here because the input length is tiny (at least compared to 2^64) and
|
// much about overflow here because the input length is tiny (at least compared to 2^64) and
|
||||||
// the parser already handles exponents whose absolute value is greater than 10^18
|
// the parser already handles exponents whose absolute value is greater than 10^18
|
||||||
// (which is still 10^19 short of 2^64).
|
// (which is still 10^19 short of 2^64).
|
||||||
let max_place = decimal.exp + decimal.integral.len() as i64;
|
let max_place = decimal.exp + decimal.integral.len() as i64;
|
||||||
if max_place > T::inf_cutoff() {
|
if max_place > T::INF_CUTOFF {
|
||||||
return Some(T::infinity2());
|
return Some(T::INFINITY);
|
||||||
} else if max_place < T::zero_cutoff() {
|
} else if max_place < T::ZERO_CUTOFF {
|
||||||
return Some(T::zero2());
|
return Some(T::ZERO);
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,24 +56,12 @@ impl Unpacked {
|
||||||
///
|
///
|
||||||
/// Should **never ever** be implemented for other types or be used outside the dec2flt module.
|
/// Should **never ever** be implemented for other types or be used outside the dec2flt module.
|
||||||
/// Inherits from `Float` because there is some overlap, but all the reused methods are trivial.
|
/// Inherits from `Float` because there is some overlap, but all the reused methods are trivial.
|
||||||
/// The "methods" (pseudo-constants) with default implementation should not be overriden.
|
|
||||||
pub trait RawFloat : Float + Copy + Debug + LowerExp
|
pub trait RawFloat : Float + Copy + Debug + LowerExp
|
||||||
+ Mul<Output=Self> + Div<Output=Self> + Neg<Output=Self>
|
+ Mul<Output=Self> + Div<Output=Self> + Neg<Output=Self>
|
||||||
{
|
{
|
||||||
// suffix of "2" because Float::infinity is deprecated
|
const INFINITY: Self;
|
||||||
#[allow(deprecated)]
|
const NAN: Self;
|
||||||
fn infinity2() -> Self {
|
const ZERO: Self;
|
||||||
Float::infinity()
|
|
||||||
}
|
|
||||||
|
|
||||||
// suffix of "2" because Float::nan is deprecated
|
|
||||||
#[allow(deprecated)]
|
|
||||||
fn nan2() -> Self {
|
|
||||||
Float::nan()
|
|
||||||
}
|
|
||||||
|
|
||||||
// suffix of "2" because Float::zero is deprecated
|
|
||||||
fn zero2() -> Self;
|
|
||||||
|
|
||||||
// suffix of "2" because Float::integer_decode is deprecated
|
// suffix of "2" because Float::integer_decode is deprecated
|
||||||
#[allow(deprecated)]
|
#[allow(deprecated)]
|
||||||
|
@ -94,94 +82,83 @@ pub trait RawFloat : Float + Copy + Debug + LowerExp
|
||||||
/// represented, the other code in this module makes sure to never let that happen.
|
/// represented, the other code in this module makes sure to never let that happen.
|
||||||
fn from_int(x: u64) -> Self;
|
fn from_int(x: u64) -> Self;
|
||||||
|
|
||||||
/// Get the value 10<sup>e</sup> from a pre-computed table. Panics for e >=
|
/// Get the value 10<sup>e</sup> from a pre-computed table.
|
||||||
/// ceil_log5_of_max_sig().
|
/// Panics for `e >= CEIL_LOG5_OF_MAX_SIG`.
|
||||||
fn short_fast_pow10(e: usize) -> Self;
|
fn short_fast_pow10(e: usize) -> Self;
|
||||||
|
|
||||||
// FIXME Everything that follows should be associated constants, but taking the value of an
|
|
||||||
// associated constant from a type parameter does not work (yet?)
|
|
||||||
// A possible workaround is having a `FloatInfo` struct for all the constants, but so far
|
|
||||||
// the methods aren't painful enough to rewrite.
|
|
||||||
|
|
||||||
/// What the name says. It's easier to hard code than juggling intrinsics and
|
/// What the name says. It's easier to hard code than juggling intrinsics and
|
||||||
/// hoping LLVM constant folds it.
|
/// hoping LLVM constant folds it.
|
||||||
fn ceil_log5_of_max_sig() -> i16;
|
const CEIL_LOG5_OF_MAX_SIG: i16;
|
||||||
|
|
||||||
// A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
|
// A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
|
||||||
/// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
|
/// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
|
||||||
fn max_normal_digits() -> usize;
|
const MAX_NORMAL_DIGITS: usize;
|
||||||
|
|
||||||
/// When the most significant decimal digit has a place value greater than this, the number
|
/// When the most significant decimal digit has a place value greater than this, the number
|
||||||
/// is certainly rounded to infinity.
|
/// is certainly rounded to infinity.
|
||||||
fn inf_cutoff() -> i64;
|
const INF_CUTOFF: i64;
|
||||||
|
|
||||||
/// When the most significant decimal digit has a place value less than this, the number
|
/// When the most significant decimal digit has a place value less than this, the number
|
||||||
/// is certainly rounded to zero.
|
/// is certainly rounded to zero.
|
||||||
fn zero_cutoff() -> i64;
|
const ZERO_CUTOFF: i64;
|
||||||
|
|
||||||
/// The number of bits in the exponent.
|
/// The number of bits in the exponent.
|
||||||
fn exp_bits() -> u8;
|
const EXP_BITS: u8;
|
||||||
|
|
||||||
/// The number of bits in the singificand, *including* the hidden bit.
|
/// The number of bits in the singificand, *including* the hidden bit.
|
||||||
fn sig_bits() -> u8;
|
const SIG_BITS: u8;
|
||||||
|
|
||||||
/// The number of bits in the singificand, *excluding* the hidden bit.
|
/// The number of bits in the singificand, *excluding* the hidden bit.
|
||||||
fn explicit_sig_bits() -> u8 {
|
const EXPLICIT_SIG_BITS: u8;
|
||||||
Self::sig_bits() - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The maximum legal exponent in fractional representation.
|
/// The maximum legal exponent in fractional representation.
|
||||||
fn max_exp() -> i16 {
|
const MAX_EXP: i16;
|
||||||
(1 << (Self::exp_bits() - 1)) - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The minimum legal exponent in fractional representation, excluding subnormals.
|
/// The minimum legal exponent in fractional representation, excluding subnormals.
|
||||||
fn min_exp() -> i16 {
|
const MIN_EXP: i16;
|
||||||
-Self::max_exp() + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `MAX_EXP` for integral representation, i.e., with the shift applied.
|
/// `MAX_EXP` for integral representation, i.e., with the shift applied.
|
||||||
fn max_exp_int() -> i16 {
|
const MAX_EXP_INT: i16;
|
||||||
Self::max_exp() - (Self::sig_bits() as i16 - 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `MAX_EXP` encoded (i.e., with offset bias)
|
/// `MAX_EXP` encoded (i.e., with offset bias)
|
||||||
fn max_encoded_exp() -> i16 {
|
const MAX_ENCODED_EXP: i16;
|
||||||
(1 << Self::exp_bits()) - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `MIN_EXP` for integral representation, i.e., with the shift applied.
|
/// `MIN_EXP` for integral representation, i.e., with the shift applied.
|
||||||
fn min_exp_int() -> i16 {
|
const MIN_EXP_INT: i16;
|
||||||
Self::min_exp() - (Self::sig_bits() as i16 - 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The maximum normalized singificand in integral representation.
|
/// The maximum normalized singificand in integral representation.
|
||||||
fn max_sig() -> u64 {
|
const MAX_SIG: u64;
|
||||||
(1 << Self::sig_bits()) - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The minimal normalized significand in integral representation.
|
/// The minimal normalized significand in integral representation.
|
||||||
fn min_sig() -> u64 {
|
const MIN_SIG: u64;
|
||||||
1 << (Self::sig_bits() - 1)
|
}
|
||||||
|
|
||||||
|
// Mostly a workaround for #34344.
|
||||||
|
macro_rules! other_constants {
|
||||||
|
($type: ident) => {
|
||||||
|
const EXPLICIT_SIG_BITS: u8 = Self::SIG_BITS - 1;
|
||||||
|
const MAX_EXP: i16 = (1 << (Self::EXP_BITS - 1)) - 1;
|
||||||
|
const MIN_EXP: i16 = -Self::MAX_EXP + 1;
|
||||||
|
const MAX_EXP_INT: i16 = Self::MAX_EXP - (Self::SIG_BITS as i16 - 1);
|
||||||
|
const MAX_ENCODED_EXP: i16 = (1 << Self::EXP_BITS) - 1;
|
||||||
|
const MIN_EXP_INT: i16 = Self::MIN_EXP - (Self::SIG_BITS as i16 - 1);
|
||||||
|
const MAX_SIG: u64 = (1 << Self::SIG_BITS) - 1;
|
||||||
|
const MIN_SIG: u64 = 1 << (Self::SIG_BITS - 1);
|
||||||
|
|
||||||
|
const INFINITY: Self = $crate::$type::INFINITY;
|
||||||
|
const NAN: Self = $crate::$type::NAN;
|
||||||
|
const ZERO: Self = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawFloat for f32 {
|
impl RawFloat for f32 {
|
||||||
fn zero2() -> Self {
|
const SIG_BITS: u8 = 24;
|
||||||
0.0
|
const EXP_BITS: u8 = 8;
|
||||||
}
|
const CEIL_LOG5_OF_MAX_SIG: i16 = 11;
|
||||||
|
const MAX_NORMAL_DIGITS: usize = 35;
|
||||||
fn sig_bits() -> u8 {
|
const INF_CUTOFF: i64 = 40;
|
||||||
24
|
const ZERO_CUTOFF: i64 = -48;
|
||||||
}
|
other_constants!(f32);
|
||||||
|
|
||||||
fn exp_bits() -> u8 {
|
|
||||||
8
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ceil_log5_of_max_sig() -> i16 {
|
|
||||||
11
|
|
||||||
}
|
|
||||||
|
|
||||||
fn transmute(self) -> u64 {
|
fn transmute(self) -> u64 {
|
||||||
let bits: u32 = unsafe { transmute(self) };
|
let bits: u32 = unsafe { transmute(self) };
|
||||||
|
@ -207,37 +184,17 @@ impl RawFloat for f32 {
|
||||||
fn short_fast_pow10(e: usize) -> Self {
|
fn short_fast_pow10(e: usize) -> Self {
|
||||||
table::F32_SHORT_POWERS[e]
|
table::F32_SHORT_POWERS[e]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn max_normal_digits() -> usize {
|
|
||||||
35
|
|
||||||
}
|
|
||||||
|
|
||||||
fn inf_cutoff() -> i64 {
|
|
||||||
40
|
|
||||||
}
|
|
||||||
|
|
||||||
fn zero_cutoff() -> i64 {
|
|
||||||
-48
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl RawFloat for f64 {
|
impl RawFloat for f64 {
|
||||||
fn zero2() -> Self {
|
const SIG_BITS: u8 = 53;
|
||||||
0.0
|
const EXP_BITS: u8 = 11;
|
||||||
}
|
const CEIL_LOG5_OF_MAX_SIG: i16 = 23;
|
||||||
|
const MAX_NORMAL_DIGITS: usize = 305;
|
||||||
fn sig_bits() -> u8 {
|
const INF_CUTOFF: i64 = 310;
|
||||||
53
|
const ZERO_CUTOFF: i64 = -326;
|
||||||
}
|
other_constants!(f64);
|
||||||
|
|
||||||
fn exp_bits() -> u8 {
|
|
||||||
11
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ceil_log5_of_max_sig() -> i16 {
|
|
||||||
23
|
|
||||||
}
|
|
||||||
|
|
||||||
fn transmute(self) -> u64 {
|
fn transmute(self) -> u64 {
|
||||||
let bits: u64 = unsafe { transmute(self) };
|
let bits: u64 = unsafe { transmute(self) };
|
||||||
|
@ -262,38 +219,27 @@ impl RawFloat for f64 {
|
||||||
fn short_fast_pow10(e: usize) -> Self {
|
fn short_fast_pow10(e: usize) -> Self {
|
||||||
table::F64_SHORT_POWERS[e]
|
table::F64_SHORT_POWERS[e]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn max_normal_digits() -> usize {
|
|
||||||
305
|
|
||||||
}
|
|
||||||
|
|
||||||
fn inf_cutoff() -> i64 {
|
|
||||||
310
|
|
||||||
}
|
|
||||||
|
|
||||||
fn zero_cutoff() -> i64 {
|
|
||||||
-326
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert an Fp to the closest f64. Only handles number that fit into a normalized f64.
|
/// Convert an Fp to the closest machine float type.
|
||||||
|
/// Does not handle subnormal results.
|
||||||
pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
|
pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
|
||||||
let x = x.normalize();
|
let x = x.normalize();
|
||||||
// x.f is 64 bit, so x.e has a mantissa shift of 63
|
// x.f is 64 bit, so x.e has a mantissa shift of 63
|
||||||
let e = x.e + 63;
|
let e = x.e + 63;
|
||||||
if e > T::max_exp() {
|
if e > T::MAX_EXP {
|
||||||
panic!("fp_to_float: exponent {} too large", e)
|
panic!("fp_to_float: exponent {} too large", e)
|
||||||
} else if e > T::min_exp() {
|
} else if e > T::MIN_EXP {
|
||||||
encode_normal(round_normal::<T>(x))
|
encode_normal(round_normal::<T>(x))
|
||||||
} else {
|
} else {
|
||||||
panic!("fp_to_float: exponent {} too small", e)
|
panic!("fp_to_float: exponent {} too small", e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Round the 64-bit significand to 53 bit with half-to-even. Does not handle exponent overflow.
|
/// Round the 64-bit significand to T::SIG_BITS bits with half-to-even.
|
||||||
|
/// Does not handle exponent overflow.
|
||||||
pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
|
pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
|
||||||
let excess = 64 - T::sig_bits() as i16;
|
let excess = 64 - T::SIG_BITS as i16;
|
||||||
let half: u64 = 1 << (excess - 1);
|
let half: u64 = 1 << (excess - 1);
|
||||||
let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
|
let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
|
||||||
assert_eq!(q << excess | rem, x.f);
|
assert_eq!(q << excess | rem, x.f);
|
||||||
|
@ -303,8 +249,8 @@ pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
|
||||||
Unpacked::new(q, k)
|
Unpacked::new(q, k)
|
||||||
} else if rem == half && (q % 2) == 0 {
|
} else if rem == half && (q % 2) == 0 {
|
||||||
Unpacked::new(q, k)
|
Unpacked::new(q, k)
|
||||||
} else if q == T::max_sig() {
|
} else if q == T::MAX_SIG {
|
||||||
Unpacked::new(T::min_sig(), k + 1)
|
Unpacked::new(T::MIN_SIG, k + 1)
|
||||||
} else {
|
} else {
|
||||||
Unpacked::new(q + 1, k)
|
Unpacked::new(q + 1, k)
|
||||||
}
|
}
|
||||||
|
@ -313,22 +259,22 @@ pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
|
||||||
/// Inverse of `RawFloat::unpack()` for normalized numbers.
|
/// Inverse of `RawFloat::unpack()` for normalized numbers.
|
||||||
/// Panics if the significand or exponent are not valid for normalized numbers.
|
/// Panics if the significand or exponent are not valid for normalized numbers.
|
||||||
pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
|
pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
|
||||||
debug_assert!(T::min_sig() <= x.sig && x.sig <= T::max_sig(),
|
debug_assert!(T::MIN_SIG <= x.sig && x.sig <= T::MAX_SIG,
|
||||||
"encode_normal: significand not normalized");
|
"encode_normal: significand not normalized");
|
||||||
// Remove the hidden bit
|
// Remove the hidden bit
|
||||||
let sig_enc = x.sig & !(1 << T::explicit_sig_bits());
|
let sig_enc = x.sig & !(1 << T::EXPLICIT_SIG_BITS);
|
||||||
// Adjust the exponent for exponent bias and mantissa shift
|
// Adjust the exponent for exponent bias and mantissa shift
|
||||||
let k_enc = x.k + T::max_exp() + T::explicit_sig_bits() as i16;
|
let k_enc = x.k + T::MAX_EXP + T::EXPLICIT_SIG_BITS as i16;
|
||||||
debug_assert!(k_enc != 0 && k_enc < T::max_encoded_exp(),
|
debug_assert!(k_enc != 0 && k_enc < T::MAX_ENCODED_EXP,
|
||||||
"encode_normal: exponent out of range");
|
"encode_normal: exponent out of range");
|
||||||
// Leave sign bit at 0 ("+"), our numbers are all positive
|
// Leave sign bit at 0 ("+"), our numbers are all positive
|
||||||
let bits = (k_enc as u64) << T::explicit_sig_bits() | sig_enc;
|
let bits = (k_enc as u64) << T::EXPLICIT_SIG_BITS | sig_enc;
|
||||||
T::from_bits(bits)
|
T::from_bits(bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Construct the subnormal. A mantissa of 0 is allowed and constructs zero.
|
/// Construct a subnormal. A mantissa of 0 is allowed and constructs zero.
|
||||||
pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
|
pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
|
||||||
assert!(significand < T::min_sig(), "encode_subnormal: not actually subnormal");
|
assert!(significand < T::MIN_SIG, "encode_subnormal: not actually subnormal");
|
||||||
// Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
|
// Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
|
||||||
T::from_bits(significand)
|
T::from_bits(significand)
|
||||||
}
|
}
|
||||||
|
@ -364,8 +310,8 @@ pub fn prev_float<T: RawFloat>(x: T) -> T {
|
||||||
Zero => panic!("prev_float: argument is zero"),
|
Zero => panic!("prev_float: argument is zero"),
|
||||||
Normal => {
|
Normal => {
|
||||||
let Unpacked { sig, k } = x.unpack();
|
let Unpacked { sig, k } = x.unpack();
|
||||||
if sig == T::min_sig() {
|
if sig == T::MIN_SIG {
|
||||||
encode_normal(Unpacked::new(T::max_sig(), k - 1))
|
encode_normal(Unpacked::new(T::MAX_SIG, k - 1))
|
||||||
} else {
|
} else {
|
||||||
encode_normal(Unpacked::new(sig - 1, k))
|
encode_normal(Unpacked::new(sig - 1, k))
|
||||||
}
|
}
|
||||||
|
@ -380,7 +326,7 @@ pub fn prev_float<T: RawFloat>(x: T) -> T {
|
||||||
pub fn next_float<T: RawFloat>(x: T) -> T {
|
pub fn next_float<T: RawFloat>(x: T) -> T {
|
||||||
match x.classify() {
|
match x.classify() {
|
||||||
Nan => panic!("next_float: argument is NaN"),
|
Nan => panic!("next_float: argument is NaN"),
|
||||||
Infinite => T::infinity2(),
|
Infinite => T::INFINITY,
|
||||||
// This seems too good to be true, but it works.
|
// This seems too good to be true, but it works.
|
||||||
// 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
|
// 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
|
||||||
// In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
|
// In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue