diff --git a/crates/std_float/benches/bench_libm.rs b/crates/std_float/benches/bench_libm.rs new file mode 100644 index 00000000000..5e3ceeaffdb --- /dev/null +++ b/crates/std_float/benches/bench_libm.rs @@ -0,0 +1,177 @@ +#![feature(test)] +#![feature(portable_simd)] +#![feature(concat_idents)] + +extern crate test; +use std_float::StdLibm; + +use test::{black_box, Bencher}; + +use core_simd::{f32x16, f32x4, f64x4, f64x8}; + +const N: usize = 1024; + +fn init_f32x4() -> Vec { + vec![f32x4::splat(black_box(0.5)); N / 4] +} + +fn init_f32x16() -> Vec { + vec![f32x16::splat(black_box(0.5)); N / 16] +} + +fn init_f32() -> Vec { + vec![black_box(0.5); N] +} + +fn init_f64x4() -> Vec { + vec![f64x4::splat(black_box(0.5)); N / 4] +} + +fn init_f64x8() -> Vec { + vec![f64x8::splat(black_box(0.5)); N / 8] +} + +fn init_f64() -> Vec { + vec![black_box(1.0); N] +} + +// These fuctions are not inlined to make it easier to check the asm. +// +// Build with: +// +// RUSTFLAGS="-C target-cpu=native --emit asm" cargo bench + +macro_rules! benchmark_libm { + ( + functions ($( + $names : ident, + $functions : expr, + $init : expr + )*) + ) => { + + $( + #[bench] + #[inline(never)] + fn $names(b: &mut Bencher) { + let x = $init; + let mut y = $init; + b.iter(|| { + for (x, y) in x.iter().zip(y.iter_mut()) { + *y = ($functions)(*x); + } + }) + } + )* + } +} + +benchmark_libm! { + functions ( + sin_f32x4, |x : f32x4| x.sin(), init_f32x4() + sin_f32x16, |x : f32x16| x.sin(), init_f32x16() + sin_f32, |x : f32| x.sin(), init_f32() + sin_f64x4, |x : f64x4| x.sin(), init_f64x4() + sin_f64x8, |x : f64x8| x.sin(), init_f64x8() + sin_f64, |x : f64| x.sin(), init_f64() + ) +} + +benchmark_libm! { + functions ( + cos_f32x4, |x : f32x4| x.cos(), init_f32x4() + cos_f32x16, |x : f32x16| x.cos(), init_f32x16() + cos_f32, |x : f32| x.cos(), init_f32() + cos_f64x4, |x : f64x4| x.cos(), init_f64x4() + cos_f64x8, |x : f64x8| x.cos(), init_f64x8() + cos_f64, |x : f64| x.cos(), init_f64() + ) +} + +benchmark_libm! { + functions ( + tan_f32x4, |x : f32x4| x.tan(), init_f32x4() + tan_f32x16, |x : f32x16| x.tan(), init_f32x16() + tan_f32, |x : f32| x.tan(), init_f32() + tan_f64x4, |x : f64x4| x.tan(), init_f64x4() + tan_f64x8, |x : f64x8| x.tan(), init_f64x8() + tan_f64, |x : f64| x.tan(), init_f64() + ) +} + +benchmark_libm! { + functions ( + asin_f32x4, |x : f32x4| x.asin(), init_f32x4() + asin_f32x16, |x : f32x16| x.asin(), init_f32x16() + asin_f32, |x : f32| x.asin(), init_f32() + asin_f64x4, |x : f64x4| x.asin(), init_f64x4() + asin_f64x8, |x : f64x8| x.asin(), init_f64x8() + asin_f64, |x : f64| x.asin(), init_f64() + ) +} + +benchmark_libm! { + functions ( + acos_f32x4, |x : f32x4| x.acos(), init_f32x4() + acos_f32x16, |x : f32x16| x.acos(), init_f32x16() + acos_f32, |x : f32| x.acos(), init_f32() + acos_f64x4, |x : f64x4| x.acos(), init_f64x4() + acos_f64x8, |x : f64x8| x.acos(), init_f64x8() + acos_f64, |x : f64| x.acos(), init_f64() + ) +} + +benchmark_libm! { + functions ( + atan_f32x4, |x : f32x4| x.atan(), init_f32x4() + atan_f32x16, |x : f32x16| x.atan(), init_f32x16() + atan_f32, |x : f32| x.atan(), init_f32() + atan_f64x4, |x : f64x4| x.atan(), init_f64x4() + atan_f64x8, |x : f64x8| x.atan(), init_f64x8() + atan_f64, |x : f64| x.atan(), init_f64() + ) +} + +benchmark_libm! { + functions ( + exp2_f32x4, |x : f32x4| x.exp2(), init_f32x4() + exp2_f32x16, |x : f32x16| x.exp2(), init_f32x16() + exp2_f32, |x : f32| x.exp2(), init_f32() + exp2_f64x4, |x : f64x4| x.exp2(), init_f64x4() + exp2_f64x8, |x : f64x8| x.exp2(), init_f64x8() + exp2_f64, |x : f64| x.exp2(), init_f64() + ) +} + +benchmark_libm! { + functions ( + exp_f32x4, |x : f32x4| x.exp(), init_f32x4() + exp_f32x16, |x : f32x16| x.exp(), init_f32x16() + exp_f32, |x : f32| x.exp(), init_f32() + exp_f64x4, |x : f64x4| x.exp(), init_f64x4() + exp_f64x8, |x : f64x8| x.exp(), init_f64x8() + exp_f64, |x : f64| x.exp(), init_f64() + ) +} + +benchmark_libm! { + functions ( + log2_f32x4, |x : f32x4| x.log2(), init_f32x4() + log2_f32x16, |x : f32x16| x.log2(), init_f32x16() + log2_f32, |x : f32| x.log2(), init_f32() + log2_f64x4, |x : f64x4| x.log2(), init_f64x4() + log2_f64x8, |x : f64x8| x.log2(), init_f64x8() + log2_f64, |x : f64| x.log2(), init_f64() + ) +} + +benchmark_libm! { + functions ( + ln_f32x4, |x : f32x4| x.ln(), init_f32x4() + ln_f32x16, |x : f32x16| x.ln(), init_f32x16() + ln_f32, |x : f32| x.ln(), init_f32() + ln_f64x4, |x : f64x4| x.ln(), init_f64x4() + ln_f64x8, |x : f64x8| x.ln(), init_f64x8() + ln_f64, |x : f64| x.ln(), init_f64() + ) +} diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs index 4bd4d4c05e3..6bfd91d4730 100644 --- a/crates/std_float/src/lib.rs +++ b/crates/std_float/src/lib.rs @@ -11,6 +11,12 @@ use core_simd::simd; use simd::{LaneCount, Simd, SupportedLaneCount}; +mod libm32; +mod libm64; + +#[cfg(test)] +mod test_libm; + #[cfg(feature = "as_crate")] mod experimental { pub trait Sealed {} @@ -117,6 +123,102 @@ pub trait StdFloat: Sealed + Sized { fn fract(self) -> Self; } +pub trait StdLibm: StdFloat { + /// Signed integer type with the same number of bits as this floating point type. + type IntType; + + /// Unsigned integer type with the same number of bits as this floating point type. + type UintType; + + /// Computes the sine of a number (in radians). + fn sin(self) -> Self; + + /// Computes the cosine of a number (in radians). + fn cos(self) -> Self; + + /// Computes the tangent of a number (in radians). + fn tan(self) -> Self; + + /// Computes the arcsine of a number. Return value is in radians in + /// the range [-pi/2, pi/2] or NaN if the number is outside the range + /// [-1, 1]. + fn asin(self) -> Self; + + /// Computes the arccosine of a number. Return value is in radians in + /// the range [0, pi] or NaN if the number is outside the range + /// [-1, 1]. + fn acos(self) -> Self; + + /// Computes the arctangent of a number. Return value is in radians in the + /// range [-pi/2, pi/2]; + fn atan(self) -> Self; + + /// Computes the four quadrant arctangent of `self` (`y`) and `other` (`x`) in radians. + /// + /// * `x = 0`, `y = 0`: `0` + /// * `x >= 0`: `arctan(y/x)` -> `[-pi/2, pi/2]` + /// * `y >= 0`: `arctan(y/x) + pi` -> `(pi/2, pi]` + /// * `y < 0`: `arctan(y/x) - pi` -> `(-pi, -pi/2)` + fn atan2(self, x: Self) -> Self; + + /// Returns `2^(self)`. + fn exp2(self) -> Self; + + /// Returns `e^(self)`, (the exponential function). + fn exp(self) -> Self; + + /// Returns `e^(self) - 1` in a way that is accurate even if the + /// number is close to zero. + fn exp_m1(self) -> Self; + + /// Returns the base 2 logarithm of the number. + fn log2(self) -> Self; + + /// Returns `ln(1+n)` (natural logarithm) more accurately than if + /// the operations were performed separately. + fn ln_1p(self) -> Self; + + /// Returns the natural logarithm of the number. + fn ln(self) -> Self; + + /// Returns the base 10 logarithm of the number. + fn log10(self) -> Self; + + /// Returns the logarithm of the number with respect to an arbitrary base. + fn log(self, base: Self) -> Self; + + /// Raises a number to a floating point power. + fn powf(self, y: Self) -> Self; + + /// Raises a number to an integer power. + fn powi(self, y: Self::IntType) -> Self; + + /// Hyperbolic sine function. + fn sinh(self) -> Self; + + /// Hyperbolic cosine function. + fn cosh(self) -> Self; + + /// Hyperbolic tangent function. + fn tanh(self) -> Self; + + /// Inverse hyperbolic sine function. + fn asinh(self) -> Self; + + /// Inverse hyperbolic cosine function. + fn acosh(self) -> Self; + + /// Inverse hyperbolic tangent function. + fn atanh(self) -> Self; + + /// Returns the cube root of a number. + fn cbrt(self) -> Self; + + /// Calculates the length of the hypotenuse of a right-angle triangle given + /// legs of length `x` and `y`. + fn hypot(self, other: Self) -> Self; +} + impl Sealed for Simd where LaneCount: SupportedLaneCount {} impl Sealed for Simd where LaneCount: SupportedLaneCount {} @@ -161,5 +263,6 @@ mod tests { let _xfma = x.mul_add(x, x); let _xsqrt = x.sqrt(); let _ = x2.abs() * x2; + let _ = x.sin(); } } diff --git a/crates/std_float/src/libm32.rs b/crates/std_float/src/libm32.rs new file mode 100644 index 00000000000..41c19d5a9cf --- /dev/null +++ b/crates/std_float/src/libm32.rs @@ -0,0 +1,324 @@ +#![allow(non_snake_case)] +#![allow(clippy::excessive_precision)] +#![allow(clippy::approx_constant)] +use super::StdLibm; + +use super::StdFloat; + +use super::simd::{LaneCount, Simd, SupportedLaneCount}; + +impl StdLibm for Simd +where + LaneCount: SupportedLaneCount, +{ + type IntType = Simd; + type UintType = Simd; + #[inline] + fn sinh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408); + let x = self; + let a = x.mul_add(LOG2_E, -Self::splat(1.0)); + let b = x.mul_add(-LOG2_E, -Self::splat(1.0)); + (a).exp2() - (b).exp2() + } + #[inline] + fn cosh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408); + let x = self; + let a = x.mul_add(LOG2_E, -Self::splat(1.0)); + let b = x.mul_add(-LOG2_E, -Self::splat(1.0)); + (a).exp2() + (b).exp2() + } + #[inline] + fn tanh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408); + let x = self; + let exp2x = (x * (LOG2_E * Self::splat(2.0))).exp2(); + (exp2x - Self::splat(1.0)) / (exp2x + Self::splat(1.0)) + } + #[inline] + fn asinh(self) -> Self { + let x = self; + (x + (x * x + Self::splat(1.0)).sqrt()).ln() + } + #[inline] + fn acosh(self) -> Self { + let NAN = Self::splat(f32::NAN); + let x = self; + ((x).lanes_lt(Self::splat(1.0))).select(NAN, (x + (x * x - Self::splat(1.0)).sqrt()).ln()) + } + #[inline] + fn atanh(self) -> Self { + let x = self; + ((Self::splat(1.0) + x).ln() - (Self::splat(1.0) - x).ln()) * Self::splat(0.5) + } + #[inline] + fn asin(self) -> Self { + let PI_BY_2 = Self::splat(1.57079632679489661923); + let arg = self; + let LIM = Self::splat(0.70710678118654752440); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2); + let s = ((arg).lanes_lt(Self::splat(0.0))).select(-Self::splat(1.0), Self::splat(1.0)); + let x = + ((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt()); + let y = (Self::splat(0.12778643f32)) + .mul_add(x * x, -Self::splat(0.12145509f32)) + .mul_add(x * x, Self::splat(0.09684546f32)) + .mul_add(x * x, Self::splat(0.009571692f32)) + .mul_add(x * x, Self::splat(0.047712374f32)) + .mul_add(x * x, Self::splat(0.07478066f32)) + .mul_add(x * x, Self::splat(0.1666726f32)) + .mul_add(x * x, Self::splat(1f32)) + * x; + ((arg * arg).lanes_lt(LIM * LIM)).select(y, c - y * s) + } + #[inline] + fn acos(self) -> Self { + let PI_BY_2 = Self::splat(1.57079632679489661923); + let PI = Self::splat(3.14159265358979323846); + let arg = self; + let LIM = Self::splat(0.70710678118654752440); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(PI, Self::splat(0.0)); + let s = ((arg).lanes_lt(Self::splat(0.0))).select(Self::splat(1.0), -Self::splat(1.0)); + let x = + ((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt()); + let y = (Self::splat(0.12778643f32)) + .mul_add(x * x, -Self::splat(0.12145509f32)) + .mul_add(x * x, Self::splat(0.09684546f32)) + .mul_add(x * x, Self::splat(0.009571692f32)) + .mul_add(x * x, Self::splat(0.047712374f32)) + .mul_add(x * x, Self::splat(0.07478066f32)) + .mul_add(x * x, Self::splat(0.1666726f32)) + .mul_add(x * x, Self::splat(1f32)) + * x; + ((arg * arg).lanes_lt(LIM * LIM)).select(PI_BY_2 - y, c - y * s) + } + #[inline] + fn atan(self) -> Self { + let PI_BY_2 = Self::splat(1.57079632679489661923); + let arg = self; + let LIM = Self::splat(1.0); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2); + let x = ((arg.abs()).lanes_lt(LIM)).select(arg, arg.recip()); + let y = (-Self::splat(0.0039602574f32)) + .mul_add(x * x, Self::splat(0.021659138f32)) + .mul_add(x * x, -Self::splat(0.05587457f32)) + .mul_add(x * x, Self::splat(0.09664151f32)) + .mul_add(x * x, -Self::splat(0.13930209f32)) + .mul_add(x * x, Self::splat(0.19954468f32)) + .mul_add(x * x, -Self::splat(0.33331004f32)) + .mul_add(x * x, Self::splat(0.9999998f32)) + * x; + ((arg.abs()).lanes_lt(LIM)).select(y, c - y) + } + #[inline] + fn atan2(self, x: Self) -> Self { + let PI_BY_8 = Self::splat(0.39269908169872415481); + let TAN_PI_BY_8 =Self ::splat (0.4142135623730950488038860984269714564297583406547512313284044241429748710410687354958479747260924362); + let PI_BY_2 = Self::splat(1.57079632679489661923); + let PI = Self::splat(3.14159265358979323846); + let y = self; + let x1 = x.abs(); + let y1 = y.abs(); + let x2 = ((x1).lanes_gt(y1)).select(x1, y1); + let y2 = ((x1).lanes_gt(y1)).select(y1, x1); + let x3 = TAN_PI_BY_8 * y2 + x2; + let y3 = (-TAN_PI_BY_8) * x2 + y2; + let z = y3 / x3; + let a = (-Self::splat(0.036384862f32)) + .mul_add(z * z, Self::splat(0.06897726f32)) + .mul_add(z * z, -Self::splat(0.08974458f32)) + .mul_add(z * z, Self::splat(0.11101332f32)) + .mul_add(z * z, -Self::splat(0.14285259f32)) + .mul_add(z * z, Self::splat(0.1999999f32)) + .mul_add(z * z, -Self::splat(0.33333334f32)) + .mul_add(z * z, Self::splat(1f32)) + * z + + PI_BY_8; + let q1 = ((x1).lanes_gt(y1)).select(a, PI_BY_2 - a); + let q12 = ((x).lanes_ge(Self::splat(0.0))).select(q1, PI - q1); + ((y).lanes_ge(Self::splat(0.0))).select(q12, -q12) + } + #[inline] + fn exp2(self) -> Self { + let EXP2_MAX = Self::splat(127.0f32); + let EXP2_MIN = -Self::splat(127.0f32); + let EXP2_SCALE = Self::splat(8388608.0f32); + let EXP2_ONE = Self::splat(1065353216.0f32); + let INFINITY = Self::splat(f32::INFINITY); + let arg = self; + let r = arg.round(); + let mul = Self::from_bits((r.mul_add(EXP2_SCALE, EXP2_ONE)).cast::()); + let x = arg - r; + let y = (Self::splat(0.000015310081f32)) + .mul_add(x, Self::splat(0.0001547802f32)) + .mul_add(x, Self::splat(0.0013333454f32)) + .mul_add(x, Self::splat(0.009617995f32)) + .mul_add(x, Self::splat(0.05550411f32)) + .mul_add(x, Self::splat(0.24022652f32)) + .mul_add(x, Self::splat(0.6931472f32)) + .mul_add(x, Self::splat(1f32)) + * mul; + let y1 = ((arg).lanes_gt(EXP2_MAX)).select(INFINITY, y); + ((r).lanes_lt(EXP2_MIN)).select(Self::splat(0.0), y1) + } + #[inline] + fn exp(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408); + let arg = self; + (arg * LOG2_E).exp2() + } + #[inline] + fn exp_m1(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359769137464649992339735961996202908859290566914912486673985594186422766333708408); + let EXP2_SCALE = Self::splat(8388608.0f32); + let EXP2_ONE = Self::splat(1065353216.0f32); + let arg = self; + let scaled = arg * LOG2_E; + let r = scaled.round(); + let mul = Self::from_bits((r.mul_add(EXP2_SCALE, EXP2_ONE)).cast::()); + let x = scaled - r; + (Self::splat(0.000015310081f32)) + .mul_add(x, Self::splat(0.0001547802f32)) + .mul_add(x, Self::splat(0.0013333454f32)) + .mul_add(x, Self::splat(0.009617995f32)) + .mul_add(x, Self::splat(0.05550411f32)) + .mul_add(x, Self::splat(0.24022652f32)) + .mul_add(x, Self::splat(0.6931472f32)) + .mul_add(x, -Self::splat(0.00000000008090348f32)) + * mul + + (mul - Self::splat(1.0)) + } + #[inline] + fn log2(self) -> Self { + let ONE_BITS = Self::UintType::splat(0x3f800000_u32); + let ONE_MASK = Self::UintType::splat(0x007fffff_u32); + let MIN_POSITIVE = Self::splat(f32::MIN_POSITIVE); + let INFINITY = Self::splat(f32::INFINITY); + let NAN = Self::splat(f32::NAN); + let LOG2_OFFSET = Self::IntType::splat(127_i32); + let LOG2_SHIFT = Self::IntType::splat(23_i32); + let arg = self; + let arg_bits = arg.to_bits(); + let exponent = (arg_bits.cast::() >> LOG2_SHIFT) - LOG2_OFFSET; + let x = Self::from_bits((arg_bits & ONE_MASK) | ONE_BITS) - Self::splat(1.5); + let y = (Self::splat(0.005413892f32)) + .mul_add(x, -Self::splat(0.009083694f32)) + .mul_add(x, Self::splat(0.011741556f32)) + .mul_add(x, -Self::splat(0.020581678f32)) + .mul_add(x, Self::splat(0.038030382f32)) + .mul_add(x, -Self::splat(0.07129922f32)) + .mul_add(x, Self::splat(0.14248715f32)) + .mul_add(x, -Self::splat(0.32059687f32)) + .mul_add(x, Self::splat(0.9617967f32)) + .mul_add(x, Self::splat(0.5849625f32)); + ((arg).lanes_lt(Self::splat(0.0))).select( + -NAN, + ((arg).lanes_lt(MIN_POSITIVE)).select(-INFINITY, y + (exponent.cast::())), + ) + } + #[inline] + fn ln_1p(self) -> Self { + let arg = self; + (Self::splat(1.0) + arg).ln() + } + #[inline] + fn ln(self) -> Self { + let RECIP_LOG2_E = Self::splat(0.69314718055994530942); + let arg = self; + (arg).log2() * RECIP_LOG2_E + } + #[inline] + fn log10(self) -> Self { + let RECIP_LOG2_10 = Self::splat(0.30102999566398119521); + let arg = self; + (arg).log2() * RECIP_LOG2_10 + } + #[inline] + fn log(self, base: Self) -> Self { + let arg = self; + (arg).log2() / (base).log2() + } + #[inline] + fn powf(self, y: Self) -> Self { + let arg = self; + ((arg).log2() * y).exp2() + } + #[inline] + fn powi(self, y: Self::IntType) -> Self { + let x = self; + (x).powf(y.cast::()) + } + #[inline] + fn cbrt(self) -> Self { + let TWO_THIRDS = Self::splat(0.66666667f32); + let ONE_THIRD = Self::splat(0.33333333f32); + let EXP2_ONE = Self::splat(1065353216.0f32); + let x = self; + let r = Self::from_bits( + ((x.abs().to_bits().cast::()).mul_add(ONE_THIRD, EXP2_ONE * TWO_THIRDS)) + .cast::(), + ); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + r.copysign(x) + } + #[inline] + fn hypot(self, y: Self) -> Self { + let MIN_POSITIVE = Self::splat(f32::MIN_POSITIVE); + let x = self; + let xgty = (x.abs()).lanes_gt(y.abs()); + let x2 = (xgty).select(x, y); + let y2 = (xgty).select(y, x); + ((x2.abs()).lanes_le(MIN_POSITIVE)).select( + x2, + x2.abs() * (Self::splat(1.0) + (y2 / x2) * (y2 / x2)).sqrt(), + ) + } + #[inline] + fn sin(self) -> Self { + let RECIP_2PI = Self::splat(0.15915494309189533577); + let arg = self; + let scaled = arg * RECIP_2PI; + let x = scaled - scaled.round(); + (-Self::splat(12.26886f32)) + .mul_add(x * x, Self::splat(41.21624f32)) + .mul_add(x * x, -Self::splat(76.58672f32)) + .mul_add(x * x, Self::splat(81.59746f32)) + .mul_add(x * x, -Self::splat(41.34151f32)) + .mul_add(x * x, Self::splat(6.2831845f32)) + * x + } + #[inline] + fn cos(self) -> Self { + let RECIP_2PI = Self::splat(0.15915494309189533577); + let arg = self; + let scaled = arg * RECIP_2PI; + let x = scaled - scaled.round(); + (Self::splat(6.5286584f32)) + .mul_add(x * x, -Self::splat(25.973276f32)) + .mul_add(x * x, Self::splat(60.17118f32)) + .mul_add(x * x, -Self::splat(85.45092f32)) + .mul_add(x * x, Self::splat(64.939186f32)) + .mul_add(x * x, -Self::splat(19.739206f32)) + .mul_add(x * x, Self::splat(1f32)) + } + #[inline] + fn tan(self) -> Self { + let RECIP_PI = Self::splat(0.31830988618379067154); + let arg = self; + let scaled = arg * RECIP_PI; + let x = scaled - scaled.round(); + let recip = Self::splat(1.0) / (x * x - Self::splat(0.25)); + let y = (Self::splat(0.014397301f32)) + .mul_add(x * x, Self::splat(0.021017345f32)) + .mul_add(x * x, Self::splat(0.05285888f32)) + .mul_add(x * x, Self::splat(0.13475448f32)) + .mul_add(x * x, Self::splat(0.55773664f32)) + .mul_add(x * x, -Self::splat(0.7853982f32)) + * x; + y * recip + } +} diff --git a/crates/std_float/src/libm64.rs b/crates/std_float/src/libm64.rs new file mode 100644 index 00000000000..314a89c180e --- /dev/null +++ b/crates/std_float/src/libm64.rs @@ -0,0 +1,399 @@ +#![allow(non_snake_case)] +#![allow(clippy::excessive_precision)] +#![allow(clippy::approx_constant)] +use super::StdLibm; + +use super::StdFloat; + +use super::simd::{LaneCount, Simd, SupportedLaneCount}; + +impl StdLibm for Simd +where + LaneCount: SupportedLaneCount, +{ + type IntType = Simd; + type UintType = Simd; + #[inline] + fn sinh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359924681001892137426660660756662389692043961734752676409787833023288579071980359); + let x = self; + let a = x.mul_add(LOG2_E, -Self::splat(1.0)); + let b = x.mul_add(-LOG2_E, -Self::splat(1.0)); + (a).exp2() - (b).exp2() + } + #[inline] + fn cosh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359924681001892137426660660756662389692043961734752676409787833023288579071980359); + let x = self; + let a = x.mul_add(LOG2_E, -Self::splat(1.0)); + let b = x.mul_add(-LOG2_E, -Self::splat(1.0)); + (a).exp2() + (b).exp2() + } + #[inline] + fn tanh(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359924681001892137426660660756662389692043961734752676409787833023288579071980359); + let x = self; + let exp2x = (x * (LOG2_E * Self::splat(2.0))).exp2(); + (exp2x - Self::splat(1.0)) / (exp2x + Self::splat(1.0)) + } + #[inline] + fn asinh(self) -> Self { + let x = self; + (x + (x * x + Self::splat(1.0)).sqrt()).ln() + } + #[inline] + fn acosh(self) -> Self { + let NAN = Self::splat(f64::NAN); + let x = self; + ((x).lanes_lt(Self::splat(1.0))).select(NAN, (x + (x * x - Self::splat(1.0)).sqrt()).ln()) + } + #[inline] + fn atanh(self) -> Self { + let x = self; + ((Self::splat(1.0) + x).ln() - (Self::splat(1.0) - x).ln()) * Self::splat(0.5) + } + #[inline] + fn asin(self) -> Self { + let PI_BY_2 = Self::splat(1.5707963267948966192313216916397514420986); + let arg = self; + let LIM = Self::splat(0.70710678118654752440); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2); + let s = ((arg).lanes_lt(Self::splat(0.0))).select(-Self::splat(1.0), Self::splat(1.0)); + let x = + ((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt()); + let y = (Self::splat(0.8373648093412319f64)) + .mul_add(x * x, -Self::splat(2.980106295592163f64)) + .mul_add(x * x, Self::splat(5.042442367613399f64)) + .mul_add(x * x, -Self::splat(5.227353021050702f64)) + .mul_add(x * x, Self::splat(3.7146677455757984f64)) + .mul_add(x * x, -Self::splat(1.8827672802928515f64)) + .mul_add(x * x, Self::splat(0.7180951142924303f64)) + .mul_add(x * x, -Self::splat(0.19178725657932066f64)) + .mul_add(x * x, Self::splat(0.05210781979238637f64)) + .mul_add(x * x, Self::splat(0.00485554931570699f64)) + .mul_add(x * x, Self::splat(0.014746118856810628f64)) + .mul_add(x * x, Self::splat(0.017287003548468568f64)) + .mul_add(x * x, Self::splat(0.022376015418082405f64)) + .mul_add(x * x, Self::splat(0.030381795054318782f64)) + .mul_add(x * x, Self::splat(0.04464286065908419f64)) + .mul_add(x * x, Self::splat(0.07499999995639162f64)) + .mul_add(x * x, Self::splat(0.1666666666668809f64)) + .mul_add(x * x, Self::splat(0.9999999999999997f64)) + * x; + ((arg * arg).lanes_lt(LIM * LIM)).select(y, c - y * s) + } + #[inline] + fn acos(self) -> Self { + let PI_BY_2 = Self::splat(1.5707963267948966192313216916397514420986); + let PI = Self::splat(3.1415926535897932384626433832795028841972); + let arg = self; + let LIM = Self::splat(0.70710678118654752440); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(PI, Self::splat(0.0)); + let s = ((arg).lanes_lt(Self::splat(0.0))).select(Self::splat(1.0), -Self::splat(1.0)); + let x = + ((arg * arg).lanes_lt(LIM * LIM)).select(arg, (Self::splat(1.0) - arg * arg).sqrt()); + let y = (Self::splat(0.6668533325236312f64)) + .mul_add(x * x, -Self::splat(2.203633342583737f64)) + .mul_add(x * x, Self::splat(3.4682293590554205f64)) + .mul_add(x * x, -Self::splat(3.31825365991194f64)) + .mul_add(x * x, Self::splat(2.1679686827931266f64)) + .mul_add(x * x, -Self::splat(0.9934711561764131f64)) + .mul_add(x * x, Self::splat(0.34673516466685284f64)) + .mul_add(x * x, -Self::splat(0.07465114063751678f64)) + .mul_add(x * x, Self::splat(0.02708987879711642f64)) + .mul_add(x * x, Self::splat(0.011875258490214528f64)) + .mul_add(x * x, Self::splat(0.01755397524017199f64)) + .mul_add(x * x, Self::splat(0.022358737646075745f64)) + .mul_add(x * x, Self::splat(0.03038253331569182f64)) + .mul_add(x * x, Self::splat(0.04464284149373235f64)) + .mul_add(x * x, Self::splat(0.07500000021866425f64)) + .mul_add(x * x, Self::splat(0.16666666666545776f64)) + .mul_add(x * x, Self::splat(1.000000000000001f64)) + * x; + ((arg * arg).lanes_lt(LIM * LIM)).select(PI_BY_2 - y, c - y * s) + } + #[inline] + fn atan(self) -> Self { + let PI_BY_2 = Self::splat(1.5707963267948966192313216916397514420986); + let arg = self; + let LIM = Self::splat(1.0); + let c = ((arg).lanes_lt(Self::splat(0.0))).select(-PI_BY_2, PI_BY_2); + let x = ((arg.abs()).lanes_lt(LIM)).select(arg, arg.recip()); + let y = (-Self::splat(0.000039339860635465445f64)) + .mul_add(x * x, Self::splat(0.0004066164434836197f64)) + .mul_add(x * x, -Self::splat(0.001986001768572495f64)) + .mul_add(x * x, Self::splat(0.006143174006145858f64)) + .mul_add(x * x, -Self::splat(0.013667536945096575f64)) + .mul_add(x * x, Self::splat(0.023696745325204483f64)) + .mul_add(x * x, -Self::splat(0.03413639435272701f64)) + .mul_add(x * x, Self::splat(0.043317460873511335f64)) + .mul_add(x * x, -Self::splat(0.05106904370972279f64)) + .mul_add(x * x, Self::splat(0.058384099848191776f64)) + .mul_add(x * x, -Self::splat(0.0665730796562759f64)) + .mul_add(x * x, Self::splat(0.07690840682218662f64)) + .mul_add(x * x, -Self::splat(0.09090746301914292f64)) + .mul_add(x * x, Self::splat(0.11111099019519444f64)) + .mul_add(x * x, -Self::splat(0.1428571373381894f64)) + .mul_add(x * x, Self::splat(0.19999999986592576f64)) + .mul_add(x * x, -Self::splat(0.3333333333320309f64)) + .mul_add(x * x, Self::splat(0.9999999999999978f64)) + * x; + ((arg.abs()).lanes_lt(LIM)).select(y, c - y) + } + #[inline] + fn atan2(self, x: Self) -> Self { + let PI_BY_8 = Self::splat(0.3926990816987241548078304229099378605247); + let TAN_PI_BY_8 =Self ::splat (0.4142135623730950488016887242096980785697285106532352473741716556853198927224463663590653778617382114); + let PI_BY_2 = Self::splat(1.5707963267948966192313216916397514420986); + let PI = Self::splat(3.1415926535897932384626433832795028841972); + let y = self; + let x1 = x.abs(); + let y1 = y.abs(); + let x2 = ((x1).lanes_gt(y1)).select(x1, y1); + let y2 = ((x1).lanes_gt(y1)).select(y1, x1); + let x3 = TAN_PI_BY_8 * y2 + x2; + let y3 = (-TAN_PI_BY_8) * x2 + y2; + let z = y3 / x3; + let a = (-Self::splat(0.006954938736046998f64)) + .mul_add(z * z, Self::splat(0.018449085999592458f64)) + .mul_add(z * z, -Self::splat(0.027729176254205356f64)) + .mul_add(z * z, Self::splat(0.0332359412804085f64)) + .mul_add(z * z, -Self::splat(0.03678285477572582f64)) + .mul_add(z * z, Self::splat(0.03996091862436943f64)) + .mul_add(z * z, -Self::splat(0.043473681490793725f64)) + .mul_add(z * z, Self::splat(0.04761863716846753f64)) + .mul_add(z * z, -Self::splat(0.05263155087567416f64)) + .mul_add(z * z, Self::splat(0.05882352795939899f64)) + .mul_add(z * z, -Self::splat(0.06666666661070236f64)) + .mul_add(z * z, Self::splat(0.07692307692150929f64)) + .mul_add(z * z, -Self::splat(0.0909090909090601f64)) + .mul_add(z * z, Self::splat(0.1111111111111107f64)) + .mul_add(z * z, -Self::splat(0.14285714285714282f64)) + .mul_add(z * z, Self::splat(0.19999999999999998f64)) + .mul_add(z * z, -Self::splat(0.33333333333333326f64)) + .mul_add(z * z, Self::splat(1f64)) + * z + + PI_BY_8; + let q1 = ((x1).lanes_gt(y1)).select(a, PI_BY_2 - a); + let q12 = ((x).lanes_ge(Self::splat(0.0))).select(q1, PI - q1); + ((y).lanes_ge(Self::splat(0.0))).select(q12, -q12) + } + #[inline] + fn exp2(self) -> Self { + let EXP2_MAX = Self::splat(1023.0f64); + let EXP2_MIN = -Self::splat(1023.0f64); + let EXP2_SCALE = Self::splat(4503599627370496.0f64); + let EXP2_ONE = Self::splat(4607182418800017408.0f64); + let INFINITY = Self::splat(f64::INFINITY); + let arg = self; + let r = arg.round(); + let mul = Self::from_bits((r.mul_add(EXP2_SCALE, EXP2_ONE)).cast::()); + let x = arg - r; + let y = (Self::splat(0.00000000044566754636398603f64)) + .mul_add(x, Self::splat(0.000000007075803175956986f64)) + .mul_add(x, Self::splat(0.00000010178051728089911f64)) + .mul_add(x, Self::splat(0.0000013215422480586546f64)) + .mul_add(x, Self::splat(0.000015252733853280778f64)) + .mul_add(x, Self::splat(0.00015403530485719912f64)) + .mul_add(x, Self::splat(0.0013333558146396002f64)) + .mul_add(x, Self::splat(0.009618129107567618f64)) + .mul_add(x, Self::splat(0.05550410866482166f64)) + .mul_add(x, Self::splat(0.2402265069591022f64)) + .mul_add(x, Self::splat(0.6931471805599452f64)) + .mul_add(x, Self::splat(1f64)) + * mul; + let y1 = ((arg).lanes_gt(EXP2_MAX)).select(INFINITY, y); + ((r).lanes_lt(EXP2_MIN)).select(Self::splat(0.0), y1) + } + #[inline] + fn exp(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359924681001892137426660660756662389692043961734752676409787833023288579071980359); + let arg = self; + (arg * LOG2_E).exp2() + } + #[inline] + fn exp_m1(self) -> Self { + let LOG2_E =Self ::splat (1.442695040888963407359924681001892137426660660756662389692043961734752676409787833023288579071980359); + let EXP2_SCALE = Self::splat(4503599627370496.0f64); + let EXP2_ONE = Self::splat(4607182418800017408.0f64); + let arg = self; + let scaled = arg * LOG2_E; + let r = scaled.round(); + let mul = Self::from_bits((r.mul_add(EXP2_SCALE, EXP2_ONE)).cast::()); + let x = scaled - r; + (Self::splat(0.00000000044566754636398603f64)) + .mul_add(x, Self::splat(0.000000007075803175956986f64)) + .mul_add(x, Self::splat(0.00000010178051728089911f64)) + .mul_add(x, Self::splat(0.0000013215422480586546f64)) + .mul_add(x, Self::splat(0.000015252733853280778f64)) + .mul_add(x, Self::splat(0.00015403530485719912f64)) + .mul_add(x, Self::splat(0.0013333558146396002f64)) + .mul_add(x, Self::splat(0.009618129107567618f64)) + .mul_add(x, Self::splat(0.05550410866482166f64)) + .mul_add(x, Self::splat(0.2402265069591022f64)) + .mul_add(x, Self::splat(0.6931471805599452f64)) + .mul_add(x, -Self::splat(0.0000000000000000061353609179806035f64)) + * mul + + (mul - Self::splat(1.0)) + } + #[inline] + fn log2(self) -> Self { + let ONE_BITS = Self::UintType::splat(0x3ff0000000000000_u64); + let ONE_MASK = Self::UintType::splat(0x000fffffffffffff_u64); + let MIN_POSITIVE = Self::splat(f64::MIN_POSITIVE); + let INFINITY = Self::splat(f64::INFINITY); + let NAN = Self::splat(f64::NAN); + let LOG2_OFFSET = Self::IntType::splat(1023_i64); + let LOG2_SHIFT = Self::IntType::splat(52_i64); + let arg = self; + let arg_bits = arg.to_bits(); + let exponent = (arg_bits.cast::() >> LOG2_SHIFT) - LOG2_OFFSET; + let x = Self::from_bits((arg_bits & ONE_MASK) | ONE_BITS) - Self::splat(1.5); + let y = (Self::splat(0.000059440811569894275f64)) + .mul_add(x, -Self::splat(0.00009384549305785918f64)) + .mul_add(x, Self::splat(0.00007056268243091807f64)) + .mul_add(x, -Self::splat(0.00011279762643562555f64)) + .mul_add(x, Self::splat(0.00022472329897976745f64)) + .mul_add(x, -Self::splat(0.00036098242513245754f64)) + .mul_add(x, Self::splat(0.0005692370613966115f64)) + .mul_add(x, -Self::splat(0.0009250629378630191f64)) + .mul_add(x, Self::splat(0.0015163928320102102f64)) + .mul_add(x, -Self::splat(0.002502038922613527f64)) + .mul_add(x, Self::splat(0.004169747986750192f64)) + .mul_add(x, -Self::splat(0.007036450720529529f64)) + .mul_add(x, Self::splat(0.01206251033391092f64)) + .mul_add(x, -Self::splat(0.021109393020516138f64)) + .mul_add(x, Self::splat(0.03799690641909651f64)) + .mul_add(x, -Self::splat(0.07124419953811696f64)) + .mul_add(x, Self::splat(0.14248839910020777f64)) + .mul_add(x, -Self::splat(0.3205988979754245f64)) + .mul_add(x, Self::splat(0.9617966939259754f64)) + .mul_add(x, Self::splat(0.5849625007211563f64)); + ((arg).lanes_lt(Self::splat(0.0))).select( + -NAN, + ((arg).lanes_lt(MIN_POSITIVE)).select(-INFINITY, y + (exponent.cast::())), + ) + } + #[inline] + fn ln_1p(self) -> Self { + let arg = self; + (Self::splat(1.0) + arg).ln() + } + #[inline] + fn ln(self) -> Self { + let RECIP_LOG2_E = Self::splat(0.6931471805599453094172321214581765680755); + let arg = self; + (arg).log2() * RECIP_LOG2_E + } + #[inline] + fn log10(self) -> Self { + let RECIP_LOG2_10 = Self::splat(0.3010299956639811952137388947244930267682); + let arg = self; + (arg).log2() * RECIP_LOG2_10 + } + #[inline] + fn log(self, base: Self) -> Self { + let arg = self; + (arg).log2() / (base).log2() + } + #[inline] + fn powf(self, y: Self) -> Self { + let arg = self; + ((arg).log2() * y).exp2() + } + #[inline] + fn powi(self, y: Self::IntType) -> Self { + let x = self; + (x).powf(y.cast::()) + } + #[inline] + fn cbrt(self) -> Self { + let TWO_THIRDS = Self::splat(0.666666666666666667f64); + let ONE_THIRD = Self::splat(0.333333333333333333f64); + let EXP2_ONE = Self::splat(4607182418800017408.0f64); + let x = self; + let r = Self::from_bits( + ((x.abs().to_bits().cast::()).mul_add(ONE_THIRD, EXP2_ONE * TWO_THIRDS)) + .cast::(), + ); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + let r = r + (x.abs() - r * r * r) / (Self::splat(3.0) * r * r); + r.copysign(x) + } + #[inline] + fn hypot(self, y: Self) -> Self { + let MIN_POSITIVE = Self::splat(f64::MIN_POSITIVE); + let x = self; + let xgty = (x.abs()).lanes_gt(y.abs()); + let x2 = (xgty).select(x, y); + let y2 = (xgty).select(y, x); + ((x2.abs()).lanes_le(MIN_POSITIVE)).select( + x2, + x2.abs() * (Self::splat(1.0) + (y2 / x2) * (y2 / x2)).sqrt(), + ) + } + #[inline] + fn sin(self) -> Self { + let RECIP_2PI = Self::splat(0.1591549430918953357688837633725143620345); + let arg = self; + let scaled = arg * RECIP_2PI; + let x = scaled - scaled.round(); + (-Self::splat(0.00007959781355646816f64)) + .mul_add(x * x, Self::splat(0.0011251039233483632f64)) + .mul_add(x * x, -Self::splat(0.012029309381583758f64)) + .mul_add(x * x, Self::splat(0.10422859417031961f64)) + .mul_add(x * x, -Self::splat(0.718122207748485f64)) + .mul_add(x * x, Self::splat(3.8199525744232106f64)) + .mul_add(x * x, -Self::splat(15.094642576059076f64)) + .mul_add(x * x, Self::splat(42.058693944862014f64)) + .mul_add(x * x, -Self::splat(76.7058597530604f64)) + .mul_add(x * x, Self::splat(81.60524927607504f64)) + .mul_add(x * x, -Self::splat(41.34170224039976f64)) + .mul_add(x * x, Self::splat(6.283185307179586f64)) + * x + } + #[inline] + fn cos(self) -> Self { + let RECIP_2PI = Self::splat(0.1591549430918953357688837633725143620345); + let arg = self; + let scaled = arg * RECIP_2PI; + let x = scaled - scaled.round(); + (Self::splat(0.00002092503869007534f64)) + .mul_add(x * x, -Self::splat(0.0003214576104012376f64)) + .mul_add(x * x, Self::splat(0.003779202401314546f64)) + .mul_add(x * x, -Self::splat(0.03638267368288368f64)) + .mul_add(x * x, Self::splat(0.28200593868080975f64)) + .mul_add(x * x, -Self::splat(1.7143907074899654f64)) + .mul_add(x * x, Self::splat(7.903536371025055f64)) + .mul_add(x * x, -Self::splat(26.426256783358706f64)) + .mul_add(x * x, Self::splat(60.244641371876135f64)) + .mul_add(x * x, -Self::splat(85.45681720669371f64)) + .mul_add(x * x, Self::splat(64.9393940226683f64)) + .mul_add(x * x, -Self::splat(19.739208802178716f64)) + .mul_add(x * x, Self::splat(1f64)) + } + #[inline] + fn tan(self) -> Self { + let RECIP_PI = Self::splat(0.3183098861837906715377675267450287240689); + let arg = self; + let scaled = arg * RECIP_PI; + let x = scaled - scaled.round(); + let recip = Self::splat(1.0) / (x * x - Self::splat(0.25)); + let y = (Self::splat(0.00015634929503112444f64)) + .mul_add(x * x, Self::splat(0.00010749666907629025f64)) + .mul_add(x * x, Self::splat(0.00040923484089717195f64)) + .mul_add(x * x, Self::splat(0.0008549505315816931f64)) + .mul_add(x * x, Self::splat(0.0019412482440671268f64)) + .mul_add(x * x, Self::splat(0.004371782765072613f64)) + .mul_add(x * x, Self::splat(0.009879869124007078f64)) + .mul_add(x * x, Self::splat(0.02251293831770456f64)) + .mul_add(x * x, Self::splat(0.05263664423645279f64)) + .mul_add(x * x, Self::splat(0.13476940059037382f64)) + .mul_add(x * x, Self::splat(0.5577362635648092f64)) + .mul_add(x * x, -Self::splat(0.7853981633974483f64)) + * x; + y * recip + } +} diff --git a/crates/std_float/src/test_libm.rs b/crates/std_float/src/test_libm.rs new file mode 100644 index 00000000000..55bd9f266fa --- /dev/null +++ b/crates/std_float/src/test_libm.rs @@ -0,0 +1,715 @@ +const NUM_ITER: usize = 0x10000; + +macro_rules! test_vec { + ( + scalar_type: $scalar_type: ty, + vector_type: $vector_type: ty, + int_vector_type: $int_vector_type: ty, + scalar_fn: $scalar_fn: expr, + vector_fn: $vector_fn: expr, + limit: $limit: expr, + x: $x: expr, + ) => ({ + { + #![allow(non_camel_case_types)] + use crate::StdLibm; + type scalar_type = $scalar_type; + type vector_type = $vector_type; + let sf = $scalar_fn; + let vf = $vector_fn; + let yref = <$vector_type>::from_array([sf($x[0]), sf($x[1]), sf($x[2]), sf($x[3])]); + let y = vf($x); + let e = (y - yref); + // Note: is e is NaN then the comparison will fail. + let bad_val = e.abs().lanes_gt($limit).any(); + if bad_val || y.is_nan() != yref.is_nan() || y.is_infinite() != yref.is_infinite() { + panic!("\nx ={:20.16?}\ne ={:20.16?}\nlimit ={:20.16?}\nvector={:20.16?}\nscalar={:20.16?}\nvector={:020x?}\nscalar={:020x?}\nvector_fn={}", + $x, + e, + $limit, + y, yref, + y.to_bits(), yref.to_bits(), + stringify!($vector_fn) + ); + } + } + }); +} + +macro_rules! test_range { + ( + min: $min: expr, + max: $max: expr, + limit: $limit: expr, + scalar_fn: $scalar_fn: expr, + vector_fn: $vector_fn: expr, + ) => {{ + #![allow(non_camel_case_types)] + #![allow(dead_code)] + #![allow(clippy::approx_constant)] + type scalar_type = f32; + type vector_type = core_simd::f32x4; + type int_vector_type = core_simd::i32x4; + const PI: scalar_type = 3.1415926535897932384626433832795028841972; + + let limit = vector_type::splat($limit); + let b = (($max) - ($min)) * (1.0 / NUM_ITER as scalar_type); + let a = $min; + for i in (0..NUM_ITER / 4) { + let fi = (i * 4) as scalar_type; + let x = vector_type::from_array([ + (fi + 0.0) * b + a, + (fi + 1.0) * b + a, + (fi + 2.0) * b + a, + (fi + 3.0) * b + a, + ]); + test_vec!( + scalar_type: f32, + vector_type: core_simd::f32x4, + int_vector_type: core_simd::i32x4, + scalar_fn: $scalar_fn, + vector_fn: $vector_fn, + limit: limit, + x: x, + ) + } + } + { + #![allow(non_camel_case_types)] + #![allow(dead_code)] + #![allow(unused)] + #![allow(clippy::approx_constant)] + type scalar_type = f64; + type vector_type = core_simd::f64x4; + type int_vector_type = core_simd::i64x4; + const PI: scalar_type = 3.1415926535897932384626433832795028841972; + + let limit = vector_type::splat($limit); + let b = (($max) - ($min)) * (1.0 / NUM_ITER as scalar_type); + let a = $min; + for i in (0..NUM_ITER / 4) { + let fi = (i * 4) as scalar_type; + let x = vector_type::from_array([ + (fi + 0.0) * b + a, + (fi + 1.0) * b + a, + (fi + 2.0) * b + a, + (fi + 3.0) * b + a, + ]); + test_vec!( + scalar_type: f64, + vector_type: core_simd::f64x4, + int_vector_type: core_simd::i64x4, + scalar_fn: $scalar_fn, + vector_fn: $vector_fn, + limit: limit, + x: x, + ) + } + }}; + ( + value: $value: expr, + limit: $limit: expr, + scalar_fn: $scalar_fn: expr, + vector_fn: $vector_fn: expr, + ) => {{ + #![allow(non_camel_case_types)] + #![allow(dead_code)] + { + type scalar_type = f32; + type vector_type = core_simd::f32x4; + let limit = ::splat($limit); + let x = ::splat($value); + test_vec!( + scalar_type: f32, + vector_type: core_simd::f32x4, + int_vector_type: core_simd::i32x4, + scalar_fn: $scalar_fn, + vector_fn: $vector_fn, + limit: limit, + x: x, + ) + } + { + type scalar_type = f64; + type vector_type = core_simd::f64x4; + let limit = ::splat($limit); + let x = ::splat($value); + test_vec!( + scalar_type: f64, + vector_type: core_simd::f64x4, + int_vector_type: core_simd::i64x4, + scalar_fn: $scalar_fn, + vector_fn: $vector_fn, + limit: limit, + x: x, + ) + } + }}; +} + +#[test] +fn sin() { + test_range!( + min: -PI/4.0, + max: PI/4.0, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.sin(), + vector_fn: |x : vector_type| x.sin(), + ); + + test_range!( + min: -PI/2.0, + max: PI/2.0, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.sin(), + vector_fn: |x : vector_type| x.sin(), + ); + + test_range!( + min: -PI, + max: PI, + limit: scalar_type::EPSILON * 10.0, + scalar_fn: |x : scalar_type| x.sin(), + vector_fn: |x : vector_type| x.sin(), + ); +} + +#[test] +fn cos() { + // In the range +/- pi/4 the input has 1 ulp of error. + test_range!( + min: -PI/4.0, + max: PI/4.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.cos(), + vector_fn: |x : vector_type| x.cos(), + ); + + // In the range +/- pi/2 the input and output has 2 ulp of error. + test_range!( + min: -PI/2.0, + max: PI/2.0, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.cos(), + vector_fn: |x : vector_type| x.cos(), + ); + + // In the range +/- pi the input has 4 ulp of error and the output has 5. + // Note that the scalar cos also has this error but the implementation + // is different. + test_range!( + min: -PI, + max: PI, + limit: scalar_type::EPSILON * 10.0, + scalar_fn: |x : scalar_type| x.cos(), + vector_fn: |x : vector_type| x.cos(), + ); +} + +#[test] +fn tan() { + // For the outsides, reciprocal accuracy is important. + // Note that the vector function correctly gets -inf for -PI/2 + // but the scalar function does not. + test_range!( + min: -PI/2.0 + 0.00001, + max: -PI/4.0, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.tan().recip(), + vector_fn: |x : vector_type| x.tan().recip(), + ); + + // For the insides, absolute accuracy is important. + test_range!( + min: -PI/4.0, + max: PI/4.0, + limit: scalar_type::EPSILON * 3.0, + scalar_fn: |x : scalar_type| x.tan(), + vector_fn: |x : vector_type| x.tan(), + ); + + test_range!( + min: PI/4.0, + max: PI/2.0 - 0.00001, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.tan().recip(), + vector_fn: |x : vector_type| x.tan().recip(), + ); +} + +#[test] +fn asin() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 8.0, + scalar_fn: |x : scalar_type| x.asin(), + vector_fn: |x : vector_type| x.asin(), + ); + + test_range!( + min: -0.5, + max: 0.5, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.asin(), + vector_fn: |x : vector_type| x.asin(), + ); +} + +#[test] +fn atan() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 8.0, + scalar_fn: |x : scalar_type| x.atan(), + vector_fn: |x : vector_type| x.atan(), + ); + + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 8.0, + scalar_fn: |x : scalar_type| x.recip().atan(), + vector_fn: |x : vector_type| x.recip().atan(), + ); +} + +#[test] +fn acos() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.acos(), + vector_fn: |x : vector_type| x.acos(), + ); +} + +#[test] +fn exp2() { + test_range!( + value: -126.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.exp2(), + vector_fn: |x : vector_type| x.exp2(), + ); + + test_range!( + value: 127.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.exp2(), + vector_fn: |x : vector_type| x.exp2(), + ); + + // Denormals not supported. + // + // test_range!( + // value: -127.0, + // limit: scalar_type::EPSILON * 2.0, + // scalar_fn: |x : scalar_type| x.exp2(), + // vector_fn: |x : vector_type| x.exp2(), + // ); + + // Large negatives give zero + test_range!( + value: -200.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.exp2(), + vector_fn: |x : vector_type| x.exp2(), + ); + + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.exp2(), + vector_fn: |x : vector_type| x.exp2(), + ); + + // Accuracy is good over the entire range. + // (Range expanded because windows exp->log is less accurate) + test_range!( + min: -126.0, + max: 126.0, + limit: scalar_type::EPSILON * 4.0, + scalar_fn: |x : scalar_type| x.exp2().log2(), + vector_fn: |x : vector_type| x.exp2().log2(), + ); +} + +#[test] +fn exp() { + test_range!( + min: -2.0, + max: 0.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.exp(), + vector_fn: |x : vector_type| x.exp(), + ); + + test_range!( + min: 0.0, + max: 2.0, + limit: scalar_type::EPSILON * 8.0, + scalar_fn: |x : scalar_type| x.exp(), + vector_fn: |x : vector_type| x.exp(), + ); +} + +#[test] +fn log2() { + // Both should give NaN. + test_range!( + value: -1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); + + // Both should give Inf. + test_range!( + value: 0.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); + + // Note that the std library may accept denormals. + test_range!( + value: scalar_type::MIN_POSITIVE, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); + + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); + + test_range!( + min: 2.0, + max: 4.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); + + test_range!( + min: 4.0, + max: 8.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log2(), + vector_fn: |x : vector_type| x.log2(), + ); +} + +#[test] +fn ln() { + // test_range!( + // value: -1.0, + // limit: scalar_type::EPSILON * 2.0, + // scalar_fn: |x : scalar_type| x.ln(), + // vector_fn: |x : vector_type| x.ln(), + // ); + + // test_range!( + // value: 0.0, + // limit: scalar_type::EPSILON * 2.0, + // scalar_fn: |x : scalar_type| x.ln(), + // vector_fn: |x : vector_type| x.ln(), + // ); + + // test_range!( + // value: scalar_type::MIN_POSITIVE, + // limit: scalar_type::EPSILON * 2.0, + // scalar_fn: |x : scalar_type| x.ln(), + // vector_fn: |x : vector_type| x.ln(), + // ); + + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.ln(), + vector_fn: |x : vector_type| x.ln(), + ); + + test_range!( + min: 2.0, + max: 4.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.ln(), + vector_fn: |x : vector_type| x.ln(), + ); + + test_range!( + min: 4.0, + max: 8.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.ln(), + vector_fn: |x : vector_type| x.ln(), + ); +} + +#[test] +fn log10() { + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log10(), + vector_fn: |x : vector_type| x.log10(), + ); + + test_range!( + min: 2.0, + max: 4.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log10(), + vector_fn: |x : vector_type| x.log10(), + ); + + test_range!( + min: 4.0, + max: 8.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log10(), + vector_fn: |x : vector_type| x.log10(), + ); +} + +#[test] +fn ln_1p() { + test_range!( + min: 0.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.ln_1p(), + vector_fn: |x : vector_type| x.ln_1p(), + ); +} + +#[test] +fn log() { + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.log(2.0), + vector_fn: |x : vector_type| x.log(vector_type::splat(2.0)), + ); +} + +#[test] +fn powf() { + test_range!( + min: 0.5, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.powf(2.0), + vector_fn: |x : vector_type| x.powf(vector_type::splat(2.0)), + ); + + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 5.0, + scalar_fn: |x : scalar_type| x.powf(2.0), + vector_fn: |x : vector_type| x.powf(vector_type::splat(2.0)), + ); +} + +#[test] +fn powi() { + test_range!( + min: 0.5, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.powi(2), + vector_fn: |x : vector_type| x.powi(int_vector_type::splat(2)), + ); + + test_range!( + min: 1.0, + max: 2.0, + limit: scalar_type::EPSILON * 5.0, + scalar_fn: |x : scalar_type| x.powi(2), + vector_fn: |x : vector_type| x.powi(int_vector_type::splat(2)), + ); +} + +#[test] +fn sinh() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.sinh(), + vector_fn: |x : vector_type| x.sinh(), + ); +} + +#[test] +fn cosh() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.cosh(), + vector_fn: |x : vector_type| x.cosh(), + ); +} + +#[test] +fn tanh() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.tanh(), + vector_fn: |x : vector_type| x.tanh(), + ); +} + +#[test] +fn asinh() { + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 3.0, + scalar_fn: |x : scalar_type| x.asinh(), + vector_fn: |x : vector_type| x.asinh(), + ); +} + +#[test] +fn acosh() { + // Will be NAN in this range. + test_range!( + min: 0.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.acosh(), + vector_fn: |x : vector_type| x.acosh(), + ); + + test_range!( + min: -1.0, + max: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.acosh(), + vector_fn: |x : vector_type| x.acosh(), + ); +} + +#[test] +fn atanh() { + test_range!( + value: -1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.atanh(), + vector_fn: |x : vector_type| x.atanh(), + ); + + test_range!( + value: 1.0, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.atanh(), + vector_fn: |x : vector_type| x.atanh(), + ); + + test_range!( + min: -0.75, + max: 0.75, + limit: scalar_type::EPSILON * 2.0, + scalar_fn: |x : scalar_type| x.atanh(), + vector_fn: |x : vector_type| x.atanh(), + ); +} + +#[test] +fn cbrt() { + test_range!( + min: -8.0, + max: 8.0, + limit: scalar_type::EPSILON * 3.0, + scalar_fn: |x : scalar_type| x.cbrt(), + vector_fn: |x : vector_type| x.cbrt(), + ); +} + +#[test] +fn hypot() { + test_range!( + min: 0.0, + max: 8.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.cos().hypot(x.sin()), + vector_fn: |x : vector_type| x.cos().hypot(x.sin()), + ); + + // Large values will mostly not overflow. + // test_range!( + // value: scalar_type::MAX/8.0, + // limit: scalar_type::MAX * scalar_type::EPSILON * 6.0, + // scalar_fn: |x : scalar_type| x.hypot(x), + // vector_fn: |x : vector_type| x.hypot(x), + // ); + + // Except for MAX. + // test_range!( + // value: scalar_type::MAX, + // limit: scalar_type::EPSILON * 6.0, + // scalar_fn: |x : scalar_type| x.hypot(x), + // vector_fn: |x : vector_type| x.hypot(x), + // ); +} + +#[test] +fn atan2() { + // Studiously ignore -PI and PI where signs change erraticly. + test_range!( + min: -3.141, + max: 3.141, + limit: scalar_type::EPSILON * 8.0, + scalar_fn: |x : scalar_type| x.sin().atan2(x.cos()), + vector_fn: |x : vector_type| x.sin().atan2(x.cos()), + ); + + // East + test_range!( + value: 0.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.atan2(scalar_type::from(1.0)), + vector_fn: |x : vector_type| x.atan2(vector_type::splat(1.0)), + ); + + // West + test_range!( + value: 0.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.atan2(scalar_type::from(-1.0)), + vector_fn: |x : vector_type| x.atan2(vector_type::splat(-1.0)), + ); + + // North + test_range!( + value: 1.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.atan2(scalar_type::from(0.0)), + vector_fn: |x : vector_type| x.atan2(vector_type::splat(0.0)), + ); + + // South + test_range!( + value: -1.0, + limit: scalar_type::EPSILON * 6.0, + scalar_fn: |x : scalar_type| x.atan2(scalar_type::from(0.0)), + vector_fn: |x : vector_type| x.atan2(vector_type::splat(0.0)), + ); +}